From db02ddcc903201ad0f5c9297028b794f63d10252 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sun, 9 Jun 2019 10:40:12 +0900
Subject: [PATCH 001/583] Added files for NNUE.

---
 .../nnue/architectures/halfkp_256x2-32-32.h   |  35 +
 src/eval/nnue/architectures/k-p_256x2-32-32.h |  35 +
 src/eval/nnue/evaluate_nnue.cpp               | 314 ++++++
 src/eval/nnue/evaluate_nnue.h                 |  64 ++
 src/eval/nnue/evaluate_nnue_learner.cpp       | 230 +++++
 src/eval/nnue/evaluate_nnue_learner.h         |  48 +
 src/eval/nnue/features/feature_set.h          | 249 +++++
 src/eval/nnue/features/features_common.h      |  47 +
 src/eval/nnue/features/half_kp.cpp            |  78 ++
 src/eval/nnue/features/half_kp.h              |  62 ++
 src/eval/nnue/features/half_relative_kp.cpp   |  91 ++
 src/eval/nnue/features/half_relative_kp.h     |  68 ++
 src/eval/nnue/features/index_list.h           |  55 ++
 src/eval/nnue/features/k.cpp                  |  49 +
 src/eval/nnue/features/k.h                    |  48 +
 src/eval/nnue/features/p.cpp                  |  46 +
 src/eval/nnue/features/p.h                    |  48 +
 src/eval/nnue/layers/affine_transform.h       | 170 ++++
 src/eval/nnue/layers/clipped_relu.h           | 140 +++
 src/eval/nnue/layers/input_slice.h            |  74 ++
 src/eval/nnue/layers/sum.h                    | 165 ++++
 src/eval/nnue/nnue_accumulator.h              |  32 +
 src/eval/nnue/nnue_architecture.h             |  36 +
 src/eval/nnue/nnue_common.h                   |  54 ++
 src/eval/nnue/nnue_feature_transformer.h      | 323 +++++++
 src/eval/nnue/nnue_test_command.cpp           | 196 ++++
 src/eval/nnue/nnue_test_command.h             |  23 +
 src/eval/nnue/trainer/features/factorizer.h   | 112 +++
 .../trainer/features/factorizer_feature_set.h | 106 ++
 .../trainer/features/factorizer_half_kp.h     | 105 ++
 src/eval/nnue/trainer/trainer.h               | 127 +++
 .../nnue/trainer/trainer_affine_transform.h   | 303 ++++++
 src/eval/nnue/trainer/trainer_clipped_relu.h  | 144 +++
 .../trainer/trainer_feature_transformer.h     | 379 ++++++++
 src/eval/nnue/trainer/trainer_input_slice.h   | 253 +++++
 src/eval/nnue/trainer/trainer_sum.h           | 192 ++++
 src/evaluate.cpp                              | 909 ------------------
 37 files changed, 4501 insertions(+), 909 deletions(-)
 create mode 100644 src/eval/nnue/architectures/halfkp_256x2-32-32.h
 create mode 100644 src/eval/nnue/architectures/k-p_256x2-32-32.h
 create mode 100644 src/eval/nnue/evaluate_nnue.cpp
 create mode 100644 src/eval/nnue/evaluate_nnue.h
 create mode 100644 src/eval/nnue/evaluate_nnue_learner.cpp
 create mode 100644 src/eval/nnue/evaluate_nnue_learner.h
 create mode 100644 src/eval/nnue/features/feature_set.h
 create mode 100644 src/eval/nnue/features/features_common.h
 create mode 100644 src/eval/nnue/features/half_kp.cpp
 create mode 100644 src/eval/nnue/features/half_kp.h
 create mode 100644 src/eval/nnue/features/half_relative_kp.cpp
 create mode 100644 src/eval/nnue/features/half_relative_kp.h
 create mode 100644 src/eval/nnue/features/index_list.h
 create mode 100644 src/eval/nnue/features/k.cpp
 create mode 100644 src/eval/nnue/features/k.h
 create mode 100644 src/eval/nnue/features/p.cpp
 create mode 100644 src/eval/nnue/features/p.h
 create mode 100644 src/eval/nnue/layers/affine_transform.h
 create mode 100644 src/eval/nnue/layers/clipped_relu.h
 create mode 100644 src/eval/nnue/layers/input_slice.h
 create mode 100644 src/eval/nnue/layers/sum.h
 create mode 100644 src/eval/nnue/nnue_accumulator.h
 create mode 100644 src/eval/nnue/nnue_architecture.h
 create mode 100644 src/eval/nnue/nnue_common.h
 create mode 100644 src/eval/nnue/nnue_feature_transformer.h
 create mode 100644 src/eval/nnue/nnue_test_command.cpp
 create mode 100644 src/eval/nnue/nnue_test_command.h
 create mode 100644 src/eval/nnue/trainer/features/factorizer.h
 create mode 100644 src/eval/nnue/trainer/features/factorizer_feature_set.h
 create mode 100644 src/eval/nnue/trainer/features/factorizer_half_kp.h
 create mode 100644 src/eval/nnue/trainer/trainer.h
 create mode 100644 src/eval/nnue/trainer/trainer_affine_transform.h
 create mode 100644 src/eval/nnue/trainer/trainer_clipped_relu.h
 create mode 100644 src/eval/nnue/trainer/trainer_feature_transformer.h
 create mode 100644 src/eval/nnue/trainer/trainer_input_slice.h
 create mode 100644 src/eval/nnue/trainer/trainer_sum.h
 delete mode 100644 src/evaluate.cpp

diff --git a/src/eval/nnue/architectures/halfkp_256x2-32-32.h b/src/eval/nnue/architectures/halfkp_256x2-32-32.h
new file mode 100644
index 00000000..9b25ee54
--- /dev/null
+++ b/src/eval/nnue/architectures/halfkp_256x2-32-32.h
@@ -0,0 +1,35 @@
+﻿// NNUE評価関数で用いる入力特徴量とネットワーク構造の定義
+
+#include "../features/feature_set.h"
+#include "../features/half_kp.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 評価関数で用いる入力特徴量
+using RawFeatures = Features::FeatureSet<
+    Features::HalfKP<Features::Side::kFriend>>;
+
+// 変換後の入力特徴量の次元数
+constexpr IndexType kTransformedFeatureDimensions = 256;
+
+namespace Layers {
+
+// ネットワーク構造の定義
+using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+}  // namespace Layers
+
+using Network = Layers::OutputLayer;
+
+}  // namespace NNUE
+
+}  // namespace Eval
diff --git a/src/eval/nnue/architectures/k-p_256x2-32-32.h b/src/eval/nnue/architectures/k-p_256x2-32-32.h
new file mode 100644
index 00000000..b77aeaa6
--- /dev/null
+++ b/src/eval/nnue/architectures/k-p_256x2-32-32.h
@@ -0,0 +1,35 @@
+﻿// NNUE評価関数で用いる入力特徴量とネットワーク構造の定義
+
+#include "../features/feature_set.h"
+#include "../features/k.h"
+#include "../features/p.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 評価関数で用いる入力特徴量
+using RawFeatures = Features::FeatureSet<Features::K, Features::P>;
+
+// 変換後の入力特徴量の次元数
+constexpr IndexType kTransformedFeatureDimensions = 256;
+
+namespace Layers {
+
+// ネットワーク構造の定義
+using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+}  // namespace Layers
+
+using Network = Layers::OutputLayer;
+
+}  // namespace NNUE
+
+}  // namespace Eval
diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
new file mode 100644
index 00000000..84707bf9
--- /dev/null
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -0,0 +1,314 @@
+﻿// NNUE評価関数の計算に関するコード
+
+#include <fstream>
+
+#include "../../evaluate.h"
+#include "../../position.h"
+#include "../../misc.h"
+
+#include "evaluate_nnue.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 入力特徴量変換器
+AlignedPtr<FeatureTransformer> feature_transformer;
+
+// 評価関数
+AlignedPtr<Network> network;
+
+// 評価関数ファイル名
+const char* const kFileName = "nn.bin";
+
+// 評価関数の構造を表す文字列を取得する
+std::string GetArchitectureString() {
+  return "Features=" + FeatureTransformer::GetStructureString() +
+      ",Network=" + Network::GetStructureString();
+}
+
+namespace {
+
+namespace Detail {
+
+// 評価関数パラメータを初期化する
+template <typename T>
+void Initialize(AlignedPtr<T>& pointer) {
+  pointer.reset(reinterpret_cast<T*>(aligned_malloc(sizeof(T), alignof(T))));
+  std::memset(pointer.get(), 0, sizeof(T));
+}
+
+// 評価関数パラメータを読み込む
+template <typename T>
+bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
+  std::uint32_t header;
+  stream.read(reinterpret_cast<char*>(&header), sizeof(header));
+  if (!stream || header != T::GetHashValue()) return false;
+  return pointer->ReadParameters(stream);
+}
+
+// 評価関数パラメータを書き込む
+template <typename T>
+bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
+  constexpr std::uint32_t header = T::GetHashValue();
+  stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+  return pointer->WriteParameters(stream);
+}
+
+}  // namespace Detail
+
+// 評価関数パラメータを初期化する
+void Initialize() {
+  Detail::Initialize(feature_transformer);
+  Detail::Initialize(network);
+}
+
+}  // namespace
+
+// ヘッダを読み込む
+bool ReadHeader(std::istream& stream,
+  std::uint32_t* hash_value, std::string* architecture) {
+  std::uint32_t version, size;
+  stream.read(reinterpret_cast<char*>(&version), sizeof(version));
+  stream.read(reinterpret_cast<char*>(hash_value), sizeof(*hash_value));
+  stream.read(reinterpret_cast<char*>(&size), sizeof(size));
+  if (!stream || version != kVersion) return false;
+  architecture->resize(size);
+  stream.read(&(*architecture)[0], size);
+  return !stream.fail();
+}
+
+// ヘッダを書き込む
+bool WriteHeader(std::ostream& stream,
+  std::uint32_t hash_value, const std::string& architecture) {
+  stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
+  stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
+  const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
+  stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
+  stream.write(architecture.data(), size);
+  return !stream.fail();
+}
+
+// 評価関数パラメータを読み込む
+bool ReadParameters(std::istream& stream) {
+  std::uint32_t hash_value;
+  std::string architecture;
+  if (!ReadHeader(stream, &hash_value, &architecture)) return false;
+  if (hash_value != kHashValue) return false;
+  if (!Detail::ReadParameters(stream, feature_transformer)) return false;
+  if (!Detail::ReadParameters(stream, network)) return false;
+  return stream && stream.peek() == std::ios::traits_type::eof();
+}
+
+// 評価関数パラメータを書き込む
+bool WriteParameters(std::ostream& stream) {
+  if (!WriteHeader(stream, kHashValue, GetArchitectureString())) return false;
+  if (!Detail::WriteParameters(stream, feature_transformer)) return false;
+  if (!Detail::WriteParameters(stream, network)) return false;
+  return !stream.fail();
+}
+
+// 差分計算ができるなら進める
+static void UpdateAccumulatorIfPossible(const Position& pos) {
+  feature_transformer->UpdateAccumulatorIfPossible(pos);
+}
+
+// 評価値を計算する
+static Value ComputeScore(const Position& pos, bool refresh = false) {
+  auto& accumulator = pos.state()->accumulator;
+  if (!refresh && accumulator.computed_score) {
+    return accumulator.score;
+  }
+
+  alignas(kCacheLineSize) TransformedFeatureType
+      transformed_features[FeatureTransformer::kBufferSize];
+  feature_transformer->Transform(pos, transformed_features, refresh);
+  alignas(kCacheLineSize) char buffer[Network::kBufferSize];
+  const auto output = network->Propagate(transformed_features, buffer);
+
+  // VALUE_MAX_EVALより大きな値が返ってくるとaspiration searchがfail highして
+  // 探索が終わらなくなるのでVALUE_MAX_EVAL以下であることを保証すべき。
+
+  // この現象が起きても、対局時に秒固定などだとそこで探索が打ち切られるので、
+  // 1つ前のiterationのときの最善手がbestmoveとして指されるので見かけ上、
+  // 問題ない。このVALUE_MAX_EVALが返ってくるような状況は、ほぼ詰みの局面であり、
+  // そのような詰みの局面が出現するのは終盤で形勢に大差がついていることが多いので
+  // 勝敗にはあまり影響しない。
+
+  // しかし、教師生成時などdepth固定で探索するときに探索から戻ってこなくなるので
+  // そのスレッドの計算時間を無駄にする。またdepth固定対局でtime-outするようになる。
+
+  auto score = static_cast<Value>(output[0] / FV_SCALE);
+
+  // 1) ここ、下手にclipすると学習時には影響があるような気もするが…。
+  // 2) accumulator.scoreは、差分計算の時に用いないので書き換えて問題ない。
+  score = Math::clamp(score , -VALUE_MAX_EVAL , VALUE_MAX_EVAL);
+
+  accumulator.score = score;
+  accumulator.computed_score = true;
+  return accumulator.score;
+}
+
+}  // namespace NNUE
+
+#if defined(USE_EVAL_HASH)
+// HashTableに評価値を保存するために利用するクラス
+struct alignas(16) ScoreKeyValue {
+#if defined(USE_SSE2)
+  ScoreKeyValue() = default;
+  ScoreKeyValue(const ScoreKeyValue& other) {
+    static_assert(sizeof(ScoreKeyValue) == sizeof(__m128i),
+                  "sizeof(ScoreKeyValue) should be equal to sizeof(__m128i)");
+    _mm_store_si128(&as_m128i, other.as_m128i);
+  }
+  ScoreKeyValue& operator=(const ScoreKeyValue& other) {
+    _mm_store_si128(&as_m128i, other.as_m128i);
+    return *this;
+  }
+#endif
+
+  // evaluate hashでatomicに操作できる必要があるのでそのための操作子
+  void encode() {
+#if defined(USE_SSE2)
+    // ScoreKeyValue は atomic にコピーされるので key が合っていればデータも合っている。
+#else
+    key ^= score;
+#endif
+  }
+  // decode()はencode()の逆変換だが、xorなので逆変換も同じ変換。
+  void decode() { encode(); }
+
+  union {
+    struct {
+      std::uint64_t key;
+      std::uint64_t score;
+    };
+#if defined(USE_SSE2)
+    __m128i as_m128i;
+#endif
+  };
+};
+
+// シンプルなHashTableの実装。
+// Sizeは2のべき乗。
+template <typename T, size_t Size>
+struct HashTable {
+  HashTable() { clear(); }
+  T* operator [] (const Key k) { return entries_ + (static_cast<size_t>(k) & (Size - 1)); }
+  void clear() { memset(entries_, 0, sizeof(T)*Size); }
+
+  // Size が 2のべき乗であることのチェック
+  static_assert((Size & (Size - 1)) == 0, "");
+
+ private:
+  T entries_[Size];
+};
+
+// evaluateしたものを保存しておくHashTable(俗にいうehash)
+
+#if !defined(USE_LARGE_EVAL_HASH)
+// 134MB(魔女のAVX2以外の時の設定)
+struct EvaluateHashTable : HashTable<ScoreKeyValue, 0x800000> {};
+#else
+// prefetch有りなら大きいほうが良いのでは…。
+// →　あまり変わらないし、メモリもったいないのでデフォルトでは↑の設定で良いか…。
+// 1GB(魔女のAVX2の時の設定)
+struct EvaluateHashTable : HashTable<ScoreKeyValue, 0x4000000> {};
+#endif
+
+EvaluateHashTable g_evalTable;
+
+// prefetchする関数も用意しておく。
+void prefetch_evalhash(const Key key) {
+  constexpr auto mask = ~((u64)0x1f);
+  prefetch((void*)((u64)g_evalTable[key] & mask));
+}
+#endif
+
+// 評価関数ファイルを読み込む
+// benchコマンドなどでOptionsを保存して復元するのでこのときEvalDirが変更されたことになって、
+// 評価関数の再読込の必要があるというフラグを立てるため、この関数は2度呼び出されることがある。
+void load_eval() {
+  NNUE::Initialize();
+
+#if defined(EVAL_LEARN)
+  if (!Options["SkipLoadingEval"])
+#endif
+  {
+    const std::string dir_name = Options["EvalDir"];
+    const std::string file_name = Path::Combine(dir_name, NNUE::kFileName);
+    std::ifstream stream(file_name, std::ios::binary);
+    const bool result = NNUE::ReadParameters(stream);
+
+//    ASSERT(result);
+	if (!result)
+	{
+		// 読み込みエラーのとき終了してくれないと困る。
+		std::cout << "Error! : failed to read " << NNUE::kFileName << std::endl;
+		my_exit();
+	}
+  }
+}
+
+// 初期化
+void init() {
+}
+
+// 評価関数。差分計算ではなく全計算する。
+// Position::set()で一度だけ呼び出される。(以降は差分計算)
+// 手番側から見た評価値を返すので注意。(他の評価関数とは設計がこの点において異なる)
+// なので、この関数の最適化は頑張らない。
+Value compute_eval(const Position& pos) {
+  return NNUE::ComputeScore(pos, true);
+}
+
+// 評価関数
+Value evaluate(const Position& pos) {
+  const auto& accumulator = pos.state()->accumulator;
+  if (accumulator.computed_score) {
+    return accumulator.score;
+  }
+
+#if defined(USE_GLOBAL_OPTIONS)
+  // GlobalOptionsでeval hashを用いない設定になっているなら
+  // eval hashへの照会をskipする。
+  if (!GlobalOptions.use_eval_hash) {
+    ASSERT_LV5(pos.state()->materialValue == Eval::material(pos));
+    return NNUE::ComputeScore(pos);
+  }
+#endif
+
+#if defined(USE_EVAL_HASH)
+  // evaluate hash tableにはあるかも。
+  const Key key = pos.state()->key();
+  ScoreKeyValue entry = *g_evalTable[key];
+  entry.decode();
+  if (entry.key == key) {
+    // あった！
+    return Value(entry.score);
+  }
+#endif
+
+  Value score = NNUE::ComputeScore(pos);
+#if defined(USE_EVAL_HASH)
+  // せっかく計算したのでevaluate hash tableに保存しておく。
+  entry.key = key;
+  entry.score = score;
+  entry.encode();
+  *g_evalTable[key] = entry;
+#endif
+
+  return score;
+}
+
+// 差分計算ができるなら進める
+void evaluate_with_no_return(const Position& pos) {
+  NNUE::UpdateAccumulatorIfPossible(pos);
+}
+
+// 現在の局面の評価値の内訳を表示する
+void print_eval_stat(Position& /*pos*/) {
+  std::cout << "--- EVAL STAT: not implemented" << std::endl;
+}
+
+}  // namespace Eval
diff --git a/src/eval/nnue/evaluate_nnue.h b/src/eval/nnue/evaluate_nnue.h
new file mode 100644
index 00000000..a95f2bd9
--- /dev/null
+++ b/src/eval/nnue/evaluate_nnue.h
@@ -0,0 +1,64 @@
+﻿// NNUE評価関数で用いるheader
+
+#ifndef _EVALUATE_NNUE_H_
+#define _EVALUATE_NNUE_H_
+
+#if defined(EVAL_NNUE)
+
+#include "nnue_feature_transformer.h"
+#include "nnue_architecture.h"
+
+#include <memory>
+
+namespace Eval {
+
+namespace NNUE {
+
+// 評価関数の構造のハッシュ値
+constexpr std::uint32_t kHashValue =
+    FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
+
+// メモリ領域の解放を自動化するためのデリータ
+template <typename T>
+struct AlignedDeleter {
+  void operator()(T* ptr) const {
+    ptr->~T();
+    aligned_free(ptr);
+  }
+};
+template <typename T>
+using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
+
+// 入力特徴量変換器
+extern AlignedPtr<FeatureTransformer> feature_transformer;
+
+// 評価関数
+extern AlignedPtr<Network> network;
+
+// 評価関数ファイル名
+extern const char* const kFileName;
+
+// 評価関数の構造を表す文字列を取得する
+std::string GetArchitectureString();
+
+// ヘッダを読み込む
+bool ReadHeader(std::istream& stream,
+    std::uint32_t* hash_value, std::string* architecture);
+
+// ヘッダを書き込む
+bool WriteHeader(std::ostream& stream,
+    std::uint32_t hash_value, const std::string& architecture);
+
+// 評価関数パラメータを読み込む
+bool ReadParameters(std::istream& stream);
+
+// 評価関数パラメータを書き込む
+bool WriteParameters(std::ostream& stream);
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/evaluate_nnue_learner.cpp b/src/eval/nnue/evaluate_nnue_learner.cpp
new file mode 100644
index 00000000..cd3ae72a
--- /dev/null
+++ b/src/eval/nnue/evaluate_nnue_learner.cpp
@@ -0,0 +1,230 @@
+﻿// NNUE評価関数の学習時用のコード
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include <random>
+#include <fstream>
+
+#include "../../learn/learn.h"
+#include "../../learn/learning_tools.h"
+
+#include "../../position.h"
+#include "../../usi.h"
+#include "../../misc.h"
+
+#include "../evaluate_common.h"
+
+#include "evaluate_nnue.h"
+#include "evaluate_nnue_learner.h"
+#include "trainer/features/factorizer_feature_set.h"
+#include "trainer/features/factorizer_half_kp.h"
+#include "trainer/trainer_feature_transformer.h"
+#include "trainer/trainer_input_slice.h"
+#include "trainer/trainer_affine_transform.h"
+#include "trainer/trainer_clipped_relu.h"
+#include "trainer/trainer_sum.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace {
+
+// 学習データ
+std::vector<Example> examples;
+
+// examplesの排他制御をするMutex
+Mutex examples_mutex;
+
+// ミニバッチのサンプル数
+u64 batch_size;
+
+// 乱数生成器
+std::mt19937 rng;
+
+// 学習器
+std::shared_ptr<Trainer<Network>> trainer;
+
+// 学習率のスケール
+double global_learning_rate_scale;
+
+// 学習率のスケールを取得する
+double GetGlobalLearningRateScale() {
+  return global_learning_rate_scale;
+}
+
+// ハイパーパラメータなどのオプションを学習器に伝える
+void SendMessages(std::vector<Message> messages) {
+  for (auto& message : messages) {
+    trainer->SendMessage(&message);
+    ASSERT_LV3(message.num_receivers > 0);
+  }
+}
+
+}  // namespace
+
+// 学習の初期化を行う
+void InitializeTraining(double eta1, u64 eta1_epoch,
+                        double eta2, u64 eta2_epoch, double eta3) {
+  std::cout << "Initializing NN training for "
+            << GetArchitectureString() << std::endl;
+
+  ASSERT(feature_transformer);
+  ASSERT(network);
+  trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
+
+  if (Options["SkipLoadingEval"]) {
+    trainer->Initialize(rng);
+  }
+
+  global_learning_rate_scale = 1.0;
+  EvalLearningTools::Weight::init_eta(eta1, eta2, eta3, eta1_epoch, eta2_epoch);
+}
+
+// ミニバッチのサンプル数を設定する
+void SetBatchSize(u64 size) {
+  ASSERT_LV3(size > 0);
+  batch_size = size;
+}
+
+// 学習率のスケールを設定する
+void SetGlobalLearningRateScale(double scale) {
+  global_learning_rate_scale = scale;
+}
+
+// ハイパーパラメータなどのオプションを設定する
+void SetOptions(const std::string& options) {
+  std::vector<Message> messages;
+  for (const auto& option : Split(options, ',')) {
+    const auto fields = Split(option, '=');
+    ASSERT_LV3(fields.size() == 1 || fields.size() == 2);
+    if (fields.size() == 1) {
+      messages.emplace_back(fields[0]);
+    } else {
+      messages.emplace_back(fields[0], fields[1]);
+    }
+  }
+  SendMessages(std::move(messages));
+}
+
+// 学習用評価関数パラメータをファイルから読み直す
+void RestoreParameters(const std::string& dir_name) {
+  const std::string file_name = Path::Combine(dir_name, NNUE::kFileName);
+  std::ifstream stream(file_name, std::ios::binary);
+  bool result = ReadParameters(stream);
+  ASSERT(result);
+
+  SendMessages({{"reset"}});
+}
+
+// 学習データを1サンプル追加する
+void AddExample(Position& pos, Color rootColor,
+                const Learner::PackedSfenValue& psv, double weight) {
+  Example example;
+  if (rootColor == pos.side_to_move()) {
+    example.sign = 1;
+  } else {
+    example.sign = -1;
+  }
+  example.psv = psv;
+  example.weight = weight;
+
+  Features::IndexList active_indices[2];
+  for (const auto trigger : kRefreshTriggers) {
+    RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
+  }
+  if (pos.side_to_move() != BLACK) {
+    active_indices[0].swap(active_indices[1]);
+  }
+  for (const auto color : COLOR) {
+    std::vector<TrainingFeature> training_features;
+    for (const auto base_index : active_indices[color]) {
+      static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
+                    (1 << TrainingFeature::kIndexBits), "");
+      Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+          base_index, &training_features);
+    }
+    std::sort(training_features.begin(), training_features.end());
+
+    auto& unique_features = example.training_features[color];
+    for (const auto& feature : training_features) {
+      if (!unique_features.empty() &&
+          feature.GetIndex() == unique_features.back().GetIndex()) {
+        unique_features.back() += feature;
+      } else {
+        unique_features.push_back(feature);
+      }
+    }
+  }
+
+  std::lock_guard<Mutex> lock(examples_mutex);
+  examples.push_back(std::move(example));
+}
+
+// 評価関数パラメーターを更新する
+void UpdateParameters(u64 epoch) {
+  ASSERT_LV3(batch_size > 0);
+
+  EvalLearningTools::Weight::calc_eta(epoch);
+  const auto learning_rate = static_cast<LearnFloatType>(
+      get_eta() / batch_size);
+
+  std::lock_guard<Mutex> lock(examples_mutex);
+  std::shuffle(examples.begin(), examples.end(), rng);
+  while (examples.size() >= batch_size) {
+    std::vector<Example> batch(examples.end() - batch_size, examples.end());
+    examples.resize(examples.size() - batch_size);
+
+    const auto network_output = trainer->Propagate(batch);
+
+    std::vector<LearnFloatType> gradients(batch.size());
+    for (std::size_t b = 0; b < batch.size(); ++b) {
+      const auto shallow = static_cast<Value>(Round<std::int32_t>(
+          batch[b].sign * network_output[b] * kPonanzaConstant));
+      const auto& psv = batch[b].psv;
+      const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
+      gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+    }
+
+    trainer->Backpropagate(gradients.data(), learning_rate);
+  }
+  SendMessages({{"quantize_parameters"}});
+}
+
+// 学習に問題が生じていないかチェックする
+void CheckHealth() {
+  SendMessages({{"check_health"}});
+}
+
+}  // namespace NNUE
+
+// 評価関数パラメーターをファイルに保存する
+void save_eval(std::string dir_name) {
+  auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
+  std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+
+  // すでにこのフォルダがあるならmkdir()に失敗するが、
+  // 別にそれは構わない。なければ作って欲しいだけ。
+  // また、EvalSaveDirまでのフォルダは掘ってあるものとする。
+  Dependency::mkdir(eval_dir);
+
+  if (Options["SkipLoadingEval"] && NNUE::trainer) {
+    NNUE::SendMessages({{"clear_unobserved_feature_weights"}});
+  }
+
+  const std::string file_name = Path::Combine(eval_dir, NNUE::kFileName);
+  std::ofstream stream(file_name, std::ios::binary);
+  const bool result = NNUE::WriteParameters(stream);
+  ASSERT(result);
+
+  std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
+}
+
+// 現在のetaを取得する
+double get_eta() {
+  return NNUE::GetGlobalLearningRateScale() * EvalLearningTools::Weight::eta;
+}
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
diff --git a/src/eval/nnue/evaluate_nnue_learner.h b/src/eval/nnue/evaluate_nnue_learner.h
new file mode 100644
index 00000000..130ce376
--- /dev/null
+++ b/src/eval/nnue/evaluate_nnue_learner.h
@@ -0,0 +1,48 @@
+﻿// NNUE評価関数の学習で用いるインターフェイス
+
+#ifndef _EVALUATE_NNUE_LEARNER_H_
+#define _EVALUATE_NNUE_LEARNER_H_
+
+#include "../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../learn/learn.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 学習の初期化を行う
+void InitializeTraining(double eta1, u64 eta1_epoch,
+                        double eta2, u64 eta2_epoch, double eta3);
+
+// ミニバッチのサンプル数を設定する
+void SetBatchSize(u64 size);
+
+// 学習率のスケールを設定する
+void SetGlobalLearningRateScale(double scale);
+
+// ハイパーパラメータなどのオプションを設定する
+void SetOptions(const std::string& options);
+
+// 学習用評価関数パラメータをファイルから読み直す
+void RestoreParameters(const std::string& dir_name);
+
+// 学習データを1サンプル追加する
+void AddExample(Position& pos, Color rootColor,
+                const Learner::PackedSfenValue& psv, double weight);
+
+// 評価関数パラメータを更新する
+void UpdateParameters(u64 epoch);
+
+// 学習に問題が生じていないかチェックする
+void CheckHealth();
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/features/feature_set.h b/src/eval/nnue/features/feature_set.h
new file mode 100644
index 00000000..5d312a2e
--- /dev/null
+++ b/src/eval/nnue/features/feature_set.h
@@ -0,0 +1,249 @@
+﻿// NNUE評価関数の入力特徴量セットを表すクラステンプレート
+
+#ifndef _NNUE_FEATURE_SET_H_
+#define _NNUE_FEATURE_SET_H_
+
+#if defined(EVAL_NNUE)
+
+#include "features_common.h"
+#include <array>
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 値のリストを表すクラステンプレート
+template <typename T, T... Values>
+struct CompileTimeList;
+template <typename T, T First, T... Remaining>
+struct CompileTimeList<T, First, Remaining...> {
+  static constexpr bool Contains(T value) {
+    return value == First || CompileTimeList<T, Remaining...>::Contains(value);
+  }
+  static constexpr std::array<T, sizeof...(Remaining) + 1>
+      kValues = {{First, Remaining...}};
+};
+template <typename T, T First, T... Remaining>
+constexpr std::array<T, sizeof...(Remaining) + 1>
+    CompileTimeList<T, First, Remaining...>::kValues;
+template <typename T>
+struct CompileTimeList<T> {
+  static constexpr bool Contains(T /*value*/) {
+    return false;
+  }
+  static constexpr std::array<T, 0> kValues = {{}};
+};
+
+// リストの先頭への追加を行うクラステンプレート
+template <typename T, typename ListType, T Value>
+struct AppendToList;
+template <typename T, T... Values, T AnotherValue>
+struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
+  using Result = CompileTimeList<T, AnotherValue, Values...>;
+};
+
+// ソートされた重複のないリストへの追加を行うクラステンプレート
+template <typename T, typename ListType, T Value>
+struct InsertToSet;
+template <typename T, T First, T... Remaining, T AnotherValue>
+struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
+  using Result = std::conditional_t<
+      CompileTimeList<T, First, Remaining...>::Contains(AnotherValue),
+      CompileTimeList<T, First, Remaining...>,
+      std::conditional_t<(AnotherValue < First),
+          CompileTimeList<T, AnotherValue, First, Remaining...>,
+          typename AppendToList<T, typename InsertToSet<
+              T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
+              First>::Result>>;
+};
+template <typename T, T Value>
+struct InsertToSet<T, CompileTimeList<T>, Value> {
+  using Result = CompileTimeList<T, Value>;
+};
+
+// 特徴量セットの基底クラス
+template <typename Derived>
+class FeatureSetBase {
+ public:
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  template <typename IndexListType>
+  static void AppendActiveIndices(
+      const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
+    for (const auto perspective : COLOR) {
+      Derived::CollectActiveIndices(
+          pos, trigger, perspective, &active[perspective]);
+    }
+  }
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  template <typename PositionType, typename IndexListType>
+  static void AppendChangedIndices(
+      const PositionType& pos, TriggerEvent trigger,
+      IndexListType removed[2], IndexListType added[2], bool reset[2]) {
+    const auto& dp = pos.state()->dirtyPiece;
+    if (dp.dirty_num == 0) return;
+
+    for (const auto perspective : COLOR) {
+      reset[perspective] = false;
+      switch (trigger) {
+        case TriggerEvent::kNone:
+          break;
+        case TriggerEvent::kFriendKingMoved:
+          reset[perspective] =
+              dp.pieceNo[0] == PIECE_NUMBER_KING + perspective;
+          break;
+        case TriggerEvent::kEnemyKingMoved:
+          reset[perspective] =
+              dp.pieceNo[0] == PIECE_NUMBER_KING + ~perspective;
+          break;
+        case TriggerEvent::kAnyKingMoved:
+          reset[perspective] = dp.pieceNo[0] >= PIECE_NUMBER_KING;
+          break;
+        case TriggerEvent::kAnyPieceMoved:
+          reset[perspective] = true;
+          break;
+        default:
+          ASSERT_LV5(false);
+          break;
+      }
+      if (reset[perspective]) {
+        Derived::CollectActiveIndices(
+            pos, trigger, perspective, &added[perspective]);
+      } else {
+        Derived::CollectChangedIndices(
+            pos, trigger, perspective,
+            &removed[perspective], &added[perspective]);
+      }
+    }
+  }
+};
+
+// 特徴量セットを表すクラステンプレート
+// 実行時の計算量を線形にするために、内部の処理はテンプレート引数の逆順に行う
+template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
+    public FeatureSetBase<
+        FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
+ private:
+  using Head = FirstFeatureType;
+  using Tail = FeatureSet<RemainingFeatureTypes...>;
+
+ public:
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t kHashValue =
+      Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
+  // 特徴量の次元数
+  static constexpr IndexType kDimensions =
+      Head::kDimensions + Tail::kDimensions;
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions =
+      Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
+  // 差分計算の代わりに全計算を行うタイミングのリスト
+  using SortedTriggerSet = typename InsertToSet<TriggerEvent,
+      typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
+  static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+  // 特徴量名を取得する
+  static std::string GetName() {
+    return std::string(Head::kName) + "+" + Tail::GetName();
+  }
+
+ private:
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  template <typename IndexListType>
+  static void CollectActiveIndices(
+      const Position& pos, const TriggerEvent trigger, const Color perspective,
+      IndexListType* const active) {
+    Tail::CollectActiveIndices(pos, trigger, perspective, active);
+    if (Head::kRefreshTrigger == trigger) {
+      const auto start = active->size();
+      Head::AppendActiveIndices(pos, perspective, active);
+      for (auto i = start; i < active->size(); ++i) {
+        (*active)[i] += Tail::kDimensions;
+      }
+    }
+  }
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  template <typename IndexListType>
+  static void CollectChangedIndices(
+      const Position& pos, const TriggerEvent trigger, const Color perspective,
+      IndexListType* const removed, IndexListType* const added) {
+    Tail::CollectChangedIndices(pos, trigger, perspective, removed, added);
+    if (Head::kRefreshTrigger == trigger) {
+      const auto start_removed = removed->size();
+      const auto start_added = added->size();
+      Head::AppendChangedIndices(pos, perspective, removed, added);
+      for (auto i = start_removed; i < removed->size(); ++i) {
+        (*removed)[i] += Tail::kDimensions;
+      }
+      for (auto i = start_added; i < added->size(); ++i) {
+        (*added)[i] += Tail::kDimensions;
+      }
+    }
+  }
+
+  // 基底クラスと、自身を再帰的に利用するクラステンプレートをfriendにする
+  friend class FeatureSetBase<FeatureSet>;
+  template <typename... FeatureTypes>
+  friend class FeatureSet;
+};
+
+// 特徴量セットを表すクラステンプレート
+// テンプレート引数が1つの場合の特殊化
+template <typename FeatureType>
+class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+ public:
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
+  // 特徴量の次元数
+  static constexpr IndexType kDimensions = FeatureType::kDimensions;
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions =
+      FeatureType::kMaxActiveDimensions;
+  // 差分計算の代わりに全計算を行うタイミングのリスト
+  using SortedTriggerSet =
+      CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
+  static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+  // 特徴量名を取得する
+  static std::string GetName() {
+    return FeatureType::kName;
+  }
+
+ private:
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  static void CollectActiveIndices(
+      const Position& pos, const TriggerEvent trigger, const Color perspective,
+      IndexList* const active) {
+    if (FeatureType::kRefreshTrigger == trigger) {
+      FeatureType::AppendActiveIndices(pos, perspective, active);
+    }
+  }
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  static void CollectChangedIndices(
+      const Position& pos, const TriggerEvent trigger, const Color perspective,
+      IndexList* const removed, IndexList* const added) {
+    if (FeatureType::kRefreshTrigger == trigger) {
+      FeatureType::AppendChangedIndices(pos, perspective, removed, added);
+    }
+  }
+
+  // 基底クラスと、自身を再帰的に利用するクラステンプレートをfriendにする
+  friend class FeatureSetBase<FeatureSet>;
+  template <typename... FeatureTypes>
+  friend class FeatureSet;
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/features/features_common.h b/src/eval/nnue/features/features_common.h
new file mode 100644
index 00000000..15ccb8a7
--- /dev/null
+++ b/src/eval/nnue/features/features_common.h
@@ -0,0 +1,47 @@
+﻿// NNUE評価関数の入力特徴量の共通ヘッダ
+
+#ifndef _NNUE_FEATURES_COMMON_H_
+#define _NNUE_FEATURES_COMMON_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// インデックスリストの型
+class IndexList;
+
+// 特徴量セットを表すクラステンプレート
+template <typename... FeatureTypes>
+class FeatureSet;
+
+// 差分計算の代わりに全計算を行うタイミングの種類
+enum class TriggerEvent {
+  kNone,             // 可能な場合は常に差分計算する
+  kFriendKingMoved,  // 自玉が移動した場合に全計算する
+  kEnemyKingMoved,   // 敵玉が移動した場合に全計算する
+  kAnyKingMoved,     // どちらかの玉が移動した場合に全計算する
+  kAnyPieceMoved,    // 常に全計算する
+};
+
+// 手番側or相手側
+enum class Side {
+  kFriend,  // 手番側
+  kEnemy,   // 相手側
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/features/half_kp.cpp b/src/eval/nnue/features/half_kp.cpp
new file mode 100644
index 00000000..f1a1f57f
--- /dev/null
+++ b/src/eval/nnue/features/half_kp.cpp
@@ -0,0 +1,78 @@
+﻿// NNUE評価関数の入力特徴量HalfKPの定義
+
+#if defined(EVAL_NNUE)
+
+#include "half_kp.h"
+#include "index_list.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 玉の位置とBonaPieceから特徴量のインデックスを求める
+template <Side AssociatedKing>
+inline IndexType HalfKP<AssociatedKing>::MakeIndex(Square sq_k, BonaPiece p) {
+  return static_cast<IndexType>(fe_end) * static_cast<IndexType>(sq_k) + p;
+}
+
+// 駒の情報を取得する
+template <Side AssociatedKing>
+inline void HalfKP<AssociatedKing>::GetPieces(
+    const Position& pos, Color perspective,
+    BonaPiece** pieces, Square* sq_target_k) {
+  *pieces = (perspective == BLACK) ?
+      pos.eval_list()->piece_list_fb() :
+      pos.eval_list()->piece_list_fw();
+  const PieceNumber target = (AssociatedKing == Side::kFriend) ?
+      static_cast<PieceNumber>(PIECE_NUMBER_KING + perspective) :
+      static_cast<PieceNumber>(PIECE_NUMBER_KING + ~perspective);
+  *sq_target_k = static_cast<Square>(((*pieces)[target] - f_king) % SQ_NB);
+}
+
+// 特徴量のうち、値が1であるインデックスのリストを取得する
+template <Side AssociatedKing>
+void HalfKP<AssociatedKing>::AppendActiveIndices(
+    const Position& pos, Color perspective, IndexList* active) {
+  // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+  BonaPiece* pieces;
+  Square sq_target_k;
+  GetPieces(pos, perspective, &pieces, &sq_target_k);
+  for (PieceNumber i = PIECE_NUMBER_ZERO; i < PIECE_NUMBER_KING; ++i) {
+    active->push_back(MakeIndex(sq_target_k, pieces[i]));
+  }
+}
+
+// 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+template <Side AssociatedKing>
+void HalfKP<AssociatedKing>::AppendChangedIndices(
+    const Position& pos, Color perspective,
+    IndexList* removed, IndexList* added) {
+  BonaPiece* pieces;
+  Square sq_target_k;
+  GetPieces(pos, perspective, &pieces, &sq_target_k);
+  const auto& dp = pos.state()->dirtyPiece;
+  for (int i = 0; i < dp.dirty_num; ++i) {
+    if (dp.pieceNo[i] >= PIECE_NUMBER_KING) continue;
+    const auto old_p = static_cast<BonaPiece>(
+        dp.changed_piece[i].old_piece.from[perspective]);
+    removed->push_back(MakeIndex(sq_target_k, old_p));
+    const auto new_p = static_cast<BonaPiece>(
+        dp.changed_piece[i].new_piece.from[perspective]);
+    added->push_back(MakeIndex(sq_target_k, new_p));
+  }
+}
+
+template class HalfKP<Side::kFriend>;
+template class HalfKP<Side::kEnemy>;
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
diff --git a/src/eval/nnue/features/half_kp.h b/src/eval/nnue/features/half_kp.h
new file mode 100644
index 00000000..ffbc2947
--- /dev/null
+++ b/src/eval/nnue/features/half_kp.h
@@ -0,0 +1,62 @@
+﻿// NNUE評価関数の入力特徴量HalfKPの定義
+
+#ifndef _NNUE_FEATURES_HALF_KP_H_
+#define _NNUE_FEATURES_HALF_KP_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量HalfKP：自玉または敵玉の位置と、玉以外の駒の位置の組み合わせ
+template <Side AssociatedKing>
+class HalfKP {
+ public:
+  // 特徴量名
+  static constexpr const char* kName =
+      (AssociatedKing == Side::kFriend) ? "HalfKP(Friend)" : "HalfKP(Enemy)";
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t kHashValue =
+      0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
+  // 特徴量の次元数
+  static constexpr IndexType kDimensions =
+      static_cast<IndexType>(SQ_NB) * static_cast<IndexType>(fe_end);
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions = PIECE_NUMBER_KING;
+  // 差分計算の代わりに全計算を行うタイミング
+  static constexpr TriggerEvent kRefreshTrigger =
+      (AssociatedKing == Side::kFriend) ?
+      TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  static void AppendActiveIndices(const Position& pos, Color perspective,
+                                  IndexList* active);
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  static void AppendChangedIndices(const Position& pos, Color perspective,
+                                   IndexList* removed, IndexList* added);
+
+  // 玉の位置とBonaPieceから特徴量のインデックスを求める
+  static IndexType MakeIndex(Square sq_k, BonaPiece p);
+
+ private:
+  // 駒の情報を取得する
+  static void GetPieces(const Position& pos, Color perspective,
+                        BonaPiece** pieces, Square* sq_target_k);
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/features/half_relative_kp.cpp b/src/eval/nnue/features/half_relative_kp.cpp
new file mode 100644
index 00000000..3ee49ff9
--- /dev/null
+++ b/src/eval/nnue/features/half_relative_kp.cpp
@@ -0,0 +1,91 @@
+﻿// NNUE評価関数の入力特徴量HalfRelativeKPの定義
+
+#if defined(EVAL_NNUE)
+
+#include "half_relative_kp.h"
+#include "index_list.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 玉の位置とBonaPieceから特徴量のインデックスを求める
+template <Side AssociatedKing>
+inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
+    Square sq_k, BonaPiece p) {
+  constexpr IndexType W = kBoardWidth;
+  constexpr IndexType H = kBoardHeight;
+  const IndexType piece_index = (p - fe_hand_end) / SQ_NB;
+  const Square sq_p = static_cast<Square>((p - fe_hand_end) % SQ_NB);
+  const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
+  const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
+  return H * W * piece_index + H * relative_file + relative_rank;
+}
+
+// 駒の情報を取得する
+template <Side AssociatedKing>
+inline void HalfRelativeKP<AssociatedKing>::GetPieces(
+    const Position& pos, Color perspective,
+    BonaPiece** pieces, Square* sq_target_k) {
+  *pieces = (perspective == BLACK) ?
+      pos.eval_list()->piece_list_fb() :
+      pos.eval_list()->piece_list_fw();
+  const PieceNumber target = (AssociatedKing == Side::kFriend) ?
+      static_cast<PieceNumber>(PIECE_NUMBER_KING + perspective) :
+      static_cast<PieceNumber>(PIECE_NUMBER_KING + ~perspective);
+  *sq_target_k = static_cast<Square>(((*pieces)[target] - f_king) % SQ_NB);
+}
+
+// 特徴量のうち、値が1であるインデックスのリストを取得する
+template <Side AssociatedKing>
+void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
+    const Position& pos, Color perspective, IndexList* active) {
+  // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+  BonaPiece* pieces;
+  Square sq_target_k;
+  GetPieces(pos, perspective, &pieces, &sq_target_k);
+  for (PieceNumber i = PIECE_NUMBER_ZERO; i < PIECE_NUMBER_KING; ++i) {
+    if (pieces[i] >= fe_hand_end) {
+      active->push_back(MakeIndex(sq_target_k, pieces[i]));
+    }
+  }
+}
+
+// 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+template <Side AssociatedKing>
+void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
+    const Position& pos, Color perspective,
+    IndexList* removed, IndexList* added) {
+  BonaPiece* pieces;
+  Square sq_target_k;
+  GetPieces(pos, perspective, &pieces, &sq_target_k);
+  const auto& dp = pos.state()->dirtyPiece;
+  for (int i = 0; i < dp.dirty_num; ++i) {
+    if (dp.pieceNo[i] >= PIECE_NUMBER_KING) continue;
+    const auto old_p = static_cast<BonaPiece>(
+        dp.changed_piece[i].old_piece.from[perspective]);
+    if (old_p >= fe_hand_end) {
+      removed->push_back(MakeIndex(sq_target_k, old_p));
+    }
+    const auto new_p = static_cast<BonaPiece>(
+        dp.changed_piece[i].new_piece.from[perspective]);
+    if (new_p >= fe_hand_end) {
+      added->push_back(MakeIndex(sq_target_k, new_p));
+    }
+  }
+}
+
+template class HalfRelativeKP<Side::kFriend>;
+template class HalfRelativeKP<Side::kEnemy>;
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
diff --git a/src/eval/nnue/features/half_relative_kp.h b/src/eval/nnue/features/half_relative_kp.h
new file mode 100644
index 00000000..f9afd446
--- /dev/null
+++ b/src/eval/nnue/features/half_relative_kp.h
@@ -0,0 +1,68 @@
+﻿// NNUE評価関数の入力特徴量HalfRelativeKPの定義
+
+#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
+#define _NNUE_FEATURES_HALF_RELATIVE_KP_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量HalfRelativeKP：自玉または敵玉を基準とした、玉以外の各駒の相対位置
+template <Side AssociatedKing>
+class HalfRelativeKP {
+ public:
+  // 特徴量名
+  static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+      "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t kHashValue =
+      0xF9180919u ^ (AssociatedKing == Side::kFriend);
+  // 玉を除いた駒種
+  static constexpr IndexType kNumPieceKinds = (fe_end - fe_hand_end) / SQ_NB;
+  // 玉を中央に置いた仮想的な盤の幅
+  static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
+  // 玉を中央に置いた仮想的な盤の高さ
+  static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
+  // 特徴量の次元数
+  static constexpr IndexType kDimensions =
+      kNumPieceKinds * kBoardHeight * kBoardWidth;
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions = PIECE_NUMBER_KING;
+  // 差分計算の代わりに全計算を行うタイミング
+  static constexpr TriggerEvent kRefreshTrigger =
+      (AssociatedKing == Side::kFriend) ?
+      TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  static void AppendActiveIndices(const Position& pos, Color perspective,
+                                  IndexList* active);
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  static void AppendChangedIndices(const Position& pos, Color perspective,
+                                   IndexList* removed, IndexList* added);
+
+  // 玉の位置とBonaPieceから特徴量のインデックスを求める
+  static IndexType MakeIndex(Square sq_k, BonaPiece p);
+
+ private:
+  // 駒の情報を取得する
+  static void GetPieces(const Position& pos, Color perspective,
+                        BonaPiece** pieces, Square* sq_target_k);
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/features/index_list.h b/src/eval/nnue/features/index_list.h
new file mode 100644
index 00000000..a5a71011
--- /dev/null
+++ b/src/eval/nnue/features/index_list.h
@@ -0,0 +1,55 @@
+﻿// 入力特徴量のインデックスリストの定義
+
+#ifndef _NNUE_FEATURES_INDEX_LIST_H_
+#define _NNUE_FEATURES_INDEX_LIST_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../position.h"
+#include "../nnue_architecture.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量のインデックスリストに使うクラステンプレート
+template <typename T, std::size_t MaxSize>
+class ValueList {
+ public:
+  std::size_t size() const { return size_; }
+  void resize(std::size_t size) { size_ = size; }
+  void push_back(const T& value) { values_[size_++] = value; }
+  T& operator[](std::size_t index) { return values_[index]; }
+  T* begin() { return values_; }
+  T* end() { return values_ + size_; }
+  const T& operator[](std::size_t index) const { return values_[index]; }
+  const T* begin() const { return values_; }
+  const T* end() const { return values_ + size_; }
+  void swap(ValueList& other) {
+    const std::size_t max_size = std::max(size_, other.size_);
+    for (std::size_t i = 0; i < max_size; ++i) {
+      std::swap(values_[i], other.values_[i]);
+    }
+    std::swap(size_, other.size_);
+  }
+ private:
+  T values_[MaxSize];
+  std::size_t size_ = 0;
+};
+
+// 特徴量のインデックスリストの型
+class IndexList
+    : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/features/k.cpp b/src/eval/nnue/features/k.cpp
new file mode 100644
index 00000000..9c019e08
--- /dev/null
+++ b/src/eval/nnue/features/k.cpp
@@ -0,0 +1,49 @@
+﻿// NNUE評価関数の入力特徴量Kの定義
+
+#if defined(EVAL_NNUE)
+
+#include "k.h"
+#include "index_list.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量のうち、値が1であるインデックスのリストを取得する
+void K::AppendActiveIndices(
+    const Position& pos, Color perspective, IndexList* active) {
+  // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+  const BonaPiece* pieces = (perspective == BLACK) ?
+      pos.eval_list()->piece_list_fb() :
+      pos.eval_list()->piece_list_fw();
+  ASSERT_LV5(pieces[PIECE_NUMBER_BKING] != BONA_PIECE_ZERO);
+  ASSERT_LV5(pieces[PIECE_NUMBER_WKING] != BONA_PIECE_ZERO);
+  for (PieceNumber i = PIECE_NUMBER_KING; i < PIECE_NUMBER_NB; ++i) {
+    active->push_back(pieces[i] - fe_end);
+  }
+}
+
+// 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+void K::AppendChangedIndices(
+    const Position& pos, Color perspective,
+    IndexList* removed, IndexList* added) {
+  const auto& dp = pos.state()->dirtyPiece;
+  if (dp.pieceNo[0] >= PIECE_NUMBER_KING) {
+    removed->push_back(
+        dp.changed_piece[0].old_piece.from[perspective] - fe_end);
+    added->push_back(
+        dp.changed_piece[0].new_piece.from[perspective] - fe_end);
+  }
+}
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
diff --git a/src/eval/nnue/features/k.h b/src/eval/nnue/features/k.h
new file mode 100644
index 00000000..a5dda8fd
--- /dev/null
+++ b/src/eval/nnue/features/k.h
@@ -0,0 +1,48 @@
+﻿// NNUE評価関数の入力特徴量Kの定義
+
+#ifndef _NNUE_FEATURES_K_H_
+#define _NNUE_FEATURES_K_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量K：玉の位置
+class K {
+ public:
+  // 特徴量名
+  static constexpr const char* kName = "K";
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
+  // 特徴量の次元数
+  static constexpr IndexType kDimensions = SQ_NB * 2;
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions = 2;
+  // 差分計算の代わりに全計算を行うタイミング
+  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  static void AppendActiveIndices(const Position& pos, Color perspective,
+                                  IndexList* active);
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  static void AppendChangedIndices(const Position& pos, Color perspective,
+                                   IndexList* removed, IndexList* added);
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/features/p.cpp b/src/eval/nnue/features/p.cpp
new file mode 100644
index 00000000..da1481cb
--- /dev/null
+++ b/src/eval/nnue/features/p.cpp
@@ -0,0 +1,46 @@
+﻿// NNUE評価関数の入力特徴量Pの定義
+
+#if defined(EVAL_NNUE)
+
+#include "p.h"
+#include "index_list.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量のうち、値が1であるインデックスのリストを取得する
+void P::AppendActiveIndices(
+    const Position& pos, Color perspective, IndexList* active) {
+  // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+  const BonaPiece* pieces = (perspective == BLACK) ?
+      pos.eval_list()->piece_list_fb() :
+      pos.eval_list()->piece_list_fw();
+  for (PieceNumber i = PIECE_NUMBER_ZERO; i < PIECE_NUMBER_KING; ++i) {
+    active->push_back(pieces[i]);
+  }
+}
+
+// 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+void P::AppendChangedIndices(
+    const Position& pos, Color perspective,
+    IndexList* removed, IndexList* added) {
+  const auto& dp = pos.state()->dirtyPiece;
+  for (int i = 0; i < dp.dirty_num; ++i) {
+    if (dp.pieceNo[i] >= PIECE_NUMBER_KING) continue;
+    removed->push_back(dp.changed_piece[i].old_piece.from[perspective]);
+    added->push_back(dp.changed_piece[i].new_piece.from[perspective]);
+  }
+}
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
diff --git a/src/eval/nnue/features/p.h b/src/eval/nnue/features/p.h
new file mode 100644
index 00000000..77ea882d
--- /dev/null
+++ b/src/eval/nnue/features/p.h
@@ -0,0 +1,48 @@
+﻿// NNUE評価関数の入力特徴量Pの定義
+
+#ifndef _NNUE_FEATURES_P_H_
+#define _NNUE_FEATURES_P_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 特徴量P：玉以外の駒のBonaPiece
+class P {
+ public:
+  // 特徴量名
+  static constexpr const char* kName = "P";
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
+  // 特徴量の次元数
+  static constexpr IndexType kDimensions = fe_end;
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions = PIECE_NUMBER_KING;
+  // 差分計算の代わりに全計算を行うタイミング
+  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  static void AppendActiveIndices(const Position& pos, Color perspective,
+                                  IndexList* active);
+
+  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  static void AppendChangedIndices(const Position& pos, Color perspective,
+                                   IndexList* removed, IndexList* added);
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/layers/affine_transform.h b/src/eval/nnue/layers/affine_transform.h
new file mode 100644
index 00000000..9b227270
--- /dev/null
+++ b/src/eval/nnue/layers/affine_transform.h
@@ -0,0 +1,170 @@
+﻿// NNUE評価関数の層AffineTransformの定義
+
+#ifndef _NNUE_LAYERS_AFFINE_TRANSFORM_H_
+#define _NNUE_LAYERS_AFFINE_TRANSFORM_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Layers {
+
+// アフィン変換層
+template <typename PreviousLayer, IndexType OutputDimensions>
+class AffineTransform {
+ public:
+  // 入出力の型
+  using InputType = typename PreviousLayer::OutputType;
+  using OutputType = std::int32_t;
+  static_assert(std::is_same<InputType, std::uint8_t>::value, "");
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions =
+      PreviousLayer::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = OutputDimensions;
+  static constexpr IndexType kPaddedInputDimensions =
+      CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);
+
+  // この層で使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kSelfBufferSize =
+      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+
+  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kBufferSize =
+      PreviousLayer::kBufferSize + kSelfBufferSize;
+
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0xCC03DAE4u;
+    hash_value += kOutputDimensions;
+    hash_value ^= PreviousLayer::GetHashValue() >> 1;
+    hash_value ^= PreviousLayer::GetHashValue() << 31;
+    return hash_value;
+  }
+
+  // 入力層からこの層までの構造を表す文字列
+  static std::string GetStructureString() {
+    return "AffineTransform[" +
+        std::to_string(kOutputDimensions) + "<-" +
+        std::to_string(kInputDimensions) + "](" +
+        PreviousLayer::GetStructureString() + ")";
+  }
+
+  // パラメータを読み込む
+  bool ReadParameters(std::istream& stream) {
+    if (!previous_layer_.ReadParameters(stream)) return false;
+    stream.read(reinterpret_cast<char*>(biases_),
+                kOutputDimensions * sizeof(BiasType));
+    stream.read(reinterpret_cast<char*>(weights_),
+                kOutputDimensions * kPaddedInputDimensions *
+                sizeof(WeightType));
+    return !stream.fail();
+  }
+
+  // パラメータを書き込む
+  bool WriteParameters(std::ostream& stream) const {
+    if (!previous_layer_.WriteParameters(stream)) return false;
+    stream.write(reinterpret_cast<const char*>(biases_),
+                 kOutputDimensions * sizeof(BiasType));
+    stream.write(reinterpret_cast<const char*>(weights_),
+                 kOutputDimensions * kPaddedInputDimensions *
+                 sizeof(WeightType));
+    return !stream.fail();
+  }
+
+  // 順伝播
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features, char* buffer) const {
+    const auto input = previous_layer_.Propagate(
+        transformed_features, buffer + kSelfBufferSize);
+    const auto output = reinterpret_cast<OutputType*>(buffer);
+#if defined(USE_AVX2)
+    constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+    const __m256i kOnes = _mm256_set1_epi16(1);
+    const auto input_vector = reinterpret_cast<const __m256i*>(input);
+#elif defined(USE_SSE41)
+    constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+    const __m128i kOnes = _mm_set1_epi16(1);
+    const auto input_vector = reinterpret_cast<const __m128i*>(input);
+#elif defined(IS_ARM)
+    constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+    const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
+#endif
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      const IndexType offset = i * kPaddedInputDimensions;
+#if defined(USE_AVX2)
+      __m256i sum = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, biases_[i]);
+      const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        __m256i product = _mm256_maddubs_epi16(
+            _mm256_load_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+        product = _mm256_madd_epi16(product, kOnes);
+        sum = _mm256_add_epi32(sum, product);
+      }
+      sum = _mm256_hadd_epi32(sum, sum);
+      sum = _mm256_hadd_epi32(sum, sum);
+      const __m128i lo = _mm256_extracti128_si256(sum, 0);
+      const __m128i hi = _mm256_extracti128_si256(sum, 1);
+      output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
+#elif defined(USE_SSE41)
+      __m128i sum = _mm_cvtsi32_si128(biases_[i]);
+      const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        __m128i product = _mm_maddubs_epi16(
+            _mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+        product = _mm_madd_epi16(product, kOnes);
+        sum = _mm_add_epi32(sum, product);
+      }
+      sum = _mm_hadd_epi32(sum, sum);
+      sum = _mm_hadd_epi32(sum, sum);
+      output[i] = _mm_cvtsi128_si32(sum);
+#elif defined(IS_ARM)
+      int32x4_t sum = {biases_[i]};
+      const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
+        product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
+        sum = vpadalq_s16(sum, product);
+      }
+      output[i] = sum[0] + sum[1] + sum[2] + sum[3];
+#else
+      OutputType sum = biases_[i];
+      for (IndexType j = 0; j < kInputDimensions; ++j) {
+        sum += weights_[offset + j] * input[j];
+      }
+      output[i] = sum;
+#endif
+    }
+    return output;
+  }
+
+ private:
+  // パラメータの型
+  using BiasType = OutputType;
+  using WeightType = std::int8_t;
+
+  // 学習用クラスをfriendにする
+  friend class Trainer<AffineTransform>;
+
+  // この層の直前の層
+  PreviousLayer previous_layer_;
+
+  // パラメータ
+  alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
+  alignas(kCacheLineSize)
+      WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
+};
+
+}  // namespace Layers
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/layers/clipped_relu.h b/src/eval/nnue/layers/clipped_relu.h
new file mode 100644
index 00000000..f904de74
--- /dev/null
+++ b/src/eval/nnue/layers/clipped_relu.h
@@ -0,0 +1,140 @@
+﻿// NNUE評価関数の層ClippedReLUの定義
+
+#ifndef _NNUE_LAYERS_CLIPPED_RELU_H_
+#define _NNUE_LAYERS_CLIPPED_RELU_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Layers {
+
+// Clipped ReLU
+template <typename PreviousLayer>
+class ClippedReLU {
+ public:
+  // 入出力の型
+  using InputType = typename PreviousLayer::OutputType;
+  using OutputType = std::uint8_t;
+  static_assert(std::is_same<InputType, std::int32_t>::value, "");
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions =
+      PreviousLayer::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = kInputDimensions;
+
+  // この層で使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kSelfBufferSize =
+      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+
+  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kBufferSize =
+      PreviousLayer::kBufferSize + kSelfBufferSize;
+
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0x538D24C7u;
+    hash_value += PreviousLayer::GetHashValue();
+    return hash_value;
+  }
+
+  // 入力層からこの層までの構造を表す文字列
+  static std::string GetStructureString() {
+    return "ClippedReLU[" +
+        std::to_string(kOutputDimensions) + "](" +
+        PreviousLayer::GetStructureString() + ")";
+  }
+
+  // パラメータを読み込む
+  bool ReadParameters(std::istream& stream) {
+    return previous_layer_.ReadParameters(stream);
+  }
+
+  // パラメータを書き込む
+  bool WriteParameters(std::ostream& stream) const {
+    return previous_layer_.WriteParameters(stream);
+  }
+
+  // 順伝播
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features, char* buffer) const {
+    const auto input = previous_layer_.Propagate(
+        transformed_features, buffer + kSelfBufferSize);
+    const auto output = reinterpret_cast<OutputType*>(buffer);
+#if defined(USE_AVX2)
+    constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+    const __m256i kZero = _mm256_setzero_si256();
+    const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+    const auto in = reinterpret_cast<const __m256i*>(input);
+    const auto out = reinterpret_cast<__m256i*>(output);
+    for (IndexType i = 0; i < kNumChunks; ++i) {
+      const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
+          _mm256_load_si256(&in[i * 4 + 0]),
+          _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
+      const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
+          _mm256_load_si256(&in[i * 4 + 2]),
+          _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
+      _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+          _mm256_packs_epi16(words0, words1), kZero), kOffsets));
+    }
+    constexpr IndexType kStart = kNumChunks * kSimdWidth;
+#elif defined(USE_SSE41)
+    constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+    const __m128i kZero = _mm_setzero_si128();
+    const auto in = reinterpret_cast<const __m128i*>(input);
+    const auto out = reinterpret_cast<__m128i*>(output);
+    for (IndexType i = 0; i < kNumChunks; ++i) {
+      const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
+          _mm_load_si128(&in[i * 4 + 0]),
+          _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits);
+      const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
+          _mm_load_si128(&in[i * 4 + 2]),
+          _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
+      _mm_store_si128(&out[i], _mm_max_epi8(
+          _mm_packs_epi16(words0, words1), kZero));
+    }
+    constexpr IndexType kStart = kNumChunks * kSimdWidth;
+#elif defined(IS_ARM)
+    constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
+    const int8x8_t kZero = {0};
+    const auto in = reinterpret_cast<const int32x4_t*>(input);
+    const auto out = reinterpret_cast<int8x8_t*>(output);
+    for (IndexType i = 0; i < kNumChunks; ++i) {
+      int16x8_t shifted;
+      const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
+      pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits);
+      pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits);
+      out[i] = vmax_s8(vqmovn_s16(shifted), kZero);
+    }
+    constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2);
+#else
+    constexpr IndexType kStart = 0;
+#endif
+    for (IndexType i = kStart; i < kInputDimensions; ++i) {
+      output[i] = static_cast<OutputType>(
+          std::max(0, std::min(127, input[i] >> kWeightScaleBits)));
+    }
+    return output;
+  }
+
+ private:
+  // 学習用クラスをfriendにする
+  friend class Trainer<ClippedReLU>;
+
+  // この層の直前の層
+  PreviousLayer previous_layer_;
+};
+
+}  // namespace Layers
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/layers/input_slice.h b/src/eval/nnue/layers/input_slice.h
new file mode 100644
index 00000000..c9c6a7c9
--- /dev/null
+++ b/src/eval/nnue/layers/input_slice.h
@@ -0,0 +1,74 @@
+﻿// NNUE評価関数の層InputSliceの定義
+
+#ifndef _NNUE_LAYERS_INPUT_SLICE_H_
+#define _NNUE_LAYERS_INPUT_SLICE_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Layers {
+
+// 入力層
+template <IndexType OutputDimensions, IndexType Offset = 0>
+class InputSlice {
+ public:
+  // アライメントを維持する必要がある
+  static_assert(Offset % kMaxSimdWidth == 0, "");
+
+  // 出力の型
+  using OutputType = TransformedFeatureType;
+
+  // 出力の次元数
+  static constexpr IndexType kOutputDimensions = OutputDimensions;
+
+  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kBufferSize = 0;
+
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0xEC42E90Du;
+    hash_value ^= kOutputDimensions ^ (Offset << 10);
+    return hash_value;
+  }
+
+  // 入力層からこの層までの構造を表す文字列
+  static std::string GetStructureString() {
+    return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
+        std::to_string(Offset) + ":" +
+        std::to_string(Offset + kOutputDimensions) + ")]";
+  }
+
+  // パラメータを読み込む
+  bool ReadParameters(std::istream& /*stream*/) {
+    return true;
+  }
+
+  // パラメータを書き込む
+  bool WriteParameters(std::ostream& /*stream*/) const {
+    return true;
+  }
+
+  // 順伝播
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features,
+      char* /*buffer*/) const {
+    return transformed_features + Offset;
+  }
+
+ private:
+};
+
+}  // namespace Layers
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/layers/sum.h b/src/eval/nnue/layers/sum.h
new file mode 100644
index 00000000..216de458
--- /dev/null
+++ b/src/eval/nnue/layers/sum.h
@@ -0,0 +1,165 @@
+﻿// NNUE評価関数の層Sumの定義
+
+#ifndef _NNUE_LAYERS_SUM_H_
+#define _NNUE_LAYERS_SUM_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Layers {
+
+// 複数の層の出力の和を取る層
+template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+class Sum : public Sum<RemainingPreviousLayers...> {
+ private:
+  using Head = FirstPreviousLayer;
+  using Tail = Sum<RemainingPreviousLayers...>;
+
+ public:
+  // 入出力の型
+  using InputType = typename Head::OutputType;
+  using OutputType = InputType;
+  static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = kInputDimensions;
+  static_assert(kInputDimensions == Tail::kInputDimensions , "");
+
+  // この層で使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kSelfBufferSize =
+      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+
+  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kBufferSize =
+      std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
+
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0xBCE400B4u;
+    hash_value ^= Head::GetHashValue() >> 1;
+    hash_value ^= Head::GetHashValue() << 31;
+    hash_value ^= Tail::GetHashValue() >> 2;
+    hash_value ^= Tail::GetHashValue() << 30;
+    return hash_value;
+  }
+
+  // 入力層からこの層までの構造を表す文字列
+  static std::string GetStructureString() {
+    return "Sum[" +
+        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+  }
+
+  // パラメータを読み込む
+  bool ReadParameters(std::istream& stream) {
+    if (!Tail::ReadParameters(stream)) return false;
+    return previous_layer_.ReadParameters(stream);
+  }
+
+  // パラメータを書き込む
+  bool WriteParameters(std::ostream& stream) const {
+    if (!Tail::WriteParameters(stream)) return false;
+    return previous_layer_.WriteParameters(stream);
+  }
+
+  // 順伝播
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features, char* buffer) const {
+    Tail::Propagate(transformed_features, buffer);
+    const auto head_output = previous_layer_.Propagate(
+        transformed_features, buffer + kSelfBufferSize);
+    const auto output = reinterpret_cast<OutputType*>(buffer);
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      output[i] += head_output[i];
+    }
+    return output;
+  }
+
+ protected:
+  // 和を取る対象となる層のリストを表す文字列
+  static std::string GetSummandsString() {
+    return Head::GetStructureString() + "," + Tail::GetSummandsString();
+  }
+
+  // 学習用クラスをfriendにする
+  friend class Trainer<Sum>;
+
+  // この層の直前の層
+  FirstPreviousLayer previous_layer_;
+};
+
+// 複数の層の出力の和を取る層（テンプレート引数が1つの場合）
+template <typename PreviousLayer>
+class Sum<PreviousLayer> {
+ public:
+  // 入出力の型
+  using InputType = typename PreviousLayer::OutputType;
+  using OutputType = InputType;
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions =
+      PreviousLayer::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = kInputDimensions;
+
+  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0xBCE400B4u;
+    hash_value ^= PreviousLayer::GetHashValue() >> 1;
+    hash_value ^= PreviousLayer::GetHashValue() << 31;
+    return hash_value;
+  }
+
+  // 入力層からこの層までの構造を表す文字列
+  static std::string GetStructureString() {
+    return "Sum[" +
+        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+  }
+
+  // パラメータを読み込む
+  bool ReadParameters(std::istream& stream) {
+    return previous_layer_.ReadParameters(stream);
+  }
+
+  // パラメータを書き込む
+  bool WriteParameters(std::ostream& stream) const {
+    return previous_layer_.WriteParameters(stream);
+  }
+
+  // 順伝播
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features, char* buffer) const {
+    return previous_layer_.Propagate(transformed_features, buffer);
+  }
+
+ protected:
+  // 和を取る対象となる層のリストを表す文字列
+  static std::string GetSummandsString() {
+    return PreviousLayer::GetStructureString();
+  }
+
+  // 学習用クラスをfriendにする
+  friend class Trainer<Sum>;
+
+  // この層の直前の層
+  PreviousLayer previous_layer_;
+};
+
+}  // namespace Layers
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/nnue_accumulator.h b/src/eval/nnue/nnue_accumulator.h
new file mode 100644
index 00000000..c7c43a3e
--- /dev/null
+++ b/src/eval/nnue/nnue_accumulator.h
@@ -0,0 +1,32 @@
+﻿// NNUE評価関数の差分計算用のクラス
+
+#ifndef _NNUE_ACCUMULATOR_H_
+#define _NNUE_ACCUMULATOR_H_
+
+#include "../../config.h"
+
+#if defined(EVAL_NNUE)
+
+#include "nnue_architecture.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 入力特徴量をアフィン変換した結果を保持するクラス
+// 最終的な出力である評価値も一緒に持たせておく
+struct alignas(32) Accumulator {
+  std::int16_t
+      accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+  Value score = VALUE_ZERO;
+  bool computed_accumulation = false;
+  bool computed_score = false;
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/nnue_architecture.h b/src/eval/nnue/nnue_architecture.h
new file mode 100644
index 00000000..6815ada5
--- /dev/null
+++ b/src/eval/nnue/nnue_architecture.h
@@ -0,0 +1,36 @@
+﻿// NNUE評価関数で用いる入力特徴量とネットワーク構造
+
+#ifndef _NNUE_ARCHITECTURE_H_
+#define _NNUE_ARCHITECTURE_H_
+
+#if defined(EVAL_NNUE)
+
+// 入力特徴量とネットワーク構造が定義されたヘッダをincludeする
+
+// KP256型を使いたいときは、これを事前にdefineする。
+#if defined(EVAL_NNUE_KP256)
+#include "architectures/k-p_256x2-32-32.h"
+#else // #if defined(EVAL_NNUE_HALFKP256)
+
+// NNUE評価関数のデフォルトは、halfKP256
+#include "architectures/halfkp_256x2-32-32.h"
+#endif
+
+namespace Eval {
+
+namespace NNUE {
+
+static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
+static_assert(Network::kOutputDimensions == 1, "");
+static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
+
+// 差分計算の代わりに全計算を行うタイミングのリスト
+constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/nnue_common.h b/src/eval/nnue/nnue_common.h
new file mode 100644
index 00000000..ac114b75
--- /dev/null
+++ b/src/eval/nnue/nnue_common.h
@@ -0,0 +1,54 @@
+﻿// NNUE評価関数で用いる定数など
+
+#ifndef _NNUE_COMMON_H_
+#define _NNUE_COMMON_H_
+
+#if defined(EVAL_NNUE)
+
+namespace Eval {
+
+namespace NNUE {
+
+// 評価関数ファイルのバージョンを表す定数
+constexpr std::uint32_t kVersion = 0x7AF32F16u;
+
+// 評価値の計算で利用する定数
+constexpr int FV_SCALE = 16;
+constexpr int kWeightScaleBits = 6;
+
+// キャッシュラインのサイズ（バイト単位）
+constexpr std::size_t kCacheLineSize = 64;
+
+// SIMD幅（バイト単位）
+#if defined(USE_AVX2)
+constexpr std::size_t kSimdWidth = 32;
+#elif defined(USE_SSE2)
+constexpr std::size_t kSimdWidth = 16;
+#elif defined(IS_ARM)
+constexpr std::size_t kSimdWidth = 16;
+#endif
+constexpr std::size_t kMaxSimdWidth = 32;
+
+// 変換後の入力特徴量の型
+using TransformedFeatureType = std::uint8_t;
+
+// インデックスの型
+using IndexType = std::uint32_t;
+
+// 学習用クラステンプレートの前方宣言
+template <typename Layer>
+class Trainer;
+
+// n以上で最小のbaseの倍数を求める
+template <typename IntType>
+constexpr IntType CeilToMultiple(IntType n, IntType base) {
+  return (n + base - 1) / base * base;
+}
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/nnue_feature_transformer.h b/src/eval/nnue/nnue_feature_transformer.h
new file mode 100644
index 00000000..22f5df82
--- /dev/null
+++ b/src/eval/nnue/nnue_feature_transformer.h
@@ -0,0 +1,323 @@
+﻿// NNUE評価関数の入力特徴量の変換を行うクラス
+
+#ifndef _NNUE_FEATURE_TRANSFORMER_H_
+#define _NNUE_FEATURE_TRANSFORMER_H_
+
+#if defined(EVAL_NNUE)
+
+#include "nnue_common.h"
+#include "nnue_architecture.h"
+#include "features/index_list.h"
+
+#include <cstring> // std::memset()
+
+namespace Eval {
+
+namespace NNUE {
+
+// 入力特徴量変換器
+class FeatureTransformer {
+ private:
+  // 片側分の出力の次元数
+  static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
+
+ public:
+  // 出力の型
+  using OutputType = TransformedFeatureType;
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
+  static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
+
+  // 順伝播用バッファのサイズ
+  static constexpr std::size_t kBufferSize =
+      kOutputDimensions * sizeof(OutputType);
+
+  // 評価関数ファイルに埋め込むハッシュ値
+  static constexpr std::uint32_t GetHashValue() {
+    return RawFeatures::kHashValue ^ kOutputDimensions;
+  }
+
+  // 構造を表す文字列
+  static std::string GetStructureString() {
+    return RawFeatures::GetName() + "[" +
+        std::to_string(kInputDimensions) + "->" +
+        std::to_string(kHalfDimensions) + "x2]";
+  }
+
+  // パラメータを読み込む
+  bool ReadParameters(std::istream& stream) {
+    stream.read(reinterpret_cast<char*>(biases_),
+                kHalfDimensions * sizeof(BiasType));
+    stream.read(reinterpret_cast<char*>(weights_),
+                kHalfDimensions * kInputDimensions * sizeof(WeightType));
+    return !stream.fail();
+  }
+
+  // パラメータを書き込む
+  bool WriteParameters(std::ostream& stream) const {
+    stream.write(reinterpret_cast<const char*>(biases_),
+                 kHalfDimensions * sizeof(BiasType));
+    stream.write(reinterpret_cast<const char*>(weights_),
+                 kHalfDimensions * kInputDimensions * sizeof(WeightType));
+    return !stream.fail();
+  }
+
+  // 可能なら差分計算を進める
+  bool UpdateAccumulatorIfPossible(const Position& pos) const {
+    const auto now = pos.state();
+    if (now->accumulator.computed_accumulation) {
+      return true;
+    }
+    const auto prev = now->previous;
+    if (prev && prev->accumulator.computed_accumulation) {
+      UpdateAccumulator(pos);
+      return true;
+    }
+    return false;
+  }
+
+  // 入力特徴量を変換する
+  void Transform(const Position& pos, OutputType* output, bool refresh) const {
+    if (refresh || !UpdateAccumulatorIfPossible(pos)) {
+      RefreshAccumulator(pos);
+    }
+    const auto& accumulation = pos.state()->accumulator.accumulation;
+#if defined(USE_AVX2)
+    constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+    constexpr int kControl = 0b11011000;
+    const __m256i kZero = _mm256_setzero_si256();
+#elif defined(USE_SSE41)
+    constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+    const __m128i kZero = _mm_setzero_si128();
+#elif defined(IS_ARM)
+    constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+    const int8x8_t kZero = {0};
+#endif
+    const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+    for (IndexType p = 0; p < 2; ++p) {
+      const IndexType offset = kHalfDimensions * p;
+#if defined(USE_AVX2)
+      auto out = reinterpret_cast<__m256i*>(&output[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        __m256i sum0 = _mm256_load_si256(&reinterpret_cast<const __m256i*>(
+            accumulation[perspectives[p]][0])[j * 2 + 0]);
+        __m256i sum1 = _mm256_load_si256(&reinterpret_cast<const __m256i*>(
+            accumulation[perspectives[p]][0])[j * 2 + 1]);
+        for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+          sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
+              accumulation[perspectives[p]][i])[j * 2 + 0]);
+          sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
+              accumulation[perspectives[p]][i])[j * 2 + 1]);
+        }
+        _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+            _mm256_packs_epi16(sum0, sum1), kZero), kControl));
+      }
+#elif defined(USE_SSE41)
+      auto out = reinterpret_cast<__m128i*>(&output[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+            accumulation[perspectives[p]][0])[j * 2 + 0]);
+        __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+            accumulation[perspectives[p]][0])[j * 2 + 1]);
+        for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+          sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
+              accumulation[perspectives[p]][i])[j * 2 + 0]);
+          sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
+              accumulation[perspectives[p]][i])[j * 2 + 1]);
+        }
+        _mm_store_si128(&out[j], _mm_max_epi8(
+            _mm_packs_epi16(sum0, sum1), kZero));
+      }
+#elif defined(IS_ARM)
+      const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        int16x8_t sum = reinterpret_cast<const int16x8_t*>(
+            accumulation[perspectives[p]][0])[j];
+        for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+          sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
+              accumulation[perspectives[p]][i])[j]);
+        }
+        out[j] = vmax_s8(vqmovn_s16(sum), kZero);
+      }
+#else
+      for (IndexType j = 0; j < kHalfDimensions; ++j) {
+        BiasType sum = accumulation[perspectives[p]][0][j];
+        for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+          sum += accumulation[perspectives[p]][i][j];
+        }
+        output[offset + j] = static_cast<OutputType>(
+            std::max<int>(0, std::min<int>(127, sum)));
+      }
+#endif
+    }
+  }
+
+ private:
+  // 差分計算を用いずに累積値を計算する
+  void RefreshAccumulator(const Position& pos) const {
+    auto& accumulator = pos.state()->accumulator;
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList active_indices[2];
+      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
+                                       active_indices);
+      for (const auto perspective : COLOR) {
+        if (i == 0) {
+          std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                      kHalfDimensions * sizeof(BiasType));
+        } else {
+          std::memset(accumulator.accumulation[perspective][i], 0,
+                      kHalfDimensions * sizeof(BiasType));
+        }
+        for (const auto index : active_indices[perspective]) {
+          const IndexType offset = kHalfDimensions * index;
+#if defined(USE_AVX2)
+          auto accumulation = reinterpret_cast<__m256i*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
+          }
+#elif defined(USE_SSE2)
+          auto accumulation = reinterpret_cast<__m128i*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
+          }
+#elif defined(IS_ARM)
+          auto accumulation = reinterpret_cast<int16x8_t*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = vaddq_s16(accumulation[j], column[j]);
+          }
+#else
+          for (IndexType j = 0; j < kHalfDimensions; ++j) {
+            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+          }
+#endif
+        }
+      }
+    }
+
+    accumulator.computed_accumulation = true;
+    accumulator.computed_score = false;
+  }
+
+  // 差分計算を用いて累積値を計算する
+  void UpdateAccumulator(const Position& pos) const {
+    const auto prev_accumulator = pos.state()->previous->accumulator;
+    auto& accumulator = pos.state()->accumulator;
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList removed_indices[2], added_indices[2];
+      bool reset[2];
+      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
+                                        removed_indices, added_indices, reset);
+      for (const auto perspective : COLOR) {
+#if defined(USE_AVX2)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<__m256i*>(
+            &accumulator.accumulation[perspective][i][0]);
+#elif defined(USE_SSE2)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<__m128i*>(
+            &accumulator.accumulation[perspective][i][0]);
+#elif defined(IS_ARM)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<int16x8_t*>(
+            &accumulator.accumulation[perspective][i][0]);
+#endif
+        if (reset[perspective]) {
+          if (i == 0) {
+            std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                        kHalfDimensions * sizeof(BiasType));
+          } else {
+            std::memset(accumulator.accumulation[perspective][i], 0,
+                        kHalfDimensions * sizeof(BiasType));
+          }
+        } else {  // 1から0に変化した特徴量に関する差分計算
+          std::memcpy(accumulator.accumulation[perspective][i],
+                      prev_accumulator.accumulation[perspective][i],
+                      kHalfDimensions * sizeof(BiasType));
+          for (const auto index : removed_indices[perspective]) {
+            const IndexType offset = kHalfDimensions * index;
+#if defined(USE_AVX2)
+            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
+            }
+#elif defined(USE_SSE2)
+            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
+            }
+#elif defined(IS_ARM)
+            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = vsubq_s16(accumulation[j], column[j]);
+            }
+#else
+            for (IndexType j = 0; j < kHalfDimensions; ++j) {
+              accumulator.accumulation[perspective][i][j] -=
+                  weights_[offset + j];
+            }
+#endif
+          }
+        }
+        {  // 0から1に変化した特徴量に関する差分計算
+          for (const auto index : added_indices[perspective]) {
+            const IndexType offset = kHalfDimensions * index;
+#if defined(USE_AVX2)
+            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
+            }
+#elif defined(USE_SSE2)
+            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
+            }
+#elif defined(IS_ARM)
+            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = vaddq_s16(accumulation[j], column[j]);
+            }
+#else
+            for (IndexType j = 0; j < kHalfDimensions; ++j) {
+              accumulator.accumulation[perspective][i][j] +=
+                  weights_[offset + j];
+            }
+#endif
+          }
+        }
+      }
+    }
+
+    accumulator.computed_accumulation = true;
+    accumulator.computed_score = false;
+  }
+
+  // パラメータの型
+  using BiasType = std::int16_t;
+  using WeightType = std::int16_t;
+
+  // 学習用クラスをfriendにする
+  friend class Trainer<FeatureTransformer>;
+
+  // パラメータ
+  alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
+  alignas(kCacheLineSize)
+      WeightType weights_[kHalfDimensions * kInputDimensions];
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/nnue_test_command.cpp b/src/eval/nnue/nnue_test_command.cpp
new file mode 100644
index 00000000..a2618b3b
--- /dev/null
+++ b/src/eval/nnue/nnue_test_command.cpp
@@ -0,0 +1,196 @@
+﻿// NNUE評価関数に関するUSI拡張コマンド
+
+#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+
+#include "../../extra/all.h"
+#include "evaluate_nnue.h"
+#include "nnue_test_command.h"
+
+#include <set>
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace {
+
+// 主に差分計算に関するRawFeaturesのテスト
+void TestFeatures(Position& pos) {
+  const std::uint64_t num_games = 1000;
+  StateInfo si;
+  pos.set_hirate(&si,Threads.main());
+  const int MAX_PLY = 256; // 256手までテスト
+
+  StateInfo state[MAX_PLY]; // StateInfoを最大手数分だけ
+  int ply; // 初期局面からの手数
+
+  PRNG prng(20171128);
+
+  std::uint64_t num_moves = 0;
+  std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
+  std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
+  constexpr IndexType kUnknown = -1;
+  std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
+  auto make_index_sets = [&](const Position& pos) {
+    std::vector<std::vector<std::set<IndexType>>> index_sets(
+        kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList active_indices[2];
+      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
+                                       active_indices);
+      for (const auto perspective : COLOR) {
+        for (const auto index : active_indices[perspective]) {
+          ASSERT(index < RawFeatures::kDimensions);
+          ASSERT(index_sets[i][perspective].count(index) == 0);
+          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+          index_sets[i][perspective].insert(index);
+          trigger_map[index] = i;
+        }
+      }
+    }
+    return index_sets;
+  };
+  auto update_index_sets = [&](const Position& pos, auto* index_sets) {
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList removed_indices[2], added_indices[2];
+      bool reset[2];
+      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
+                                        removed_indices, added_indices, reset);
+      for (const auto perspective : COLOR) {
+        if (reset[perspective]) {
+          (*index_sets)[i][perspective].clear();
+          ++num_resets[i];
+        } else {
+          for (const auto index : removed_indices[perspective]) {
+            ASSERT(index < RawFeatures::kDimensions);
+            ASSERT((*index_sets)[i][perspective].count(index) == 1);
+            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+            (*index_sets)[i][perspective].erase(index);
+            ++num_updates.back();
+            ++num_updates[i];
+            trigger_map[index] = i;
+          }
+        }
+        for (const auto index : added_indices[perspective]) {
+          ASSERT(index < RawFeatures::kDimensions);
+          ASSERT((*index_sets)[i][perspective].count(index) == 0);
+          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+          (*index_sets)[i][perspective].insert(index);
+          ++num_updates.back();
+          ++num_updates[i];
+          trigger_map[index] = i;
+        }
+      }
+    }
+  };
+
+  std::cout << "feature set: " << RawFeatures::GetName()
+            << "[" << RawFeatures::kDimensions << "]" << std::endl;
+  std::cout << "start testing with random games";
+
+  for (std::uint64_t i = 0; i < num_games; ++i) {
+    auto index_sets = make_index_sets(pos);
+    for (ply = 0; ply < MAX_PLY; ++ply) {
+      MoveList<LEGAL_ALL> mg(pos); // 全合法手の生成
+
+      // 合法な指し手がなかった == 詰み
+      if (mg.size() == 0)
+        break;
+
+      // 生成された指し手のなかからランダムに選び、その指し手で局面を進める。
+      Move m = mg.begin()[prng.rand(mg.size())];
+      pos.do_move(m, state[ply]);
+
+      ++num_moves;
+      update_index_sets(pos, &index_sets);
+      ASSERT(index_sets == make_index_sets(pos));
+    }
+
+    pos.set_hirate(&si,Threads.main());
+
+    // 100回に1回ごとに'.'を出力(進んでいることがわかるように)
+    if ((i % 100) == 0)
+      std::cout << "." << std::flush;
+  }
+  std::cout << "passed." << std::endl;
+  std::cout << num_games << " games, " << num_moves << " moves, "
+            << num_updates.back() << " updates, "
+            << (1.0 * num_updates.back() / num_moves)
+            << " updates per move" << std::endl;
+  std::size_t num_observed_indices = 0;
+  for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+    const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
+    num_observed_indices += count;
+    std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
+              << "): " << count << " features ("
+              << (100.0 * count / RawFeatures::kDimensions) << "%), "
+              << num_updates[i] << " updates ("
+              << (1.0 * num_updates[i] / num_moves) << " per move), "
+              << num_resets[i] << " resets ("
+              << (100.0 * num_resets[i] / num_moves) << "%)"
+              << std::endl;
+  }
+  std::cout << "observed " << num_observed_indices << " ("
+            << (100.0 * num_observed_indices / RawFeatures::kDimensions)
+            << "% of " << RawFeatures::kDimensions
+            << ") features" << std::endl;
+}
+
+// 評価関数の構造を表す文字列を出力する
+void PrintInfo(std::istream& stream) {
+  std::cout << "network architecture: " << GetArchitectureString() << std::endl;
+
+  while (true) {
+    std::string file_name;
+    stream >> file_name;
+    if (file_name.empty()) break;
+
+    std::uint32_t hash_value;
+    std::string architecture;
+    const bool success = [&]() {
+      std::ifstream file_stream(file_name, std::ios::binary);
+      if (!file_stream) return false;
+      if (!ReadHeader(file_stream, &hash_value, &architecture)) return false;
+      return true;
+    }();
+
+    std::cout << file_name << ": ";
+    if (success) {
+      if (hash_value == kHashValue) {
+        std::cout << "matches with this binary";
+        if (architecture != GetArchitectureString()) {
+          std::cout << ", but architecture string differs: " << architecture;
+        }
+        std::cout << std::endl;
+      } else {
+        std::cout << architecture << std::endl;
+      }
+    } else {
+      std::cout << "failed to read header" << std::endl;
+    }
+  }
+}
+
+}  // namespace
+
+// NNUE評価関数に関するUSI拡張コマンド
+void TestCommand(Position& pos, std::istream& stream) {
+  std::string sub_command;
+  stream >> sub_command;
+
+  if (sub_command == "test_features") {
+    TestFeatures(pos);
+  } else if (sub_command == "info") {
+    PrintInfo(stream);
+  } else {
+    std::cout << "usage:" << std::endl;
+    std::cout << " test nn test_features" << std::endl;
+    std::cout << " test nn info [path/to/" << kFileName << "...]" << std::endl;
+  }
+}
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
diff --git a/src/eval/nnue/nnue_test_command.h b/src/eval/nnue/nnue_test_command.h
new file mode 100644
index 00000000..bf5894c9
--- /dev/null
+++ b/src/eval/nnue/nnue_test_command.h
@@ -0,0 +1,23 @@
+﻿// NNUE評価関数に関するUSI拡張コマンドのインターフェイス
+
+#ifndef _NNUE_TEST_COMMAND_H_
+#define _NNUE_TEST_COMMAND_H_
+
+#include "../../config.h"
+
+#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+
+namespace Eval {
+
+namespace NNUE {
+
+// NNUE評価関数に関するUSI拡張コマンド
+void TestCommand(Position& pos, std::istream& stream);
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/trainer/features/factorizer.h b/src/eval/nnue/trainer/features/factorizer.h
new file mode 100644
index 00000000..e31c9976
--- /dev/null
+++ b/src/eval/nnue/trainer/features/factorizer.h
@@ -0,0 +1,112 @@
+﻿// NNUE評価関数の特徴量変換クラステンプレート
+
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
+
+#include "../../../../config.h"
+
+#if defined(EVAL_NNUE)
+
+#include "../../nnue_common.h"
+#include "../trainer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 入力特徴量を学習用特徴量に変換するクラステンプレート
+// デフォルトでは学習用特徴量は元の入力特徴量と同じとし、必要に応じて特殊化する
+template <typename FeatureType>
+class Factorizer {
+ public:
+  // 学習用特徴量の次元数を取得する
+  static constexpr IndexType GetDimensions() {
+    return FeatureType::kDimensions;
+  }
+
+  // 学習用特徴量のインデックスと学習率のスケールを取得する
+  static void AppendTrainingFeatures(
+      IndexType base_index, std::vector<TrainingFeature>* training_features) {
+    ASSERT_LV5(base_index < FeatureType::kDimensions);
+    training_features->emplace_back(base_index);
+  }
+};
+
+// 学習用特徴量の情報
+struct FeatureProperties {
+  bool active;
+  IndexType dimensions;
+};
+
+// 元の入力特徴量を学習用特徴量に追加する
+template <typename FeatureType>
+IndexType AppendBaseFeature(
+    FeatureProperties properties, IndexType base_index,
+    std::vector<TrainingFeature>* training_features) {
+  ASSERT_LV5(properties.dimensions == FeatureType::kDimensions);
+  ASSERT_LV5(base_index < FeatureType::kDimensions);
+  training_features->emplace_back(base_index);
+  return properties.dimensions;
+}
+
+// 学習率のスケールが0でなければ他の種類の学習用特徴量を引き継ぐ
+template <typename FeatureType>
+IndexType InheritFeaturesIfRequired(
+    IndexType index_offset, FeatureProperties properties, IndexType base_index,
+    std::vector<TrainingFeature>* training_features) {
+  if (!properties.active) {
+    return 0;
+  }
+  ASSERT_LV5(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
+  ASSERT_LV5(base_index < FeatureType::kDimensions);
+  const auto start = training_features->size();
+  Factorizer<FeatureType>::AppendTrainingFeatures(
+      base_index, training_features);
+  for (auto i = start; i < training_features->size(); ++i) {
+    auto& feature = (*training_features)[i];
+    ASSERT_LV5(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
+    feature.ShiftIndex(index_offset);
+  }
+  return properties.dimensions;
+}
+
+// 学習用特徴量を追加せず、必要に応じてインデックスの差分を返す
+// 対応する特徴量がない場合にInheritFeaturesIfRequired()の代わりに呼ぶ
+IndexType SkipFeatures(FeatureProperties properties) {
+  if (!properties.active) {
+    return 0;
+  }
+  return properties.dimensions;
+}
+
+// 学習用特徴量の次元数を取得する
+template <std::size_t N>
+constexpr IndexType GetActiveDimensions(
+    const FeatureProperties (&properties)[N]) {
+  static_assert(N > 0, "");
+  IndexType dimensions = properties[0].dimensions;
+  for (std::size_t i = 1; i < N; ++i) {
+    if (properties[i].active) {
+      dimensions += properties[i].dimensions;
+    }
+  }
+  return dimensions;
+}
+
+// 配列の要素数を取得する
+template <typename T, std::size_t N>
+constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
+  return N;
+}
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/trainer/features/factorizer_feature_set.h b/src/eval/nnue/trainer/features/factorizer_feature_set.h
new file mode 100644
index 00000000..e2db79b1
--- /dev/null
+++ b/src/eval/nnue/trainer/features/factorizer_feature_set.h
@@ -0,0 +1,106 @@
+﻿// NNUE評価関数の特徴量変換クラステンプレートのFeatureSet用特殊化
+
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
+
+#include "../../../../config.h"
+
+#if defined(EVAL_NNUE)
+
+#include "../../features/feature_set.h"
+#include "factorizer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 入力特徴量を学習用特徴量に変換するクラステンプレート
+// FeatureSet用特殊化
+template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
+ private:
+  using Head = Factorizer<FeatureSet<FirstFeatureType>>;
+  using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
+
+ public:
+  // 元の入力特徴量の次元数
+  static constexpr IndexType kBaseDimensions =
+      FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
+
+  // 学習用特徴量の次元数を取得する
+  static constexpr IndexType GetDimensions() {
+    return Head::GetDimensions() + Tail::GetDimensions();
+  }
+
+  // 学習用特徴量のインデックスと学習率のスケールを取得する
+  static void AppendTrainingFeatures(
+      IndexType base_index, std::vector<TrainingFeature>* training_features,
+      IndexType base_dimensions = kBaseDimensions) {
+    ASSERT_LV5(base_index < kBaseDimensions);
+    constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
+    if (base_index < boundary) {
+      Tail::AppendTrainingFeatures(
+          base_index, training_features, base_dimensions);
+    } else {
+      const auto start = training_features->size();
+      Head::AppendTrainingFeatures(
+          base_index - boundary, training_features, base_dimensions);
+      for (auto i = start; i < training_features->size(); ++i) {
+        auto& feature = (*training_features)[i];
+        const auto index = feature.GetIndex();
+        ASSERT_LV5(index < Head::GetDimensions() ||
+                   (index >= base_dimensions &&
+                    index < base_dimensions +
+                            Head::GetDimensions() - Head::kBaseDimensions));
+        if (index < Head::kBaseDimensions) {
+          feature.ShiftIndex(Tail::kBaseDimensions);
+        } else {
+          feature.ShiftIndex(Tail::GetDimensions() - Tail::kBaseDimensions);
+        }
+      }
+    }
+  }
+};
+
+// 入力特徴量を学習用特徴量に変換するクラステンプレート
+// FeatureSetのテンプレート引数が1つの場合の特殊化
+template <typename FeatureType>
+class Factorizer<FeatureSet<FeatureType>> {
+public:
+  // 元の入力特徴量の次元数
+  static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
+
+  // 学習用特徴量の次元数を取得する
+  static constexpr IndexType GetDimensions() {
+    return Factorizer<FeatureType>::GetDimensions();
+  }
+
+  // 学習用特徴量のインデックスと学習率のスケールを取得する
+  static void AppendTrainingFeatures(
+      IndexType base_index, std::vector<TrainingFeature>* training_features,
+      IndexType base_dimensions = kBaseDimensions) {
+    ASSERT_LV5(base_index < kBaseDimensions);
+    const auto start = training_features->size();
+    Factorizer<FeatureType>::AppendTrainingFeatures(
+        base_index, training_features);
+    for (auto i = start; i < training_features->size(); ++i) {
+      auto& feature = (*training_features)[i];
+      ASSERT_LV5(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
+      if (feature.GetIndex() >= kBaseDimensions) {
+        feature.ShiftIndex(base_dimensions - kBaseDimensions);
+      }
+    }
+  }
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/trainer/features/factorizer_half_kp.h b/src/eval/nnue/trainer/features/factorizer_half_kp.h
new file mode 100644
index 00000000..20e4460e
--- /dev/null
+++ b/src/eval/nnue/trainer/features/factorizer_half_kp.h
@@ -0,0 +1,105 @@
+﻿// NNUE評価関数の特徴量変換クラステンプレートのHalfKP用特殊化
+
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
+
+#include "../../../../config.h"
+
+#if defined(EVAL_NNUE)
+
+#include "../../features/half_kp.h"
+#include "../../features/p.h"
+#include "../../features/half_relative_kp.h"
+#include "factorizer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// 入力特徴量を学習用特徴量に変換するクラステンプレート
+// HalfKP用特殊化
+template <Side AssociatedKing>
+class Factorizer<HalfKP<AssociatedKing>> {
+ private:
+  using FeatureType = HalfKP<AssociatedKing>;
+
+  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  static constexpr IndexType kMaxActiveDimensions =
+      FeatureType::kMaxActiveDimensions;
+
+  // 学習用特徴量の種類
+  enum TrainingFeatureType {
+    kFeaturesHalfKP,
+    kFeaturesHalfK,
+    kFeaturesP,
+    kFeaturesHalfRelativeKP,
+    kNumTrainingFeatureTypes,
+  };
+
+  // 学習用特徴量の情報
+  static constexpr FeatureProperties kProperties[] = {
+    // kFeaturesHalfKP
+    {true, FeatureType::kDimensions},
+    // kFeaturesHalfK
+    {true, SQ_NB},
+    // kFeaturesP
+    {true, Factorizer<P>::GetDimensions()},
+    // kFeaturesHalfRelativeKP
+    {true, Factorizer<HalfRelativeKP<AssociatedKing>>::GetDimensions()},
+  };
+  static_assert(GetArrayLength(kProperties) == kNumTrainingFeatureTypes, "");
+
+ public:
+  // 学習用特徴量の次元数を取得する
+  static constexpr IndexType GetDimensions() {
+    return GetActiveDimensions(kProperties);
+  }
+
+  // 学習用特徴量のインデックスと学習率のスケールを取得する
+  static void AppendTrainingFeatures(
+      IndexType base_index, std::vector<TrainingFeature>* training_features) {
+    // kFeaturesHalfKP
+    IndexType index_offset = AppendBaseFeature<FeatureType>(
+        kProperties[kFeaturesHalfKP], base_index, training_features);
+
+    const auto sq_k = static_cast<Square>(base_index / fe_end);
+    const auto p = static_cast<BonaPiece>(base_index % fe_end);
+    // kFeaturesHalfK
+    {
+      const auto& properties = kProperties[kFeaturesHalfK];
+      if (properties.active) {
+        training_features->emplace_back(index_offset + sq_k);
+        index_offset += properties.dimensions;
+      }
+    }
+    // kFeaturesP
+    index_offset += InheritFeaturesIfRequired<P>(
+        index_offset, kProperties[kFeaturesP], p, training_features);
+    // kFeaturesHalfRelativeKP
+    if (p >= fe_hand_end) {
+      index_offset += InheritFeaturesIfRequired<HalfRelativeKP<AssociatedKing>>(
+          index_offset, kProperties[kFeaturesHalfRelativeKP],
+          HalfRelativeKP<AssociatedKing>::MakeIndex(sq_k, p),
+          training_features);
+    } else {
+      index_offset += SkipFeatures(kProperties[kFeaturesHalfRelativeKP]);
+    }
+
+    ASSERT_LV5(index_offset == GetDimensions());
+  }
+};
+
+template <Side AssociatedKing>
+constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/trainer/trainer.h b/src/eval/nnue/trainer/trainer.h
new file mode 100644
index 00000000..1b322703
--- /dev/null
+++ b/src/eval/nnue/trainer/trainer.h
@@ -0,0 +1,127 @@
+﻿// NNUE評価関数の学習用クラステンプレートの共通ヘッダ
+
+#ifndef _NNUE_TRAINER_H_
+#define _NNUE_TRAINER_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+#include "../features/index_list.h"
+
+#include <sstream>
+#if defined(USE_BLAS)
+static_assert(std::is_same<LearnFloatType, float>::value, "");
+#include <cblas.h>
+#endif
+
+namespace Eval {
+
+namespace NNUE {
+
+// 評価値と勝率の関係式で用いるPonanza定数
+constexpr double kPonanzaConstant = 600.0;
+
+// 学習用特徴量のインデックス1つを表すクラス
+class TrainingFeature {
+  using StorageType = std::uint32_t;
+  static_assert(std::is_unsigned<StorageType>::value, "");
+
+ public:
+  static constexpr std::uint32_t kIndexBits = 24;
+  static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
+  static constexpr std::uint32_t kCountBits =
+      std::numeric_limits<StorageType>::digits - kIndexBits;
+
+  explicit TrainingFeature(IndexType index) :
+      index_and_count_((index << kCountBits) | 1) {
+    ASSERT_LV3(index < (1 << kIndexBits));
+  }
+  TrainingFeature& operator+=(const TrainingFeature& other) {
+    ASSERT_LV3(other.GetIndex() == GetIndex());
+    ASSERT_LV3(other.GetCount() + GetCount() < (1 << kCountBits));
+    index_and_count_ += other.GetCount();
+    return *this;
+  }
+  IndexType GetIndex() const {
+    return static_cast<IndexType>(index_and_count_ >> kCountBits);
+  }
+  void ShiftIndex(IndexType offset) {
+    ASSERT_LV3(GetIndex() + offset < (1 << kIndexBits));
+    index_and_count_ += offset << kCountBits;
+  }
+  IndexType GetCount() const {
+    return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
+  }
+  bool operator<(const TrainingFeature& other) const {
+    return index_and_count_ < other.index_and_count_;
+  }
+
+ private:
+  StorageType index_and_count_;
+};
+
+// 学習データ1サンプルを表す構造体
+struct Example {
+  std::vector<TrainingFeature> training_features[2];
+  Learner::PackedSfenValue psv;
+  int sign;
+  double weight;
+};
+
+// ハイパーパラメータの設定などに使用するメッセージ
+struct Message {
+  Message(const std::string& name, const std::string& value = "") :
+      name(name), value(value), num_peekers(0), num_receivers(0) {}
+  const std::string name;
+  const std::string value;
+  std::uint32_t num_peekers;
+  std::uint32_t num_receivers;
+};
+
+// メッセージを受理するかどうかを判定する
+bool ReceiveMessage(const std::string& name, Message* message) {
+  const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
+  if (message->name.substr(0, name.size() + 1) == name + "[") {
+    ++message->num_peekers;
+  }
+  if (message->name == name || message->name == name + subscript) {
+    ++message->num_receivers;
+    return true;
+  }
+  return false;
+}
+
+// 文字列を分割する
+std::vector<std::string> Split(const std::string& input, char delimiter) {
+  std::istringstream stream(input);
+  std::string field;
+  std::vector<std::string> fields;
+  while (std::getline(stream, field, delimiter)) {
+    fields.push_back(field);
+  }
+  return fields;
+}
+
+// 浮動小数点数を整数に丸める
+template <typename IntType>
+IntType Round(double value) {
+  return static_cast<IntType>(std::floor(value + 0.5));
+}
+
+// アライメント付きmake_shared
+template <typename T, typename... ArgumentTypes>
+std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
+  const auto ptr = new(aligned_malloc(sizeof(T), alignof(T)))
+      T(std::forward<ArgumentTypes>(arguments)...);
+  return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
+}
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/trainer/trainer_affine_transform.h b/src/eval/nnue/trainer/trainer_affine_transform.h
new file mode 100644
index 00000000..197beec3
--- /dev/null
+++ b/src/eval/nnue/trainer/trainer_affine_transform.h
@@ -0,0 +1,303 @@
+﻿// NNUE評価関数の学習クラステンプレートのAffineTransform用特殊化
+
+#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
+#define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../layers/affine_transform.h"
+#include "trainer.h"
+
+#include <random>
+
+namespace Eval {
+
+namespace NNUE {
+
+// 学習：アフィン変換層
+template <typename PreviousLayer, IndexType OutputDimensions>
+class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
+ private:
+  // 学習対象の層の型
+  using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
+
+ public:
+  // ファクトリ関数
+  static std::shared_ptr<Trainer> Create(
+      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(
+        new Trainer(target_layer, feature_transformer));
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    previous_layer_trainer_->SendMessage(message);
+    if (ReceiveMessage("momentum", message)) {
+      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+    }
+    if (ReceiveMessage("learning_rate_scale", message)) {
+      learning_rate_scale_ =
+          static_cast<LearnFloatType>(std::stod(message->value));
+    }
+    if (ReceiveMessage("reset", message)) {
+      DequantizeParameters();
+    }
+    if (ReceiveMessage("quantize_parameters", message)) {
+      QuantizeParameters();
+    }
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    previous_layer_trainer_->Initialize(rng);
+    if (kIsOutputLayer) {
+      // 出力層は0で初期化する
+      std::fill(std::begin(biases_), std::end(biases_),
+                static_cast<LearnFloatType>(0.0));
+      std::fill(std::begin(weights_), std::end(weights_),
+                static_cast<LearnFloatType>(0.0));
+    } else {
+      // 入力の分布が各ユニット平均0.5、等分散であることを仮定し、
+      // 出力の分布が各ユニット平均0.5、入力と同じ等分散になるように初期化する
+      const double kSigma = 1.0 / std::sqrt(kInputDimensions);
+      auto distribution = std::normal_distribution<double>(0.0, kSigma);
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        double sum = 0.0;
+        for (IndexType j = 0; j < kInputDimensions; ++j) {
+          const auto weight = static_cast<LearnFloatType>(distribution(rng));
+          weights_[kInputDimensions * i + j] = weight;
+          sum += weight;
+        }
+        biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
+      }
+    }
+    QuantizeParameters();
+  }
+
+  // 順伝播
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+      gradients_.resize(kInputDimensions * batch.size());
+    }
+    batch_size_ = static_cast<IndexType>(batch.size());
+    batch_input_ = previous_layer_trainer_->Propagate(batch);
+#if defined(USE_BLAS)
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
+    }
+    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, batch_size_, kInputDimensions, 1.0,
+                weights_, kInputDimensions,
+                batch_input_, kInputDimensions,
+                1.0, &output_[0], kOutputDimensions);
+#else
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_batch_offset = kInputDimensions * b;
+      const IndexType output_batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        double sum = biases_[i];
+        for (IndexType j = 0; j < kInputDimensions; ++j) {
+          const IndexType index = kInputDimensions * i + j;
+          sum += weights_[index] * batch_input_[input_batch_offset + j];
+        }
+        output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
+      }
+    }
+#endif
+    return output_.data();
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    const LearnFloatType local_learning_rate =
+        learning_rate * learning_rate_scale_;
+#if defined(USE_BLAS)
+    // backpropagate
+    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                kInputDimensions, batch_size_, kOutputDimensions, 1.0,
+                weights_, kInputDimensions,
+                gradients, kOutputDimensions,
+                0.0, &gradients_[0], kInputDimensions);
+    // update
+    cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      cblas_saxpy(kOutputDimensions, 1.0,
+                  &gradients[batch_offset], 1, biases_diff_, 1);
+    }
+    cblas_saxpy(kOutputDimensions, -local_learning_rate,
+                biases_diff_, 1, biases_, 1);
+    cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, kInputDimensions, batch_size_, 1.0,
+                gradients, kOutputDimensions,
+                batch_input_, kInputDimensions,
+                momentum_, weights_diff_, kInputDimensions);
+    cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
+                weights_diff_, 1, weights_, 1);
+#else
+    // backpropagate
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_batch_offset = kInputDimensions * b;
+      const IndexType output_batch_offset = kOutputDimensions * b;
+      for (IndexType j = 0; j < kInputDimensions; ++j) {
+        double sum = 0.0;
+        for (IndexType i = 0; i < kOutputDimensions; ++i) {
+          const IndexType index = kInputDimensions * i + j;
+          sum += weights_[index] * gradients[output_batch_offset + i];
+        }
+        gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
+      }
+    }
+    // update
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      biases_diff_[i] *= momentum_;
+    }
+    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+      weights_diff_[i] *= momentum_;
+    }
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_batch_offset = kInputDimensions * b;
+      const IndexType output_batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        biases_diff_[i] += gradients[output_batch_offset + i];
+      }
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        for (IndexType j = 0; j < kInputDimensions; ++j) {
+          const IndexType index = kInputDimensions * i + j;
+          weights_diff_[index] += gradients[output_batch_offset + i] *
+              batch_input_[input_batch_offset + j];
+        }
+      }
+    }
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      biases_[i] -= local_learning_rate * biases_diff_[i];
+    }
+    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+      weights_[i] -= local_learning_rate * weights_diff_[i];
+    }
+#endif
+    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+  }
+
+ private:
+  // コンストラクタ
+  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      batch_input_(nullptr),
+      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+          &target_layer->previous_layer_, feature_transformer)),
+      target_layer_(target_layer),
+      biases_(),
+      weights_(),
+      biases_diff_(),
+      weights_diff_(),
+      momentum_(0.0),
+      learning_rate_scale_(1.0) {
+    DequantizeParameters();
+  }
+
+  // 重みの飽和とパラメータの整数化
+  void QuantizeParameters() {
+    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+      weights_[i] = std::max(-kMaxWeightMagnitude,
+                             std::min(+kMaxWeightMagnitude, weights_[i]));
+    }
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      target_layer_->biases_[i] =
+          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+    }
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      const auto offset = kInputDimensions * i;
+      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+      for (IndexType j = 0; j < kInputDimensions; ++j) {
+        target_layer_->weights_[padded_offset + j] =
+            Round<typename LayerType::WeightType>(
+                weights_[offset + j] * kWeightScale);
+      }
+    }
+  }
+
+  // 整数化されたパラメータの読み込み
+  void DequantizeParameters() {
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      biases_[i] = static_cast<LearnFloatType>(
+          target_layer_->biases_[i] / kBiasScale);
+    }
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      const auto offset = kInputDimensions * i;
+      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+      for (IndexType j = 0; j < kInputDimensions; ++j) {
+        weights_[offset + j] = static_cast<LearnFloatType>(
+            target_layer_->weights_[padded_offset + j] / kWeightScale);
+      }
+    }
+    std::fill(std::begin(biases_diff_), std::end(biases_diff_),
+              static_cast<LearnFloatType>(0.0));
+    std::fill(std::begin(weights_diff_), std::end(weights_diff_),
+              static_cast<LearnFloatType>(0.0));
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+  // 出力の次元数が1なら出力層
+  static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
+
+  // パラメータの整数化で用いる係数
+  static constexpr LearnFloatType kActivationScale =
+      std::numeric_limits<std::int8_t>::max();
+  static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
+      (kPonanzaConstant * FV_SCALE) :
+      ((1 << kWeightScaleBits) * kActivationScale);
+  static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
+
+  // パラメータの整数化でオーバーフローさせないために用いる重みの絶対値の上限
+  static constexpr LearnFloatType kMaxWeightMagnitude =
+      std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
+
+  // ミニバッチのサンプル数
+  IndexType batch_size_;
+
+  // ミニバッチの入力
+  const LearnFloatType* batch_input_;
+
+  // 直前の層のTrainer
+  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+  // 学習対象の層
+  LayerType* const target_layer_;
+
+  // パラメータ
+  LearnFloatType biases_[kOutputDimensions];
+  LearnFloatType weights_[kOutputDimensions * kInputDimensions];
+
+  // パラメータの更新で用いるバッファ
+  LearnFloatType biases_diff_[kOutputDimensions];
+  LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+
+  // 順伝播用バッファ
+  std::vector<LearnFloatType> output_;
+
+  // 逆伝播用バッファ
+  std::vector<LearnFloatType> gradients_;
+
+  // ハイパーパラメータ
+  LearnFloatType momentum_;
+  LearnFloatType learning_rate_scale_;
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/trainer/trainer_clipped_relu.h b/src/eval/nnue/trainer/trainer_clipped_relu.h
new file mode 100644
index 00000000..d7cc96e6
--- /dev/null
+++ b/src/eval/nnue/trainer/trainer_clipped_relu.h
@@ -0,0 +1,144 @@
+﻿// NNUE評価関数の学習クラステンプレートのClippedReLU用特殊化
+
+#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
+#define _NNUE_TRAINER_CLIPPED_RELU_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../layers/clipped_relu.h"
+#include "trainer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 学習：アフィン変換層
+template <typename PreviousLayer>
+class Trainer<Layers::ClippedReLU<PreviousLayer>> {
+ private:
+  // 学習対象の層の型
+  using LayerType = Layers::ClippedReLU<PreviousLayer>;
+
+ public:
+  // ファクトリ関数
+  static std::shared_ptr<Trainer> Create(
+      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(
+        new Trainer(target_layer, feature_transformer));
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    previous_layer_trainer_->SendMessage(message);
+    if (ReceiveMessage("check_health", message)) {
+      CheckHealth();
+    }
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    previous_layer_trainer_->Initialize(rng);
+  }
+
+  // 順伝播
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+      gradients_.resize(kInputDimensions * batch.size());
+    }
+    const auto input = previous_layer_trainer_->Propagate(batch);
+    batch_size_ = static_cast<IndexType>(batch.size());
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        const IndexType index = batch_offset + i;
+        output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
+        min_activations_[i] = std::min(min_activations_[i], output_[index]);
+        max_activations_[i] = std::max(max_activations_[i], output_[index]);
+      }
+    }
+    return output_.data();
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        const IndexType index = batch_offset + i;
+        gradients_[index] = gradients[index] *
+            (output_[index] > kZero) * (output_[index] < kOne);
+      }
+    }
+    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+  }
+
+ private:
+  // コンストラクタ
+  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+          &target_layer->previous_layer_, feature_transformer)),
+      target_layer_(target_layer) {
+    std::fill(std::begin(min_activations_), std::end(min_activations_),
+              std::numeric_limits<LearnFloatType>::max());
+    std::fill(std::begin(max_activations_), std::end(max_activations_),
+              std::numeric_limits<LearnFloatType>::lowest());
+  }
+
+  // 学習に問題が生じていないかチェックする
+  void CheckHealth() {
+    const auto largest_min_activation = *std::max_element(
+        std::begin(min_activations_), std::end(min_activations_));
+    const auto smallest_max_activation = *std::min_element(
+        std::begin(max_activations_), std::end(max_activations_));
+    std::cout << "INFO: largest min activation = " << largest_min_activation
+              << ", smallest max activation = " << smallest_max_activation
+              << std::endl;
+
+    std::fill(std::begin(min_activations_), std::end(min_activations_),
+              std::numeric_limits<LearnFloatType>::max());
+    std::fill(std::begin(max_activations_), std::end(max_activations_),
+              std::numeric_limits<LearnFloatType>::lowest());
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+  // LearnFloatTypeの定数
+  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+
+  // ミニバッチのサンプル数
+  IndexType batch_size_;
+
+  // 直前の層のTrainer
+  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+  // 学習対象の層
+  LayerType* const target_layer_;
+
+  // 順伝播用バッファ
+  std::vector<LearnFloatType> output_;
+
+  // 逆伝播用バッファ
+  std::vector<LearnFloatType> gradients_;
+
+  // ヘルスチェック用統計値
+  LearnFloatType min_activations_[kOutputDimensions];
+  LearnFloatType max_activations_[kOutputDimensions];
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/trainer/trainer_feature_transformer.h b/src/eval/nnue/trainer/trainer_feature_transformer.h
new file mode 100644
index 00000000..ff4da717
--- /dev/null
+++ b/src/eval/nnue/trainer/trainer_feature_transformer.h
@@ -0,0 +1,379 @@
+﻿// NNUE評価関数の学習クラステンプレートのFeatureTransformer用特殊化
+
+#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
+#define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../nnue_feature_transformer.h"
+#include "trainer.h"
+#include "features/factorizer_feature_set.h"
+
+#include <array>
+#include <bitset>
+#include <numeric>
+#include <random>
+#include <set>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace Eval {
+
+namespace NNUE {
+
+// 学習：入力特徴量変換器
+template <>
+class Trainer<FeatureTransformer> {
+ private:
+  // 学習対象の層の型
+  using LayerType = FeatureTransformer;
+
+ public:
+  template <typename T>
+  friend struct AlignedDeleter;
+  template <typename T, typename... ArgumentTypes>
+  friend std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments);
+
+  // ファクトリ関数
+  static std::shared_ptr<Trainer> Create(LayerType* target_layer) {
+    return MakeAlignedSharedPtr<Trainer>(target_layer);
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    if (ReceiveMessage("momentum", message)) {
+      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+    }
+    if (ReceiveMessage("learning_rate_scale", message)) {
+      learning_rate_scale_ =
+          static_cast<LearnFloatType>(std::stod(message->value));
+    }
+    if (ReceiveMessage("reset", message)) {
+      DequantizeParameters();
+    }
+    if (ReceiveMessage("quantize_parameters", message)) {
+      QuantizeParameters();
+    }
+    if (ReceiveMessage("clear_unobserved_feature_weights", message)) {
+      ClearUnobservedFeatureWeights();
+    }
+    if (ReceiveMessage("check_health", message)) {
+      CheckHealth();
+    }
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    std::fill(std::begin(weights_), std::end(weights_), +kZero);
+    const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
+    auto distribution = std::normal_distribution<double>(0.0, kSigma);
+    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
+      const auto weight = static_cast<LearnFloatType>(distribution(rng));
+      weights_[i] = weight;
+    }
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      biases_[i] = static_cast<LearnFloatType>(0.5);
+    }
+    QuantizeParameters();
+  }
+
+  // 順伝播
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+      gradients_.resize(kOutputDimensions * batch.size());
+    }
+    batch_ = &batch;
+    // affine transform
+#pragma omp parallel for
+    for (IndexType b = 0; b < batch.size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType c = 0; c < 2; ++c) {
+        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+#if defined(USE_BLAS)
+        cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
+        for (const auto& feature : batch[b].training_features[c]) {
+          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+          cblas_saxpy(kHalfDimensions, (float)feature.GetCount(),
+                      &weights_[weights_offset], 1, &output_[output_offset], 1);
+        }
+#else
+        for (IndexType i = 0; i < kHalfDimensions; ++i) {
+          output_[output_offset + i] = biases_[i];
+        }
+        for (const auto& feature : batch[b].training_features[c]) {
+          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+          for (IndexType i = 0; i < kHalfDimensions; ++i) {
+            output_[output_offset + i] +=
+                feature.GetCount() * weights_[weights_offset + i];
+          }
+        }
+#endif
+      }
+    }
+    // clipped ReLU
+    for (IndexType b = 0; b < batch.size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        const IndexType index = batch_offset + i;
+        min_pre_activation_ = std::min(min_pre_activation_, output_[index]);
+        max_pre_activation_ = std::max(max_pre_activation_, output_[index]);
+        output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
+        const IndexType t = i % kHalfDimensions;
+        min_activations_[t] = std::min(min_activations_[t], output_[index]);
+        max_activations_[t] = std::max(max_activations_[t], output_[index]);
+      }
+    }
+    return output_.data();
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    const LearnFloatType local_learning_rate =
+        learning_rate * learning_rate_scale_;
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        const IndexType index = batch_offset + i;
+        gradients_[index] = gradients[index] *
+            ((output_[index] > kZero) * (output_[index] < kOne));
+      }
+    }
+    // 重み行列は入力に出現した特徴量に対応する列のみを更新するため、
+    // momentumを使用せず、学習率を補正してスケールを合わせる
+    const LearnFloatType effective_learning_rate =
+        static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
+#if defined(USE_BLAS)
+    cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType c = 0; c < 2; ++c) {
+        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+        cblas_saxpy(kHalfDimensions, 1.0,
+                    &gradients_[output_offset], 1, biases_diff_, 1);
+      }
+    }
+    cblas_saxpy(kHalfDimensions, -local_learning_rate,
+                biases_diff_, 1, biases_, 1);
+#pragma omp parallel
+    {
+#if defined(_OPENMP)
+      const IndexType num_threads = omp_get_num_threads();
+      const IndexType thread_index = omp_get_thread_num();
+#endif
+      for (IndexType b = 0; b < batch_->size(); ++b) {
+        const IndexType batch_offset = kOutputDimensions * b;
+        for (IndexType c = 0; c < 2; ++c) {
+          const IndexType output_offset = batch_offset + kHalfDimensions * c;
+          for (const auto& feature : (*batch_)[b].training_features[c]) {
+#if defined(_OPENMP)
+            if (feature.GetIndex() % num_threads != thread_index) continue;
+#endif
+            const IndexType weights_offset =
+                kHalfDimensions * feature.GetIndex();
+            const auto scale = static_cast<LearnFloatType>(
+                effective_learning_rate / feature.GetCount());
+            cblas_saxpy(kHalfDimensions, -scale,
+                        &gradients_[output_offset], 1,
+                        &weights_[weights_offset], 1);
+          }
+        }
+      }
+    }
+#else
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      biases_diff_[i] *= momentum_;
+    }
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType c = 0; c < 2; ++c) {
+        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+        for (IndexType i = 0; i < kHalfDimensions; ++i) {
+          biases_diff_[i] += gradients_[output_offset + i];
+        }
+      }
+    }
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      biases_[i] -= local_learning_rate * biases_diff_[i];
+    }
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType c = 0; c < 2; ++c) {
+        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+        for (const auto& feature : (*batch_)[b].training_features[c]) {
+          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+          const auto scale = static_cast<LearnFloatType>(
+              effective_learning_rate / feature.GetCount());
+          for (IndexType i = 0; i < kHalfDimensions; ++i) {
+            weights_[weights_offset + i] -=
+                scale * gradients_[output_offset + i];
+          }
+        }
+      }
+    }
+#endif
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      for (IndexType c = 0; c < 2; ++c) {
+        for (const auto& feature : (*batch_)[b].training_features[c]) {
+          observed_features.set(feature.GetIndex());
+        }
+      }
+    }
+  }
+
+ private:
+  // コンストラクタ
+  Trainer(LayerType* target_layer) :
+      batch_(nullptr),
+      target_layer_(target_layer),
+      biases_(),
+      weights_(),
+      biases_diff_(),
+      momentum_(0.0),
+      learning_rate_scale_(1.0) {
+    min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
+    max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
+    std::fill(std::begin(min_activations_), std::end(min_activations_),
+              std::numeric_limits<LearnFloatType>::max());
+    std::fill(std::begin(max_activations_), std::end(max_activations_),
+              std::numeric_limits<LearnFloatType>::lowest());
+    DequantizeParameters();
+  }
+
+  // 重みの飽和とパラメータの整数化
+  void QuantizeParameters() {
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      target_layer_->biases_[i] =
+          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+    }
+    std::vector<TrainingFeature> training_features;
+#pragma omp parallel for private(training_features)
+    for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
+      training_features.clear();
+      Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+          j, &training_features);
+      for (IndexType i = 0; i < kHalfDimensions; ++i) {
+        double sum = 0.0;
+        for (const auto& feature : training_features) {
+          sum += weights_[kHalfDimensions * feature.GetIndex() + i];
+        }
+        target_layer_->weights_[kHalfDimensions * j + i] =
+            Round<typename LayerType::WeightType>(sum * kWeightScale);
+      }
+    }
+  }
+
+  // 整数化されたパラメータの読み込み
+  void DequantizeParameters() {
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      biases_[i] = static_cast<LearnFloatType>(
+          target_layer_->biases_[i] / kBiasScale);
+    }
+    std::fill(std::begin(weights_), std::end(weights_), +kZero);
+    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
+      weights_[i] = static_cast<LearnFloatType>(
+          target_layer_->weights_[i] / kWeightScale);
+    }
+    std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
+  }
+
+  // 学習データに出現していない特徴量に対応する重みを0にする
+  void ClearUnobservedFeatureWeights() {
+    for (IndexType i = 0; i < kInputDimensions; ++i) {
+      if (!observed_features.test(i)) {
+        std::fill(std::begin(weights_) + kHalfDimensions * i,
+                  std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
+      }
+    }
+    QuantizeParameters();
+  }
+
+  // 学習に問題が生じていないかチェックする
+  void CheckHealth() {
+    std::cout << "INFO: observed " << observed_features.count()
+              << " (out of " << kInputDimensions << ") features" << std::endl;
+
+    constexpr LearnFloatType kPreActivationLimit =
+        std::numeric_limits<typename LayerType::WeightType>::max() /
+        kWeightScale;
+    std::cout << "INFO: (min, max) of pre-activations = "
+              << min_pre_activation_ << ", "
+              << max_pre_activation_ << " (limit = "
+              << kPreActivationLimit << ")" << std::endl;
+
+    const auto largest_min_activation = *std::max_element(
+        std::begin(min_activations_), std::end(min_activations_));
+    const auto smallest_max_activation = *std::min_element(
+        std::begin(max_activations_), std::end(max_activations_));
+    std::cout << "INFO: largest min activation = " << largest_min_activation
+              << ", smallest max activation = " << smallest_max_activation
+              << std::endl;
+
+    std::fill(std::begin(min_activations_), std::end(min_activations_),
+              std::numeric_limits<LearnFloatType>::max());
+    std::fill(std::begin(max_activations_), std::end(max_activations_),
+              std::numeric_limits<LearnFloatType>::lowest());
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions =
+      Features::Factorizer<RawFeatures>::GetDimensions();
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+  static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
+
+  // パラメータの整数化で用いる係数
+  static constexpr LearnFloatType kActivationScale =
+      std::numeric_limits<std::int8_t>::max();
+  static constexpr LearnFloatType kBiasScale = kActivationScale;
+  static constexpr LearnFloatType kWeightScale = kActivationScale;
+
+  // LearnFloatTypeの定数
+  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+
+  // ミニバッチ
+  const std::vector<Example>* batch_;
+
+  // 学習対象の層
+  LayerType* const target_layer_;
+
+  // パラメータ
+  alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
+  alignas(kCacheLineSize)
+      LearnFloatType weights_[kHalfDimensions * kInputDimensions];
+
+  // パラメータの更新で用いるバッファ
+  LearnFloatType biases_diff_[kHalfDimensions];
+  std::vector<LearnFloatType> gradients_;
+
+  // 順伝播用バッファ
+  std::vector<LearnFloatType> output_;
+
+  // 学習データに出現した特徴量
+  std::bitset<kInputDimensions> observed_features;
+
+  // ハイパーパラメータ
+  LearnFloatType momentum_;
+  LearnFloatType learning_rate_scale_;
+
+  // ヘルスチェック用統計値
+  LearnFloatType min_pre_activation_;
+  LearnFloatType max_pre_activation_;
+  LearnFloatType min_activations_[kHalfDimensions];
+  LearnFloatType max_activations_[kHalfDimensions];
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/trainer/trainer_input_slice.h b/src/eval/nnue/trainer/trainer_input_slice.h
new file mode 100644
index 00000000..c6df775f
--- /dev/null
+++ b/src/eval/nnue/trainer/trainer_input_slice.h
@@ -0,0 +1,253 @@
+﻿// NNUE評価関数の学習クラステンプレートのInputSlice用特殊化
+
+#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
+#define _NNUE_TRAINER_INPUT_SLICE_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../layers/input_slice.h"
+#include "trainer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 学習：入力層
+class SharedInputTrainer {
+ public:
+  // ファクトリ関数
+  static std::shared_ptr<SharedInputTrainer> Create(
+      FeatureTransformer* feature_transformer) {
+    static std::shared_ptr<SharedInputTrainer> instance;
+    if (!instance) {
+      instance.reset(new SharedInputTrainer(feature_transformer));
+    }
+    ++instance->num_referrers_;
+    return instance;
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    if (num_calls_ == 0) {
+      current_operation_ = Operation::kSendMessage;
+      feature_transformer_trainer_->SendMessage(message);
+    }
+    ASSERT_LV3(current_operation_ == Operation::kSendMessage);
+    if (++num_calls_ == num_referrers_) {
+      num_calls_ = 0;
+      current_operation_ = Operation::kNone;
+    }
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    if (num_calls_ == 0) {
+      current_operation_ = Operation::kInitialize;
+      feature_transformer_trainer_->Initialize(rng);
+    }
+    ASSERT_LV3(current_operation_ == Operation::kInitialize);
+    if (++num_calls_ == num_referrers_) {
+      num_calls_ = 0;
+      current_operation_ = Operation::kNone;
+    }
+  }
+
+  // 順伝播
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (gradients_.size() < kInputDimensions * batch.size()) {
+      gradients_.resize(kInputDimensions * batch.size());
+    }
+    batch_size_ = static_cast<IndexType>(batch.size());
+    if (num_calls_ == 0) {
+      current_operation_ = Operation::kPropagate;
+      output_ = feature_transformer_trainer_->Propagate(batch);
+    }
+    ASSERT_LV3(current_operation_ == Operation::kPropagate);
+    if (++num_calls_ == num_referrers_) {
+      num_calls_ = 0;
+      current_operation_ = Operation::kNone;
+    }
+    return output_;
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    if (num_referrers_ == 1) {
+      feature_transformer_trainer_->Backpropagate(gradients, learning_rate);
+      return;
+    }
+    if (num_calls_ == 0) {
+      current_operation_ = Operation::kBackPropagate;
+      for (IndexType b = 0; b < batch_size_; ++b) {
+        const IndexType batch_offset = kInputDimensions * b;
+        for (IndexType i = 0; i < kInputDimensions; ++i) {
+          gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
+        }
+      }
+    }
+    ASSERT_LV3(current_operation_ == Operation::kBackPropagate);
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kInputDimensions * b;
+      for (IndexType i = 0; i < kInputDimensions; ++i) {
+        gradients_[batch_offset + i] += gradients[batch_offset + i];
+      }
+    }
+    if (++num_calls_ == num_referrers_) {
+      feature_transformer_trainer_->Backpropagate(
+          gradients_.data(), learning_rate);
+      num_calls_ = 0;
+      current_operation_ = Operation::kNone;
+    }
+  }
+
+ private:
+  // コンストラクタ
+  SharedInputTrainer(FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      num_referrers_(0),
+      num_calls_(0),
+      current_operation_(Operation::kNone),
+      feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
+          feature_transformer)),
+      output_(nullptr) {
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions =
+      FeatureTransformer::kOutputDimensions;
+
+  // 処理の種類
+  enum class Operation {
+    kNone,
+    kSendMessage,
+    kInitialize,
+    kPropagate,
+    kBackPropagate,
+  };
+
+  // ミニバッチのサンプル数
+  IndexType batch_size_;
+
+  // この層を入力として共有する層の数
+  std::uint32_t num_referrers_;
+
+  // 現在の処理が呼び出された回数
+  std::uint32_t num_calls_;
+
+  // 現在の処理の種類
+  Operation current_operation_;
+
+  // 入力特徴量変換器のTrainer
+  const std::shared_ptr<Trainer<FeatureTransformer>>
+      feature_transformer_trainer_;
+
+  // 順伝播用に共有する出力のポインタ
+  const LearnFloatType* output_;
+
+  // 逆伝播用バッファ
+  std::vector<LearnFloatType> gradients_;
+};
+
+// 学習：入力層
+template <IndexType OutputDimensions, IndexType Offset>
+class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
+ private:
+  // 学習対象の層の型
+  using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
+
+ public:
+  // ファクトリ関数
+  static std::shared_ptr<Trainer> Create(
+      LayerType* /*target_layer*/, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(new Trainer(feature_transformer));
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    shared_input_trainer_->SendMessage(message);
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    shared_input_trainer_->Initialize(rng);
+  }
+
+  // 順伝播
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+      gradients_.resize(kInputDimensions * batch.size());
+    }
+    batch_size_ = static_cast<IndexType>(batch.size());
+    const auto input = shared_input_trainer_->Propagate(batch);
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_offset = kInputDimensions * b;
+      const IndexType output_offset = kOutputDimensions * b;
+#if defined(USE_BLAS)
+      cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
+                  &output_[output_offset], 1);
+#else
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        output_[output_offset + i] = input[input_offset + Offset + i];
+      }
+#endif
+    }
+    return output_.data();
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_offset = kInputDimensions * b;
+      const IndexType output_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kInputDimensions; ++i) {
+        if (i < Offset || i >= Offset + kOutputDimensions) {
+          gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+        } else {
+          gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+        }
+      }
+    }
+    shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate);
+  }
+
+ private:
+  // コンストラクタ
+  Trainer(FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      shared_input_trainer_(SharedInputTrainer::Create(feature_transformer)) {
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kInputDimensions =
+      FeatureTransformer::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = OutputDimensions;
+  static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
+
+  // ミニバッチのサンプル数
+  IndexType batch_size_;
+
+  // 共有入力層のTrainer
+  const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
+
+  // 順伝播用バッファ
+  std::vector<LearnFloatType> output_;
+
+  // 逆伝播用バッファ
+  std::vector<LearnFloatType> gradients_;
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/trainer/trainer_sum.h b/src/eval/nnue/trainer/trainer_sum.h
new file mode 100644
index 00000000..4095482a
--- /dev/null
+++ b/src/eval/nnue/trainer/trainer_sum.h
@@ -0,0 +1,192 @@
+﻿// NNUE評価関数の学習クラステンプレートのSum用特殊化
+
+#ifndef _NNUE_TRAINER_SUM_H_
+#define _NNUE_TRAINER_SUM_H_
+
+#include "../../../config.h"
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../layers/sum.h"
+#include "trainer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// 学習：複数の層の出力の和を取る層
+template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
+      Trainer<Layers::Sum<RemainingPreviousLayers...>> {
+ private:
+  // 学習対象の層の型
+  using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
+  using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
+
+ public:
+  // ファクトリ関数
+  static std::shared_ptr<Trainer> Create(
+      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(
+        new Trainer(target_layer, feature_transformer));
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    // 他のメンバ関数の結果は処理の順番に依存しないため、
+    // 実装をシンプルにすることを目的としてTailを先に処理するが、
+    // SendMessageは添字の対応を分かりやすくするためにHeadを先に処理する
+    previous_layer_trainer_->SendMessage(message);
+    Tail::SendMessage(message);
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    Tail::Initialize(rng);
+    previous_layer_trainer_->Initialize(rng);
+  }
+
+  // 順伝播
+  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    batch_size_ = static_cast<IndexType>(batch.size());
+    auto output = Tail::Propagate(batch);
+    const auto head_output = previous_layer_trainer_->Propagate(batch);
+#if defined(USE_BLAS)
+    cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
+                head_output, 1, output, 1);
+#else
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        output[batch_offset + i] += head_output[batch_offset + i];
+      }
+    }
+#endif
+    return output;
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    Tail::Backpropagate(gradients, learning_rate);
+    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+  }
+
+ private:
+  // コンストラクタ
+  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+      Tail(target_layer, feature_transformer),
+      batch_size_(0),
+      previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
+          &target_layer->previous_layer_, feature_transformer)),
+      target_layer_(target_layer) {
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+  // サブクラスをfriendにする
+  template <typename SumLayer>
+  friend class Trainer;
+
+  // ミニバッチのサンプル数
+  IndexType batch_size_;
+
+  // 直前の層のTrainer
+  const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
+
+  // 学習対象の層
+  LayerType* const target_layer_;
+};
+
+
+// 学習：複数の層の出力の和を取る層（テンプレート引数が1つの場合）
+template <typename PreviousLayer>
+class Trainer<Layers::Sum<PreviousLayer>> {
+ private:
+  // 学習対象の層の型
+  using LayerType = Layers::Sum<PreviousLayer>;
+
+ public:
+  // ファクトリ関数
+  static std::shared_ptr<Trainer> Create(
+      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(
+        new Trainer(target_layer, feature_transformer));
+  }
+
+  // ハイパーパラメータなどのオプションを設定する
+  void SendMessage(Message* message) {
+    previous_layer_trainer_->SendMessage(message);
+  }
+
+  // パラメータを乱数で初期化する
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    previous_layer_trainer_->Initialize(rng);
+  }
+
+  // 順伝播
+  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+    }
+    batch_size_ = static_cast<IndexType>(batch.size());
+    const auto output = previous_layer_trainer_->Propagate(batch);
+#if defined(USE_BLAS)
+    cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
+#else
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        output_[batch_offset + i] = output[batch_offset + i];
+      }
+    }
+#endif
+    return output_.data();
+  }
+
+  // 逆伝播
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+  }
+
+ private:
+  // コンストラクタ
+  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+          &target_layer->previous_layer_, feature_transformer)),
+      target_layer_(target_layer) {
+  }
+
+  // 入出力の次元数
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+  // サブクラスをfriendにする
+  template <typename SumLayer>
+  friend class Trainer;
+
+  // ミニバッチのサンプル数
+  IndexType batch_size_;
+
+  // 直前の層のTrainer
+  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+  // 学習対象の層
+  LayerType* const target_layer_;
+
+  // 順伝播用バッファ
+  std::vector<LearnFloatType> output_;
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
deleted file mode 100644
index d211db64..00000000
--- a/src/evaluate.cpp
+++ /dev/null
@@ -1,909 +0,0 @@
-/*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2019 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
-
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <algorithm>
-#include <cassert>
-#include <cstring>   // For std::memset
-#include <iomanip>
-#include <sstream>
-
-#include "bitboard.h"
-#include "evaluate.h"
-#include "material.h"
-#include "pawns.h"
-#include "thread.h"
-
-namespace Trace {
-
-  enum Tracing { NO_TRACE, TRACE };
-
-  enum Term { // The first 8 entries are reserved for PieceType
-    MATERIAL = 8, IMBALANCE, MOBILITY, THREAT, PASSED, SPACE, INITIATIVE, TOTAL, TERM_NB
-  };
-
-  Score scores[TERM_NB][COLOR_NB];
-
-  double to_cp(Value v) { return double(v) / PawnValueEg; }
-
-  void add(int idx, Color c, Score s) {
-    scores[idx][c] = s;
-  }
-
-  void add(int idx, Score w, Score b = SCORE_ZERO) {
-    scores[idx][WHITE] = w;
-    scores[idx][BLACK] = b;
-  }
-
-  std::ostream& operator<<(std::ostream& os, Score s) {
-    os << std::setw(5) << to_cp(mg_value(s)) << " "
-       << std::setw(5) << to_cp(eg_value(s));
-    return os;
-  }
-
-  std::ostream& operator<<(std::ostream& os, Term t) {
-
-    if (t == MATERIAL || t == IMBALANCE || t == INITIATIVE || t == TOTAL)
-        os << " ----  ----"    << " | " << " ----  ----";
-    else
-        os << scores[t][WHITE] << " | " << scores[t][BLACK];
-
-    os << " | " << scores[t][WHITE] - scores[t][BLACK] << "\n";
-    return os;
-  }
-}
-
-using namespace Trace;
-
-namespace {
-
-  // Threshold for lazy and space evaluation
-  constexpr Value LazyThreshold  = Value(1400);
-  constexpr Value SpaceThreshold = Value(12222);
-
-  // KingAttackWeights[PieceType] contains king attack weights by piece type
-  constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 77, 55, 44, 10 };
-
-  // Penalties for enemy's safe checks
-  constexpr int QueenSafeCheck  = 780;
-  constexpr int RookSafeCheck   = 1080;
-  constexpr int BishopSafeCheck = 635;
-  constexpr int KnightSafeCheck = 790;
-
-#define S(mg, eg) make_score(mg, eg)
-
-  // MobilityBonus[PieceType-2][attacked] contains bonuses for middle and end game,
-  // indexed by piece type and number of attacked squares in the mobility area.
-  constexpr Score MobilityBonus[][32] = {
-    { S(-62,-81), S(-53,-56), S(-12,-30), S( -4,-14), S(  3,  8), S( 13, 15), // Knights
-      S( 22, 23), S( 28, 27), S( 33, 33) },
-    { S(-48,-59), S(-20,-23), S( 16, -3), S( 26, 13), S( 38, 24), S( 51, 42), // Bishops
-      S( 55, 54), S( 63, 57), S( 63, 65), S( 68, 73), S( 81, 78), S( 81, 86),
-      S( 91, 88), S( 98, 97) },
-    { S(-58,-76), S(-27,-18), S(-15, 28), S(-10, 55), S( -5, 69), S( -2, 82), // Rooks
-      S(  9,112), S( 16,118), S( 30,132), S( 29,142), S( 32,155), S( 38,165),
-      S( 46,166), S( 48,169), S( 58,171) },
-    { S(-39,-36), S(-21,-15), S(  3,  8), S(  3, 18), S( 14, 34), S( 22, 54), // Queens
-      S( 28, 61), S( 41, 73), S( 43, 79), S( 48, 92), S( 56, 94), S( 60,104),
-      S( 60,113), S( 66,120), S( 67,123), S( 70,126), S( 71,133), S( 73,136),
-      S( 79,140), S( 88,143), S( 88,148), S( 99,166), S(102,170), S(102,175),
-      S(106,184), S(109,191), S(113,206), S(116,212) }
-  };
-
-  // RookOnFile[semiopen/open] contains bonuses for each rook when there is
-  // no (friendly) pawn on the rook file.
-  constexpr Score RookOnFile[] = { S(18, 7), S(44, 20) };
-
-  // ThreatByMinor/ByRook[attacked PieceType] contains bonuses according to
-  // which piece type attacks which one. Attacks on lesser pieces which are
-  // pawn-defended are not considered.
-  constexpr Score ThreatByMinor[PIECE_TYPE_NB] = {
-    S(0, 0), S(0, 31), S(39, 42), S(57, 44), S(68, 112), S(62, 120)
-  };
-
-  constexpr Score ThreatByRook[PIECE_TYPE_NB] = {
-    S(0, 0), S(0, 24), S(38, 71), S(38, 61), S(0, 38), S(51, 38)
-  };
-
-  // PassedRank[Rank] contains a bonus according to the rank of a passed pawn
-  constexpr Score PassedRank[RANK_NB] = {
-    S(0, 0), S(5, 18), S(12, 23), S(10, 31), S(57, 62), S(163, 167), S(271, 250)
-  };
-
-  // PassedFile[File] contains a bonus according to the file of a passed pawn
-  constexpr Score PassedFile[FILE_NB] = {
-    S( -1,  7), S( 0,  9), S(-9, -8), S(-30,-14),
-    S(-30,-14), S(-9, -8), S( 0,  9), S( -1,  7)
-  };
-
-  // Assorted bonuses and penalties
-  constexpr Score BishopPawns        = S(  3,  7);
-  constexpr Score CorneredBishop     = S( 50, 50);
-  constexpr Score FlankAttacks       = S(  8,  0);
-  constexpr Score Hanging            = S( 69, 36);
-  constexpr Score KingProtector      = S(  7,  8);
-  constexpr Score KnightOnQueen      = S( 16, 12);
-  constexpr Score LongDiagonalBishop = S( 45,  0);
-  constexpr Score MinorBehindPawn    = S( 18,  3);
-  constexpr Score Outpost            = S(  9,  3);
-  constexpr Score PawnlessFlank      = S( 17, 95);
-  constexpr Score RestrictedPiece    = S(  7,  7);
-  constexpr Score RookOnPawn         = S( 10, 32);
-  constexpr Score SliderOnQueen      = S( 59, 18);
-  constexpr Score ThreatByKing       = S( 24, 89);
-  constexpr Score ThreatByPawnPush   = S( 48, 39);
-  constexpr Score ThreatByRank       = S( 13,  0);
-  constexpr Score ThreatBySafePawn   = S(173, 94);
-  constexpr Score TrappedRook        = S( 47,  4);
-  constexpr Score WeakQueen          = S( 49, 15);
-  constexpr Score WeakUnopposedPawn  = S( 12, 23);
-
-#undef S
-
-  // Evaluation class computes and stores attacks tables and other working data
-  template<Tracing T>
-  class Evaluation {
-
-  public:
-    Evaluation() = delete;
-    explicit Evaluation(const Position& p) : pos(p) {}
-    Evaluation& operator=(const Evaluation&) = delete;
-    Value value();
-
-  private:
-    template<Color Us> void initialize();
-    template<Color Us, PieceType Pt> Score pieces();
-    template<Color Us> Score king() const;
-    template<Color Us> Score threats() const;
-    template<Color Us> Score passed() const;
-    template<Color Us> Score space() const;
-    ScaleFactor scale_factor(Value eg) const;
-    Score initiative(Value eg) const;
-
-    const Position& pos;
-    Material::Entry* me;
-    Pawns::Entry* pe;
-    Bitboard mobilityArea[COLOR_NB];
-    Score mobility[COLOR_NB] = { SCORE_ZERO, SCORE_ZERO };
-
-    // attackedBy[color][piece type] is a bitboard representing all squares
-    // attacked by a given color and piece type. Special "piece types" which
-    // is also calculated is ALL_PIECES.
-    Bitboard attackedBy[COLOR_NB][PIECE_TYPE_NB];
-
-    // attackedBy2[color] are the squares attacked by at least 2 units of a given
-    // color, including x-rays. But diagonal x-rays through pawns are not computed.
-    Bitboard attackedBy2[COLOR_NB];
-
-    // kingRing[color] are the squares adjacent to the king, plus (only for a
-    // king on its first rank) the squares two ranks in front. For instance,
-    // if black's king is on g8, kingRing[BLACK] is f8, h8, f7, g7, h7, f6, g6
-    // and h6.
-    Bitboard kingRing[COLOR_NB];
-
-    // kingAttackersCount[color] is the number of pieces of the given color
-    // which attack a square in the kingRing of the enemy king.
-    int kingAttackersCount[COLOR_NB];
-
-    // kingAttackersWeight[color] is the sum of the "weights" of the pieces of
-    // the given color which attack a square in the kingRing of the enemy king.
-    // The weights of the individual piece types are given by the elements in
-    // the KingAttackWeights array.
-    int kingAttackersWeight[COLOR_NB];
-
-    // kingAttacksCount[color] is the number of attacks by the given color to
-    // squares directly adjacent to the enemy king. Pieces which attack more
-    // than one square are counted multiple times. For instance, if there is
-    // a white knight on g5 and black's king is on g8, this white knight adds 2
-    // to kingAttacksCount[WHITE].
-    int kingAttacksCount[COLOR_NB];
-  };
-
-
-  // Evaluation::initialize() computes king and pawn attacks, and the king ring
-  // bitboard for a given color. This is done at the beginning of the evaluation.
-  template<Tracing T> template<Color Us>
-  void Evaluation<T>::initialize() {
-
-    constexpr Color     Them = (Us == WHITE ? BLACK : WHITE);
-    constexpr Direction Up   = (Us == WHITE ? NORTH : SOUTH);
-    constexpr Direction Down = (Us == WHITE ? SOUTH : NORTH);
-    constexpr Bitboard LowRanks = (Us == WHITE ? Rank2BB | Rank3BB: Rank7BB | Rank6BB);
-
-    const Square ksq = pos.square<KING>(Us);
-
-    Bitboard dblAttackByPawn = pawn_double_attacks_bb<Us>(pos.pieces(Us, PAWN));
-
-    // Find our pawns that are blocked or on the first two ranks
-    Bitboard b = pos.pieces(Us, PAWN) & (shift<Down>(pos.pieces()) | LowRanks);
-
-    // Squares occupied by those pawns, by our king or queen or controlled by
-    // enemy pawns are excluded from the mobility area.
-    mobilityArea[Us] = ~(b | pos.pieces(Us, KING, QUEEN) | pe->pawn_attacks(Them));
-
-    // Initialize attackedBy[] for king and pawns
-    attackedBy[Us][KING] = pos.attacks_from<KING>(ksq);
-    attackedBy[Us][PAWN] = pe->pawn_attacks(Us);
-    attackedBy[Us][ALL_PIECES] = attackedBy[Us][KING] | attackedBy[Us][PAWN];
-    attackedBy2[Us] = dblAttackByPawn | (attackedBy[Us][KING] & attackedBy[Us][PAWN]);
-
-    // Init our king safety tables
-    kingRing[Us] = attackedBy[Us][KING];
-    if (relative_rank(Us, ksq) == RANK_1)
-        kingRing[Us] |= shift<Up>(kingRing[Us]);
-
-    if (file_of(ksq) == FILE_H)
-        kingRing[Us] |= shift<WEST>(kingRing[Us]);
-
-    else if (file_of(ksq) == FILE_A)
-        kingRing[Us] |= shift<EAST>(kingRing[Us]);
-
-    kingAttackersCount[Them] = popcount(kingRing[Us] & pe->pawn_attacks(Them));
-    kingAttacksCount[Them] = kingAttackersWeight[Them] = 0;
-
-    // Remove from kingRing[] the squares defended by two pawns
-    kingRing[Us] &= ~dblAttackByPawn;
-  }
-
-
-  // Evaluation::pieces() scores pieces of a given color and type
-  template<Tracing T> template<Color Us, PieceType Pt>
-  Score Evaluation<T>::pieces() {
-
-    constexpr Color     Them = (Us == WHITE ? BLACK : WHITE);
-    constexpr Direction Down = (Us == WHITE ? SOUTH : NORTH);
-    constexpr Bitboard OutpostRanks = (Us == WHITE ? Rank4BB | Rank5BB | Rank6BB
-                                                   : Rank5BB | Rank4BB | Rank3BB);
-    const Square* pl = pos.squares<Pt>(Us);
-
-    Bitboard b, bb;
-    Score score = SCORE_ZERO;
-
-    attackedBy[Us][Pt] = 0;
-
-    for (Square s = *pl; s != SQ_NONE; s = *++pl)
-    {
-        // Find attacked squares, including x-ray attacks for bishops and rooks
-        b = Pt == BISHOP ? attacks_bb<BISHOP>(s, pos.pieces() ^ pos.pieces(QUEEN))
-          : Pt ==   ROOK ? attacks_bb<  ROOK>(s, pos.pieces() ^ pos.pieces(QUEEN) ^ pos.pieces(Us, ROOK))
-                         : pos.attacks_from<Pt>(s);
-
-        if (pos.blockers_for_king(Us) & s)
-            b &= LineBB[pos.square<KING>(Us)][s];
-
-        attackedBy2[Us] |= attackedBy[Us][ALL_PIECES] & b;
-        attackedBy[Us][Pt] |= b;
-        attackedBy[Us][ALL_PIECES] |= b;
-
-        if (b & kingRing[Them])
-        {
-            kingAttackersCount[Us]++;
-            kingAttackersWeight[Us] += KingAttackWeights[Pt];
-            kingAttacksCount[Us] += popcount(b & attackedBy[Them][KING]);
-        }
-
-        int mob = popcount(b & mobilityArea[Us]);
-
-        mobility[Us] += MobilityBonus[Pt - 2][mob];
-
-        if (Pt == BISHOP || Pt == KNIGHT)
-        {
-            // Bonus if piece is on an outpost square or can reach one
-            bb = OutpostRanks & ~pe->pawn_attacks_span(Them);
-            if (bb & s)
-                score += Outpost * (Pt == KNIGHT ? 4 : 2)
-                                 * ((attackedBy[Us][PAWN] & s) ? 2 : 1);
-
-            else if (bb &= b & ~pos.pieces(Us))
-                score += Outpost * (Pt == KNIGHT ? 2 : 1)
-                                 * ((attackedBy[Us][PAWN] & bb) ? 2 : 1);
-
-            // Knight and Bishop bonus for being right behind a pawn
-            if (shift<Down>(pos.pieces(PAWN)) & s)
-                score += MinorBehindPawn;
-
-            // Penalty if the piece is far from the king
-            score -= KingProtector * distance(s, pos.square<KING>(Us));
-
-            if (Pt == BISHOP)
-            {
-                // Penalty according to number of pawns on the same color square as the
-                // bishop, bigger when the center files are blocked with pawns.
-                Bitboard blocked = pos.pieces(Us, PAWN) & shift<Down>(pos.pieces());
-
-                score -= BishopPawns * pos.pawns_on_same_color_squares(Us, s)
-                                     * (1 + popcount(blocked & CenterFiles));
-
-                // Bonus for bishop on a long diagonal which can "see" both center squares
-                if (more_than_one(attacks_bb<BISHOP>(s, pos.pieces(PAWN)) & Center))
-                    score += LongDiagonalBishop;
-            }
-
-            // An important Chess960 pattern: A cornered bishop blocked by a friendly
-            // pawn diagonally in front of it is a very serious problem, especially
-            // when that pawn is also blocked.
-            if (   Pt == BISHOP
-                && pos.is_chess960()
-                && (s == relative_square(Us, SQ_A1) || s == relative_square(Us, SQ_H1)))
-            {
-                Direction d = pawn_push(Us) + (file_of(s) == FILE_A ? EAST : WEST);
-                if (pos.piece_on(s + d) == make_piece(Us, PAWN))
-                    score -= !pos.empty(s + d + pawn_push(Us))                ? CorneredBishop * 4
-                            : pos.piece_on(s + d + d) == make_piece(Us, PAWN) ? CorneredBishop * 2
-                                                                              : CorneredBishop;
-            }
-        }
-
-        if (Pt == ROOK)
-        {
-            // Bonus for aligning rook with enemy pawns on the same rank/file
-            if (relative_rank(Us, s) >= RANK_5)
-                score += RookOnPawn * popcount(pos.pieces(Them, PAWN) & PseudoAttacks[ROOK][s]);
-
-            // Bonus for rook on an open or semi-open file
-            if (pos.is_on_semiopen_file(Us, s))
-                score += RookOnFile[bool(pos.is_on_semiopen_file(Them, s))];
-
-            // Penalty when trapped by the king, even more if the king cannot castle
-            else if (mob <= 3)
-            {
-                File kf = file_of(pos.square<KING>(Us));
-                if ((kf < FILE_E) == (file_of(s) < kf))
-                    score -= TrappedRook * (1 + !pos.castling_rights(Us));
-            }
-        }
-
-        if (Pt == QUEEN)
-        {
-            // Penalty if any relative pin or discovered attack against the queen
-            Bitboard queenPinners;
-            if (pos.slider_blockers(pos.pieces(Them, ROOK, BISHOP), s, queenPinners))
-                score -= WeakQueen;
-        }
-    }
-    if (T)
-        Trace::add(Pt, Us, score);
-
-    return score;
-  }
-
-
-  // Evaluation::king() assigns bonuses and penalties to a king of a given color
-  template<Tracing T> template<Color Us>
-  Score Evaluation<T>::king() const {
-
-    constexpr Color    Them = (Us == WHITE ? BLACK : WHITE);
-    constexpr Bitboard Camp = (Us == WHITE ? AllSquares ^ Rank6BB ^ Rank7BB ^ Rank8BB
-                                           : AllSquares ^ Rank1BB ^ Rank2BB ^ Rank3BB);
-
-    Bitboard weak, b1, b2, safe, unsafeChecks = 0;
-    Bitboard rookChecks, queenChecks, bishopChecks, knightChecks;
-    int kingDanger = 0;
-    const Square ksq = pos.square<KING>(Us);
-
-    // Init the score with king shelter and enemy pawns storm
-    Score score = pe->king_safety<Us>(pos);
-
-    // Attacked squares defended at most once by our queen or king
-    weak =  attackedBy[Them][ALL_PIECES]
-          & ~attackedBy2[Us]
-          & (~attackedBy[Us][ALL_PIECES] | attackedBy[Us][KING] | attackedBy[Us][QUEEN]);
-
-    // Analyse the safe enemy's checks which are possible on next move
-    safe  = ~pos.pieces(Them);
-    safe &= ~attackedBy[Us][ALL_PIECES] | (weak & attackedBy2[Them]);
-
-    b1 = attacks_bb<ROOK  >(ksq, pos.pieces() ^ pos.pieces(Us, QUEEN));
-    b2 = attacks_bb<BISHOP>(ksq, pos.pieces() ^ pos.pieces(Us, QUEEN));
-
-    // Enemy rooks checks
-    rookChecks = b1 & safe & attackedBy[Them][ROOK];
-
-    if (rookChecks)
-        kingDanger += RookSafeCheck;
-    else
-        unsafeChecks |= b1 & attackedBy[Them][ROOK];
-
-    // Enemy queen safe checks: we count them only if they are from squares from
-    // which we can't give a rook check, because rook checks are more valuable.
-    queenChecks =  (b1 | b2)
-                 & attackedBy[Them][QUEEN]
-                 & safe
-                 & ~attackedBy[Us][QUEEN]
-                 & ~rookChecks;
-
-    if (queenChecks)
-        kingDanger += QueenSafeCheck;
-
-    // Enemy bishops checks: we count them only if they are from squares from
-    // which we can't give a queen check, because queen checks are more valuable.
-    bishopChecks =  b2
-                  & attackedBy[Them][BISHOP]
-                  & safe
-                  & ~queenChecks;
-
-    if (bishopChecks)
-        kingDanger += BishopSafeCheck;
-    else
-        unsafeChecks |= b2 & attackedBy[Them][BISHOP];
-
-    // Enemy knights checks
-    knightChecks = pos.attacks_from<KNIGHT>(ksq) & attackedBy[Them][KNIGHT];
-
-    if (knightChecks & safe)
-        kingDanger += KnightSafeCheck;
-    else
-        unsafeChecks |= knightChecks;
-
-    // Unsafe or occupied checking squares will also be considered, as long as
-    // the square is in the attacker's mobility area.
-    unsafeChecks &= mobilityArea[Them];
-
-    // Find the squares that opponent attacks in our king flank, and the squares
-    // which are attacked twice in that flank.
-    b1 = attackedBy[Them][ALL_PIECES] & KingFlank[file_of(ksq)] & Camp;
-    b2 = b1 & attackedBy2[Them];
-
-    int kingFlankAttacks = popcount(b1) + popcount(b2);
-
-    kingDanger +=        kingAttackersCount[Them] * kingAttackersWeight[Them]
-                 +  69 * kingAttacksCount[Them]
-                 + 185 * popcount(kingRing[Us] & weak)
-                 - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING])
-                 -  35 * bool(attackedBy[Us][BISHOP] & attackedBy[Us][KING])
-                 + 150 * popcount(pos.blockers_for_king(Us) | unsafeChecks)
-                 - 873 * !pos.count<QUEEN>(Them)
-                 -   6 * mg_value(score) / 8
-                 +       mg_value(mobility[Them] - mobility[Us])
-                 +   5 * kingFlankAttacks * kingFlankAttacks / 16
-                 -   7;
-
-    // Transform the kingDanger units into a Score, and subtract it from the evaluation
-    if (kingDanger > 100)
-        score -= make_score(kingDanger * kingDanger / 4096, kingDanger / 16);
-
-    // Penalty when our king is on a pawnless flank
-    if (!(pos.pieces(PAWN) & KingFlank[file_of(ksq)]))
-        score -= PawnlessFlank;
-
-    // Penalty if king flank is under attack, potentially moving toward the king
-    score -= FlankAttacks * kingFlankAttacks;
-
-    if (T)
-        Trace::add(KING, Us, score);
-
-    return score;
-  }
-
-
-  // Evaluation::threats() assigns bonuses according to the types of the
-  // attacking and the attacked pieces.
-  template<Tracing T> template<Color Us>
-  Score Evaluation<T>::threats() const {
-
-    constexpr Color     Them     = (Us == WHITE ? BLACK   : WHITE);
-    constexpr Direction Up       = (Us == WHITE ? NORTH   : SOUTH);
-    constexpr Bitboard  TRank3BB = (Us == WHITE ? Rank3BB : Rank6BB);
-
-    Bitboard b, weak, defended, nonPawnEnemies, stronglyProtected, safe;
-    Score score = SCORE_ZERO;
-
-    // Non-pawn enemies
-    nonPawnEnemies = pos.pieces(Them) & ~pos.pieces(PAWN);
-
-    // Squares strongly protected by the enemy, either because they defend the
-    // square with a pawn, or because they defend the square twice and we don't.
-    stronglyProtected =  attackedBy[Them][PAWN]
-                       | (attackedBy2[Them] & ~attackedBy2[Us]);
-
-    // Non-pawn enemies, strongly protected
-    defended = nonPawnEnemies & stronglyProtected;
-
-    // Enemies not strongly protected and under our attack
-    weak = pos.pieces(Them) & ~stronglyProtected & attackedBy[Us][ALL_PIECES];
-
-    // Safe or protected squares
-    safe = ~attackedBy[Them][ALL_PIECES] | attackedBy[Us][ALL_PIECES];
-
-    // Bonus according to the kind of attacking pieces
-    if (defended | weak)
-    {
-        b = (defended | weak) & (attackedBy[Us][KNIGHT] | attackedBy[Us][BISHOP]);
-        while (b)
-        {
-            Square s = pop_lsb(&b);
-            score += ThreatByMinor[type_of(pos.piece_on(s))];
-            if (type_of(pos.piece_on(s)) != PAWN)
-                score += ThreatByRank * (int)relative_rank(Them, s);
-        }
-
-        b = weak & attackedBy[Us][ROOK];
-        while (b)
-        {
-            Square s = pop_lsb(&b);
-            score += ThreatByRook[type_of(pos.piece_on(s))];
-            if (type_of(pos.piece_on(s)) != PAWN)
-                score += ThreatByRank * (int)relative_rank(Them, s);
-        }
-
-        if (weak & attackedBy[Us][KING])
-            score += ThreatByKing;
-
-        b =  ~attackedBy[Them][ALL_PIECES]
-           | (nonPawnEnemies & attackedBy2[Us]);
-        score += Hanging * popcount(weak & b);
-    }
-
-    // Bonus for restricting their piece moves
-    b =   attackedBy[Them][ALL_PIECES]
-       & ~stronglyProtected
-       &  attackedBy[Us][ALL_PIECES];
-
-    score += RestrictedPiece * popcount(b);
-
-    // Bonus for enemy unopposed weak pawns
-    if (pos.pieces(Us, ROOK, QUEEN))
-        score += WeakUnopposedPawn * pe->weak_unopposed(Them);
-
-    // Find squares where our pawns can push on the next move
-    b  = shift<Up>(pos.pieces(Us, PAWN)) & ~pos.pieces();
-    b |= shift<Up>(b & TRank3BB) & ~pos.pieces();
-
-    // Keep only the squares which are relatively safe
-    b &= ~attackedBy[Them][PAWN] & safe;
-
-    // Bonus for safe pawn threats on the next move
-    b = pawn_attacks_bb<Us>(b) & pos.pieces(Them);
-    score += ThreatByPawnPush * popcount(b);
-
-    // Our safe or protected pawns
-    b = pos.pieces(Us, PAWN) & safe;
-
-    b = pawn_attacks_bb<Us>(b) & nonPawnEnemies;
-    score += ThreatBySafePawn * popcount(b);
-
-    // Bonus for threats on the next moves against enemy queen
-    if (pos.count<QUEEN>(Them) == 1)
-    {
-        Square s = pos.square<QUEEN>(Them);
-        safe = mobilityArea[Us] & ~stronglyProtected;
-
-        b = attackedBy[Us][KNIGHT] & pos.attacks_from<KNIGHT>(s);
-
-        score += KnightOnQueen * popcount(b & safe);
-
-        b =  (attackedBy[Us][BISHOP] & pos.attacks_from<BISHOP>(s))
-           | (attackedBy[Us][ROOK  ] & pos.attacks_from<ROOK  >(s));
-
-        score += SliderOnQueen * popcount(b & safe & attackedBy2[Us]);
-    }
-
-    if (T)
-        Trace::add(THREAT, Us, score);
-
-    return score;
-  }
-
-  // Evaluation::passed() evaluates the passed pawns and candidate passed
-  // pawns of the given color.
-
-  template<Tracing T> template<Color Us>
-  Score Evaluation<T>::passed() const {
-
-    constexpr Color     Them = (Us == WHITE ? BLACK : WHITE);
-    constexpr Direction Up   = (Us == WHITE ? NORTH : SOUTH);
-
-    auto king_proximity = [&](Color c, Square s) {
-      return std::min(distance(pos.square<KING>(c), s), 5);
-    };
-
-    Bitboard b, bb, squaresToQueen, defendedSquares, unsafeSquares;
-    Score score = SCORE_ZERO;
-
-    b = pe->passed_pawns(Us);
-
-    while (b)
-    {
-        Square s = pop_lsb(&b);
-
-        assert(!(pos.pieces(Them, PAWN) & forward_file_bb(Us, s + Up)));
-
-        int r = relative_rank(Us, s);
-
-        Score bonus = PassedRank[r];
-
-        if (r > RANK_3)
-        {
-            int w = (r-2) * (r-2) + 2;
-            Square blockSq = s + Up;
-
-            // Adjust bonus based on the king's proximity
-            bonus += make_score(0, (  king_proximity(Them, blockSq) * 5
-                                    - king_proximity(Us,   blockSq) * 2) * w);
-
-            // If blockSq is not the queening square then consider also a second push
-            if (r != RANK_7)
-                bonus -= make_score(0, king_proximity(Us, blockSq + Up) * w);
-
-            // If the pawn is free to advance, then increase the bonus
-            if (pos.empty(blockSq))
-            {
-                // If there is a rook or queen attacking/defending the pawn from behind,
-                // consider all the squaresToQueen. Otherwise consider only the squares
-                // in the pawn's path attacked or occupied by the enemy.
-                defendedSquares = unsafeSquares = squaresToQueen = forward_file_bb(Us, s);
-
-                bb = forward_file_bb(Them, s) & pos.pieces(ROOK, QUEEN);
-
-                if (!(pos.pieces(Us) & bb))
-                    defendedSquares &= attackedBy[Us][ALL_PIECES];
-
-                if (!(pos.pieces(Them) & bb))
-                    unsafeSquares &= attackedBy[Them][ALL_PIECES] | pos.pieces(Them);
-
-                // If there aren't any enemy attacks, assign a big bonus. Otherwise
-                // assign a smaller bonus if the block square isn't attacked.
-                int k = !unsafeSquares ? 20 : !(unsafeSquares & blockSq) ? 9 : 0;
-
-                // If the path to the queen is fully defended, assign a big bonus.
-                // Otherwise assign a smaller bonus if the block square is defended.
-                if (defendedSquares == squaresToQueen)
-                    k += 6;
-
-                else if (defendedSquares & blockSq)
-                    k += 4;
-
-                bonus += make_score(k * w, k * w);
-            }
-        } // r > RANK_3
-
-        // Scale down bonus for candidate passers which need more than one
-        // pawn push to become passed, or have a pawn in front of them.
-        if (   !pos.pawn_passed(Us, s + Up)
-            || (pos.pieces(PAWN) & forward_file_bb(Us, s)))
-            bonus = bonus / 2;
-
-        score += bonus + PassedFile[file_of(s)];
-    }
-
-    if (T)
-        Trace::add(PASSED, Us, score);
-
-    return score;
-  }
-
-
-  // Evaluation::space() computes the space evaluation for a given side. The
-  // space evaluation is a simple bonus based on the number of safe squares
-  // available for minor pieces on the central four files on ranks 2--4. Safe
-  // squares one, two or three squares behind a friendly pawn are counted
-  // twice. Finally, the space bonus is multiplied by a weight. The aim is to
-  // improve play on game opening.
-
-  template<Tracing T> template<Color Us>
-  Score Evaluation<T>::space() const {
-
-    if (pos.non_pawn_material() < SpaceThreshold)
-        return SCORE_ZERO;
-
-    constexpr Color Them     = (Us == WHITE ? BLACK : WHITE);
-    constexpr Direction Down = (Us == WHITE ? SOUTH : NORTH);
-    constexpr Bitboard SpaceMask =
-      Us == WHITE ? CenterFiles & (Rank2BB | Rank3BB | Rank4BB)
-                  : CenterFiles & (Rank7BB | Rank6BB | Rank5BB);
-
-    // Find the available squares for our pieces inside the area defined by SpaceMask
-    Bitboard safe =   SpaceMask
-                   & ~pos.pieces(Us, PAWN)
-                   & ~attackedBy[Them][PAWN];
-
-    // Find all squares which are at most three squares behind some friendly pawn
-    Bitboard behind = pos.pieces(Us, PAWN);
-    behind |= shift<Down>(behind);
-    behind |= shift<Down+Down>(behind);
-
-    int bonus = popcount(safe) + popcount(behind & safe);
-    int weight = pos.count<ALL_PIECES>(Us) - 1;
-    Score score = make_score(bonus * weight * weight / 16, 0);
-
-    if (T)
-        Trace::add(SPACE, Us, score);
-
-    return score;
-  }
-
-
-  // Evaluation::initiative() computes the initiative correction value
-  // for the position. It is a second order bonus/malus based on the
-  // known attacking/defending status of the players.
-
-  template<Tracing T>
-  Score Evaluation<T>::initiative(Value eg) const {
-
-    int outflanking =  distance<File>(pos.square<KING>(WHITE), pos.square<KING>(BLACK))
-                     - distance<Rank>(pos.square<KING>(WHITE), pos.square<KING>(BLACK));
-
-    bool pawnsOnBothFlanks =   (pos.pieces(PAWN) & QueenSide)
-                            && (pos.pieces(PAWN) & KingSide);
-
-    // Compute the initiative bonus for the attacking side
-    int complexity =   9 * pe->passed_count()
-                    + 11 * pos.count<PAWN>()
-                    +  9 * outflanking
-                    + 18 * pawnsOnBothFlanks
-                    + 49 * !pos.non_pawn_material()
-                    -103 ;
-
-    // Now apply the bonus: note that we find the attacking side by extracting
-    // the sign of the endgame value, and that we carefully cap the bonus so
-    // that the endgame score will never change sign after the bonus.
-    int v = ((eg > 0) - (eg < 0)) * std::max(complexity, -abs(eg));
-
-    if (T)
-        Trace::add(INITIATIVE, make_score(0, v));
-
-    return make_score(0, v);
-  }
-
-
-  // Evaluation::scale_factor() computes the scale factor for the winning side
-
-  template<Tracing T>
-  ScaleFactor Evaluation<T>::scale_factor(Value eg) const {
-
-    Color strongSide = eg > VALUE_DRAW ? WHITE : BLACK;
-    int sf = me->scale_factor(pos, strongSide);
-
-    // If scale is not already specific, scale down the endgame via general heuristics
-    if (sf == SCALE_FACTOR_NORMAL)
-    {
-        if (   pos.opposite_bishops()
-            && pos.non_pawn_material() == 2 * BishopValueMg)
-            sf = 16 + 4 * pe->passed_count();
-        else
-            sf = std::min(40 + (pos.opposite_bishops() ? 2 : 7) * pos.count<PAWN>(strongSide), sf);
-
-    }
-
-    return ScaleFactor(sf);
-  }
-
-
-  // Evaluation::value() is the main function of the class. It computes the various
-  // parts of the evaluation and returns the value of the position from the point
-  // of view of the side to move.
-
-  template<Tracing T>
-  Value Evaluation<T>::value() {
-
-    assert(!pos.checkers());
-
-    // Probe the material hash table
-    me = Material::probe(pos);
-
-    // If we have a specialized evaluation function for the current material
-    // configuration, call it and return.
-    if (me->specialized_eval_exists())
-        return me->evaluate(pos);
-
-    // Initialize score by reading the incrementally updated scores included in
-    // the position object (material + piece square tables) and the material
-    // imbalance. Score is computed internally from the white point of view.
-    Score score = pos.psq_score() + me->imbalance() + pos.this_thread()->contempt;
-
-    // Probe the pawn hash table
-    pe = Pawns::probe(pos);
-    score += pe->pawn_score(WHITE) - pe->pawn_score(BLACK);
-
-    // Early exit if score is high
-    Value v = (mg_value(score) + eg_value(score)) / 2;
-    if (abs(v) > (LazyThreshold + pos.non_pawn_material() / 64))
-       return pos.side_to_move() == WHITE ? v : -v;
-
-    // Main evaluation begins here
-
-    initialize<WHITE>();
-    initialize<BLACK>();
-
-    // Pieces should be evaluated first (populate attack tables)
-    score +=  pieces<WHITE, KNIGHT>() - pieces<BLACK, KNIGHT>()
-            + pieces<WHITE, BISHOP>() - pieces<BLACK, BISHOP>()
-            + pieces<WHITE, ROOK  >() - pieces<BLACK, ROOK  >()
-            + pieces<WHITE, QUEEN >() - pieces<BLACK, QUEEN >();
-
-    score += mobility[WHITE] - mobility[BLACK];
-
-    score +=  king<   WHITE>() - king<   BLACK>()
-            + threats<WHITE>() - threats<BLACK>()
-            + passed< WHITE>() - passed< BLACK>()
-            + space<  WHITE>() - space<  BLACK>();
-
-    score += initiative(eg_value(score));
-
-    // Interpolate between a middlegame and a (scaled by 'sf') endgame score
-    ScaleFactor sf = scale_factor(eg_value(score));
-    v =  mg_value(score) * int(me->game_phase())
-       + eg_value(score) * int(PHASE_MIDGAME - me->game_phase()) * sf / SCALE_FACTOR_NORMAL;
-
-    v /= PHASE_MIDGAME;
-
-    // In case of tracing add all remaining individual evaluation terms
-    if (T)
-    {
-        Trace::add(MATERIAL, pos.psq_score());
-        Trace::add(IMBALANCE, me->imbalance());
-        Trace::add(PAWN, pe->pawn_score(WHITE), pe->pawn_score(BLACK));
-        Trace::add(MOBILITY, mobility[WHITE], mobility[BLACK]);
-        Trace::add(TOTAL, score);
-    }
-
-    return  (pos.side_to_move() == WHITE ? v : -v) // Side to move point of view
-           + Eval::Tempo;
-  }
-
-} // namespace
-
-
-/// evaluate() is the evaluator for the outer world. It returns a static
-/// evaluation of the position from the point of view of the side to move.
-
-Value Eval::evaluate(const Position& pos) {
-  return Evaluation<NO_TRACE>(pos).value();
-}
-
-
-/// trace() is like evaluate(), but instead of returning a value, it returns
-/// a string (suitable for outputting to stdout) that contains the detailed
-/// descriptions and values of each evaluation term. Useful for debugging.
-
-std::string Eval::trace(const Position& pos) {
-
-  std::memset(scores, 0, sizeof(scores));
-
-  pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt
-
-  Value v = Evaluation<TRACE>(pos).value();
-
-  v = pos.side_to_move() == WHITE ? v : -v; // Trace scores are from white's point of view
-
-  std::stringstream ss;
-  ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2)
-     << "     Term    |    White    |    Black    |    Total   \n"
-     << "             |   MG    EG  |   MG    EG  |   MG    EG \n"
-     << " ------------+-------------+-------------+------------\n"
-     << "    Material | " << Term(MATERIAL)
-     << "   Imbalance | " << Term(IMBALANCE)
-     << "       Pawns | " << Term(PAWN)
-     << "     Knights | " << Term(KNIGHT)
-     << "     Bishops | " << Term(BISHOP)
-     << "       Rooks | " << Term(ROOK)
-     << "      Queens | " << Term(QUEEN)
-     << "    Mobility | " << Term(MOBILITY)
-     << " King safety | " << Term(KING)
-     << "     Threats | " << Term(THREAT)
-     << "      Passed | " << Term(PASSED)
-     << "       Space | " << Term(SPACE)
-     << "  Initiative | " << Term(INITIATIVE)
-     << " ------------+-------------+-------------+------------\n"
-     << "       Total | " << Term(TOTAL);
-
-  ss << "\nTotal evaluation: " << to_cp(v) << " (white side)\n";
-
-  return ss.str();
-}

From 9964fbbe2526c92804c3a4ec7104cae881aaa4e5 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sat, 15 Jun 2019 11:46:54 +0900
Subject: [PATCH 002/583] Reverted evaluate.cpp.

---
 src/evaluate.cpp | 909 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 909 insertions(+)
 create mode 100644 src/evaluate.cpp

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
new file mode 100644
index 00000000..d211db64
--- /dev/null
+++ b/src/evaluate.cpp
@@ -0,0 +1,909 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
+  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
+  Copyright (C) 2015-2019 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>   // For std::memset
+#include <iomanip>
+#include <sstream>
+
+#include "bitboard.h"
+#include "evaluate.h"
+#include "material.h"
+#include "pawns.h"
+#include "thread.h"
+
+namespace Trace {
+
+  enum Tracing { NO_TRACE, TRACE };
+
+  enum Term { // The first 8 entries are reserved for PieceType
+    MATERIAL = 8, IMBALANCE, MOBILITY, THREAT, PASSED, SPACE, INITIATIVE, TOTAL, TERM_NB
+  };
+
+  Score scores[TERM_NB][COLOR_NB];
+
+  double to_cp(Value v) { return double(v) / PawnValueEg; }
+
+  void add(int idx, Color c, Score s) {
+    scores[idx][c] = s;
+  }
+
+  void add(int idx, Score w, Score b = SCORE_ZERO) {
+    scores[idx][WHITE] = w;
+    scores[idx][BLACK] = b;
+  }
+
+  std::ostream& operator<<(std::ostream& os, Score s) {
+    os << std::setw(5) << to_cp(mg_value(s)) << " "
+       << std::setw(5) << to_cp(eg_value(s));
+    return os;
+  }
+
+  std::ostream& operator<<(std::ostream& os, Term t) {
+
+    if (t == MATERIAL || t == IMBALANCE || t == INITIATIVE || t == TOTAL)
+        os << " ----  ----"    << " | " << " ----  ----";
+    else
+        os << scores[t][WHITE] << " | " << scores[t][BLACK];
+
+    os << " | " << scores[t][WHITE] - scores[t][BLACK] << "\n";
+    return os;
+  }
+}
+
+using namespace Trace;
+
+namespace {
+
+  // Threshold for lazy and space evaluation
+  constexpr Value LazyThreshold  = Value(1400);
+  constexpr Value SpaceThreshold = Value(12222);
+
+  // KingAttackWeights[PieceType] contains king attack weights by piece type
+  constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 77, 55, 44, 10 };
+
+  // Penalties for enemy's safe checks
+  constexpr int QueenSafeCheck  = 780;
+  constexpr int RookSafeCheck   = 1080;
+  constexpr int BishopSafeCheck = 635;
+  constexpr int KnightSafeCheck = 790;
+
+#define S(mg, eg) make_score(mg, eg)
+
+  // MobilityBonus[PieceType-2][attacked] contains bonuses for middle and end game,
+  // indexed by piece type and number of attacked squares in the mobility area.
+  constexpr Score MobilityBonus[][32] = {
+    { S(-62,-81), S(-53,-56), S(-12,-30), S( -4,-14), S(  3,  8), S( 13, 15), // Knights
+      S( 22, 23), S( 28, 27), S( 33, 33) },
+    { S(-48,-59), S(-20,-23), S( 16, -3), S( 26, 13), S( 38, 24), S( 51, 42), // Bishops
+      S( 55, 54), S( 63, 57), S( 63, 65), S( 68, 73), S( 81, 78), S( 81, 86),
+      S( 91, 88), S( 98, 97) },
+    { S(-58,-76), S(-27,-18), S(-15, 28), S(-10, 55), S( -5, 69), S( -2, 82), // Rooks
+      S(  9,112), S( 16,118), S( 30,132), S( 29,142), S( 32,155), S( 38,165),
+      S( 46,166), S( 48,169), S( 58,171) },
+    { S(-39,-36), S(-21,-15), S(  3,  8), S(  3, 18), S( 14, 34), S( 22, 54), // Queens
+      S( 28, 61), S( 41, 73), S( 43, 79), S( 48, 92), S( 56, 94), S( 60,104),
+      S( 60,113), S( 66,120), S( 67,123), S( 70,126), S( 71,133), S( 73,136),
+      S( 79,140), S( 88,143), S( 88,148), S( 99,166), S(102,170), S(102,175),
+      S(106,184), S(109,191), S(113,206), S(116,212) }
+  };
+
+  // RookOnFile[semiopen/open] contains bonuses for each rook when there is
+  // no (friendly) pawn on the rook file.
+  constexpr Score RookOnFile[] = { S(18, 7), S(44, 20) };
+
+  // ThreatByMinor/ByRook[attacked PieceType] contains bonuses according to
+  // which piece type attacks which one. Attacks on lesser pieces which are
+  // pawn-defended are not considered.
+  constexpr Score ThreatByMinor[PIECE_TYPE_NB] = {
+    S(0, 0), S(0, 31), S(39, 42), S(57, 44), S(68, 112), S(62, 120)
+  };
+
+  constexpr Score ThreatByRook[PIECE_TYPE_NB] = {
+    S(0, 0), S(0, 24), S(38, 71), S(38, 61), S(0, 38), S(51, 38)
+  };
+
+  // PassedRank[Rank] contains a bonus according to the rank of a passed pawn
+  constexpr Score PassedRank[RANK_NB] = {
+    S(0, 0), S(5, 18), S(12, 23), S(10, 31), S(57, 62), S(163, 167), S(271, 250)
+  };
+
+  // PassedFile[File] contains a bonus according to the file of a passed pawn
+  constexpr Score PassedFile[FILE_NB] = {
+    S( -1,  7), S( 0,  9), S(-9, -8), S(-30,-14),
+    S(-30,-14), S(-9, -8), S( 0,  9), S( -1,  7)
+  };
+
+  // Assorted bonuses and penalties
+  constexpr Score BishopPawns        = S(  3,  7);
+  constexpr Score CorneredBishop     = S( 50, 50);
+  constexpr Score FlankAttacks       = S(  8,  0);
+  constexpr Score Hanging            = S( 69, 36);
+  constexpr Score KingProtector      = S(  7,  8);
+  constexpr Score KnightOnQueen      = S( 16, 12);
+  constexpr Score LongDiagonalBishop = S( 45,  0);
+  constexpr Score MinorBehindPawn    = S( 18,  3);
+  constexpr Score Outpost            = S(  9,  3);
+  constexpr Score PawnlessFlank      = S( 17, 95);
+  constexpr Score RestrictedPiece    = S(  7,  7);
+  constexpr Score RookOnPawn         = S( 10, 32);
+  constexpr Score SliderOnQueen      = S( 59, 18);
+  constexpr Score ThreatByKing       = S( 24, 89);
+  constexpr Score ThreatByPawnPush   = S( 48, 39);
+  constexpr Score ThreatByRank       = S( 13,  0);
+  constexpr Score ThreatBySafePawn   = S(173, 94);
+  constexpr Score TrappedRook        = S( 47,  4);
+  constexpr Score WeakQueen          = S( 49, 15);
+  constexpr Score WeakUnopposedPawn  = S( 12, 23);
+
+#undef S
+
+  // Evaluation class computes and stores attacks tables and other working data
+  template<Tracing T>
+  class Evaluation {
+
+  public:
+    Evaluation() = delete;
+    explicit Evaluation(const Position& p) : pos(p) {}
+    Evaluation& operator=(const Evaluation&) = delete;
+    Value value();
+
+  private:
+    template<Color Us> void initialize();
+    template<Color Us, PieceType Pt> Score pieces();
+    template<Color Us> Score king() const;
+    template<Color Us> Score threats() const;
+    template<Color Us> Score passed() const;
+    template<Color Us> Score space() const;
+    ScaleFactor scale_factor(Value eg) const;
+    Score initiative(Value eg) const;
+
+    const Position& pos;
+    Material::Entry* me;
+    Pawns::Entry* pe;
+    Bitboard mobilityArea[COLOR_NB];
+    Score mobility[COLOR_NB] = { SCORE_ZERO, SCORE_ZERO };
+
+    // attackedBy[color][piece type] is a bitboard representing all squares
+    // attacked by a given color and piece type. Special "piece types" which
+    // is also calculated is ALL_PIECES.
+    Bitboard attackedBy[COLOR_NB][PIECE_TYPE_NB];
+
+    // attackedBy2[color] are the squares attacked by at least 2 units of a given
+    // color, including x-rays. But diagonal x-rays through pawns are not computed.
+    Bitboard attackedBy2[COLOR_NB];
+
+    // kingRing[color] are the squares adjacent to the king, plus (only for a
+    // king on its first rank) the squares two ranks in front. For instance,
+    // if black's king is on g8, kingRing[BLACK] is f8, h8, f7, g7, h7, f6, g6
+    // and h6.
+    Bitboard kingRing[COLOR_NB];
+
+    // kingAttackersCount[color] is the number of pieces of the given color
+    // which attack a square in the kingRing of the enemy king.
+    int kingAttackersCount[COLOR_NB];
+
+    // kingAttackersWeight[color] is the sum of the "weights" of the pieces of
+    // the given color which attack a square in the kingRing of the enemy king.
+    // The weights of the individual piece types are given by the elements in
+    // the KingAttackWeights array.
+    int kingAttackersWeight[COLOR_NB];
+
+    // kingAttacksCount[color] is the number of attacks by the given color to
+    // squares directly adjacent to the enemy king. Pieces which attack more
+    // than one square are counted multiple times. For instance, if there is
+    // a white knight on g5 and black's king is on g8, this white knight adds 2
+    // to kingAttacksCount[WHITE].
+    int kingAttacksCount[COLOR_NB];
+  };
+
+
+  // Evaluation::initialize() computes king and pawn attacks, and the king ring
+  // bitboard for a given color. This is done at the beginning of the evaluation.
+  template<Tracing T> template<Color Us>
+  void Evaluation<T>::initialize() {
+
+    constexpr Color     Them = (Us == WHITE ? BLACK : WHITE);
+    constexpr Direction Up   = (Us == WHITE ? NORTH : SOUTH);
+    constexpr Direction Down = (Us == WHITE ? SOUTH : NORTH);
+    constexpr Bitboard LowRanks = (Us == WHITE ? Rank2BB | Rank3BB: Rank7BB | Rank6BB);
+
+    const Square ksq = pos.square<KING>(Us);
+
+    Bitboard dblAttackByPawn = pawn_double_attacks_bb<Us>(pos.pieces(Us, PAWN));
+
+    // Find our pawns that are blocked or on the first two ranks
+    Bitboard b = pos.pieces(Us, PAWN) & (shift<Down>(pos.pieces()) | LowRanks);
+
+    // Squares occupied by those pawns, by our king or queen or controlled by
+    // enemy pawns are excluded from the mobility area.
+    mobilityArea[Us] = ~(b | pos.pieces(Us, KING, QUEEN) | pe->pawn_attacks(Them));
+
+    // Initialize attackedBy[] for king and pawns
+    attackedBy[Us][KING] = pos.attacks_from<KING>(ksq);
+    attackedBy[Us][PAWN] = pe->pawn_attacks(Us);
+    attackedBy[Us][ALL_PIECES] = attackedBy[Us][KING] | attackedBy[Us][PAWN];
+    attackedBy2[Us] = dblAttackByPawn | (attackedBy[Us][KING] & attackedBy[Us][PAWN]);
+
+    // Init our king safety tables
+    kingRing[Us] = attackedBy[Us][KING];
+    if (relative_rank(Us, ksq) == RANK_1)
+        kingRing[Us] |= shift<Up>(kingRing[Us]);
+
+    if (file_of(ksq) == FILE_H)
+        kingRing[Us] |= shift<WEST>(kingRing[Us]);
+
+    else if (file_of(ksq) == FILE_A)
+        kingRing[Us] |= shift<EAST>(kingRing[Us]);
+
+    kingAttackersCount[Them] = popcount(kingRing[Us] & pe->pawn_attacks(Them));
+    kingAttacksCount[Them] = kingAttackersWeight[Them] = 0;
+
+    // Remove from kingRing[] the squares defended by two pawns
+    kingRing[Us] &= ~dblAttackByPawn;
+  }
+
+
+  // Evaluation::pieces() scores pieces of a given color and type
+  template<Tracing T> template<Color Us, PieceType Pt>
+  Score Evaluation<T>::pieces() {
+
+    constexpr Color     Them = (Us == WHITE ? BLACK : WHITE);
+    constexpr Direction Down = (Us == WHITE ? SOUTH : NORTH);
+    constexpr Bitboard OutpostRanks = (Us == WHITE ? Rank4BB | Rank5BB | Rank6BB
+                                                   : Rank5BB | Rank4BB | Rank3BB);
+    const Square* pl = pos.squares<Pt>(Us);
+
+    Bitboard b, bb;
+    Score score = SCORE_ZERO;
+
+    attackedBy[Us][Pt] = 0;
+
+    for (Square s = *pl; s != SQ_NONE; s = *++pl)
+    {
+        // Find attacked squares, including x-ray attacks for bishops and rooks
+        b = Pt == BISHOP ? attacks_bb<BISHOP>(s, pos.pieces() ^ pos.pieces(QUEEN))
+          : Pt ==   ROOK ? attacks_bb<  ROOK>(s, pos.pieces() ^ pos.pieces(QUEEN) ^ pos.pieces(Us, ROOK))
+                         : pos.attacks_from<Pt>(s);
+
+        if (pos.blockers_for_king(Us) & s)
+            b &= LineBB[pos.square<KING>(Us)][s];
+
+        attackedBy2[Us] |= attackedBy[Us][ALL_PIECES] & b;
+        attackedBy[Us][Pt] |= b;
+        attackedBy[Us][ALL_PIECES] |= b;
+
+        if (b & kingRing[Them])
+        {
+            kingAttackersCount[Us]++;
+            kingAttackersWeight[Us] += KingAttackWeights[Pt];
+            kingAttacksCount[Us] += popcount(b & attackedBy[Them][KING]);
+        }
+
+        int mob = popcount(b & mobilityArea[Us]);
+
+        mobility[Us] += MobilityBonus[Pt - 2][mob];
+
+        if (Pt == BISHOP || Pt == KNIGHT)
+        {
+            // Bonus if piece is on an outpost square or can reach one
+            bb = OutpostRanks & ~pe->pawn_attacks_span(Them);
+            if (bb & s)
+                score += Outpost * (Pt == KNIGHT ? 4 : 2)
+                                 * ((attackedBy[Us][PAWN] & s) ? 2 : 1);
+
+            else if (bb &= b & ~pos.pieces(Us))
+                score += Outpost * (Pt == KNIGHT ? 2 : 1)
+                                 * ((attackedBy[Us][PAWN] & bb) ? 2 : 1);
+
+            // Knight and Bishop bonus for being right behind a pawn
+            if (shift<Down>(pos.pieces(PAWN)) & s)
+                score += MinorBehindPawn;
+
+            // Penalty if the piece is far from the king
+            score -= KingProtector * distance(s, pos.square<KING>(Us));
+
+            if (Pt == BISHOP)
+            {
+                // Penalty according to number of pawns on the same color square as the
+                // bishop, bigger when the center files are blocked with pawns.
+                Bitboard blocked = pos.pieces(Us, PAWN) & shift<Down>(pos.pieces());
+
+                score -= BishopPawns * pos.pawns_on_same_color_squares(Us, s)
+                                     * (1 + popcount(blocked & CenterFiles));
+
+                // Bonus for bishop on a long diagonal which can "see" both center squares
+                if (more_than_one(attacks_bb<BISHOP>(s, pos.pieces(PAWN)) & Center))
+                    score += LongDiagonalBishop;
+            }
+
+            // An important Chess960 pattern: A cornered bishop blocked by a friendly
+            // pawn diagonally in front of it is a very serious problem, especially
+            // when that pawn is also blocked.
+            if (   Pt == BISHOP
+                && pos.is_chess960()
+                && (s == relative_square(Us, SQ_A1) || s == relative_square(Us, SQ_H1)))
+            {
+                Direction d = pawn_push(Us) + (file_of(s) == FILE_A ? EAST : WEST);
+                if (pos.piece_on(s + d) == make_piece(Us, PAWN))
+                    score -= !pos.empty(s + d + pawn_push(Us))                ? CorneredBishop * 4
+                            : pos.piece_on(s + d + d) == make_piece(Us, PAWN) ? CorneredBishop * 2
+                                                                              : CorneredBishop;
+            }
+        }
+
+        if (Pt == ROOK)
+        {
+            // Bonus for aligning rook with enemy pawns on the same rank/file
+            if (relative_rank(Us, s) >= RANK_5)
+                score += RookOnPawn * popcount(pos.pieces(Them, PAWN) & PseudoAttacks[ROOK][s]);
+
+            // Bonus for rook on an open or semi-open file
+            if (pos.is_on_semiopen_file(Us, s))
+                score += RookOnFile[bool(pos.is_on_semiopen_file(Them, s))];
+
+            // Penalty when trapped by the king, even more if the king cannot castle
+            else if (mob <= 3)
+            {
+                File kf = file_of(pos.square<KING>(Us));
+                if ((kf < FILE_E) == (file_of(s) < kf))
+                    score -= TrappedRook * (1 + !pos.castling_rights(Us));
+            }
+        }
+
+        if (Pt == QUEEN)
+        {
+            // Penalty if any relative pin or discovered attack against the queen
+            Bitboard queenPinners;
+            if (pos.slider_blockers(pos.pieces(Them, ROOK, BISHOP), s, queenPinners))
+                score -= WeakQueen;
+        }
+    }
+    if (T)
+        Trace::add(Pt, Us, score);
+
+    return score;
+  }
+
+
+  // Evaluation::king() assigns bonuses and penalties to a king of a given color
+  template<Tracing T> template<Color Us>
+  Score Evaluation<T>::king() const {
+
+    constexpr Color    Them = (Us == WHITE ? BLACK : WHITE);
+    constexpr Bitboard Camp = (Us == WHITE ? AllSquares ^ Rank6BB ^ Rank7BB ^ Rank8BB
+                                           : AllSquares ^ Rank1BB ^ Rank2BB ^ Rank3BB);
+
+    Bitboard weak, b1, b2, safe, unsafeChecks = 0;
+    Bitboard rookChecks, queenChecks, bishopChecks, knightChecks;
+    int kingDanger = 0;
+    const Square ksq = pos.square<KING>(Us);
+
+    // Init the score with king shelter and enemy pawns storm
+    Score score = pe->king_safety<Us>(pos);
+
+    // Attacked squares defended at most once by our queen or king
+    weak =  attackedBy[Them][ALL_PIECES]
+          & ~attackedBy2[Us]
+          & (~attackedBy[Us][ALL_PIECES] | attackedBy[Us][KING] | attackedBy[Us][QUEEN]);
+
+    // Analyse the safe enemy's checks which are possible on next move
+    safe  = ~pos.pieces(Them);
+    safe &= ~attackedBy[Us][ALL_PIECES] | (weak & attackedBy2[Them]);
+
+    b1 = attacks_bb<ROOK  >(ksq, pos.pieces() ^ pos.pieces(Us, QUEEN));
+    b2 = attacks_bb<BISHOP>(ksq, pos.pieces() ^ pos.pieces(Us, QUEEN));
+
+    // Enemy rooks checks
+    rookChecks = b1 & safe & attackedBy[Them][ROOK];
+
+    if (rookChecks)
+        kingDanger += RookSafeCheck;
+    else
+        unsafeChecks |= b1 & attackedBy[Them][ROOK];
+
+    // Enemy queen safe checks: we count them only if they are from squares from
+    // which we can't give a rook check, because rook checks are more valuable.
+    queenChecks =  (b1 | b2)
+                 & attackedBy[Them][QUEEN]
+                 & safe
+                 & ~attackedBy[Us][QUEEN]
+                 & ~rookChecks;
+
+    if (queenChecks)
+        kingDanger += QueenSafeCheck;
+
+    // Enemy bishops checks: we count them only if they are from squares from
+    // which we can't give a queen check, because queen checks are more valuable.
+    bishopChecks =  b2
+                  & attackedBy[Them][BISHOP]
+                  & safe
+                  & ~queenChecks;
+
+    if (bishopChecks)
+        kingDanger += BishopSafeCheck;
+    else
+        unsafeChecks |= b2 & attackedBy[Them][BISHOP];
+
+    // Enemy knights checks
+    knightChecks = pos.attacks_from<KNIGHT>(ksq) & attackedBy[Them][KNIGHT];
+
+    if (knightChecks & safe)
+        kingDanger += KnightSafeCheck;
+    else
+        unsafeChecks |= knightChecks;
+
+    // Unsafe or occupied checking squares will also be considered, as long as
+    // the square is in the attacker's mobility area.
+    unsafeChecks &= mobilityArea[Them];
+
+    // Find the squares that opponent attacks in our king flank, and the squares
+    // which are attacked twice in that flank.
+    b1 = attackedBy[Them][ALL_PIECES] & KingFlank[file_of(ksq)] & Camp;
+    b2 = b1 & attackedBy2[Them];
+
+    int kingFlankAttacks = popcount(b1) + popcount(b2);
+
+    kingDanger +=        kingAttackersCount[Them] * kingAttackersWeight[Them]
+                 +  69 * kingAttacksCount[Them]
+                 + 185 * popcount(kingRing[Us] & weak)
+                 - 100 * bool(attackedBy[Us][KNIGHT] & attackedBy[Us][KING])
+                 -  35 * bool(attackedBy[Us][BISHOP] & attackedBy[Us][KING])
+                 + 150 * popcount(pos.blockers_for_king(Us) | unsafeChecks)
+                 - 873 * !pos.count<QUEEN>(Them)
+                 -   6 * mg_value(score) / 8
+                 +       mg_value(mobility[Them] - mobility[Us])
+                 +   5 * kingFlankAttacks * kingFlankAttacks / 16
+                 -   7;
+
+    // Transform the kingDanger units into a Score, and subtract it from the evaluation
+    if (kingDanger > 100)
+        score -= make_score(kingDanger * kingDanger / 4096, kingDanger / 16);
+
+    // Penalty when our king is on a pawnless flank
+    if (!(pos.pieces(PAWN) & KingFlank[file_of(ksq)]))
+        score -= PawnlessFlank;
+
+    // Penalty if king flank is under attack, potentially moving toward the king
+    score -= FlankAttacks * kingFlankAttacks;
+
+    if (T)
+        Trace::add(KING, Us, score);
+
+    return score;
+  }
+
+
+  // Evaluation::threats() assigns bonuses according to the types of the
+  // attacking and the attacked pieces.
+  template<Tracing T> template<Color Us>
+  Score Evaluation<T>::threats() const {
+
+    constexpr Color     Them     = (Us == WHITE ? BLACK   : WHITE);
+    constexpr Direction Up       = (Us == WHITE ? NORTH   : SOUTH);
+    constexpr Bitboard  TRank3BB = (Us == WHITE ? Rank3BB : Rank6BB);
+
+    Bitboard b, weak, defended, nonPawnEnemies, stronglyProtected, safe;
+    Score score = SCORE_ZERO;
+
+    // Non-pawn enemies
+    nonPawnEnemies = pos.pieces(Them) & ~pos.pieces(PAWN);
+
+    // Squares strongly protected by the enemy, either because they defend the
+    // square with a pawn, or because they defend the square twice and we don't.
+    stronglyProtected =  attackedBy[Them][PAWN]
+                       | (attackedBy2[Them] & ~attackedBy2[Us]);
+
+    // Non-pawn enemies, strongly protected
+    defended = nonPawnEnemies & stronglyProtected;
+
+    // Enemies not strongly protected and under our attack
+    weak = pos.pieces(Them) & ~stronglyProtected & attackedBy[Us][ALL_PIECES];
+
+    // Safe or protected squares
+    safe = ~attackedBy[Them][ALL_PIECES] | attackedBy[Us][ALL_PIECES];
+
+    // Bonus according to the kind of attacking pieces
+    if (defended | weak)
+    {
+        b = (defended | weak) & (attackedBy[Us][KNIGHT] | attackedBy[Us][BISHOP]);
+        while (b)
+        {
+            Square s = pop_lsb(&b);
+            score += ThreatByMinor[type_of(pos.piece_on(s))];
+            if (type_of(pos.piece_on(s)) != PAWN)
+                score += ThreatByRank * (int)relative_rank(Them, s);
+        }
+
+        b = weak & attackedBy[Us][ROOK];
+        while (b)
+        {
+            Square s = pop_lsb(&b);
+            score += ThreatByRook[type_of(pos.piece_on(s))];
+            if (type_of(pos.piece_on(s)) != PAWN)
+                score += ThreatByRank * (int)relative_rank(Them, s);
+        }
+
+        if (weak & attackedBy[Us][KING])
+            score += ThreatByKing;
+
+        b =  ~attackedBy[Them][ALL_PIECES]
+           | (nonPawnEnemies & attackedBy2[Us]);
+        score += Hanging * popcount(weak & b);
+    }
+
+    // Bonus for restricting their piece moves
+    b =   attackedBy[Them][ALL_PIECES]
+       & ~stronglyProtected
+       &  attackedBy[Us][ALL_PIECES];
+
+    score += RestrictedPiece * popcount(b);
+
+    // Bonus for enemy unopposed weak pawns
+    if (pos.pieces(Us, ROOK, QUEEN))
+        score += WeakUnopposedPawn * pe->weak_unopposed(Them);
+
+    // Find squares where our pawns can push on the next move
+    b  = shift<Up>(pos.pieces(Us, PAWN)) & ~pos.pieces();
+    b |= shift<Up>(b & TRank3BB) & ~pos.pieces();
+
+    // Keep only the squares which are relatively safe
+    b &= ~attackedBy[Them][PAWN] & safe;
+
+    // Bonus for safe pawn threats on the next move
+    b = pawn_attacks_bb<Us>(b) & pos.pieces(Them);
+    score += ThreatByPawnPush * popcount(b);
+
+    // Our safe or protected pawns
+    b = pos.pieces(Us, PAWN) & safe;
+
+    b = pawn_attacks_bb<Us>(b) & nonPawnEnemies;
+    score += ThreatBySafePawn * popcount(b);
+
+    // Bonus for threats on the next moves against enemy queen
+    if (pos.count<QUEEN>(Them) == 1)
+    {
+        Square s = pos.square<QUEEN>(Them);
+        safe = mobilityArea[Us] & ~stronglyProtected;
+
+        b = attackedBy[Us][KNIGHT] & pos.attacks_from<KNIGHT>(s);
+
+        score += KnightOnQueen * popcount(b & safe);
+
+        b =  (attackedBy[Us][BISHOP] & pos.attacks_from<BISHOP>(s))
+           | (attackedBy[Us][ROOK  ] & pos.attacks_from<ROOK  >(s));
+
+        score += SliderOnQueen * popcount(b & safe & attackedBy2[Us]);
+    }
+
+    if (T)
+        Trace::add(THREAT, Us, score);
+
+    return score;
+  }
+
+  // Evaluation::passed() evaluates the passed pawns and candidate passed
+  // pawns of the given color.
+
+  template<Tracing T> template<Color Us>
+  Score Evaluation<T>::passed() const {
+
+    constexpr Color     Them = (Us == WHITE ? BLACK : WHITE);
+    constexpr Direction Up   = (Us == WHITE ? NORTH : SOUTH);
+
+    auto king_proximity = [&](Color c, Square s) {
+      return std::min(distance(pos.square<KING>(c), s), 5);
+    };
+
+    Bitboard b, bb, squaresToQueen, defendedSquares, unsafeSquares;
+    Score score = SCORE_ZERO;
+
+    b = pe->passed_pawns(Us);
+
+    while (b)
+    {
+        Square s = pop_lsb(&b);
+
+        assert(!(pos.pieces(Them, PAWN) & forward_file_bb(Us, s + Up)));
+
+        int r = relative_rank(Us, s);
+
+        Score bonus = PassedRank[r];
+
+        if (r > RANK_3)
+        {
+            int w = (r-2) * (r-2) + 2;
+            Square blockSq = s + Up;
+
+            // Adjust bonus based on the king's proximity
+            bonus += make_score(0, (  king_proximity(Them, blockSq) * 5
+                                    - king_proximity(Us,   blockSq) * 2) * w);
+
+            // If blockSq is not the queening square then consider also a second push
+            if (r != RANK_7)
+                bonus -= make_score(0, king_proximity(Us, blockSq + Up) * w);
+
+            // If the pawn is free to advance, then increase the bonus
+            if (pos.empty(blockSq))
+            {
+                // If there is a rook or queen attacking/defending the pawn from behind,
+                // consider all the squaresToQueen. Otherwise consider only the squares
+                // in the pawn's path attacked or occupied by the enemy.
+                defendedSquares = unsafeSquares = squaresToQueen = forward_file_bb(Us, s);
+
+                bb = forward_file_bb(Them, s) & pos.pieces(ROOK, QUEEN);
+
+                if (!(pos.pieces(Us) & bb))
+                    defendedSquares &= attackedBy[Us][ALL_PIECES];
+
+                if (!(pos.pieces(Them) & bb))
+                    unsafeSquares &= attackedBy[Them][ALL_PIECES] | pos.pieces(Them);
+
+                // If there aren't any enemy attacks, assign a big bonus. Otherwise
+                // assign a smaller bonus if the block square isn't attacked.
+                int k = !unsafeSquares ? 20 : !(unsafeSquares & blockSq) ? 9 : 0;
+
+                // If the path to the queen is fully defended, assign a big bonus.
+                // Otherwise assign a smaller bonus if the block square is defended.
+                if (defendedSquares == squaresToQueen)
+                    k += 6;
+
+                else if (defendedSquares & blockSq)
+                    k += 4;
+
+                bonus += make_score(k * w, k * w);
+            }
+        } // r > RANK_3
+
+        // Scale down bonus for candidate passers which need more than one
+        // pawn push to become passed, or have a pawn in front of them.
+        if (   !pos.pawn_passed(Us, s + Up)
+            || (pos.pieces(PAWN) & forward_file_bb(Us, s)))
+            bonus = bonus / 2;
+
+        score += bonus + PassedFile[file_of(s)];
+    }
+
+    if (T)
+        Trace::add(PASSED, Us, score);
+
+    return score;
+  }
+
+
+  // Evaluation::space() computes the space evaluation for a given side. The
+  // space evaluation is a simple bonus based on the number of safe squares
+  // available for minor pieces on the central four files on ranks 2--4. Safe
+  // squares one, two or three squares behind a friendly pawn are counted
+  // twice. Finally, the space bonus is multiplied by a weight. The aim is to
+  // improve play on game opening.
+
+  template<Tracing T> template<Color Us>
+  Score Evaluation<T>::space() const {
+
+    if (pos.non_pawn_material() < SpaceThreshold)
+        return SCORE_ZERO;
+
+    constexpr Color Them     = (Us == WHITE ? BLACK : WHITE);
+    constexpr Direction Down = (Us == WHITE ? SOUTH : NORTH);
+    constexpr Bitboard SpaceMask =
+      Us == WHITE ? CenterFiles & (Rank2BB | Rank3BB | Rank4BB)
+                  : CenterFiles & (Rank7BB | Rank6BB | Rank5BB);
+
+    // Find the available squares for our pieces inside the area defined by SpaceMask
+    Bitboard safe =   SpaceMask
+                   & ~pos.pieces(Us, PAWN)
+                   & ~attackedBy[Them][PAWN];
+
+    // Find all squares which are at most three squares behind some friendly pawn
+    Bitboard behind = pos.pieces(Us, PAWN);
+    behind |= shift<Down>(behind);
+    behind |= shift<Down+Down>(behind);
+
+    int bonus = popcount(safe) + popcount(behind & safe);
+    int weight = pos.count<ALL_PIECES>(Us) - 1;
+    Score score = make_score(bonus * weight * weight / 16, 0);
+
+    if (T)
+        Trace::add(SPACE, Us, score);
+
+    return score;
+  }
+
+
+  // Evaluation::initiative() computes the initiative correction value
+  // for the position. It is a second order bonus/malus based on the
+  // known attacking/defending status of the players.
+
+  template<Tracing T>
+  Score Evaluation<T>::initiative(Value eg) const {
+
+    int outflanking =  distance<File>(pos.square<KING>(WHITE), pos.square<KING>(BLACK))
+                     - distance<Rank>(pos.square<KING>(WHITE), pos.square<KING>(BLACK));
+
+    bool pawnsOnBothFlanks =   (pos.pieces(PAWN) & QueenSide)
+                            && (pos.pieces(PAWN) & KingSide);
+
+    // Compute the initiative bonus for the attacking side
+    int complexity =   9 * pe->passed_count()
+                    + 11 * pos.count<PAWN>()
+                    +  9 * outflanking
+                    + 18 * pawnsOnBothFlanks
+                    + 49 * !pos.non_pawn_material()
+                    -103 ;
+
+    // Now apply the bonus: note that we find the attacking side by extracting
+    // the sign of the endgame value, and that we carefully cap the bonus so
+    // that the endgame score will never change sign after the bonus.
+    int v = ((eg > 0) - (eg < 0)) * std::max(complexity, -abs(eg));
+
+    if (T)
+        Trace::add(INITIATIVE, make_score(0, v));
+
+    return make_score(0, v);
+  }
+
+
+  // Evaluation::scale_factor() computes the scale factor for the winning side
+
+  template<Tracing T>
+  ScaleFactor Evaluation<T>::scale_factor(Value eg) const {
+
+    Color strongSide = eg > VALUE_DRAW ? WHITE : BLACK;
+    int sf = me->scale_factor(pos, strongSide);
+
+    // If scale is not already specific, scale down the endgame via general heuristics
+    if (sf == SCALE_FACTOR_NORMAL)
+    {
+        if (   pos.opposite_bishops()
+            && pos.non_pawn_material() == 2 * BishopValueMg)
+            sf = 16 + 4 * pe->passed_count();
+        else
+            sf = std::min(40 + (pos.opposite_bishops() ? 2 : 7) * pos.count<PAWN>(strongSide), sf);
+
+    }
+
+    return ScaleFactor(sf);
+  }
+
+
+  // Evaluation::value() is the main function of the class. It computes the various
+  // parts of the evaluation and returns the value of the position from the point
+  // of view of the side to move.
+
+  template<Tracing T>
+  Value Evaluation<T>::value() {
+
+    assert(!pos.checkers());
+
+    // Probe the material hash table
+    me = Material::probe(pos);
+
+    // If we have a specialized evaluation function for the current material
+    // configuration, call it and return.
+    if (me->specialized_eval_exists())
+        return me->evaluate(pos);
+
+    // Initialize score by reading the incrementally updated scores included in
+    // the position object (material + piece square tables) and the material
+    // imbalance. Score is computed internally from the white point of view.
+    Score score = pos.psq_score() + me->imbalance() + pos.this_thread()->contempt;
+
+    // Probe the pawn hash table
+    pe = Pawns::probe(pos);
+    score += pe->pawn_score(WHITE) - pe->pawn_score(BLACK);
+
+    // Early exit if score is high
+    Value v = (mg_value(score) + eg_value(score)) / 2;
+    if (abs(v) > (LazyThreshold + pos.non_pawn_material() / 64))
+       return pos.side_to_move() == WHITE ? v : -v;
+
+    // Main evaluation begins here
+
+    initialize<WHITE>();
+    initialize<BLACK>();
+
+    // Pieces should be evaluated first (populate attack tables)
+    score +=  pieces<WHITE, KNIGHT>() - pieces<BLACK, KNIGHT>()
+            + pieces<WHITE, BISHOP>() - pieces<BLACK, BISHOP>()
+            + pieces<WHITE, ROOK  >() - pieces<BLACK, ROOK  >()
+            + pieces<WHITE, QUEEN >() - pieces<BLACK, QUEEN >();
+
+    score += mobility[WHITE] - mobility[BLACK];
+
+    score +=  king<   WHITE>() - king<   BLACK>()
+            + threats<WHITE>() - threats<BLACK>()
+            + passed< WHITE>() - passed< BLACK>()
+            + space<  WHITE>() - space<  BLACK>();
+
+    score += initiative(eg_value(score));
+
+    // Interpolate between a middlegame and a (scaled by 'sf') endgame score
+    ScaleFactor sf = scale_factor(eg_value(score));
+    v =  mg_value(score) * int(me->game_phase())
+       + eg_value(score) * int(PHASE_MIDGAME - me->game_phase()) * sf / SCALE_FACTOR_NORMAL;
+
+    v /= PHASE_MIDGAME;
+
+    // In case of tracing add all remaining individual evaluation terms
+    if (T)
+    {
+        Trace::add(MATERIAL, pos.psq_score());
+        Trace::add(IMBALANCE, me->imbalance());
+        Trace::add(PAWN, pe->pawn_score(WHITE), pe->pawn_score(BLACK));
+        Trace::add(MOBILITY, mobility[WHITE], mobility[BLACK]);
+        Trace::add(TOTAL, score);
+    }
+
+    return  (pos.side_to_move() == WHITE ? v : -v) // Side to move point of view
+           + Eval::Tempo;
+  }
+
+} // namespace
+
+
+/// evaluate() is the evaluator for the outer world. It returns a static
+/// evaluation of the position from the point of view of the side to move.
+
+Value Eval::evaluate(const Position& pos) {
+  return Evaluation<NO_TRACE>(pos).value();
+}
+
+
+/// trace() is like evaluate(), but instead of returning a value, it returns
+/// a string (suitable for outputting to stdout) that contains the detailed
+/// descriptions and values of each evaluation term. Useful for debugging.
+
+std::string Eval::trace(const Position& pos) {
+
+  std::memset(scores, 0, sizeof(scores));
+
+  pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt
+
+  Value v = Evaluation<TRACE>(pos).value();
+
+  v = pos.side_to_move() == WHITE ? v : -v; // Trace scores are from white's point of view
+
+  std::stringstream ss;
+  ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2)
+     << "     Term    |    White    |    Black    |    Total   \n"
+     << "             |   MG    EG  |   MG    EG  |   MG    EG \n"
+     << " ------------+-------------+-------------+------------\n"
+     << "    Material | " << Term(MATERIAL)
+     << "   Imbalance | " << Term(IMBALANCE)
+     << "       Pawns | " << Term(PAWN)
+     << "     Knights | " << Term(KNIGHT)
+     << "     Bishops | " << Term(BISHOP)
+     << "       Rooks | " << Term(ROOK)
+     << "      Queens | " << Term(QUEEN)
+     << "    Mobility | " << Term(MOBILITY)
+     << " King safety | " << Term(KING)
+     << "     Threats | " << Term(THREAT)
+     << "      Passed | " << Term(PASSED)
+     << "       Space | " << Term(SPACE)
+     << "  Initiative | " << Term(INITIATIVE)
+     << " ------------+-------------+-------------+------------\n"
+     << "       Total | " << Term(TOTAL);
+
+  ss << "\nTotal evaluation: " << to_cp(v) << " (white side)\n";
+
+  return ss.str();
+}

From b330602cdc57c10e169ed0597fdb300a89118256 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sat, 15 Jun 2019 17:08:47 +0900
Subject: [PATCH 003/583] Fixed compile errors.

---
 src/eval/nnue/evaluate_nnue.cpp               |   4 +-
 src/eval/nnue/evaluate_nnue.h                 |   2 +
 src/eval/nnue/features/feature_set.h          |   6 +-
 src/eval/nnue/features/half_kp.cpp            |   2 +-
 src/eval/nnue/features/half_kp.h              |   2 +-
 src/eval/nnue/features/half_relative_kp.cpp   |   6 +-
 src/eval/nnue/features/half_relative_kp.h     |   2 +-
 src/eval/nnue/features/k.cpp                  |   4 +-
 src/eval/nnue/features/k.h                    |   2 +-
 src/eval/nnue/nnue_accumulator.h              |   2 -
 src/eval/nnue/nnue_architecture.h             |   1 +
 src/eval/nnue/nnue_feature_transformer.h      |   8 +-
 .../trainer/features/factorizer_half_kp.h     |   2 +-
 src/evaluate.h                                | 175 ++++++++++++++++++
 src/misc.cpp                                  |  16 ++
 src/misc.h                                    |  66 +++++++
 src/position.h                                |  19 ++
 src/types.h                                   |  39 +++-
 18 files changed, 336 insertions(+), 22 deletions(-)

diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index 84707bf9..de86ebe9 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -1,10 +1,12 @@
 ﻿// NNUE評価関数の計算に関するコード
 
 #include <fstream>
+#include <iostream>
 
 #include "../../evaluate.h"
 #include "../../position.h"
 #include "../../misc.h"
+#include "../../uci.h"
 
 #include "evaluate_nnue.h"
 
@@ -263,7 +265,7 @@ Value compute_eval(const Position& pos) {
 }
 
 // 評価関数
-Value evaluate(const Position& pos) {
+Value NNUE::evaluate(const Position& pos) {
   const auto& accumulator = pos.state()->accumulator;
   if (accumulator.computed_score) {
     return accumulator.score;
diff --git a/src/eval/nnue/evaluate_nnue.h b/src/eval/nnue/evaluate_nnue.h
index a95f2bd9..1ca48d5b 100644
--- a/src/eval/nnue/evaluate_nnue.h
+++ b/src/eval/nnue/evaluate_nnue.h
@@ -55,6 +55,8 @@ bool ReadParameters(std::istream& stream);
 // 評価関数パラメータを書き込む
 bool WriteParameters(std::ostream& stream);
 
+Value evaluate(const Position& pos);
+
 }  // namespace NNUE
 
 }  // namespace Eval
diff --git a/src/eval/nnue/features/feature_set.h b/src/eval/nnue/features/feature_set.h
index 5d312a2e..919be65d 100644
--- a/src/eval/nnue/features/feature_set.h
+++ b/src/eval/nnue/features/feature_set.h
@@ -71,7 +71,7 @@ class FeatureSetBase {
   template <typename IndexListType>
   static void AppendActiveIndices(
       const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
-    for (const auto perspective : COLOR) {
+    for (const auto perspective : Colors) {
       Derived::CollectActiveIndices(
           pos, trigger, perspective, &active[perspective]);
     }
@@ -85,7 +85,7 @@ class FeatureSetBase {
     const auto& dp = pos.state()->dirtyPiece;
     if (dp.dirty_num == 0) return;
 
-    for (const auto perspective : COLOR) {
+    for (const auto perspective : Colors) {
       reset[perspective] = false;
       switch (trigger) {
         case TriggerEvent::kNone:
@@ -105,7 +105,7 @@ class FeatureSetBase {
           reset[perspective] = true;
           break;
         default:
-          ASSERT_LV5(false);
+          assert(false);
           break;
       }
       if (reset[perspective]) {
diff --git a/src/eval/nnue/features/half_kp.cpp b/src/eval/nnue/features/half_kp.cpp
index f1a1f57f..1741f3ce 100644
--- a/src/eval/nnue/features/half_kp.cpp
+++ b/src/eval/nnue/features/half_kp.cpp
@@ -28,7 +28,7 @@ inline void HalfKP<AssociatedKing>::GetPieces(
   const PieceNumber target = (AssociatedKing == Side::kFriend) ?
       static_cast<PieceNumber>(PIECE_NUMBER_KING + perspective) :
       static_cast<PieceNumber>(PIECE_NUMBER_KING + ~perspective);
-  *sq_target_k = static_cast<Square>(((*pieces)[target] - f_king) % SQ_NB);
+  *sq_target_k = static_cast<Square>(((*pieces)[target] - f_king) % SQUARE_NB);
 }
 
 // 特徴量のうち、値が1であるインデックスのリストを取得する
diff --git a/src/eval/nnue/features/half_kp.h b/src/eval/nnue/features/half_kp.h
index ffbc2947..556127d3 100644
--- a/src/eval/nnue/features/half_kp.h
+++ b/src/eval/nnue/features/half_kp.h
@@ -26,7 +26,7 @@ class HalfKP {
       0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
   // 特徴量の次元数
   static constexpr IndexType kDimensions =
-      static_cast<IndexType>(SQ_NB) * static_cast<IndexType>(fe_end);
+      static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(fe_end);
   // 特徴量のうち、同時に値が1となるインデックスの数の最大値
   static constexpr IndexType kMaxActiveDimensions = PIECE_NUMBER_KING;
   // 差分計算の代わりに全計算を行うタイミング
diff --git a/src/eval/nnue/features/half_relative_kp.cpp b/src/eval/nnue/features/half_relative_kp.cpp
index 3ee49ff9..d0810df6 100644
--- a/src/eval/nnue/features/half_relative_kp.cpp
+++ b/src/eval/nnue/features/half_relative_kp.cpp
@@ -17,8 +17,8 @@ inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
     Square sq_k, BonaPiece p) {
   constexpr IndexType W = kBoardWidth;
   constexpr IndexType H = kBoardHeight;
-  const IndexType piece_index = (p - fe_hand_end) / SQ_NB;
-  const Square sq_p = static_cast<Square>((p - fe_hand_end) % SQ_NB);
+  const IndexType piece_index = (p - fe_hand_end) / SQUARE_NB;
+  const Square sq_p = static_cast<Square>((p - fe_hand_end) % SQUARE_NB);
   const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
   const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
   return H * W * piece_index + H * relative_file + relative_rank;
@@ -35,7 +35,7 @@ inline void HalfRelativeKP<AssociatedKing>::GetPieces(
   const PieceNumber target = (AssociatedKing == Side::kFriend) ?
       static_cast<PieceNumber>(PIECE_NUMBER_KING + perspective) :
       static_cast<PieceNumber>(PIECE_NUMBER_KING + ~perspective);
-  *sq_target_k = static_cast<Square>(((*pieces)[target] - f_king) % SQ_NB);
+  *sq_target_k = static_cast<Square>(((*pieces)[target] - f_king) % SQUARE_NB);
 }
 
 // 特徴量のうち、値が1であるインデックスのリストを取得する
diff --git a/src/eval/nnue/features/half_relative_kp.h b/src/eval/nnue/features/half_relative_kp.h
index f9afd446..99e10c57 100644
--- a/src/eval/nnue/features/half_relative_kp.h
+++ b/src/eval/nnue/features/half_relative_kp.h
@@ -25,7 +25,7 @@ class HalfRelativeKP {
   static constexpr std::uint32_t kHashValue =
       0xF9180919u ^ (AssociatedKing == Side::kFriend);
   // 玉を除いた駒種
-  static constexpr IndexType kNumPieceKinds = (fe_end - fe_hand_end) / SQ_NB;
+  static constexpr IndexType kNumPieceKinds = (fe_end - fe_hand_end) / SQUARE_NB;
   // 玉を中央に置いた仮想的な盤の幅
   static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
   // 玉を中央に置いた仮想的な盤の高さ
diff --git a/src/eval/nnue/features/k.cpp b/src/eval/nnue/features/k.cpp
index 9c019e08..03f66ff5 100644
--- a/src/eval/nnue/features/k.cpp
+++ b/src/eval/nnue/features/k.cpp
@@ -20,8 +20,8 @@ void K::AppendActiveIndices(
   const BonaPiece* pieces = (perspective == BLACK) ?
       pos.eval_list()->piece_list_fb() :
       pos.eval_list()->piece_list_fw();
-  ASSERT_LV5(pieces[PIECE_NUMBER_BKING] != BONA_PIECE_ZERO);
-  ASSERT_LV5(pieces[PIECE_NUMBER_WKING] != BONA_PIECE_ZERO);
+  assert(pieces[PIECE_NUMBER_BKING] != BONA_PIECE_ZERO);
+  assert(pieces[PIECE_NUMBER_WKING] != BONA_PIECE_ZERO);
   for (PieceNumber i = PIECE_NUMBER_KING; i < PIECE_NUMBER_NB; ++i) {
     active->push_back(pieces[i] - fe_end);
   }
diff --git a/src/eval/nnue/features/k.h b/src/eval/nnue/features/k.h
index a5dda8fd..1a01c471 100644
--- a/src/eval/nnue/features/k.h
+++ b/src/eval/nnue/features/k.h
@@ -22,7 +22,7 @@ class K {
   // 評価関数ファイルに埋め込むハッシュ値
   static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
   // 特徴量の次元数
-  static constexpr IndexType kDimensions = SQ_NB * 2;
+  static constexpr IndexType kDimensions = SQUARE_NB * 2;
   // 特徴量のうち、同時に値が1となるインデックスの数の最大値
   static constexpr IndexType kMaxActiveDimensions = 2;
   // 差分計算の代わりに全計算を行うタイミング
diff --git a/src/eval/nnue/nnue_accumulator.h b/src/eval/nnue/nnue_accumulator.h
index c7c43a3e..4241edb3 100644
--- a/src/eval/nnue/nnue_accumulator.h
+++ b/src/eval/nnue/nnue_accumulator.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_ACCUMULATOR_H_
 #define _NNUE_ACCUMULATOR_H_
 
-#include "../../config.h"
-
 #if defined(EVAL_NNUE)
 
 #include "nnue_architecture.h"
diff --git a/src/eval/nnue/nnue_architecture.h b/src/eval/nnue/nnue_architecture.h
index 6815ada5..5f11a02b 100644
--- a/src/eval/nnue/nnue_architecture.h
+++ b/src/eval/nnue/nnue_architecture.h
@@ -8,6 +8,7 @@
 // 入力特徴量とネットワーク構造が定義されたヘッダをincludeする
 
 // KP256型を使いたいときは、これを事前にdefineする。
+#define EVAL_NNUE_KP256
 #if defined(EVAL_NNUE_KP256)
 #include "architectures/k-p_256x2-32-32.h"
 #else // #if defined(EVAL_NNUE_HALFKP256)
diff --git a/src/eval/nnue/nnue_feature_transformer.h b/src/eval/nnue/nnue_feature_transformer.h
index 22f5df82..f7c2080f 100644
--- a/src/eval/nnue/nnue_feature_transformer.h
+++ b/src/eval/nnue/nnue_feature_transformer.h
@@ -142,9 +142,9 @@ class FeatureTransformer {
       }
 #else
       for (IndexType j = 0; j < kHalfDimensions; ++j) {
-        BiasType sum = accumulation[perspectives[p]][0][j];
+        BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
         for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-          sum += accumulation[perspectives[p]][i][j];
+          sum += accumulation[static_cast<int>(perspectives[p])][i][j];
         }
         output[offset + j] = static_cast<OutputType>(
             std::max<int>(0, std::min<int>(127, sum)));
@@ -161,7 +161,7 @@ class FeatureTransformer {
       Features::IndexList active_indices[2];
       RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
                                        active_indices);
-      for (const auto perspective : COLOR) {
+      for (const auto perspective : Colors) {
         if (i == 0) {
           std::memcpy(accumulator.accumulation[perspective][i], biases_,
                       kHalfDimensions * sizeof(BiasType));
@@ -217,7 +217,7 @@ class FeatureTransformer {
       bool reset[2];
       RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
                                         removed_indices, added_indices, reset);
-      for (const auto perspective : COLOR) {
+      for (const auto perspective : Colors) {
 #if defined(USE_AVX2)
         constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
         auto accumulation = reinterpret_cast<__m256i*>(
diff --git a/src/eval/nnue/trainer/features/factorizer_half_kp.h b/src/eval/nnue/trainer/features/factorizer_half_kp.h
index 20e4460e..5682e8e6 100644
--- a/src/eval/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/eval/nnue/trainer/features/factorizer_half_kp.h
@@ -43,7 +43,7 @@ class Factorizer<HalfKP<AssociatedKing>> {
     // kFeaturesHalfKP
     {true, FeatureType::kDimensions},
     // kFeaturesHalfK
-    {true, SQ_NB},
+    {true, SQUARE_NB},
     // kFeaturesP
     {true, Factorizer<P>::GetDimensions()},
     // kFeaturesHalfRelativeKP
diff --git a/src/evaluate.h b/src/evaluate.h
index cccdd25d..c96a1288 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -34,6 +34,181 @@ constexpr Value Tempo = Value(28); // Must be visible to search
 std::string trace(const Position& pos);
 
 Value evaluate(const Position& pos);
+
+// --- �]���֐��Ŏg���萔 KPP(�ʂƔC��2��)��P�ɑ�������enum
+
+// (�]���֐��̎����̂Ƃ��ɂ́ABonaPiece�͎��R�ɒ�`�������̂ł����ł͒�`���Ȃ��B)
+
+
+// Bonanza��KKP/KPP�ƌ����Ƃ���P(Piece)��\������^�B
+// �� KPP�����߂�Ƃ��ɁA39�̒n�_�̕��̂悤�ɁA���~���ɑ΂��Ĉ�ӂȔԍ����K�v�ƂȂ�B
+enum BonaPiece : int32_t
+{
+	// f = friend(�����)�̈Ӗ��Be = enemy(�����)�̈Ӗ�
+
+	// ���������̎��̒l
+	BONA_PIECE_NOT_INIT = -1,
+
+	// �����ȋ�B����̂Ƃ��Ȃǂ́A�s�v�ȋ�������Ɉړ�������B
+	BONA_PIECE_ZERO = 0,
+
+	fe_hand_end = BONA_PIECE_ZERO + 1,
+
+    // Bonanza�̂悤�ɔՏ�̂��肦�Ȃ����̕��⍁�̔ԍ����l�߂Ȃ��B
+	// ���R1) �w�K�̂Ƃ��ɑ���PP��1�i�ڂɍ�������Ƃ��������āA������t�ϊ��ɂ����Đ������\������̂�����B
+	// ���R2) �c�^Bitboard����Square����̕ϊ��ɍ���B
+
+	// --- �Տ�̋�
+	f_pawn = fe_hand_end,
+	e_pawn = f_pawn + SQUARE_NB,
+	f_knight = e_pawn + SQUARE_NB,
+	e_knight = f_knight + SQUARE_NB,
+	f_bishop = e_knight + SQUARE_NB,
+	e_bishop = f_bishop + SQUARE_NB,
+	f_rook = e_bishop + SQUARE_NB,
+	e_rook = f_rook + SQUARE_NB,
+	f_queen = e_rook + SQUARE_NB,
+	e_queen = f_queen + SQUARE_NB,
+	fe_end = e_queen + SQUARE_NB,
+	f_king = fe_end,
+	e_king = f_king + SQUARE_NB,
+	fe_end2 = e_king + SQUARE_NB, // �ʂ��܂߂������̔ԍ��B
+};
+
+
+// BonaPiece����肩�猩���Ƃ�(����39�̕�����肩�猩��ƌ���71�̕�)�̔ԍ��Ƃ�
+// �y�A�ɂ������̂�ExtBonaPiece�^�ƌĂԂ��Ƃɂ���B
+union ExtBonaPiece
+{
+	struct {
+		BonaPiece fw; // from white
+		BonaPiece fb; // from black
+	};
+	BonaPiece from[2];
+
+	ExtBonaPiece() {}
+	ExtBonaPiece(BonaPiece fw_, BonaPiece fb_) : fw(fw_), fb(fb_) {}
+};
+
+// �����̎w����ɂ���Ăǂ�����ǂ��Ɉړ������̂��̏��B
+// ���ExtBonaPiece�\���ł���Ƃ���B
+struct ChangedBonaPiece
+{
+	ExtBonaPiece old_piece;
+	ExtBonaPiece new_piece;
+};
+
+// KPP�e�[�u���̔Տ�̋�pc�ɑΉ�����BonaPiece�����߂邽�߂̔z��B
+// ��)
+// BonaPiece fb = kpp_board_index[pc].fb + sq; // ��肩�猩��sq�ɂ���pc�ɑΉ�����BonaPiece
+// BonaPiece fw = kpp_board_index[pc].fw + sq; // ��肩�猩��sq�ɂ���pc�ɑΉ�����BonaPiece
+extern ExtBonaPiece kpp_board_index[PIECE_NB];
+
+// �]���֐��ŗp�����X�g�B�ǂ̋�(PieceNumber)���ǂ��ɂ���̂�(BonaPiece)��ێ����Ă���\����
+struct EvalList
+{
+	// �]���֐�(FV38�^)�ŗp�����ԍ��̃��X�g
+	BonaPiece* piece_list_fw() const { return const_cast<BonaPiece*>(pieceListFw); }
+	BonaPiece* piece_list_fb() const { return const_cast<BonaPiece*>(pieceListFb); }
+
+	// �w�肳�ꂽpiece_no�̋��ExtBonaPiece�^�ɕϊ����ĕԂ��B
+	ExtBonaPiece bona_piece(PieceNumber piece_no) const
+	{
+		ExtBonaPiece bp;
+		bp.fw = pieceListFw[piece_no];
+		bp.fb = pieceListFb[piece_no];
+		return bp;
+	}
+
+	// �Տ��sq�̏���piece_no��pc�̋��z�u����
+	void put_piece(PieceNumber piece_no, Square sq, Piece pc) {
+		set_piece_on_board(piece_no, BonaPiece(kpp_board_index[pc].fw + sq), BonaPiece(kpp_board_index[pc].fb + inverse(sq)), sq);
+	}
+
+	// �Տ�̂��鏡sq�ɑΉ�����PieceNumber��Ԃ��B
+	PieceNumber piece_no_of_board(Square sq) const { return piece_no_list_board[sq]; }
+
+	// pieceList������������B
+	// ����ɑΉ������鎞�̂��߂ɁA���g�p�̋�̒l��BONA_PIECE_ZERO�ɂ��Ă����B
+	// �ʏ�̕]���֐�������̕]���֐��Ƃ��ė��p�ł���B
+	// piece_no_list�̂ق��̓f�o�b�O������悤��PIECE_NUMBER_NB�ŏ������B
+	void clear()
+	{
+
+		for (auto& p : pieceListFw)
+			p = BONA_PIECE_ZERO;
+
+		for (auto& p : pieceListFb)
+			p = BONA_PIECE_ZERO;
+
+		for (auto& v : piece_no_list_board)
+			v = PIECE_NUMBER_NB;
+	}
+
+	// list�����ς̂Ƃ��́Aadd()/remove()���T�|�[�g����B
+	// DirtyPiece�̂ق�����Ăяo�����B
+
+	// list��add()����B
+	void add(BonaPiece fb);
+
+	// list����remove����B
+	void remove(BonaPiece fb);
+
+	// �����ŕێ����Ă���pieceListFb[]��������BonaPiece�ł��邩����������B
+	// �� : �f�o�b�O�p�B�x���B
+	bool is_valid(const Position& pos);
+
+
+protected:
+
+	// �Տ�sq�ɂ���piece_no�̋��BonaPiece��fb,fw�ł��邱�Ƃ�ݒ肷��B
+	inline void set_piece_on_board(PieceNumber piece_no, BonaPiece fw, BonaPiece fb, Square sq)
+	{
+		assert(is_ok(piece_no));
+		pieceListFw[piece_no] = fw;
+		pieceListFb[piece_no] = fb;
+		piece_no_list_board[sq] = piece_no;
+	}
+
+	// ��X�g�B��ԍ�(PieceNumber)�����̋�ǂ��ɂ���̂�(BonaPiece)�������BFV38�Ȃǂŗp����B
+
+	// ��X�g�̒���
+		// 38�Œ�
+public:
+	int length() const { return PIECE_NUMBER_KING; }
+
+	// VPGATHERDD���g���s���A4�̔{���łȂ���΂Ȃ�Ȃ��B
+	// �܂��AKPPT�^�]���֐��Ȃǂ́A39,40�Ԗڂ̗v�f���[���ł��邱�Ƃ�O��Ƃ���
+	// �A�N�Z�X�����Ă���ӏ�������̂Œ��ӂ��邱�ƁB
+	static const int MAX_LENGTH = 40;
+private:
+
+	BonaPiece pieceListFw[MAX_LENGTH];
+	BonaPiece pieceListFb[MAX_LENGTH];
+
+	// �Տ�̋�ɑ΂��āA���̋�ԍ�(PieceNumber)��ێ����Ă���z��
+	// �ʂ�SQ_NB�Ɉړ����Ă���Ƃ��p��+1�܂ŕێ����Ă������A
+	// SQ_NB�̋ʂ��ړ������Ȃ��̂ŁA���̒l���g�����Ƃ͂Ȃ��͂��B
+	PieceNumber piece_no_list_board[SQUARE_NB_PLUS1];
+};
+
+// �]���l�̍����v�Z�̊Ǘ��p
+// �O�̋ǖʂ���ړ�������ԍ����Ǘ����邽�߂̍\����
+// ������́A�ő��2�B
+struct DirtyPiece
+{
+	// ���̋�ԍ��̋�����牽�ɕς�����̂�
+	Eval::ChangedBonaPiece changed_piece[2];
+
+	// dirty�ɂȂ�����ԍ�
+	PieceNumber pieceNo[2];
+
+	// dirty�ɂȂ������B
+	// null move����0�Ƃ������Ƃ����肤��B
+	// ������Ǝ�����Ƃōő��2�B
+	int dirty_num;
+
+};
 }
 
 #endif // #ifndef EVALUATE_H_INCLUDED
diff --git a/src/misc.cpp b/src/misc.cpp
index 8d3b202d..69c6bacc 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -315,3 +315,19 @@ void bindThisThread(size_t idx) {
 #endif
 
 } // namespace WinProcGroup
+
+void sleep(int ms)
+{
+	std::this_thread::sleep_for(std::chrono::milliseconds(ms));
+}
+
+void* aligned_malloc(size_t size, size_t align)
+{
+	void* p = _mm_malloc(size, align);
+	if (p == nullptr)
+	{
+		std::cout << "info string can't allocate memory. sise = " << size << std::endl;
+		exit(1);
+	}
+	return p;
+}
diff --git a/src/misc.h b/src/misc.h
index ddd05e4e..5b63ef1c 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -21,6 +21,7 @@
 #ifndef MISC_H_INCLUDED
 #define MISC_H_INCLUDED
 
+#include <algorithm>
 #include <cassert>
 #include <chrono>
 #include <ostream>
@@ -110,4 +111,69 @@ namespace WinProcGroup {
   void bindThisThread(size_t idx);
 }
 
+// 指定されたミリ秒だけsleepする。
+extern void sleep(int ms);
+
+// 途中での終了処理のためのwrapper
+static void my_exit()
+{
+	sleep(3000); // エラーメッセージが出力される前に終了するのはまずいのでwaitを入れておく。
+	exit(EXIT_FAILURE);
+}
+
+// --------------------
+//       Math
+// --------------------
+
+// 進行度の計算や学習で用いる数学的な関数
+namespace Math {
+	// シグモイド関数
+	//  = 1.0 / (1.0 + std::exp(-x))
+	double sigmoid(double x);
+
+	// シグモイド関数の微分
+	//  = sigmoid(x) * (1.0 - sigmoid(x))
+	double dsigmoid(double x);
+
+	// vを[lo,hi]の間に収まるようにクリップする。
+	// ※　Stockfishではこの関数、bitboard.hに書いてある。
+	template<class T> constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
+		return v < lo ? lo : v > hi ? hi : v;
+	}
+
+}
+
+// --------------------
+//       Path
+// --------------------
+
+// C#にあるPathクラス的なもの。ファイル名の操作。
+// C#のメソッド名に合わせておく。
+struct Path
+{
+	// path名とファイル名を結合して、それを返す。
+	// folder名のほうは空文字列でないときに、末尾に'/'か'\\'がなければそれを付与する。
+	static std::string Combine(const std::string& folder, const std::string& filename)
+	{
+		if (folder.length() >= 1 && *folder.rbegin() != '/' && *folder.rbegin() != '\\')
+			return folder + "/" + filename;
+
+		return folder + filename;
+	}
+
+	// full path表現から、(フォルダ名を除いた)ファイル名の部分を取得する。
+	static std::string GetFileName(const std::string& path)
+	{
+		// "\"か"/"か、どちらを使ってあるかはわからない。
+		auto path_index1 = path.find_last_of("\\") + 1;
+		auto path_index2 = path.find_last_of("/") + 1;
+		auto path_index = std::max(path_index1, path_index2);
+
+		return path.substr(path_index);
+	}
+};
+
+extern void* aligned_malloc(size_t size, size_t align);
+static void aligned_free(void* ptr) { _mm_free(ptr); }
+
 #endif // #ifndef MISC_H_INCLUDED
diff --git a/src/position.h b/src/position.h
index 343751ed..d26b1a63 100644
--- a/src/position.h
+++ b/src/position.h
@@ -29,6 +29,8 @@
 #include "bitboard.h"
 #include "types.h"
 
+#include "eval/nnue/nnue_accumulator.h"
+
 
 /// StateInfo struct stores information needed to restore a Position object to
 /// its previous state when we retract a move. Whenever a move is made on the
@@ -54,6 +56,11 @@ struct StateInfo {
   Bitboard   blockersForKing[COLOR_NB];
   Bitboard   pinners[COLOR_NB];
   Bitboard   checkSquares[PIECE_TYPE_NB];
+
+  Eval::NNUE::Accumulator accumulator;
+
+  // �]���l�̍����v�Z�̊Ǘ��p
+  Eval::DirtyPiece dirtyPiece;
 };
 
 /// A list to keep track of the position states along the setup moves (from the
@@ -165,6 +172,15 @@ public:
   bool pos_is_ok() const;
   void flip();
 
+  // --- StateInfo
+
+  // ���݂̋ǖʂɑΉ�����StateInfo��Ԃ��B
+  // ���Ƃ��΁Astate()->capturedPiece�ł���΁A�O�ǖʂŕߊl���ꂽ��i�[����Ă���B
+  StateInfo* state() const { return st; }
+
+  // �]���֐��Ŏg�����߂́A�ǂ̋�ԍ��̋�ǂ��ɂ��邩�Ȃǂ̏��B
+  const Eval::EvalList* eval_list() const { return &evalList; }
+
 private:
   // Initialization helpers (used while setting up a position)
   void set_castling_right(Color c, Square rfrom);
@@ -194,6 +210,9 @@ private:
   Thread* thisThread;
   StateInfo* st;
   bool chess960;
+
+  // �]���֐��ŗp�����̃��X�g
+  Eval::EvalList evalList;
 };
 
 namespace PSQT {
diff --git a/src/types.h b/src/types.h
index b0758f43..c4458fe4 100644
--- a/src/types.h
+++ b/src/types.h
@@ -131,6 +131,8 @@ enum Color {
   WHITE, BLACK, COLOR_NB = 2
 };
 
+constexpr Color Colors[2] = { WHITE, BLACK };
+
 enum CastlingSide {
   KING_SIDE, QUEEN_SIDE, CASTLING_SIDE_NB = 2
 };
@@ -186,7 +188,10 @@ enum Value : int {
   RookValueMg   = 1289,  RookValueEg   = 1378,
   QueenValueMg  = 2529,  QueenValueEg  = 2687,
 
-  MidgameLimit  = 15258, EndgameLimit  = 3915
+  MidgameLimit  = 15258, EndgameLimit  = 3915,
+
+  // �]���֐��̕Ԃ��l�̍ő�l(2**14���炢�Ɏ��܂��Ă��ė~�����Ƃ��낾��..)
+  VALUE_MAX_EVAL = 27000,
 };
 
 enum PieceType {
@@ -230,7 +235,8 @@ enum Square : int {
   SQ_A8, SQ_B8, SQ_C8, SQ_D8, SQ_E8, SQ_F8, SQ_G8, SQ_H8,
   SQ_NONE,
 
-  SQUARE_NB = 64
+  SQUARE_NB = 64,
+  SQUARE_NB_PLUS1 = SQUARE_NB + 1, // �ʂ����Ȃ��ꍇ�ASQ_NB�Ɉړ��������̂Ƃ��Ĉ������߁A�z���SQ_NB+1�Ŋm�ۂ��Ȃ��Ƃ����Ȃ��Ƃ�������̂ł��̒萔��p����B
 };
 
 enum Direction : int {
@@ -356,6 +362,10 @@ constexpr Square operator~(Square s) {
   return Square(s ^ SQ_A8); // Vertical flip SQ_A1 -> SQ_A8
 }
 
+constexpr Square inverse(Square s) {
+	return static_cast<Square>(static_cast<int>(SQUARE_NB) - s - 1);
+}
+
 constexpr File operator~(File f) {
   return File(f ^ FILE_H); // Horizontal flip FILE_A -> FILE_H
 }
@@ -454,4 +464,29 @@ constexpr bool is_ok(Move m) {
   return from_sq(m) != to_sq(m); // Catch MOVE_NULL and MOVE_NONE
 }
 
+// --------------------
+//        �
+// --------------------
+
+// Position�N���X�ŗp����A��X�g(�ǂ̋�ǂ��ɂ���̂�)���Ǘ�����Ƃ��̔ԍ��B
+enum PieceNumber : int8_t
+{
+	PIECE_NUMBER_PAWN = 0,
+	PIECE_NUMBER_KNIGHT = 16,
+	PIECE_NUMBER_BISHOP = 20,
+	PIECE_NUMBER_ROOK = 24,
+	PIECE_NUMBER_QUEEN = 28,
+	PIECE_NUMBER_KING = 30,
+	PIECE_NUMBER_WKING = 30,
+	PIECE_NUMBER_BKING = 31, // ���A���̋ʂ̔ԍ����K�v�ȏꍇ�͂�������p����
+	PIECE_NUMBER_ZERO = 0,
+	PIECE_NUMBER_NB = 32,
+};
+
+inline PieceNumber& operator++(PieceNumber& d) { return d = PieceNumber(int(d) + 1); }           \
+inline PieceNumber& operator--(PieceNumber& d) { return d = PieceNumber(int(d) - 1); }
+
+// PieceNumber�̐������̌����Bassert�p�B
+constexpr bool is_ok(PieceNumber pn) { return pn < PIECE_NUMBER_NB; }
+
 #endif // #ifndef TYPES_H_INCLUDED

From 48bfe86d274cb7ef42a3139da245db294ad3395d Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sun, 16 Jun 2019 10:33:53 +0900
Subject: [PATCH 004/583] Implemented the logic to update Eval List and Dirty
 Pieces.

---
 src/eval/nnue/evaluate_nnue.cpp |   6 +-
 src/evaluate.cpp                |  73 +++++++++++++++++-
 src/evaluate.h                  |  35 ++++-----
 src/position.cpp                | 131 +++++++++++++++++++++++++++++++-
 src/position.h                  |   5 ++
 src/types.h                     |  11 ++-
 src/uci.cpp                     |  69 ++++++++++++++++-
 src/uci.h                       |   2 +
 src/ucioption.cpp               |  11 +++
 9 files changed, 313 insertions(+), 30 deletions(-)

diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index de86ebe9..6009d888 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -233,12 +233,14 @@ void prefetch_evalhash(const Key key) {
 void load_eval() {
   NNUE::Initialize();
 
-#if defined(EVAL_LEARN)
   if (!Options["SkipLoadingEval"])
-#endif
   {
     const std::string dir_name = Options["EvalDir"];
     const std::string file_name = Path::Combine(dir_name, NNUE::kFileName);
+    //{
+    //  std::ofstream stream(file_name, std::ios::binary);
+    //  NNUE::WriteParameters(stream);
+    //}
     std::ifstream stream(file_name, std::ios::binary);
     const bool result = NNUE::ReadParameters(stream);
 
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index d211db64..9e85e2ae 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -29,6 +29,7 @@
 #include "material.h"
 #include "pawns.h"
 #include "thread.h"
+#include "eval/nnue/evaluate_nnue.h"
 
 namespace Trace {
 
@@ -864,7 +865,8 @@ namespace {
 /// evaluation of the position from the point of view of the side to move.
 
 Value Eval::evaluate(const Position& pos) {
-  return Evaluation<NO_TRACE>(pos).value();
+  //return Evaluation<NO_TRACE>(pos).value();
+  return Eval::NNUE::evaluate(pos);
 }
 
 
@@ -907,3 +909,72 @@ std::string Eval::trace(const Position& pos) {
 
   return ss.str();
 }
+
+namespace Eval {
+ExtBonaPiece kpp_board_index[PIECE_NB] = {
+    { BONA_PIECE_ZERO, BONA_PIECE_ZERO },
+    { f_pawn, e_pawn },
+    { f_knight, e_knight },
+    { f_bishop, e_bishop },
+    { f_rook, e_rook },
+    { f_queen, e_queen },
+    { f_king, e_king },
+    { BONA_PIECE_ZERO, BONA_PIECE_ZERO },
+
+    // ��肩�猩���ꍇ�Bf��e������ւ��B
+    { BONA_PIECE_ZERO, BONA_PIECE_ZERO },
+    { e_pawn, f_pawn },
+    { e_knight, f_knight },
+    { e_bishop, f_bishop },
+    { e_rook, f_rook },
+    { e_queen, f_queen },
+    { e_king, f_king },
+    { BONA_PIECE_ZERO, BONA_PIECE_ZERO }, // ���̐���͂Ȃ�
+};
+
+// �����ŕێ����Ă���pieceListFw[]��������BonaPiece�ł��邩����������B
+// �� : �f�o�b�O�p�B�x���B
+bool EvalList::is_valid(const Position& pos)
+{
+  for (int i = 0; i < length(); ++i)
+  {
+    BonaPiece fw = pieceListFw[i];
+    // ����fw���{���ɑ��݂��邩��Position�N���X�̂ق��ɒ��ׂɍs���B
+
+    if (fw == Eval::BONA_PIECE_ZERO) {
+      continue;
+    }
+
+    // �͈͊O
+    if (!(0 <= fw && fw < fe_end))
+      return false;
+
+    // �Տ�̋�Ȃ̂ł��̋�{���ɑ��݂��邩���ׂɂ����B
+    for (Piece pc = NO_PIECE; pc < PIECE_NB; ++pc)
+    {
+      auto pt = type_of(pc);
+      if (pt == NO_PIECE || pt == 7) // ���݂��Ȃ���
+        continue;
+
+      // ��pc��BonaPiece�̊J�n�ԍ�
+      auto s = BonaPiece(kpp_board_index[pc].fw);
+      if (s <= fw && fw < s + SQUARE_NB)
+      {
+        // ���������̂ł��̋sq�̒n�_�ɂ��邩�𒲂ׂ�B
+        Square sq = (Square)(fw - s);
+        Piece pc2 = pos.piece_on(sq);
+
+        if (pc2 != pc)
+          return false;
+
+        goto Found;
+      }
+    }
+    // ���̂����݂��Ȃ���ł�����..
+    return false;
+  Found:;
+  }
+
+  return true;
+}
+}
diff --git a/src/evaluate.h b/src/evaluate.h
index c96a1288..e9e13e7d 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -35,6 +35,15 @@ std::string trace(const Position& pos);
 
 Value evaluate(const Position& pos);
 
+// �]���֐��t�@�C����ǂݍ��ށB
+// ����́A"is_ready"�R�}���h�̉�������1�x�����Ăяo�����B2�x�Ăяo�����Ƃ͑z�肵�Ă��Ȃ��B
+// (�������AEvalDir(�]���֐��t�H���_)���ύX�ɂȂ������ƁAisready���ēx�����Ă�����ǂ݂Ȃ����B)
+void load_eval();
+
+static uint64_t calc_check_sum() { return 0; }
+
+static void print_softname(uint64_t check_sum) {}
+
 // --- �]���֐��Ŏg���萔 KPP(�ʂƔC��2��)��P�ɑ�������enum
 
 // (�]���֐��̎����̂Ƃ��ɂ́ABonaPiece�͎��R�ɒ�`�������̂ł����ł͒�`���Ȃ��B)
@@ -145,22 +154,10 @@ struct EvalList
 			v = PIECE_NUMBER_NB;
 	}
 
-	// list�����ς̂Ƃ��́Aadd()/remove()���T�|�[�g����B
-	// DirtyPiece�̂ق�����Ăяo�����B
-
-	// list��add()����B
-	void add(BonaPiece fb);
-
-	// list����remove����B
-	void remove(BonaPiece fb);
-
-	// �����ŕێ����Ă���pieceListFb[]��������BonaPiece�ł��邩����������B
+	// �����ŕێ����Ă���pieceListFw[]��������BonaPiece�ł��邩����������B
 	// �� : �f�o�b�O�p�B�x���B
 	bool is_valid(const Position& pos);
 
-
-protected:
-
 	// �Տ�sq�ɂ���piece_no�̋��BonaPiece��fb,fw�ł��邱�Ƃ�ݒ肷��B
 	inline void set_piece_on_board(PieceNumber piece_no, BonaPiece fw, BonaPiece fb, Square sq)
 	{
@@ -173,7 +170,7 @@ protected:
 	// ��X�g�B��ԍ�(PieceNumber)�����̋�ǂ��ɂ���̂�(BonaPiece)�������BFV38�Ȃǂŗp����B
 
 	// ��X�g�̒���
-		// 38�Œ�
+  // 38�Œ�
 public:
 	int length() const { return PIECE_NUMBER_KING; }
 
@@ -181,15 +178,15 @@ public:
 	// �܂��AKPPT�^�]���֐��Ȃǂ́A39,40�Ԗڂ̗v�f���[���ł��邱�Ƃ�O��Ƃ���
 	// �A�N�Z�X�����Ă���ӏ�������̂Œ��ӂ��邱�ƁB
 	static const int MAX_LENGTH = 40;
+
+  // �Տ�̋�ɑ΂��āA���̋�ԍ�(PieceNumber)��ێ����Ă���z��
+  // �ʂ�SQ_NB�Ɉړ����Ă���Ƃ��p��+1�܂ŕێ����Ă������A
+  // SQ_NB�̋ʂ��ړ������Ȃ��̂ŁA���̒l���g�����Ƃ͂Ȃ��͂��B
+  PieceNumber piece_no_list_board[SQUARE_NB_PLUS1];
 private:
 
 	BonaPiece pieceListFw[MAX_LENGTH];
 	BonaPiece pieceListFb[MAX_LENGTH];
-
-	// �Տ�̋�ɑ΂��āA���̋�ԍ�(PieceNumber)��ێ����Ă���z��
-	// �ʂ�SQ_NB�Ɉړ����Ă���Ƃ��p��+1�܂ŕێ����Ă������A
-	// SQ_NB�̋ʂ��ړ������Ȃ��̂ŁA���̒l���g�����Ƃ͂Ȃ��͂��B
-	PieceNumber piece_no_list_board[SQUARE_NB_PLUS1];
 };
 
 // �]���l�̍����v�Z�̊Ǘ��p
diff --git a/src/position.cpp b/src/position.cpp
index edb40499..43f986f9 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -243,6 +243,20 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
   std::fill_n(&pieceList[0][0], sizeof(pieceList) / sizeof(Square), SQ_NONE);
   st = si;
 
+  // evalList��clear�B���memset�Ń[���N���A�����Ƃ��ɃN���A����Ă��邪�c�B
+  evalList.clear();
+
+  // PieceList���X�V�����ŁA�ǂ̋�ǂ��ɂ��邩��ݒ肵�Ȃ���΂Ȃ�Ȃ����A
+  // ���ꂼ��̋���ǂ��܂Ŏg�������̃J�E���^�[
+  PieceNumber piece_no_count[KING] = {
+    PIECE_NUMBER_ZERO,
+    PIECE_NUMBER_PAWN,
+    PIECE_NUMBER_KNIGHT,
+    PIECE_NUMBER_BISHOP,
+    PIECE_NUMBER_ROOK,
+    PIECE_NUMBER_QUEEN
+  };
+
   ss >> std::noskipws;
 
   // 1. Piece placement
@@ -256,7 +270,15 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
 
       else if ((idx = PieceToChar.find(token)) != string::npos)
       {
-          put_piece(Piece(idx), sq);
+          auto pc = Piece(idx);
+          put_piece(pc, sq);
+
+          PieceNumber piece_no =
+            (idx == W_KING) ? PIECE_NUMBER_WKING : // ����
+            (idx == B_KING) ? PIECE_NUMBER_BKING : // ����
+            piece_no_count[type_of(Piece(idx))]++; // ����ȊO
+          evalList.put_piece(piece_no, sq, pc); // sq�̏���pc�̋��z�u����
+
           ++sq;
       }
   }
@@ -319,6 +341,7 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
   set_state(st);
 
   assert(pos_is_ok());
+  assert(evalList.is_valid(*this));
 
   return *this;
 }
@@ -739,6 +762,9 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   ++st->rule50;
   ++st->pliesFromNull;
 
+  st->accumulator.computed_accumulation = false;
+  st->accumulator.computed_score = false;
+
   Color us = sideToMove;
   Color them = ~us;
   Square from = from_sq(m);
@@ -750,6 +776,9 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   assert(captured == NO_PIECE || color_of(captured) == (type_of(m) != CASTLING ? them : us));
   assert(type_of(captured) != KING);
 
+  auto& dp = st->dirtyPiece;
+  dp.dirty_num = 1;
+
   if (type_of(m) == CASTLING)
   {
       assert(pc == make_piece(us, KING));
@@ -766,6 +795,8 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   {
       Square capsq = to;
 
+      PieceNumber piece_no1;
+
       // If the captured piece is a pawn, update pawn hash key, otherwise
       // update non-pawn material.
       if (type_of(captured) == PAWN)
@@ -780,14 +811,22 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
               assert(piece_on(to) == NO_PIECE);
               assert(piece_on(capsq) == make_piece(them, PAWN));
 
+              piece_no1 = piece_no_of(capsq);
+
               board[capsq] = NO_PIECE; // Not done by remove_piece()
           }
+          else {
+            piece_no1 = piece_no_of(capsq);
+          }
 
           st->pawnKey ^= Zobrist::psq[captured][capsq];
       }
-      else
+      else {
           st->nonPawnMaterial[them] -= PieceValue[MG][captured];
 
+          piece_no1 = piece_no_of(capsq);
+      }
+
       // Update board and piece lists
       remove_piece(captured, capsq);
 
@@ -798,6 +837,19 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
 
       // Reset rule 50 counter
       st->rule50 = 0;
+
+      dp.dirty_num = 2; // ���������2��
+
+      dp.pieceNo[1] = piece_no1;
+      dp.changed_piece[1].old_piece = evalList.bona_piece(piece_no1);
+      // Do not use Eval::EvalList::put_piece() because the piece is removed
+      // from the game, and the corresponding elements of the piece lists
+      // needs to be Eval::BONA_PIECE_ZERO.
+      evalList.set_piece_on_board(piece_no1, Eval::BONA_PIECE_ZERO, Eval::BONA_PIECE_ZERO, capsq);
+      // Set PIECE_NUMBER_NB to piece_no_of_board[capsq] directly because it
+      // will not be overritten to pc if the move type is enpassant.
+      evalList.piece_no_list_board[capsq] = PIECE_NUMBER_NB;
+      dp.changed_piece[1].new_piece = evalList.bona_piece(piece_no1);
   }
 
   // Update hash key
@@ -819,8 +871,16 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   }
 
   // Move the piece. The tricky Chess960 castling is handled earlier
-  if (type_of(m) != CASTLING)
-      move_piece(pc, from, to);
+  if (type_of(m) != CASTLING) {
+    PieceNumber piece_no0 = piece_no_of(from);
+
+    move_piece(pc, from, to);
+
+    dp.pieceNo[0] = piece_no0;
+    dp.changed_piece[0].old_piece = evalList.bona_piece(piece_no0);
+    evalList.put_piece(piece_no0, to, pc);
+    dp.changed_piece[0].new_piece = evalList.bona_piece(piece_no0);
+  }
 
   // If the moving piece is a pawn do some special extra work
   if (type_of(pc) == PAWN)
@@ -843,6 +903,12 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
           remove_piece(pc, to);
           put_piece(promotion, to);
 
+          PieceNumber piece_no0 = piece_no_of(to);
+          dp.pieceNo[0] = piece_no0;
+          dp.changed_piece[0].old_piece = evalList.bona_piece(piece_no0);
+          evalList.put_piece(piece_no0, to, promotion);
+          dp.changed_piece[0].new_piece = evalList.bona_piece(piece_no0);
+
           // Update hash keys
           k ^= Zobrist::psq[pc][to] ^ Zobrist::psq[promotion][to];
           st->pawnKey ^= Zobrist::psq[pc][to];
@@ -894,6 +960,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   }
 
   assert(pos_is_ok());
+  assert(evalList.is_valid(*this));
 }
 
 
@@ -923,6 +990,9 @@ void Position::undo_move(Move m) {
       remove_piece(pc, to);
       pc = make_piece(us, PAWN);
       put_piece(pc, to);
+
+      PieceNumber piece_no0 = st->dirtyPiece.pieceNo[0];
+      evalList.put_piece(piece_no0, to, pc);
   }
 
   if (type_of(m) == CASTLING)
@@ -932,8 +1002,12 @@ void Position::undo_move(Move m) {
   }
   else
   {
+      
       move_piece(pc, to, from); // Put the piece back at the source square
 
+      PieceNumber piece_no0 = st->dirtyPiece.pieceNo[0];
+      evalList.put_piece(piece_no0, from, pc);
+
       if (st->capturedPiece)
       {
           Square capsq = to;
@@ -950,6 +1024,11 @@ void Position::undo_move(Move m) {
           }
 
           put_piece(st->capturedPiece, capsq); // Restore the captured piece
+
+          PieceNumber piece_no1 = st->dirtyPiece.pieceNo[1];
+          assert(evalList.bona_piece(piece_no1).fw == Eval::BONA_PIECE_ZERO);
+          assert(evalList.bona_piece(piece_no1).fb == Eval::BONA_PIECE_ZERO);
+          evalList.put_piece(piece_no1, capsq, st->capturedPiece);
       }
   }
 
@@ -958,6 +1037,7 @@ void Position::undo_move(Move m) {
   --gamePly;
 
   assert(pos_is_ok());
+  assert(evalList.is_valid(*this));
 }
 
 
@@ -965,18 +1045,50 @@ void Position::undo_move(Move m) {
 /// is a bit tricky in Chess960 where from/to squares can overlap.
 template<bool Do>
 void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto) {
+  auto& dp = st->dirtyPiece;
+  // �����v�Z�̂��߂Ɉړ��������StateInfo�ɋL�^���Ă����B
+  dp.dirty_num = 2; // ���������2��
+
+  PieceNumber piece_no0;
+  PieceNumber piece_no1;
+
+  if (Do) {
+    piece_no0 = piece_no_of(from);
+    piece_no1 = piece_no_of(to);
+  }
 
   bool kingSide = to > from;
   rfrom = to; // Castling is encoded as "king captures friendly rook"
   rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1);
   to = relative_square(us, kingSide ? SQ_G1 : SQ_C1);
 
+  if (!Do) {
+    piece_no0 = piece_no_of(to);
+    piece_no1 = piece_no_of(rto);
+  }
+
   // Remove both pieces first since squares could overlap in Chess960
   remove_piece(make_piece(us, KING), Do ? from : to);
   remove_piece(make_piece(us, ROOK), Do ? rfrom : rto);
   board[Do ? from : to] = board[Do ? rfrom : rto] = NO_PIECE; // Since remove_piece doesn't do it for us
   put_piece(make_piece(us, KING), Do ? to : from);
   put_piece(make_piece(us, ROOK), Do ? rto : rfrom);
+
+  if (Do) {
+    dp.pieceNo[0] = piece_no0;
+    dp.changed_piece[0].old_piece = evalList.bona_piece(piece_no0);
+    evalList.put_piece(piece_no0, to, make_piece(us, KING));
+    dp.changed_piece[0].new_piece = evalList.bona_piece(piece_no0);
+
+    dp.pieceNo[1] = piece_no1;
+    dp.changed_piece[1].old_piece = evalList.bona_piece(piece_no1);
+    evalList.put_piece(piece_no1, rto, make_piece(us, ROOK));
+    dp.changed_piece[1].new_piece = evalList.bona_piece(piece_no1);
+  }
+  else {
+    evalList.put_piece(piece_no0, from, make_piece(us, KING));
+    evalList.put_piece(piece_no1, rfrom, make_piece(us, ROOK));
+  }
 }
 
 
@@ -1313,3 +1425,14 @@ bool Position::pos_is_ok() const {
 
   return true;
 }
+
+PieceNumber Position::piece_no_of(Square sq) const
+{
+  if (piece_on(sq) == NO_PIECE) {
+    sync_cout << *this << sync_endl;
+  }
+  assert(piece_on(sq) != NO_PIECE);
+  PieceNumber n = evalList.piece_no_of_board(sq);
+  assert(is_ok(n));
+  return n;
+}
diff --git a/src/position.h b/src/position.h
index d26b1a63..8111c663 100644
--- a/src/position.h
+++ b/src/position.h
@@ -23,10 +23,12 @@
 
 #include <cassert>
 #include <deque>
+#include <iostream>
 #include <memory> // For std::unique_ptr
 #include <string>
 
 #include "bitboard.h"
+#include "misc.h"
 #include "types.h"
 
 #include "eval/nnue/nnue_accumulator.h"
@@ -194,6 +196,9 @@ private:
   template<bool Do>
   void do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto);
 
+  // �Տ��sq�̏��ɂ�����PieceNumber��Ԃ��B
+  PieceNumber piece_no_of(Square sq) const;
+
   // Data members
   Piece board[SQUARE_NB];
   Bitboard byTypeBB[PIECE_TYPE_NB];
diff --git a/src/types.h b/src/types.h
index c4458fe4..ef6cbb40 100644
--- a/src/types.h
+++ b/src/types.h
@@ -469,7 +469,7 @@ constexpr bool is_ok(Move m) {
 // --------------------
 
 // Position�N���X�ŗp����A��X�g(�ǂ̋�ǂ��ɂ���̂�)���Ǘ�����Ƃ��̔ԍ��B
-enum PieceNumber : int8_t
+enum PieceNumber : uint8_t
 {
 	PIECE_NUMBER_PAWN = 0,
 	PIECE_NUMBER_KNIGHT = 16,
@@ -483,8 +483,13 @@ enum PieceNumber : int8_t
 	PIECE_NUMBER_NB = 32,
 };
 
-inline PieceNumber& operator++(PieceNumber& d) { return d = PieceNumber(int(d) + 1); }           \
-inline PieceNumber& operator--(PieceNumber& d) { return d = PieceNumber(int(d) - 1); }
+inline PieceNumber& operator++(PieceNumber& d) { return d = PieceNumber(int8_t(d) + 1); }
+inline PieceNumber operator++(PieceNumber& d, int) {
+  PieceNumber x = d;
+  d = PieceNumber(int8_t(d) + 1);
+  return x;
+}
+inline PieceNumber& operator--(PieceNumber& d) { return d = PieceNumber(int8_t(d) - 1); }
 
 // PieceNumber�̐������̌����Bassert�p�B
 constexpr bool is_ok(PieceNumber pn) { return pn < PIECE_NUMBER_NB; }
diff --git a/src/uci.cpp b/src/uci.cpp
index 739cf343..bee5acd7 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -177,6 +177,73 @@ namespace {
          << "\nNodes/second    : " << 1000 * nodes / elapsed << endl;
   }
 
+  // check sum���v�Z�����Ƃ��A�����ۑ����Ă����Ă��ƂŎ���ȍ~�A�������̃`�F�b�N���s�Ȃ��B
+  uint64_t eval_sum;
+
+  // is_ready_cmd()���O������Ăяo����悤�ɂ��Ă����B(bench�R�}���h�Ȃǂ���Ăяo����������)
+  // �ǖʂ͏���������Ȃ��̂Œ��ӁB
+  void is_ready(Position& pos, istringstream& is, StateListPtr& states)
+  {
+    // "isready"���󂯎�������ƁA"readyok"��Ԃ��܂�5�b���Ƃɉ��s�𑗂�悤�ɏC������B(keep alive�I�ȏ���)
+    //	USI2.0�̎d�l���B
+    //  -"isready"�̂��Ƃ�time out���Ԃ́A30�b���x�Ƃ���B����𒴂��āA�]���֐��̏������Ahash�e�[�u���̊m�ۂ��������ꍇ�A
+    //  �v�l�G���W�����������I�ɉ��炩�̃��b�Z�[�W(���s��)�𑗂�ׂ��ł���B
+    //  -ShogiGUI�ł͂��łɂ����Ȃ��Ă���̂ŁAMyShogi������ɒǐ�����B
+    //  -�܂��A��˂��牤�̃G���W�����́A"isready"���󂯎�������ƁA"readyok"��Ԃ��܂�5�b���Ƃɉ��s�𑗂�悤�ɏC������B
+
+    auto ended = false;
+    auto th = std::thread([&ended] {
+      int count = 0;
+      while (!ended)
+      {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        if (++count >= 50 /* 5�b */)
+        {
+          count = 0;
+          sync_cout << sync_endl; // ���s�𑗐M����B
+        }
+      }
+      });
+
+    // �]���֐��̓ǂݍ��݂Ȃǎ��Ԃ̂�����ł��낤�����͂��̃^�C�~���O�ōs�Ȃ��B
+    // �N�����Ɏ��Ԃ̂����鏈�������Ă��܂��Ə��������^�C���A�E�g��������āA�v�l�G���W���Ƃ��Ă̔F�������^�C�A���Ă��܂��B
+    if (!UCI::load_eval_finished)
+    {
+      // �]���֐��̓ǂݍ���
+      Eval::load_eval();
+
+      // �`�F�b�N�T���̌v�Z�ƕۑ�(���̌�̃������j���̃`�F�b�N�̂���)
+      eval_sum = Eval::calc_check_sum();
+
+      // �\�t�g���̕\��
+      Eval::print_softname(eval_sum);
+
+      UCI::load_eval_finished = true;
+
+    }
+    else
+    {
+      // ���������j�󂳂�Ă��Ȃ����𒲂ׂ邽�߂Ƀ`�F�b�N�T���𖈉񒲂ׂ�B
+      // ���Ԃ��������������Ȃ��C�����邪.. 0.1�b���炢�̂��ƂȂ̂ŗǂ��Ƃ���B
+      if (eval_sum != Eval::calc_check_sum())
+        sync_cout << "Error! : EVAL memory is corrupted" << sync_endl;
+    }
+
+    // isready�ɑ΂��Ă�readyok��Ԃ��܂Ŏ��̃R�}���h�����Ȃ����Ƃ͖񑩂���Ă���̂�
+    // ���̃^�C�~���O�Ŋe��ϐ��̏����������Ă����B
+
+    TT.resize(Options["Hash"]);
+    Search::clear();
+    Time.availableNodes = 0;
+
+    Threads.stop = false;
+
+    // keep alive�𑗐M���邽�߂ɐ��������X���b�h���I�������A�ҋ@����B
+    ended = true;
+    th.join();
+
+    sync_cout << "readyok" << sync_endl;
+  }
 } // namespace
 
 
@@ -227,7 +294,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "go")         go(pos, is, states);
       else if (token == "position")   position(pos, is, states);
       else if (token == "ucinewgame") Search::clear();
-      else if (token == "isready")    sync_cout << "readyok" << sync_endl;
+      else if (token == "isready")    is_ready(pos, is, states);
 
       // Additional custom non-UCI commands, mainly for debugging
       else if (token == "flip")  pos.flip();
diff --git a/src/uci.h b/src/uci.h
index 31b63e2f..4a7771ca 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -75,6 +75,8 @@ std::string move(Move m, bool chess960);
 std::string pv(const Position& pos, Depth depth, Value alpha, Value beta);
 Move to_move(const Position& pos, std::string& str);
 
+// �]���֐���ǂݍ��񂾂��̃t���O�B�����evaldir�̕ύX�ɂƂ��Ȃ���false�ɂ���B
+extern bool load_eval_finished; // = false;
 } // namespace UCI
 
 extern UCI::OptionsMap Options;
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 813a0890..e549c6e0 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -42,6 +42,7 @@ void on_hash_size(const Option& o) { TT.resize(o); }
 void on_logger(const Option& o) { start_logger(o); }
 void on_threads(const Option& o) { Threads.set(o); }
 void on_tb_path(const Option& o) { Tablebases::init(o); }
+void on_eval_dir(const Option& o) { load_eval_finished = false; }
 
 
 /// Our case insensitive less() function as required by UCI protocol
@@ -78,6 +79,14 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
+  // �]���֐��t�H���_�B�����ύX�����Ƃ��A�]���֐�������isready�^�C�~���O�œǂݒ����K�v������B
+  o["EvalDir"]               << Option("eval", on_eval_dir);
+  // isready�^�C�~���O�ŕ]���֐���ǂݍ��܂��ƁA�V�����]���֐��̕ϊ��̂��߂�
+  // test evalconvert�R�}���h��@�������̂ɁA���̐V�����]���֐����Ȃ������߂�
+  // ���̃R�}���h�̎��s�O�Ɉُ�I�����Ă��܂��B
+  // �����ł��̉B���I�v�V������isready���̕]���֐��̓ǂݍ��݂�}�����āA
+  // test evalconvert�R�}���h��@���B
+  o["SkipLoadingEval"]       << Option(false);
 }
 
 
@@ -186,4 +195,6 @@ Option& Option::operator=(const string& v) {
   return *this;
 }
 
+// �]���֐���ǂݍ��񂾂��̃t���O�B�����evaldir�̕ύX�ɂƂ��Ȃ���false�ɂ���B
+bool load_eval_finished = false;
 } // namespace UCI

From 87445881ec2862f95b5c49ce637d3cfbe8ba16c1 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sun, 16 Jun 2019 11:11:16 +0900
Subject: [PATCH 005/583] Added #ifdef statements to switch the legacy
 evaluation function and NNUE evaluation function.

---
 src/eval/nnue/evaluate_nnue.cpp |  4 +++
 src/evaluate.cpp                |  7 +++++-
 src/evaluate.h                  |  2 ++
 src/position.cpp                | 44 +++++++++++++++++++++++++++++++++
 src/position.h                  |  8 ++++++
 src/types.h                     |  2 ++
 src/uci.cpp                     |  2 ++
 7 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index 6009d888..15d9194b 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -1,5 +1,7 @@
 ﻿// NNUE評価関数の計算に関するコード
 
+#if defined(EVAL_NNUE)
+
 #include <fstream>
 #include <iostream>
 
@@ -316,3 +318,5 @@ void print_eval_stat(Position& /*pos*/) {
 }
 
 }  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 9e85e2ae..29ea65dc 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -865,8 +865,11 @@ namespace {
 /// evaluation of the position from the point of view of the side to move.
 
 Value Eval::evaluate(const Position& pos) {
-  //return Evaluation<NO_TRACE>(pos).value();
+#if defined(EVAL_NNUE)
   return Eval::NNUE::evaluate(pos);
+#else
+  return Evaluation<NO_TRACE>(pos).value();
+#endif  // defined(EVAL_NNUE)
 }
 
 
@@ -910,6 +913,7 @@ std::string Eval::trace(const Position& pos) {
   return ss.str();
 }
 
+#if defined(EVAL_NNUE)
 namespace Eval {
 ExtBonaPiece kpp_board_index[PIECE_NB] = {
     { BONA_PIECE_ZERO, BONA_PIECE_ZERO },
@@ -978,3 +982,4 @@ bool EvalList::is_valid(const Position& pos)
   return true;
 }
 }
+#endif  // defined(EVAL_NNUE)
diff --git a/src/evaluate.h b/src/evaluate.h
index e9e13e7d..1b114179 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -35,6 +35,7 @@ std::string trace(const Position& pos);
 
 Value evaluate(const Position& pos);
 
+#if defined(EVAL_NNUE)
 // �]���֐��t�@�C����ǂݍ��ށB
 // ����́A"is_ready"�R�}���h�̉�������1�x�����Ăяo�����B2�x�Ăяo�����Ƃ͑z�肵�Ă��Ȃ��B
 // (�������AEvalDir(�]���֐��t�H���_)���ύX�ɂȂ������ƁAisready���ēx�����Ă�����ǂ݂Ȃ����B)
@@ -206,6 +207,7 @@ struct DirtyPiece
 	int dirty_num;
 
 };
+#endif  // defined(EVAL_NNUE)
 }
 
 #endif // #ifndef EVALUATE_H_INCLUDED
diff --git a/src/position.cpp b/src/position.cpp
index 43f986f9..23ce5168 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -243,6 +243,7 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
   std::fill_n(&pieceList[0][0], sizeof(pieceList) / sizeof(Square), SQ_NONE);
   st = si;
 
+#if defined(EVAL_NNUE)
   // evalList��clear�B���memset�Ń[���N���A�����Ƃ��ɃN���A����Ă��邪�c�B
   evalList.clear();
 
@@ -256,6 +257,7 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
     PIECE_NUMBER_ROOK,
     PIECE_NUMBER_QUEEN
   };
+#endif  // defined(EVAL_NNUE)
 
   ss >> std::noskipws;
 
@@ -273,11 +275,13 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
           auto pc = Piece(idx);
           put_piece(pc, sq);
 
+#if defined(EVAL_NNUE)
           PieceNumber piece_no =
             (idx == W_KING) ? PIECE_NUMBER_WKING : // ����
             (idx == B_KING) ? PIECE_NUMBER_BKING : // ����
             piece_no_count[type_of(Piece(idx))]++; // ����ȊO
           evalList.put_piece(piece_no, sq, pc); // sq�̏���pc�̋��z�u����
+#endif  // defined(EVAL_NNUE)
 
           ++sq;
       }
@@ -341,7 +345,9 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
   set_state(st);
 
   assert(pos_is_ok());
+#if defined(EVAL_NNUE)
   assert(evalList.is_valid(*this));
+#endif  // defined(EVAL_NNUE)
 
   return *this;
 }
@@ -762,8 +768,10 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   ++st->rule50;
   ++st->pliesFromNull;
 
+#if defined(EVAL_NNUE)
   st->accumulator.computed_accumulation = false;
   st->accumulator.computed_score = false;
+#endif  // defined(EVAL_NNUE)
 
   Color us = sideToMove;
   Color them = ~us;
@@ -776,8 +784,10 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   assert(captured == NO_PIECE || color_of(captured) == (type_of(m) != CASTLING ? them : us));
   assert(type_of(captured) != KING);
 
+#if defined(EVAL_NNUE)
   auto& dp = st->dirtyPiece;
   dp.dirty_num = 1;
+#endif  // defined(EVAL_NNUE)
 
   if (type_of(m) == CASTLING)
   {
@@ -795,7 +805,9 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   {
       Square capsq = to;
 
+#if defined(EVAL_NNUE)
       PieceNumber piece_no1;
+#endif  // defined(EVAL_NNUE)
 
       // If the captured piece is a pawn, update pawn hash key, otherwise
       // update non-pawn material.
@@ -811,12 +823,16 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
               assert(piece_on(to) == NO_PIECE);
               assert(piece_on(capsq) == make_piece(them, PAWN));
 
+#if defined(EVAL_NNUE)
               piece_no1 = piece_no_of(capsq);
+#endif  // defined(EVAL_NNUE)
 
               board[capsq] = NO_PIECE; // Not done by remove_piece()
           }
           else {
+#if defined(EVAL_NNUE)
             piece_no1 = piece_no_of(capsq);
+#endif  // defined(EVAL_NNUE)
           }
 
           st->pawnKey ^= Zobrist::psq[captured][capsq];
@@ -824,7 +840,9 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
       else {
           st->nonPawnMaterial[them] -= PieceValue[MG][captured];
 
+#if defined(EVAL_NNUE)
           piece_no1 = piece_no_of(capsq);
+#endif  // defined(EVAL_NNUE)
       }
 
       // Update board and piece lists
@@ -838,6 +856,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
       // Reset rule 50 counter
       st->rule50 = 0;
 
+#if defined(EVAL_NNUE)
       dp.dirty_num = 2; // ���������2��
 
       dp.pieceNo[1] = piece_no1;
@@ -850,6 +869,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
       // will not be overritten to pc if the move type is enpassant.
       evalList.piece_no_list_board[capsq] = PIECE_NUMBER_NB;
       dp.changed_piece[1].new_piece = evalList.bona_piece(piece_no1);
+#endif  // defined(EVAL_NNUE)
   }
 
   // Update hash key
@@ -872,14 +892,18 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
 
   // Move the piece. The tricky Chess960 castling is handled earlier
   if (type_of(m) != CASTLING) {
+#if defined(EVAL_NNUE)
     PieceNumber piece_no0 = piece_no_of(from);
+#endif  // defined(EVAL_NNUE)
 
     move_piece(pc, from, to);
 
+#if defined(EVAL_NNUE)
     dp.pieceNo[0] = piece_no0;
     dp.changed_piece[0].old_piece = evalList.bona_piece(piece_no0);
     evalList.put_piece(piece_no0, to, pc);
     dp.changed_piece[0].new_piece = evalList.bona_piece(piece_no0);
+#endif  // defined(EVAL_NNUE)
   }
 
   // If the moving piece is a pawn do some special extra work
@@ -903,11 +927,13 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
           remove_piece(pc, to);
           put_piece(promotion, to);
 
+#if defined(EVAL_NNUE)
           PieceNumber piece_no0 = piece_no_of(to);
           dp.pieceNo[0] = piece_no0;
           dp.changed_piece[0].old_piece = evalList.bona_piece(piece_no0);
           evalList.put_piece(piece_no0, to, promotion);
           dp.changed_piece[0].new_piece = evalList.bona_piece(piece_no0);
+#endif  // defined(EVAL_NNUE)
 
           // Update hash keys
           k ^= Zobrist::psq[pc][to] ^ Zobrist::psq[promotion][to];
@@ -960,7 +986,9 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   }
 
   assert(pos_is_ok());
+#if defined(EVAL_NNUE)
   assert(evalList.is_valid(*this));
+#endif  // defined(EVAL_NNUE)
 }
 
 
@@ -991,8 +1019,10 @@ void Position::undo_move(Move m) {
       pc = make_piece(us, PAWN);
       put_piece(pc, to);
 
+#if defined(EVAL_NNUE)
       PieceNumber piece_no0 = st->dirtyPiece.pieceNo[0];
       evalList.put_piece(piece_no0, to, pc);
+#endif  // defined(EVAL_NNUE)
   }
 
   if (type_of(m) == CASTLING)
@@ -1005,8 +1035,10 @@ void Position::undo_move(Move m) {
       
       move_piece(pc, to, from); // Put the piece back at the source square
 
+#if defined(EVAL_NNUE)
       PieceNumber piece_no0 = st->dirtyPiece.pieceNo[0];
       evalList.put_piece(piece_no0, from, pc);
+#endif  // defined(EVAL_NNUE)
 
       if (st->capturedPiece)
       {
@@ -1025,10 +1057,12 @@ void Position::undo_move(Move m) {
 
           put_piece(st->capturedPiece, capsq); // Restore the captured piece
 
+#if defined(EVAL_NNUE)
           PieceNumber piece_no1 = st->dirtyPiece.pieceNo[1];
           assert(evalList.bona_piece(piece_no1).fw == Eval::BONA_PIECE_ZERO);
           assert(evalList.bona_piece(piece_no1).fb == Eval::BONA_PIECE_ZERO);
           evalList.put_piece(piece_no1, capsq, st->capturedPiece);
+#endif  // defined(EVAL_NNUE)
       }
   }
 
@@ -1037,7 +1071,9 @@ void Position::undo_move(Move m) {
   --gamePly;
 
   assert(pos_is_ok());
+#if defined(EVAL_NNUE)
   assert(evalList.is_valid(*this));
+#endif  // defined(EVAL_NNUE)
 }
 
 
@@ -1045,6 +1081,7 @@ void Position::undo_move(Move m) {
 /// is a bit tricky in Chess960 where from/to squares can overlap.
 template<bool Do>
 void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto) {
+#if defined(EVAL_NNUE)
   auto& dp = st->dirtyPiece;
   // �����v�Z�̂��߂Ɉړ��������StateInfo�ɋL�^���Ă����B
   dp.dirty_num = 2; // ���������2��
@@ -1056,16 +1093,19 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ
     piece_no0 = piece_no_of(from);
     piece_no1 = piece_no_of(to);
   }
+#endif  // defined(EVAL_NNUE)
 
   bool kingSide = to > from;
   rfrom = to; // Castling is encoded as "king captures friendly rook"
   rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1);
   to = relative_square(us, kingSide ? SQ_G1 : SQ_C1);
 
+#if defined(EVAL_NNUE)
   if (!Do) {
     piece_no0 = piece_no_of(to);
     piece_no1 = piece_no_of(rto);
   }
+#endif  // defined(EVAL_NNUE)
 
   // Remove both pieces first since squares could overlap in Chess960
   remove_piece(make_piece(us, KING), Do ? from : to);
@@ -1074,6 +1114,7 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ
   put_piece(make_piece(us, KING), Do ? to : from);
   put_piece(make_piece(us, ROOK), Do ? rto : rfrom);
 
+#if defined(EVAL_NNUE)
   if (Do) {
     dp.pieceNo[0] = piece_no0;
     dp.changed_piece[0].old_piece = evalList.bona_piece(piece_no0);
@@ -1089,6 +1130,7 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ
     evalList.put_piece(piece_no0, from, make_piece(us, KING));
     evalList.put_piece(piece_no1, rfrom, make_piece(us, ROOK));
   }
+#endif  // defined(EVAL_NNUE)
 }
 
 
@@ -1426,6 +1468,7 @@ bool Position::pos_is_ok() const {
   return true;
 }
 
+#if defined(EVAL_NNUE)
 PieceNumber Position::piece_no_of(Square sq) const
 {
   if (piece_on(sq) == NO_PIECE) {
@@ -1436,3 +1479,4 @@ PieceNumber Position::piece_no_of(Square sq) const
   assert(is_ok(n));
   return n;
 }
+#endif  // defined(EVAL_NNUE)
diff --git a/src/position.h b/src/position.h
index 8111c663..c6e4f9c9 100644
--- a/src/position.h
+++ b/src/position.h
@@ -59,10 +59,12 @@ struct StateInfo {
   Bitboard   pinners[COLOR_NB];
   Bitboard   checkSquares[PIECE_TYPE_NB];
 
+#if defined(EVAL_NNUE)
   Eval::NNUE::Accumulator accumulator;
 
   // �]���l�̍����v�Z�̊Ǘ��p
   Eval::DirtyPiece dirtyPiece;
+#endif  // defined(EVAL_NNUE)
 };
 
 /// A list to keep track of the position states along the setup moves (from the
@@ -174,6 +176,7 @@ public:
   bool pos_is_ok() const;
   void flip();
 
+#if defined(EVAL_NNUE)
   // --- StateInfo
 
   // ���݂̋ǖʂɑΉ�����StateInfo��Ԃ��B
@@ -182,6 +185,7 @@ public:
 
   // �]���֐��Ŏg�����߂́A�ǂ̋�ԍ��̋�ǂ��ɂ��邩�Ȃǂ̏��B
   const Eval::EvalList* eval_list() const { return &evalList; }
+#endif  // defined(EVAL_NNUE)
 
 private:
   // Initialization helpers (used while setting up a position)
@@ -196,8 +200,10 @@ private:
   template<bool Do>
   void do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto);
 
+#if defined(EVAL_NNUE)
   // �Տ��sq�̏��ɂ�����PieceNumber��Ԃ��B
   PieceNumber piece_no_of(Square sq) const;
+#endif  // defined(EVAL_NNUE)
 
   // Data members
   Piece board[SQUARE_NB];
@@ -216,8 +222,10 @@ private:
   StateInfo* st;
   bool chess960;
 
+#if defined(EVAL_NNUE)
   // �]���֐��ŗp�����̃��X�g
   Eval::EvalList evalList;
+#endif  // defined(EVAL_NNUE)
 };
 
 namespace PSQT {
diff --git a/src/types.h b/src/types.h
index ef6cbb40..5270ccd6 100644
--- a/src/types.h
+++ b/src/types.h
@@ -464,6 +464,7 @@ constexpr bool is_ok(Move m) {
   return from_sq(m) != to_sq(m); // Catch MOVE_NULL and MOVE_NONE
 }
 
+#if defined(EVAL_NNUE)
 // --------------------
 //        �
 // --------------------
@@ -493,5 +494,6 @@ inline PieceNumber& operator--(PieceNumber& d) { return d = PieceNumber(int8_t(d
 
 // PieceNumber�̐������̌����Bassert�p�B
 constexpr bool is_ok(PieceNumber pn) { return pn < PIECE_NUMBER_NB; }
+#endif  // defined(EVAL_NNUE)
 
 #endif // #ifndef TYPES_H_INCLUDED
diff --git a/src/uci.cpp b/src/uci.cpp
index bee5acd7..b47398ad 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -184,6 +184,7 @@ namespace {
   // �ǖʂ͏���������Ȃ��̂Œ��ӁB
   void is_ready(Position& pos, istringstream& is, StateListPtr& states)
   {
+#if defined(EVAL_NNUE)
     // "isready"���󂯎�������ƁA"readyok"��Ԃ��܂�5�b���Ƃɉ��s�𑗂�悤�ɏC������B(keep alive�I�ȏ���)
     //	USI2.0�̎d�l���B
     //  -"isready"�̂��Ƃ�time out���Ԃ́A30�b���x�Ƃ���B����𒴂��āA�]���֐��̏������Ahash�e�[�u���̊m�ۂ��������ꍇ�A
@@ -241,6 +242,7 @@ namespace {
     // keep alive�𑗐M���邽�߂ɐ��������X���b�h���I�������A�ҋ@����B
     ended = true;
     th.join();
+#endif  // defined(EVAL_NNUE)
 
     sync_cout << "readyok" << sync_endl;
   }

From bcd6985871977ff06b8bc0fbe5c50c7a6393e9b9 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Tue, 18 Jun 2019 08:48:05 +0900
Subject: [PATCH 006/583] Merged the training data generator and the machine
 learning logic from YaneuraOu.

---
 src/eval/evaluate_common.h                    |   83 +
 src/eval/evaluate_mir_inv_tools.cpp           |  186 ++
 src/eval/evaluate_mir_inv_tools.h             |   43 +
 src/eval/nnue/evaluate_nnue.cpp               |    6 +-
 src/eval/nnue/evaluate_nnue.h                 |    2 -
 src/eval/nnue/evaluate_nnue_learner.cpp       |   31 +-
 src/eval/nnue/evaluate_nnue_learner.h         |   10 +-
 src/eval/nnue/layers/sum.h                    |    2 -
 src/eval/nnue/trainer/features/factorizer.h   |   14 +-
 .../trainer/features/factorizer_feature_set.h |   10 +-
 .../trainer/features/factorizer_half_kp.h     |    2 -
 src/eval/nnue/trainer/trainer.h               |   10 +-
 .../nnue/trainer/trainer_affine_transform.h   |    2 -
 src/eval/nnue/trainer/trainer_clipped_relu.h  |    2 -
 .../trainer/trainer_feature_transformer.h     |    2 -
 src/eval/nnue/trainer/trainer_input_slice.h   |   10 +-
 src/eval/nnue/trainer/trainer_sum.h           |    2 -
 src/evaluate.cpp                              |    6 +-
 src/evaluate.h                                |   15 +-
 src/extra/sfen_packer.cpp                     |  444 +++
 src/learn/gensfen2019.cpp                     |    1 +
 src/learn/half_float.h                        |  133 +
 src/learn/learn.h                             |  237 ++
 src/learn/learner.cpp                         | 2922 +++++++++++++++++
 src/learn/learning_tools.cpp                  |  256 ++
 src/learn/learning_tools.h                    | 1032 ++++++
 src/learn/multi_think.cpp                     |  123 +
 src/learn/multi_think.h                       |  151 +
 src/misc.cpp                                  |  146 +
 src/misc.h                                    |  100 +
 src/movegen.h                                 |    3 +
 src/position.cpp                              |    9 +
 src/position.h                                |   26 +
 src/search.cpp                                |  280 ++
 src/types.h                                   |   14 +-
 src/uci.cpp                                   |  122 +-
 src/uci.h                                     |    8 +
 37 files changed, 6306 insertions(+), 139 deletions(-)
 create mode 100644 src/eval/evaluate_common.h
 create mode 100644 src/eval/evaluate_mir_inv_tools.cpp
 create mode 100644 src/eval/evaluate_mir_inv_tools.h
 create mode 100644 src/extra/sfen_packer.cpp
 create mode 100644 src/learn/gensfen2019.cpp
 create mode 100644 src/learn/half_float.h
 create mode 100644 src/learn/learn.h
 create mode 100644 src/learn/learner.cpp
 create mode 100644 src/learn/learning_tools.cpp
 create mode 100644 src/learn/learning_tools.h
 create mode 100644 src/learn/multi_think.cpp
 create mode 100644 src/learn/multi_think.h

diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
new file mode 100644
index 00000000..889fda7a
--- /dev/null
+++ b/src/eval/evaluate_common.h
@@ -0,0 +1,83 @@
+﻿#ifndef _EVALUATE_COMMON_H_
+#define _EVALUATE_COMMON_H_
+
+// いまどきの手番つき評価関数(EVAL_KPPTとEVAL_KPP_KKPT)の共用header的なもの。
+
+#if defined (EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_NNUE)
+#include <functional>
+
+// KKファイル名
+#define KK_BIN "KK_synthesized.bin"
+
+// KKPファイル名
+#define KKP_BIN "KKP_synthesized.bin"
+
+// KPPファイル名
+#define KPP_BIN "KPP_synthesized.bin"
+
+namespace Eval
+{
+
+#if defined(USE_EVAL_HASH)
+	// prefetchする関数
+	void prefetch_evalhash(const Key key);
+#endif
+
+	// 評価関数のそれぞれのパラメーターに対して関数fを適用してくれるoperator。
+	// パラメーターの分析などに用いる。
+	// typeは調査対象を表す。
+	//   type = -1 : KK,KKP,KPPすべて
+	//   type = 0  : KK のみ 
+	//   type = 1  : KKPのみ 
+	//   type = 2  : KPPのみ 
+	void foreach_eval_param(std::function<void(int32_t, int32_t)>f, int type = -1);
+
+	// --------------------------
+	//        学習用
+	// --------------------------
+
+#if defined(EVAL_LEARN)
+	// 学習のときの勾配配列の初期化
+	// 学習率を引数に渡しておく。0.0なら、defaultの値を採用する。
+	// update_weights()のepochが、eta_epochまでetaから徐々にeta2に変化する。
+	// eta2_epoch以降は、eta2から徐々にeta3に変化する。
+	void init_grad(double eta1, uint64_t eta_epoch, double eta2, uint64_t eta2_epoch, double eta3);
+
+	// 現在の局面で出現している特徴すべてに対して、勾配の差分値を勾配配列に加算する。
+	// freeze[0]  : kkは学習させないフラグ
+	// freeze[1]  : kkpは学習させないフラグ
+	// freeze[2]  : kppは学習させないフラグ
+	// freeze[3]  : kpppは学習させないフラグ
+	void add_grad(Position& pos, Color rootColor, double delt_grad, const std::array<bool, 4>& freeze);
+
+	// 現在の勾配をもとにSGDかAdaGradか何かする。
+	// epoch      : 世代カウンター(0から始まる)
+	// freeze[0]  : kkは学習させないフラグ
+	// freeze[1]  : kkpは学習させないフラグ
+	// freeze[2]  : kppは学習させないフラグ
+	// freeze[3]  : kpppは学習させないフラグ
+	void update_weights(uint64_t epoch, const std::array<bool,4>& freeze);
+
+	// 評価関数パラメーターをファイルに保存する。
+	// ファイルの末尾につける拡張子を指定できる。
+	void save_eval(std::string suffix);
+
+	// 現在のetaを取得する。
+	double get_eta();
+
+	// -- 学習に関連したコマンド
+
+	// KKを正規化する関数。元の評価関数と完全に等価にはならないので注意。
+	// kkp,kppの値をなるべくゼロに近づけることで、学習中に出現しなかった特徴因子の値(ゼロになっている)が
+	// 妥当であることを保証しようという考え。
+	void regularize_kk();
+
+#endif
+
+
+}
+
+
+#endif
+
+#endif // _EVALUATE_KPPT_COMMON_H_
\ No newline at end of file
diff --git a/src/eval/evaluate_mir_inv_tools.cpp b/src/eval/evaluate_mir_inv_tools.cpp
new file mode 100644
index 00000000..a0cf7461
--- /dev/null
+++ b/src/eval/evaluate_mir_inv_tools.cpp
@@ -0,0 +1,186 @@
+﻿#include "evaluate_mir_inv_tools.h"
+
+namespace Eval
+{
+
+	// --- tables
+
+	// あるBonaPieceを相手側から見たときの値
+	// BONA_PIECE_INITが-1なので符号型で持つ必要がある。
+	// KPPTを拡張しても当面、BonaPieceが2^15を超えることはないのでint16_tで良しとする。
+	int16_t inv_piece_[Eval::fe_end];
+
+	// 盤面上のあるBonaPieceをミラーした位置にあるものを返す。
+	int16_t mir_piece_[Eval::fe_end];
+
+
+	// --- methods
+
+	// あるBonaPieceを相手側から見たときの値を返す
+	Eval::BonaPiece inv_piece(Eval::BonaPiece p) { return (Eval::BonaPiece)inv_piece_[p]; }
+
+	// 盤面上のあるBonaPieceをミラーした位置にあるものを返す。
+	Eval::BonaPiece mir_piece(Eval::BonaPiece p) { return (Eval::BonaPiece)mir_piece_[p]; }
+
+	std::function<void()> mir_piece_init_function;
+
+	void init_mir_inv_tables()
+	{
+		// mirrorとinverseのテーブルの初期化。
+
+		// 初期化は1回に限る。
+		static bool first = true;
+		if (!first) return;
+		first = false;
+
+		// fとeとの交換
+		int t[] = {
+			f_pawn             , e_pawn            ,
+			f_knight           , e_knight          ,
+			f_bishop           , e_bishop          ,
+			f_rook             , e_rook            ,
+			f_queen            , e_queen           ,
+		};
+
+		// 未初期化の値を突っ込んでおく。
+		for (BonaPiece p = BONA_PIECE_ZERO; p < fe_end; ++p)
+		{
+			inv_piece_[p] = BONA_PIECE_NOT_INIT;
+
+			// mirrorは手駒に対しては機能しない。元の値を返すだけ。
+			mir_piece_[p] = (p < f_pawn) ? p : BONA_PIECE_NOT_INIT;
+		}
+
+		for (BonaPiece p = BONA_PIECE_ZERO; p < fe_end; ++p)
+		{
+			for (int i = 0; i < 32 /* t.size() */; i += 2)
+			{
+				if (t[i] <= p && p < t[i + 1])
+				{
+					Square sq = (Square)(p - t[i]);
+
+					// 見つかった!!
+					BonaPiece q = (p < fe_hand_end) ? BonaPiece(sq + t[i + 1]) : (BonaPiece)(Inv(sq) + t[i + 1]);
+					inv_piece_[p] = q;
+					inv_piece_[q] = p;
+
+					/*
+					ちょっとトリッキーだが、pに関して盤上の駒は
+					p >= fe_hand_end
+					のとき。
+
+					このpに対して、nを整数として(上のコードのiは偶数しかとらない)、
+					a)  t[2n + 0] <= p < t[2n + 1] のときは先手の駒
+					b)  t[2n + 1] <= p < t[2n + 2] のときは後手の駒
+					　である。
+
+					 ゆえに、a)の範囲にあるpをq = Inv(p-t[2n+0]) + t[2n+1] とすると180度回転させた升にある後手の駒となる。
+					 そこでpとqをswapさせてinv_piece[ ]を初期化してある。
+					 */
+
+					 // 手駒に関してはmirrorなど存在しない。
+					if (p < fe_hand_end)
+						continue;
+
+					BonaPiece r1 = (BonaPiece)(Mir(sq) + t[i]);
+					mir_piece_[p] = r1;
+					mir_piece_[r1] = p;
+
+					BonaPiece p2 = (BonaPiece)(sq + t[i + 1]);
+					BonaPiece r2 = (BonaPiece)(Mir(sq) + t[i + 1]);
+					mir_piece_[p2] = r2;
+					mir_piece_[r2] = p2;
+
+					break;
+				}
+			}
+		}
+
+		if (mir_piece_init_function)
+			mir_piece_init_function();
+
+		for (BonaPiece p = BONA_PIECE_ZERO; p < fe_end; ++p)
+		{
+			// 未初期化のままになっている。上のテーブルの初期化コードがおかしい。
+			assert(mir_piece_[p] != BONA_PIECE_NOT_INIT && mir_piece_[p] < fe_end);
+			assert(inv_piece_[p] != BONA_PIECE_NOT_INIT && inv_piece_[p] < fe_end);
+
+			// mirとinvは、2回適用したら元の座標に戻る。
+			assert(mir_piece_[mir_piece_[p]] == p);
+			assert(inv_piece_[inv_piece_[p]] == p);
+
+			// mir->inv->mir->invは元の場所でなければならない。
+			assert(p == inv_piece(mir_piece(inv_piece(mir_piece(p)))));
+
+			// inv->mir->inv->mirは元の場所でなければならない。
+			assert(p == mir_piece(inv_piece(mir_piece(inv_piece(p)))));
+		}
+
+#if 0
+		// 評価関数のミラーをしても大丈夫であるかの事前検証
+		// 値を書き込んだときにassertionがあるので、ミラーしてダメである場合、
+		// そのassertに引っかかるはず。
+
+		// AperyのWCSC26の評価関数、kppのp1==0とかp1==20(後手の0枚目の歩)とかの
+		// ところにゴミが入っていて、これを回避しないとassertに引っかかる。
+
+		std::unordered_set<BonaPiece> s;
+		vector<int> a = {
+			f_hand_pawn - 1,e_hand_pawn - 1,
+			f_hand_lance - 1, e_hand_lance - 1,
+			f_hand_knight - 1, e_hand_knight - 1,
+			f_hand_silver - 1, e_hand_silver - 1,
+			f_hand_gold - 1, e_hand_gold - 1,
+			f_hand_bishop - 1, e_hand_bishop - 1,
+			f_hand_rook - 1, e_hand_rook - 1,
+		};
+		for (auto b : a)
+			s.insert((BonaPiece)b);
+
+		// さらに出現しない升の盤上の歩、香、桂も除外(Aperyはここにもゴミが入っている)
+		for (Rank r = RANK_1; r <= RANK_2; ++r)
+			for (File f = FILE_1; f <= FILE_9; ++f)
+			{
+				if (r == RANK_1)
+				{
+					// 1段目の歩
+					BonaPiece b1 = BonaPiece(f_pawn + (f | r));
+					s.insert(b1);
+					s.insert(inv_piece[b1]);
+
+					// 1段目の香
+					BonaPiece b2 = BonaPiece(f_lance + (f | r));
+					s.insert(b2);
+					s.insert(inv_piece[b2]);
+				}
+
+				// 1,2段目の桂
+				BonaPiece b = BonaPiece(f_knight + (f | r));
+				s.insert(b);
+				s.insert(inv_piece[b]);
+			}
+
+		cout << "\nchecking kpp_write()..";
+		for (auto sq : SQ)
+		{
+			cout << sq << ' ';
+			for (BonaPiece p1 = BONA_PIECE_ZERO; p1 < fe_end; ++p1)
+				for (BonaPiece p2 = BONA_PIECE_ZERO; p2 < fe_end; ++p2)
+					if (!s.count(p1) && !s.count(p2))
+						kpp_write(sq, p1, p2, kpp[sq][p1][p2]);
+		}
+		cout << "\nchecking kkp_write()..";
+
+		for (auto sq1 : SQ)
+		{
+			cout << sq1 << ' ';
+			for (auto sq2 : SQ)
+				for (BonaPiece p1 = BONA_PIECE_ZERO; p1 < fe_end; ++p1)
+					if (!s.count(p1))
+						kkp_write(sq1, sq2, p1, kkp[sq1][sq2][p1]);
+		}
+		cout << "..done!" << endl;
+#endif
+	}
+
+}
diff --git a/src/eval/evaluate_mir_inv_tools.h b/src/eval/evaluate_mir_inv_tools.h
new file mode 100644
index 00000000..6e82ce58
--- /dev/null
+++ b/src/eval/evaluate_mir_inv_tools.h
@@ -0,0 +1,43 @@
+﻿#ifndef _EVALUATE_MIR_INV_TOOLS_
+#define _EVALUATE_MIR_INV_TOOLS_
+
+// BonaPieceのmirror(左右反転)やinverse(盤上の180度回転)させた駒を得るためのツール類。
+
+#include "../types.h"
+#include "../evaluate.h"
+#include <functional>
+
+namespace Eval
+{
+	// -------------------------------------------------
+	//                  tables
+	// -------------------------------------------------
+
+	// 	--- BonaPieceに対してMirrorとInverseを提供する。
+
+	// これらの配列は、init()かinit_mir_inv_tables();を呼び出すと初期化される。
+	// このテーブルのみを評価関数のほうから使いたいときは、評価関数の初期化のときに
+	// init_mir_inv_tables()を呼び出すと良い。
+	// これらの配列は、以下のKK/KKP/KPPクラスから参照される。
+
+	// あるBonaPieceを相手側から見たときの値を返す
+	extern Eval::BonaPiece inv_piece(Eval::BonaPiece p);
+
+	// 盤面上のあるBonaPieceをミラーした位置にあるものを返す。
+	extern Eval::BonaPiece mir_piece(Eval::BonaPiece p);
+
+
+	// mir_piece/inv_pieceの初期化のときに呼び出されるcallback
+	// fe_endをユーザー側で拡張するときに用いる。
+	// この初期化のときに必要なのでinv_piece_とinv_piece_を公開している。
+	// mir_piece_init_functionが呼び出されたタイミングで、fe_old_endまでは
+	// これらのテーブルの初期化が完了していることが保証されている。
+	extern std::function<void()> mir_piece_init_function;
+	extern int16_t mir_piece_[Eval::fe_end];
+	extern int16_t inv_piece_[Eval::fe_end];
+
+	// この関数を明示的に呼び出すか、init()を呼び出すかしたときに、上のテーブルが初期化される。
+	extern void init_mir_inv_tables();
+}
+
+#endif
diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index 15d9194b..a19b2a0e 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -224,8 +224,8 @@ EvaluateHashTable g_evalTable;
 
 // prefetchする関数も用意しておく。
 void prefetch_evalhash(const Key key) {
-  constexpr auto mask = ~((u64)0x1f);
-  prefetch((void*)((u64)g_evalTable[key] & mask));
+  constexpr auto mask = ~((uint64_t)0x1f);
+  prefetch((void*)((uint64_t)g_evalTable[key] & mask));
 }
 #endif
 
@@ -269,7 +269,7 @@ Value compute_eval(const Position& pos) {
 }
 
 // 評価関数
-Value NNUE::evaluate(const Position& pos) {
+Value evaluate(const Position& pos) {
   const auto& accumulator = pos.state()->accumulator;
   if (accumulator.computed_score) {
     return accumulator.score;
diff --git a/src/eval/nnue/evaluate_nnue.h b/src/eval/nnue/evaluate_nnue.h
index 1ca48d5b..a95f2bd9 100644
--- a/src/eval/nnue/evaluate_nnue.h
+++ b/src/eval/nnue/evaluate_nnue.h
@@ -55,8 +55,6 @@ bool ReadParameters(std::istream& stream);
 // 評価関数パラメータを書き込む
 bool WriteParameters(std::ostream& stream);
 
-Value evaluate(const Position& pos);
-
 }  // namespace NNUE
 
 }  // namespace Eval
diff --git a/src/eval/nnue/evaluate_nnue_learner.cpp b/src/eval/nnue/evaluate_nnue_learner.cpp
index cd3ae72a..0e558f39 100644
--- a/src/eval/nnue/evaluate_nnue_learner.cpp
+++ b/src/eval/nnue/evaluate_nnue_learner.cpp
@@ -9,8 +9,9 @@
 #include "../../learn/learning_tools.h"
 
 #include "../../position.h"
-#include "../../usi.h"
+#include "../../uci.h"
 #include "../../misc.h"
+#include "../../thread_win32_osx.h"
 
 #include "../evaluate_common.h"
 
@@ -37,7 +38,7 @@ std::vector<Example> examples;
 Mutex examples_mutex;
 
 // ミニバッチのサンプル数
-u64 batch_size;
+uint64_t batch_size;
 
 // 乱数生成器
 std::mt19937 rng;
@@ -57,20 +58,20 @@ double GetGlobalLearningRateScale() {
 void SendMessages(std::vector<Message> messages) {
   for (auto& message : messages) {
     trainer->SendMessage(&message);
-    ASSERT_LV3(message.num_receivers > 0);
+    assert(message.num_receivers > 0);
   }
 }
 
 }  // namespace
 
 // 学習の初期化を行う
-void InitializeTraining(double eta1, u64 eta1_epoch,
-                        double eta2, u64 eta2_epoch, double eta3) {
+void InitializeTraining(double eta1, uint64_t eta1_epoch,
+                        double eta2, uint64_t eta2_epoch, double eta3) {
   std::cout << "Initializing NN training for "
             << GetArchitectureString() << std::endl;
 
-  ASSERT(feature_transformer);
-  ASSERT(network);
+  assert(feature_transformer);
+  assert(network);
   trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
 
   if (Options["SkipLoadingEval"]) {
@@ -82,8 +83,8 @@ void InitializeTraining(double eta1, u64 eta1_epoch,
 }
 
 // ミニバッチのサンプル数を設定する
-void SetBatchSize(u64 size) {
-  ASSERT_LV3(size > 0);
+void SetBatchSize(uint64_t size) {
+  assert(size > 0);
   batch_size = size;
 }
 
@@ -97,7 +98,7 @@ void SetOptions(const std::string& options) {
   std::vector<Message> messages;
   for (const auto& option : Split(options, ',')) {
     const auto fields = Split(option, '=');
-    ASSERT_LV3(fields.size() == 1 || fields.size() == 2);
+    assert(fields.size() == 1 || fields.size() == 2);
     if (fields.size() == 1) {
       messages.emplace_back(fields[0]);
     } else {
@@ -112,7 +113,7 @@ void RestoreParameters(const std::string& dir_name) {
   const std::string file_name = Path::Combine(dir_name, NNUE::kFileName);
   std::ifstream stream(file_name, std::ios::binary);
   bool result = ReadParameters(stream);
-  ASSERT(result);
+  assert(result);
 
   SendMessages({{"reset"}});
 }
@@ -136,7 +137,7 @@ void AddExample(Position& pos, Color rootColor,
   if (pos.side_to_move() != BLACK) {
     active_indices[0].swap(active_indices[1]);
   }
-  for (const auto color : COLOR) {
+  for (const auto color : Colors) {
     std::vector<TrainingFeature> training_features;
     for (const auto base_index : active_indices[color]) {
       static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
@@ -162,8 +163,8 @@ void AddExample(Position& pos, Color rootColor,
 }
 
 // 評価関数パラメーターを更新する
-void UpdateParameters(u64 epoch) {
-  ASSERT_LV3(batch_size > 0);
+void UpdateParameters(uint64_t epoch) {
+  assert(batch_size > 0);
 
   EvalLearningTools::Weight::calc_eta(epoch);
   const auto learning_rate = static_cast<LearnFloatType>(
@@ -215,7 +216,7 @@ void save_eval(std::string dir_name) {
   const std::string file_name = Path::Combine(eval_dir, NNUE::kFileName);
   std::ofstream stream(file_name, std::ios::binary);
   const bool result = NNUE::WriteParameters(stream);
-  ASSERT(result);
+  assert(result);
 
   std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
 }
diff --git a/src/eval/nnue/evaluate_nnue_learner.h b/src/eval/nnue/evaluate_nnue_learner.h
index 130ce376..e2e68738 100644
--- a/src/eval/nnue/evaluate_nnue_learner.h
+++ b/src/eval/nnue/evaluate_nnue_learner.h
@@ -3,8 +3,6 @@
 #ifndef _EVALUATE_NNUE_LEARNER_H_
 #define _EVALUATE_NNUE_LEARNER_H_
 
-#include "../../config.h"
-
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
 #include "../../learn/learn.h"
@@ -14,11 +12,11 @@ namespace Eval {
 namespace NNUE {
 
 // 学習の初期化を行う
-void InitializeTraining(double eta1, u64 eta1_epoch,
-                        double eta2, u64 eta2_epoch, double eta3);
+void InitializeTraining(double eta1, uint64_t eta1_epoch,
+                        double eta2, uint64_t eta2_epoch, double eta3);
 
 // ミニバッチのサンプル数を設定する
-void SetBatchSize(u64 size);
+void SetBatchSize(uint64_t size);
 
 // 学習率のスケールを設定する
 void SetGlobalLearningRateScale(double scale);
@@ -34,7 +32,7 @@ void AddExample(Position& pos, Color rootColor,
                 const Learner::PackedSfenValue& psv, double weight);
 
 // 評価関数パラメータを更新する
-void UpdateParameters(u64 epoch);
+void UpdateParameters(uint64_t epoch);
 
 // 学習に問題が生じていないかチェックする
 void CheckHealth();
diff --git a/src/eval/nnue/layers/sum.h b/src/eval/nnue/layers/sum.h
index 216de458..3fe000cf 100644
--- a/src/eval/nnue/layers/sum.h
+++ b/src/eval/nnue/layers/sum.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_LAYERS_SUM_H_
 #define _NNUE_LAYERS_SUM_H_
 
-#include "../../../config.h"
-
 #if defined(EVAL_NNUE)
 
 #include "../nnue_common.h"
diff --git a/src/eval/nnue/trainer/features/factorizer.h b/src/eval/nnue/trainer/features/factorizer.h
index e31c9976..3bc59260 100644
--- a/src/eval/nnue/trainer/features/factorizer.h
+++ b/src/eval/nnue/trainer/features/factorizer.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 
-#include "../../../../config.h"
-
 #if defined(EVAL_NNUE)
 
 #include "../../nnue_common.h"
@@ -29,7 +27,7 @@ class Factorizer {
   // 学習用特徴量のインデックスと学習率のスケールを取得する
   static void AppendTrainingFeatures(
       IndexType base_index, std::vector<TrainingFeature>* training_features) {
-    ASSERT_LV5(base_index < FeatureType::kDimensions);
+    assert(base_index < FeatureType::kDimensions);
     training_features->emplace_back(base_index);
   }
 };
@@ -45,8 +43,8 @@ template <typename FeatureType>
 IndexType AppendBaseFeature(
     FeatureProperties properties, IndexType base_index,
     std::vector<TrainingFeature>* training_features) {
-  ASSERT_LV5(properties.dimensions == FeatureType::kDimensions);
-  ASSERT_LV5(base_index < FeatureType::kDimensions);
+  assert(properties.dimensions == FeatureType::kDimensions);
+  assert(base_index < FeatureType::kDimensions);
   training_features->emplace_back(base_index);
   return properties.dimensions;
 }
@@ -59,14 +57,14 @@ IndexType InheritFeaturesIfRequired(
   if (!properties.active) {
     return 0;
   }
-  ASSERT_LV5(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
-  ASSERT_LV5(base_index < FeatureType::kDimensions);
+  assert(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
+  assert(base_index < FeatureType::kDimensions);
   const auto start = training_features->size();
   Factorizer<FeatureType>::AppendTrainingFeatures(
       base_index, training_features);
   for (auto i = start; i < training_features->size(); ++i) {
     auto& feature = (*training_features)[i];
-    ASSERT_LV5(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
+    assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
     feature.ShiftIndex(index_offset);
   }
   return properties.dimensions;
diff --git a/src/eval/nnue/trainer/features/factorizer_feature_set.h b/src/eval/nnue/trainer/features/factorizer_feature_set.h
index e2db79b1..111678e4 100644
--- a/src/eval/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/eval/nnue/trainer/features/factorizer_feature_set.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 
-#include "../../../../config.h"
-
 #if defined(EVAL_NNUE)
 
 #include "../../features/feature_set.h"
@@ -38,7 +36,7 @@ class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
   static void AppendTrainingFeatures(
       IndexType base_index, std::vector<TrainingFeature>* training_features,
       IndexType base_dimensions = kBaseDimensions) {
-    ASSERT_LV5(base_index < kBaseDimensions);
+    assert(base_index < kBaseDimensions);
     constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
     if (base_index < boundary) {
       Tail::AppendTrainingFeatures(
@@ -50,7 +48,7 @@ class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
       for (auto i = start; i < training_features->size(); ++i) {
         auto& feature = (*training_features)[i];
         const auto index = feature.GetIndex();
-        ASSERT_LV5(index < Head::GetDimensions() ||
+        assert(index < Head::GetDimensions() ||
                    (index >= base_dimensions &&
                     index < base_dimensions +
                             Head::GetDimensions() - Head::kBaseDimensions));
@@ -81,13 +79,13 @@ public:
   static void AppendTrainingFeatures(
       IndexType base_index, std::vector<TrainingFeature>* training_features,
       IndexType base_dimensions = kBaseDimensions) {
-    ASSERT_LV5(base_index < kBaseDimensions);
+    assert(base_index < kBaseDimensions);
     const auto start = training_features->size();
     Factorizer<FeatureType>::AppendTrainingFeatures(
         base_index, training_features);
     for (auto i = start; i < training_features->size(); ++i) {
       auto& feature = (*training_features)[i];
-      ASSERT_LV5(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
+      assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
       if (feature.GetIndex() >= kBaseDimensions) {
         feature.ShiftIndex(base_dimensions - kBaseDimensions);
       }
diff --git a/src/eval/nnue/trainer/features/factorizer_half_kp.h b/src/eval/nnue/trainer/features/factorizer_half_kp.h
index 5682e8e6..36f53edc 100644
--- a/src/eval/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/eval/nnue/trainer/features/factorizer_half_kp.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 
-#include "../../../../config.h"
-
 #if defined(EVAL_NNUE)
 
 #include "../../features/half_kp.h"
diff --git a/src/eval/nnue/trainer/trainer.h b/src/eval/nnue/trainer/trainer.h
index 1b322703..630f1a3d 100644
--- a/src/eval/nnue/trainer/trainer.h
+++ b/src/eval/nnue/trainer/trainer.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_H_
 #define _NNUE_TRAINER_H_
 
-#include "../../../config.h"
-
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
 #include "../nnue_common.h"
@@ -36,11 +34,11 @@ class TrainingFeature {
 
   explicit TrainingFeature(IndexType index) :
       index_and_count_((index << kCountBits) | 1) {
-    ASSERT_LV3(index < (1 << kIndexBits));
+    assert(index < (1 << kIndexBits));
   }
   TrainingFeature& operator+=(const TrainingFeature& other) {
-    ASSERT_LV3(other.GetIndex() == GetIndex());
-    ASSERT_LV3(other.GetCount() + GetCount() < (1 << kCountBits));
+    assert(other.GetIndex() == GetIndex());
+    assert(other.GetCount() + GetCount() < (1 << kCountBits));
     index_and_count_ += other.GetCount();
     return *this;
   }
@@ -48,7 +46,7 @@ class TrainingFeature {
     return static_cast<IndexType>(index_and_count_ >> kCountBits);
   }
   void ShiftIndex(IndexType offset) {
-    ASSERT_LV3(GetIndex() + offset < (1 << kIndexBits));
+    assert(GetIndex() + offset < (1 << kIndexBits));
     index_and_count_ += offset << kCountBits;
   }
   IndexType GetCount() const {
diff --git a/src/eval/nnue/trainer/trainer_affine_transform.h b/src/eval/nnue/trainer/trainer_affine_transform.h
index 197beec3..34c4816b 100644
--- a/src/eval/nnue/trainer/trainer_affine_transform.h
+++ b/src/eval/nnue/trainer/trainer_affine_transform.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 
-#include "../../../config.h"
-
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
 #include "../../../learn/learn.h"
diff --git a/src/eval/nnue/trainer/trainer_clipped_relu.h b/src/eval/nnue/trainer/trainer_clipped_relu.h
index d7cc96e6..bd894769 100644
--- a/src/eval/nnue/trainer/trainer_clipped_relu.h
+++ b/src/eval/nnue/trainer/trainer_clipped_relu.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
 #define _NNUE_TRAINER_CLIPPED_RELU_H_
 
-#include "../../../config.h"
-
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
 #include "../../../learn/learn.h"
diff --git a/src/eval/nnue/trainer/trainer_feature_transformer.h b/src/eval/nnue/trainer/trainer_feature_transformer.h
index ff4da717..742da440 100644
--- a/src/eval/nnue/trainer/trainer_feature_transformer.h
+++ b/src/eval/nnue/trainer/trainer_feature_transformer.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 #define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 
-#include "../../../config.h"
-
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
 #include "../../../learn/learn.h"
diff --git a/src/eval/nnue/trainer/trainer_input_slice.h b/src/eval/nnue/trainer/trainer_input_slice.h
index c6df775f..0660e987 100644
--- a/src/eval/nnue/trainer/trainer_input_slice.h
+++ b/src/eval/nnue/trainer/trainer_input_slice.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_INPUT_SLICE_H_
 #define _NNUE_TRAINER_INPUT_SLICE_H_
 
-#include "../../../config.h"
-
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
 #include "../../../learn/learn.h"
@@ -35,7 +33,7 @@ class SharedInputTrainer {
       current_operation_ = Operation::kSendMessage;
       feature_transformer_trainer_->SendMessage(message);
     }
-    ASSERT_LV3(current_operation_ == Operation::kSendMessage);
+    assert(current_operation_ == Operation::kSendMessage);
     if (++num_calls_ == num_referrers_) {
       num_calls_ = 0;
       current_operation_ = Operation::kNone;
@@ -49,7 +47,7 @@ class SharedInputTrainer {
       current_operation_ = Operation::kInitialize;
       feature_transformer_trainer_->Initialize(rng);
     }
-    ASSERT_LV3(current_operation_ == Operation::kInitialize);
+    assert(current_operation_ == Operation::kInitialize);
     if (++num_calls_ == num_referrers_) {
       num_calls_ = 0;
       current_operation_ = Operation::kNone;
@@ -66,7 +64,7 @@ class SharedInputTrainer {
       current_operation_ = Operation::kPropagate;
       output_ = feature_transformer_trainer_->Propagate(batch);
     }
-    ASSERT_LV3(current_operation_ == Operation::kPropagate);
+    assert(current_operation_ == Operation::kPropagate);
     if (++num_calls_ == num_referrers_) {
       num_calls_ = 0;
       current_operation_ = Operation::kNone;
@@ -90,7 +88,7 @@ class SharedInputTrainer {
         }
       }
     }
-    ASSERT_LV3(current_operation_ == Operation::kBackPropagate);
+    assert(current_operation_ == Operation::kBackPropagate);
     for (IndexType b = 0; b < batch_size_; ++b) {
       const IndexType batch_offset = kInputDimensions * b;
       for (IndexType i = 0; i < kInputDimensions; ++i) {
diff --git a/src/eval/nnue/trainer/trainer_sum.h b/src/eval/nnue/trainer/trainer_sum.h
index 4095482a..76f6073f 100644
--- a/src/eval/nnue/trainer/trainer_sum.h
+++ b/src/eval/nnue/trainer/trainer_sum.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_SUM_H_
 #define _NNUE_TRAINER_SUM_H_
 
-#include "../../../config.h"
-
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
 #include "../../../learn/learn.h"
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 29ea65dc..65c7155a 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -864,13 +864,11 @@ namespace {
 /// evaluate() is the evaluator for the outer world. It returns a static
 /// evaluation of the position from the point of view of the side to move.
 
+#if !defined(EVAL_NNUE)
 Value Eval::evaluate(const Position& pos) {
-#if defined(EVAL_NNUE)
-  return Eval::NNUE::evaluate(pos);
-#else
   return Evaluation<NO_TRACE>(pos).value();
-#endif  // defined(EVAL_NNUE)
 }
+#endif  // defined(EVAL_NNUE)
 
 
 /// trace() is like evaluate(), but instead of returning a value, it returns
diff --git a/src/evaluate.h b/src/evaluate.h
index 1b114179..f31ea142 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -35,6 +35,8 @@ std::string trace(const Position& pos);
 
 Value evaluate(const Position& pos);
 
+void evaluate_with_no_return(const Position& pos);
+
 #if defined(EVAL_NNUE)
 // �]���֐��t�@�C����ǂݍ��ށB
 // ����́A"is_ready"�R�}���h�̉�������1�x�����Ăяo�����B2�x�Ăяo�����Ƃ͑z�肵�Ă��Ȃ��B
@@ -85,6 +87,13 @@ enum BonaPiece : int32_t
 	fe_end2 = e_king + SQUARE_NB, // �ʂ��܂߂������̔ԍ��B
 };
 
+#define ENABLE_INCR_OPERATORS_ON(T)                                \
+inline T& operator++(T& d) { return d = T(int(d) + 1); }           \
+inline T& operator--(T& d) { return d = T(int(d) - 1); }
+
+ENABLE_INCR_OPERATORS_ON(BonaPiece)
+
+#undef ENABLE_INCR_OPERATORS_ON
 
 // BonaPiece����肩�猩���Ƃ�(����39�̕�����肩�猩��ƌ���71�̕�)�̔ԍ��Ƃ�
 // �y�A�ɂ������̂�ExtBonaPiece�^�ƌĂԂ��Ƃɂ���B
@@ -132,7 +141,7 @@ struct EvalList
 
 	// �Տ��sq�̏���piece_no��pc�̋��z�u����
 	void put_piece(PieceNumber piece_no, Square sq, Piece pc) {
-		set_piece_on_board(piece_no, BonaPiece(kpp_board_index[pc].fw + sq), BonaPiece(kpp_board_index[pc].fb + inverse(sq)), sq);
+		set_piece_on_board(piece_no, BonaPiece(kpp_board_index[pc].fw + sq), BonaPiece(kpp_board_index[pc].fb + Inv(sq)), sq);
 	}
 
 	// �Տ�̂��鏡sq�ɑΉ�����PieceNumber��Ԃ��B
@@ -181,8 +190,8 @@ public:
 	static const int MAX_LENGTH = 40;
 
   // �Տ�̋�ɑ΂��āA���̋�ԍ�(PieceNumber)��ێ����Ă���z��
-  // �ʂ�SQ_NB�Ɉړ����Ă���Ƃ��p��+1�܂ŕێ����Ă������A
-  // SQ_NB�̋ʂ��ړ������Ȃ��̂ŁA���̒l���g�����Ƃ͂Ȃ��͂��B
+  // �ʂ�SQUARE_NB�Ɉړ����Ă���Ƃ��p��+1�܂ŕێ����Ă������A
+  // SQUARE_NB�̋ʂ��ړ������Ȃ��̂ŁA���̒l���g�����Ƃ͂Ȃ��͂��B
   PieceNumber piece_no_list_board[SQUARE_NB_PLUS1];
 private:
 
diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
new file mode 100644
index 00000000..df095ce1
--- /dev/null
+++ b/src/extra/sfen_packer.cpp
@@ -0,0 +1,444 @@
+﻿#if defined (EVAL_LEARN)
+
+#include "../misc.h"
+#include "../position.h"
+
+#include <sstream>
+#include <fstream>
+#include <cstring>	// std::memset()
+
+using namespace std;
+
+// -----------------------------------
+//        局面の圧縮・解凍
+// -----------------------------------
+
+// ビットストリームを扱うクラス
+// 局面の符号化を行なうときに、これがあると便利
+struct BitStream
+{
+  // データを格納するメモリを事前にセットする。
+  // そのメモリは0クリアされているものとする。
+  void  set_data(uint8_t* data_) { data = data_; reset(); }
+
+  // set_data()で渡されたポインタの取得。
+  uint8_t* get_data() const { return data; }
+
+  // カーソルの取得。
+  int get_cursor() const { return bit_cursor; }
+
+  // カーソルのリセット
+  void reset() { bit_cursor = 0; }
+
+  // ストリームに1bit書き出す。
+  // bは非0なら1を書き出す。0なら0を書き出す。
+  void write_one_bit(int b)
+  {
+    if (b)
+      data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+
+    ++bit_cursor;
+  }
+
+  // ストリームから1ビット取り出す。
+  int read_one_bit()
+  {
+    int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
+    ++bit_cursor;
+
+    return b;
+  }
+
+  // nビットのデータを書き出す
+  // データはdの下位から順に書き出されるものとする。
+  void write_n_bit(int d, int n)
+  {
+    for (int i = 0; i < n; ++i)
+      write_one_bit(d & (1 << i));
+  }
+
+  // nビットのデータを読み込む
+  // write_n_bit()の逆変換。
+  int read_n_bit(int n)
+  {
+    int result = 0;
+    for (int i = 0; i < n; ++i)
+      result |= read_one_bit() ? (1 << i) : 0;
+
+    return result;
+  }
+
+private:
+  // 次に読み書きすべきbit位置。
+  int bit_cursor;
+
+  // データの実体
+  uint8_t* data;
+};
+
+
+//  ハフマン符号化
+//   ※　 なのはminiの符号化から、変換が楽になるように単純化。
+//
+//   盤上の1升(NO_PIECE以外) = 2～6bit ( + 成りフラグ1bit+ 先後1bit )
+//   手駒の1枚               = 1～5bit ( + 成りフラグ1bit+ 先後1bit )
+//
+//    空     xxxxx0 + 0    (none)
+//    歩     xxxx01 + 2    xxxx0 + 2
+//    香     xx0011 + 2    xx001 + 2
+//    桂     xx1011 + 2    xx101 + 2
+//    銀     xx0111 + 2    xx011 + 2
+//    金     x01111 + 1    x0111 + 1 // 金は成りフラグはない。
+//    角     011111 + 2    01111 + 2
+//    飛     111111 + 2    11111 + 2
+//
+// すべての駒が盤上にあるとして、
+//     空 81 - 40駒 = 41升 = 41bit
+//     歩      4bit*18駒   = 72bit
+//     香      6bit* 4駒   = 24bit
+//     桂      6bit* 4駒   = 24bit
+//     銀      6bit* 4駒   = 24bit            
+//     金      6bit* 4駒   = 24bit
+//     角      8bit* 2駒   = 16bit
+//     飛      8bit* 2駒   = 16bit
+//                          -------
+//                          241bit + 1bit(手番) + 7bit×2(王の位置先後) = 256bit
+//
+// 盤上の駒が手駒に移動すると盤上の駒が空になるので盤上のその升は1bitで表現でき、
+// 手駒は、盤上の駒より1bit少なく表現できるので結局、全体のbit数に変化はない。
+// ゆえに、この表現において、どんな局面でもこのbit数で表現できる。
+// 手駒に成りフラグは不要だが、これも含めておくと盤上の駒のbit数-1になるので
+// 全体のbit数が固定化できるのでこれも含めておくことにする。
+
+// Huffman Encoding
+//
+// Empty  xxxxxxx0
+// Pawn   xxxxx001 + 1 bit (Side to move)
+// Knight xxxxx011 + 1 bit (Side to move)
+// Bishop xxxxx101 + 1 bit (Side to move)
+// Rook   xxxxx111 + 1 bit (Side to move)
+
+struct HuffmanedPiece
+{
+  int code; // どうコード化されるか
+  int bits; // 何bit専有するのか
+};
+
+HuffmanedPiece huffman_table[] =
+{
+  {0b000,1}, // NO_PIECE
+  {0b001,3}, // PAWN
+  {0b011,3}, // KNIGHT
+  {0b101,3}, // BISHOP
+  {0b111,3}, // ROOK
+};
+
+// sfenを圧縮/解凍するためのクラス
+// sfenはハフマン符号化をすることで256bit(32bytes)にpackできる。
+// このことはなのはminiにより証明された。上のハフマン符号化である。
+//
+// 内部フォーマット = 手番1bit+王の位置7bit*2 + 盤上の駒(ハフマン符号化) + 手駒(ハフマン符号化)
+// Side to move (White = 0, Black = 1) (1bit)
+// White King Position (6 bits)
+// Black King Position (6 bits)
+// Huffman Encoding of the board
+// Castling availability (1 bit x 4)
+// En passant square (1 or 1 + 6 bits)
+// Rule 50 (6 bits)
+// Game play (8 bits)
+//
+// TODO(someone): Rename SFEN to FEN.
+//
+struct SfenPacker
+{
+  // sfenをpackしてdata[32]に格納する。
+  void pack(const Position& pos)
+  {
+//    cout << pos;
+
+    memset(data, 0, 32 /* 256bit */);
+    stream.set_data(data);
+
+    // 手番
+    // Side to move.
+    stream.write_one_bit((int)(pos.side_to_move()));
+
+    // 先手玉、後手玉の位置、それぞれ7bit
+    // White king and black king, 6 bits for each.
+    for(auto c : Colors)
+      stream.write_n_bit(pos.king_square(c), 6);
+
+    // Write the pieces on the board other than the kings.
+    for (Rank r = RANK_8; r >= RANK_1; --r)
+    {
+      for (File f = FILE_A; f <= FILE_H; ++f)
+      {
+        Piece pc = pos.piece_on(make_square(f, r));
+        if (type_of(pc) == KING)
+          continue;
+        write_board_piece_to_stream(pc);
+      }
+    }
+
+    // TODO(someone): Support chess960.
+    stream.write_one_bit(pos.can_castle(WHITE_OO));
+    stream.write_one_bit(pos.can_castle(WHITE_OOO));
+    stream.write_one_bit(pos.can_castle(BLACK_OO));
+    stream.write_one_bit(pos.can_castle(BLACK_OOO));
+
+    if (pos.ep_square() == SQ_NONE) {
+      stream.write_one_bit(0);
+    }
+    else {
+      stream.write_one_bit(1);
+      stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
+    }
+
+    stream.write_n_bit(pos.state()->rule50, 6);
+
+    stream.write_n_bit(pos.game_ply(), 8);
+
+    assert(stream.get_cursor() <= 256);
+  }
+
+  // pack()でpackされたsfen(256bit = 32bytes)
+  // もしくはunpack()でdecodeするsfen
+  uint8_t *data; // uint8_t[32];
+
+//private:
+  // Position::set_from_packed_sfen(uint8_t data[32])でこれらの関数を使いたいので筋は悪いがpublicにしておく。
+
+  BitStream stream;
+
+  // 盤面の駒をstreamに出力する。
+  void write_board_piece_to_stream(Piece pc)
+  {
+    // 駒種
+    PieceType pr = type_of(pc);
+    auto c = huffman_table[pr];
+    stream.write_n_bit(c.code, c.bits);
+ 
+    if (pc == NO_PIECE)
+      return;
+
+    // 先後フラグ
+    stream.write_one_bit(color_of(pc));
+  }
+
+  // 盤面の駒を1枚streamから読み込む
+  Piece read_board_piece_from_stream()
+  {
+    PieceType pr = NO_PIECE_TYPE;
+    int code = 0, bits = 0;
+    while (true)
+    {
+      code |= stream.read_one_bit() << bits;
+      ++bits;
+
+      assert(bits <= 6);
+
+      for (pr = NO_PIECE_TYPE; pr < KING; ++pr)
+        if (huffman_table[pr].code == code
+          && huffman_table[pr].bits == bits)
+          goto Found;
+    }
+  Found:;
+    if (pr == NO_PIECE_TYPE)
+      return NO_PIECE;
+
+    // 先後フラグ
+    Color c = (Color)stream.read_one_bit();
+    
+    return make_piece(c, pr);
+  }
+};
+
+
+// -----------------------------------
+//        Positionクラスに追加
+// -----------------------------------
+
+// 高速化のために直接unpackする関数を追加。かなりしんどい。
+// packer::unpack()とPosition::set()とを合体させて書く。
+// 渡された局面に問題があって、エラーのときは非0を返す。
+int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thread* th, bool mirror)
+{
+	SfenPacker packer;
+	auto& stream = packer.stream;
+	stream.set_data((uint8_t*)&sfen);
+
+	std::memset(this, 0, sizeof(Position));
+	std::memset(si, 0, sizeof(StateInfo));
+	st = si;
+
+	// Active color
+	sideToMove = (Color)stream.read_one_bit();
+
+	// evalListのclear。上でmemsetでゼロクリアしたときにクリアされているが…。
+	evalList.clear();
+
+	// PieceListを更新する上で、どの駒がどこにあるかを設定しなければならないが、
+	// それぞれの駒をどこまで使ったかのカウンター
+	PieceNumber piece_no_count[KING] = {
+    PIECE_NUMBER_ZERO,
+    PIECE_NUMBER_PAWN,
+    PIECE_NUMBER_KNIGHT,
+		PIECE_NUMBER_BISHOP,
+    PIECE_NUMBER_ROOK,
+  };
+
+  pieceList[W_KING][0] = SQUARE_NB;
+  pieceList[B_KING][0] = SQUARE_NB;
+
+	// まず玉の位置
+	if (mirror)
+	{
+		for (auto c : Colors)
+			board[Mir((Square)stream.read_n_bit(7))] = make_piece(c, KING);
+	}
+	else
+	{
+		for (auto c : Colors)
+			board[stream.read_n_bit(7)] = make_piece(c, KING);
+	}
+
+  // Piece placement
+  for (Rank r = RANK_8; r >= RANK_1; --r)
+  {
+    for (File f = FILE_A; f <= FILE_H; ++f)
+    {
+      auto sq = make_square(f, r);
+      if (mirror) {
+        sq = Mir(sq);
+      }
+
+      // すでに玉がいるようだ
+      Piece pc;
+      if (type_of(board[sq]) != KING)
+      {
+        assert(board[sq] == NO_PIECE);
+        pc = packer.read_board_piece_from_stream();
+      }
+      else
+      {
+        pc = board[sq];
+        board[sq] = NO_PIECE; // いっかい取り除いておかないとput_piece()でASSERTに引っかかる。
+      }
+
+      // 駒がない場合もあるのでその場合はスキップする。
+      if (pc == NO_PIECE)
+        continue;
+
+      put_piece(Piece(pc), sq);
+
+      // evalListの更新
+      PieceNumber piece_no =
+        (pc == B_KING) ? PIECE_NUMBER_BKING : // 先手玉
+        (pc == W_KING) ? PIECE_NUMBER_WKING : // 後手玉
+        piece_no_count[type_of(pc)]++; // それ以外
+
+      evalList.put_piece(piece_no, sq, pc); // sqの升にpcの駒を配置する
+
+      //cout << sq << ' ' << board[sq] << ' ' << stream.get_cursor() << endl;
+
+      if (stream.get_cursor() > 256)
+        return 1;
+      //assert(stream.get_cursor() <= 256);
+
+    }
+  }
+
+  // Castling availability.
+  // TODO(someone): Support chess960.
+  st->castlingRights = 0;
+  if (stream.read_one_bit()) {
+    Square rsq;
+    for (rsq = relative_square(WHITE, SQ_H1); piece_on(rsq) != W_ROOK; --rsq) {}
+    set_castling_right(WHITE, rsq);
+  }
+  if (stream.read_one_bit()) {
+    Square rsq;
+    for (rsq = relative_square(WHITE, SQ_A1); piece_on(rsq) != W_ROOK; ++rsq) {}
+    set_castling_right(WHITE, rsq);
+  }
+  if (stream.read_one_bit()) {
+    Square rsq;
+    for (rsq = relative_square(BLACK, SQ_H1); piece_on(rsq) != W_ROOK; --rsq) {}
+    set_castling_right(BLACK, rsq);
+  }
+  if (stream.read_one_bit()) {
+    Square rsq;
+    for (rsq = relative_square(BLACK, SQ_A1); piece_on(rsq) != W_ROOK; ++rsq) {}
+    set_castling_right(BLACK, rsq);
+  }
+
+  // En passant square. Ignore if no pawn capture is possible
+  if (stream.read_one_bit()) {
+    Square ep_square = static_cast<Square>(stream.read_n_bit(6));
+    st->epSquare = ep_square;
+
+    if (!(attackers_to(st->epSquare) & pieces(sideToMove, PAWN))
+      || !(pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove))))
+      st->epSquare = SQ_NONE;
+  }
+
+  // Halfmove clock
+  st->rule50 = static_cast<Square>(stream.read_n_bit(6));
+
+  // Fullmove number
+  gamePly = static_cast<Square>(stream.read_n_bit(8));
+  // Convert from fullmove starting from 1 to gamePly starting from 0,
+  // handle also common incorrect FEN with fullmove = 0.
+  gamePly = std::max(2 * (gamePly - 1), 0) + (sideToMove == BLACK);
+
+  assert(stream.get_cursor() <= 256);
+
+  chess960 = false;
+  thisThread = th;
+	set_state(st);
+
+  assert(pos_is_ok());
+#if defined(EVAL_NNUE)
+  assert(evalList.is_valid(*this));
+#endif  // defined(EVAL_NNUE)
+
+	return 0;
+}
+
+// 盤面と手駒、手番を与えて、そのsfenを返す。
+//std::string Position::sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly_)
+//{
+//  // 内部的な構造体にコピーして、sfen()を呼べば、変換過程がそこにしか依存していないならば
+//  // これで正常に変換されるのでは…。
+//  Position pos;
+//
+//  memcpy(pos.board, board, sizeof(Piece) * 81);
+//  memcpy(pos.hand, hands, sizeof(Hand) * 2);
+//  pos.sideToMove = turn;
+//  pos.gamePly = gamePly_;
+//
+//  return pos.sfen();
+//
+//  // ↑の実装、美しいが、いかんせん遅い。
+//  // 棋譜を大量に読み込ませて学習させるときにここがボトルネックになるので直接unpackする関数を書く。
+//}
+
+// packされたsfenを得る。引数に指定したバッファに返す。
+void Position::sfen_pack(PackedSfen& sfen)
+{
+  SfenPacker sp;
+  sp.data = (uint8_t*)&sfen;
+  sp.pack(*this);
+}
+
+//// packされたsfenを解凍する。sfen文字列が返る。
+//std::string Position::sfen_unpack(const PackedSfen& sfen)
+//{
+//  SfenPacker sp;
+//  sp.data = (uint8_t*)&sfen;
+//  return sp.unpack();
+//}
+
+
+#endif // USE_SFEN_PACKER
+
diff --git a/src/learn/gensfen2019.cpp b/src/learn/gensfen2019.cpp
new file mode 100644
index 00000000..01293b9c
--- /dev/null
+++ b/src/learn/gensfen2019.cpp
@@ -0,0 +1 @@
+// just a place holder
diff --git a/src/learn/half_float.h b/src/learn/half_float.h
new file mode 100644
index 00000000..31dc5a29
--- /dev/null
+++ b/src/learn/half_float.h
@@ -0,0 +1,133 @@
+﻿#ifndef __HALF_FLOAT_H__
+#define __HALF_FLOAT_H__
+
+// Half Float Library by yaneurao
+// (16-bit float)
+
+// 16bit型による浮動小数点演算
+// コンパイラの生成するfloat型のコードがIEEE 754の形式であると仮定して、それを利用する。
+
+#include "../types.h"
+
+namespace HalfFloat
+{
+	// IEEE 754 float 32 format is :
+	//   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
+	//
+	// Our float16 format is :
+	//   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
+	union float32_converter
+	{
+		int32_t n;
+		float f;
+	};
+
+
+	// 16-bit float
+	struct float16
+	{
+		// --- constructors
+
+		float16() {}
+		float16(int16_t n) { from_float((float)n);  }
+		float16(int32_t n) { from_float((float)n); }
+		float16(float n) { from_float(n); }
+		float16(double n) { from_float((float)n); }
+
+		// build from a float
+		void from_float(float f) { *this = to_float16(f); }
+
+		// --- implicit converters
+
+		operator int32_t() const { return (int32_t)to_float(*this); }
+		operator float() const { return to_float(*this); }
+		operator double() const { return double(to_float(*this)); }
+
+		// --- operators
+
+		float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
+		float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
+		float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
+		float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
+		float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
+		float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
+		float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
+		float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
+		float16 operator - () const { return float16(-to_float(*this)); }
+		bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
+		bool operator != (float16 rhs) const { return !(*this == rhs); }
+
+		static void UnitTest() { unit_test(); }
+
+	private:
+
+		// --- entity
+
+		uint16_t v_;
+
+		// --- conversion between float and float16
+
+		static float16 to_float16(float f)
+		{
+			float32_converter c;
+			c.f = f;
+			u32 n = c.n;
+
+			// The sign bit is MSB in common.
+			uint16_t sign_bit = (n >> 16) & 0x8000;
+
+			// The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
+			uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;
+
+			// The fraction is limited to 10-bit.
+			uint16_t fraction = (n >> (23-10)) & 0x3ff;
+
+			float16 f_;
+			f_.v_ = sign_bit | exponent | fraction;
+
+			return f_;
+		}
+
+		static float to_float(float16 v)
+		{
+			u32 sign_bit = (v.v_ & 0x8000) << 16;
+			u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
+			u32 fraction = (v.v_ & 0x3ff) << (23 - 10);
+
+			float32_converter c;
+			c.n = sign_bit | exponent | fraction;
+			return c.f;
+		}
+
+		// unit testになってないが、一応計算が出来ることは確かめた。コードはあとでなおす(かも)。
+		static void unit_test()
+		{
+			float16 a, b, c, d;
+			a = 1;
+			std::cout << (float)a << std::endl;
+			b = -118.625;
+			std::cout << (float)b << std::endl;
+			c = 2.5;
+			std::cout << (float)c << std::endl;
+			d = a + c;
+			std::cout << (float)d << std::endl;
+
+			c *= 1.5;
+			std::cout << (float)c << std::endl;
+
+			b /= 3;
+			std::cout << (float)b << std::endl;
+
+			float f1 = 1.5;
+			a += f1;
+			std::cout << (float)a << std::endl;
+
+			a += f1 * (float)a;
+			std::cout << (float)a << std::endl;
+		}
+
+	};
+
+}
+
+#endif // __HALF_FLOAT_H__
diff --git a/src/learn/learn.h b/src/learn/learn.h
new file mode 100644
index 00000000..58a017bd
--- /dev/null
+++ b/src/learn/learn.h
@@ -0,0 +1,237 @@
+﻿#ifndef _LEARN_H_
+#define _LEARN_H_
+
+#if defined(EVAL_LEARN)
+
+#include <vector>
+
+// =====================
+//  学習時の設定
+// =====================
+
+// 以下のいずれかを選択すれば、そのあとの細々したものは自動的に選択される。
+// いずれも選択しない場合は、そのあとの細々したものをひとつひとつ設定する必要がある。
+
+// elmo方式での学習設定。これをデフォルト設定とする。
+// 標準の雑巾絞りにするためにはlearnコマンドで "lambda 1"を指定してやれば良い。
+#define LEARN_ELMO_METHOD
+
+
+// ----------------------
+//        更新式
+// ----------------------
+
+// AdaGrad。これが安定しているのでお勧め。
+// #define ADA_GRAD_UPDATE
+
+// 勾配の符号だけ見るSGD。省メモリで済むが精度は…。
+// #define SGD_UPDATE
+
+// ----------------------
+//    学習時の設定
+// ----------------------
+
+// mini-batchサイズ。
+// この数だけの局面をまとめて勾配を計算する。
+// 小さくするとupdate_weights()の回数が増えるので収束が速くなる。勾配が不正確になる。
+// 大きくするとupdate_weights()の回数が減るので収束が遅くなる。勾配は正確に出るようになる。
+// 多くの場合において、この値を変更する必要はないと思う。
+
+#define LEARN_MINI_BATCH_SIZE (1000 * 1000 * 1)
+
+// ファイルから1回に読み込む局面数。これだけ読み込んだあとshuffleする。
+// ある程度大きいほうが良いが、この数×40byte×3倍ぐらいのメモリを消費する。10M局面なら400MB*3程度消費する。
+// THREAD_BUFFER_SIZE(=10000)の倍数にすること。
+
+#define LEARN_SFEN_READ_SIZE (1000 * 1000 * 10)
+
+// 学習時の評価関数の保存間隔。この局面数だけ学習させるごとに保存。
+// 当然ながら、保存間隔を長くしたほうが学習時間は短くなる。
+// フォルダ名は 0/ , 1/ , 2/ ...のように保存ごとにインクリメントされていく。
+// デフォルトでは10億局面に1回。
+#define LEARN_EVAL_SAVE_INTERVAL (1000000000ULL)
+
+
+// ----------------------
+//    目的関数の選択
+// ----------------------
+
+// 目的関数が勝率の差の二乗和
+// 詳しい説明は、learner.cppを見ること。
+
+//#define LOSS_FUNCTION_IS_WINNING_PERCENTAGE
+
+// 目的関数が交差エントロピー
+// 詳しい説明は、learner.cppを見ること。
+// いわゆる、普通の「雑巾絞り」
+//#define LOSS_FUNCTION_IS_CROSS_ENTOROPY
+
+// 目的関数が交差エントロピーだが、勝率の関数を通さない版
+// #define LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE
+
+// elmo(WCSC27)の方式
+// #define LOSS_FUNCTION_IS_ELMO_METHOD
+
+// ※　他、色々追加するかも。
+
+
+// ----------------------
+// 学習に関するデバッグ設定
+// ----------------------
+
+// 学習時のrmseの出力をこの回数に1回に減らす。
+// rmseの計算は1スレッドで行なうためそこそこ時間をとられるので出力を減らすと効果がある。
+#define LEARN_RMSE_OUTPUT_INTERVAL 1
+
+
+// ----------------------
+// ゼロベクトルからの学習
+// ----------------------
+
+// 評価関数パラメーターをゼロベクトルから学習を開始する。
+// ゼロ初期化して棋譜生成してゼロベクトルから学習させて、
+// 棋譜生成→学習を繰り返すとプロの棋譜に依らないパラメーターが得られる。(かも)
+// (すごく時間かかる)
+
+//#define RESET_TO_ZERO_VECTOR
+
+
+// ----------------------
+//  学習のときの浮動小数
+// ----------------------
+
+// これをdoubleにしたほうが計算精度は上がるが、重み配列絡みのメモリが倍必要になる。
+// 現状、ここをfloatにした場合、評価関数ファイルに対して、重み配列はその4.5倍のサイズ。(KPPTで4.5GB程度)
+// double型にしても収束の仕方にほとんど差異がなかったのでfloatに固定する。
+
+// floatを使う場合
+typedef float LearnFloatType;
+
+// doubleを使う場合
+//typedef double LearnFloatType;
+
+// float16を使う場合
+//#include "half_float.h"
+//typedef HalfFloat::float16 LearnFloatType;
+
+// ----------------------
+//  省メモリ化
+// ----------------------
+
+// Weight配列(のうちのKPP)に三角配列を用いて省メモリ化する。
+// これを用いると、学習用の重み配列は評価関数ファイルの3倍程度で済むようになる。
+
+#define USE_TRIANGLE_WEIGHT_ARRAY
+
+// ----------------------
+//  次元下げ
+// ----------------------
+
+// ミラー(左右対称性)、インバース(先後対称性)に関して次元下げを行なう。
+// デフォルトではすべてオン。
+
+// KKに対してミラー、インバースを利用した次元下げを行なう。(効果のほどは不明)
+// USE_KK_INVERSE_WRITEをオンにするときはUSE_KK_MIRROR_WRITEもオンでなければならない。
+#define USE_KK_MIRROR_WRITE
+#define USE_KK_INVERSE_WRITE
+
+// KKPに対してミラー、インバースを利用した次元下げを行なう。(インバースのほうは効果のほどは不明)
+// USE_KKP_INVERSE_WRITEをオンにするときは、USE_KKP_MIRROR_WRITEもオンになっていなければならない。
+#define USE_KKP_MIRROR_WRITE
+#define USE_KKP_INVERSE_WRITE
+
+// KPPに対してミラーを利用した次元下げを行なう。(これをオフにすると教師局面が倍ぐらい必要になる)
+// KPPにはインバースはない。(先手側のKしかないので)
+#define USE_KPP_MIRROR_WRITE
+
+// KPPPに対してミラーを利用した次元下げを行なう。(これをオフにすると教師局面が倍ぐらい必要になる)
+// KPPPにもインバースはない。(先手側のKしかないので)
+#define USE_KPPP_MIRROR_WRITE
+
+// KKPP成分に対して学習時にKPPによる次元下げを行なう。
+// 学習、めっちゃ遅くなる。
+// 未デバッグなので使わないこと。
+//#define USE_KKPP_LOWER_DIM
+
+
+// ======================
+//  教師局面生成時の設定
+// ======================
+
+// ----------------------
+//  引き分けを書き出す
+// ----------------------
+
+// 引き分けに至ったとき、それを教師局面として書き出す
+// これをするほうが良いかどうかは微妙。
+// #define LEARN_GENSFEN_USE_DRAW_RESULT
+
+
+// ======================
+//       configure
+// ======================
+
+// ----------------------
+//  elmo(WCSC27)の方法での学習
+// ----------------------
+
+#if defined( LEARN_ELMO_METHOD )
+#define LOSS_FUNCTION_IS_ELMO_METHOD
+#define ADA_GRAD_UPDATE
+#endif
+
+
+// ----------------------
+// Learnerで用いるstructの定義
+// ----------------------
+#include "../position.h"
+
+namespace Learner
+{
+	// PackedSfenと評価値が一体化した構造体
+	// オプションごとに書き出す内容が異なると教師棋譜を再利用するときに困るので
+	// とりあえず、以下のメンバーはオプションによらずすべて書き出しておく。
+	struct PackedSfenValue
+	{
+		// 局面
+		PackedSfen sfen;
+
+		// Learner::search()から返ってきた評価値
+		int16_t score;
+
+		// PVの初手
+		// 教師との指し手一致率を求めるときなどに用いる
+		uint16_t move;
+
+		// 初期局面からの局面の手数。
+		uint16_t gamePly;
+
+		// この局面の手番側が、ゲームを最終的に勝っているなら1。負けているなら-1。
+		// 引き分けに至った場合は、0。
+		// 引き分けは、教師局面生成コマンドgensfenにおいて、
+		// LEARN_GENSFEN_DRAW_RESULTが有効なときにだけ書き出す。
+		int8_t game_result;
+
+		// 教師局面を書き出したファイルを他の人とやりとりするときに
+		// この構造体サイズが不定だと困るため、paddingしてどの環境でも必ず40bytesになるようにしておく。
+		uint8_t padding;
+
+		// 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
+	};
+
+	// 読み筋とそのときの評価値を返す型
+	// Learner::search() , Learner::qsearch()で用いる。
+	typedef std::pair<Value, std::vector<Move> > ValueAndPV;
+
+	// いまのところ、やねうら王2018 Otafukuしか、このスタブを持っていないが
+	// EVAL_LEARNをdefineするなら、このスタブが必須。
+	extern Learner::ValueAndPV  search(Position& pos, int depth , size_t multiPV = 1 , uint64_t NodesLimit = 0);
+	extern Learner::ValueAndPV qsearch(Position& pos);
+
+	double calc_grad(Value shallow, const PackedSfenValue& psv);
+
+}
+
+#endif
+
+#endif // ifndef _LEARN_H_
\ No newline at end of file
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
new file mode 100644
index 00000000..0e904650
--- /dev/null
+++ b/src/learn/learner.cpp
@@ -0,0 +1,2922 @@
+﻿// 学習関係のルーチン
+//
+// 1) 棋譜の自動生成
+//   → "gensfen"コマンド
+// 2) 生成した棋譜からの評価関数パラメーターの学習
+//   → "learn"コマンド
+//   → 教師局面のshuffleもこのコマンドの拡張として行なう。
+//   例) "learn shuffle"
+// 3) 定跡の自動生成
+//   → "makebook think"コマンド
+//   → extra/book/book.cppで実装
+// 4) 局後自動検討モード
+//   →　GUIが補佐すべき問題なのでエンジンでは関与しないことにする。
+// etc..
+
+#if defined(EVAL_LEARN)
+
+#include <random>
+
+#include "learn.h"
+#include "multi_think.h"
+#include "../uci.h"
+
+// 学習用のevaluate絡みのheader
+#include "../eval/evaluate_common.h"
+
+// ----------------------
+// 設定内容に基づく定数文字列
+// ----------------------
+
+// 更新式に応じた文字列。(デバッグ用に出力する。)
+// 色々更新式を実装したがAdaGradが速度面、メモリ面においてベストという結論になった。
+#if defined(ADA_GRAD_UPDATE)
+#define LEARN_UPDATE "AdaGrad"
+#elif defined(SGD_UPDATE)
+#define LEARN_UPDATE "SGD"
+#endif
+
+#if defined(LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
+#define LOSS_FUNCTION "WINNING_PERCENTAGE"
+#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
+#define LOSS_FUNCTION "CROSS_ENTOROPY"
+#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
+#define LOSS_FUNCTION "CROSS_ENTOROPY_FOR_VALUE"
+#elif defined(LOSS_FUNCTION_IS_ELMO_METHOD)
+#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
+#endif
+
+// -----------------------------------
+//    以下、実装部。
+// -----------------------------------
+
+#include <sstream>
+#include <fstream>
+#include <unordered_set>
+#include <iomanip>
+#include <list>
+#include <cmath>	// std::exp(),std::pow(),std::log()
+#include <cstring>	// memcpy()
+
+#if defined (_OPENMP)
+#include <omp.h>
+#endif
+
+#if defined(_MSC_VER)
+// C++のfilesystemは、C++17以降か、MSVCでないと使えないようだ。
+// windows.hを使うようにしたが、msys2のg++だとうまくフォルダ内のファイルが取得できない。
+// 仕方ないのでdirent.hを用いる。
+#include <filesystem>
+#elif defined(__GNUC__)
+#include <dirent.h>
+#endif
+
+#include "../misc.h"
+#include "../thread.h"
+#include "../position.h"
+//#include "../extra/book/book.h"
+#include "../tt.h"
+#include "multi_think.h"
+
+#if defined(EVAL_NNUE)
+#include "../eval/nnue/evaluate_nnue_learner.h"
+#include <shared_mutex>
+#endif
+
+using namespace std;
+
+//// これは探索部で定義されているものとする。
+//extern Book::BookMoveSelector book;
+
+// atomic<T>に対する足し算、引き算の定義
+// Apery/learner.hppにあるatomicAdd()に合わせてある。
+template <typename T>
+T operator += (std::atomic<T>& x, const T rhs)
+{
+	T old = x.load(std::memory_order_consume);
+	// このタイミングで他スレッドから値が書き換えられることは許容する。
+	// 値が破壊されなければ良しという考え。
+	T desired = old + rhs;
+	while (!x.compare_exchange_weak(old, desired, std::memory_order_release, std::memory_order_consume))
+		desired = old + rhs;
+	return desired;
+}
+template <typename T>
+T operator -= (std::atomic<T>& x, const T rhs) { return x += -rhs; }
+
+namespace Learner
+{
+
+// 局面の配列 : PSVector は packed sfen vector の略。
+typedef std::vector<PackedSfenValue> PSVector;
+
+// -----------------------------------
+//    局面のファイルへの書き出し
+// -----------------------------------
+
+// Sfenを書き出して行くためのヘルパクラス
+struct SfenWriter
+{
+	// 書き出すファイル名と生成するスレッドの数
+	SfenWriter(string filename, int thread_num)
+	{
+		sfen_buffers_pool.reserve((size_t)thread_num * 10);
+		sfen_buffers.resize(thread_num);
+
+		// 追加学習するとき、評価関数の学習後も生成される教師の質はあまり変わらず、教師局面数を稼ぎたいので
+		// 古い教師も使うのが好ましいのでこういう仕様にしてある。
+		fs.open(filename, ios::out | ios::binary | ios::app);
+		filename_ = filename;
+
+		finished = false;
+	}
+
+	~SfenWriter()
+	{
+		finished = true;
+		file_worker_thread.join();
+		fs.close();
+
+		// file_worker_threadがすべて書き出したあとなのでbufferはすべて空のはずなのだが..
+		for (auto p : sfen_buffers) { assert(p == nullptr); }
+		assert(sfen_buffers_pool.empty());
+	}
+
+	// 各スレッドについて、この局面数ごとにファイルにflushする。
+	const size_t SFEN_WRITE_SIZE = 5000;
+
+	// 局面と評価値をペアにして1つ書き出す(packされたsfen形式で)
+	void write(size_t thread_id, const PackedSfenValue& psv)
+	{
+		// スレッドごとにbufferを持っていて、そこに追加する。
+		// bufferが溢れたら、ファイルに書き出す。
+
+		// このバッファはスレッドごとに用意されている。
+		auto& buf = sfen_buffers[thread_id];
+
+		// 初回とスレッドバッファを書き出した直後はbufがないので確保する。
+		if (!buf)
+		{
+			buf = new PSVector();
+			buf->reserve(SFEN_WRITE_SIZE);
+		}
+
+		// スレッドごとに用意されており、一つのスレッドが同時にこのwrite()関数を呼び出さないので
+		// この時点では排他する必要はない。
+		buf->push_back(psv);
+
+		if (buf->size() >= SFEN_WRITE_SIZE)
+		{
+			// sfen_buffers_poolに積んでおけばあとはworkerがよきに計らってくれる。
+
+			// sfen_buffers_poolの内容を変更するときはmutexのlockが必要。
+			std::unique_lock<Mutex> lk(mutex);
+			sfen_buffers_pool.push_back(buf);
+
+			buf = nullptr;
+			// buf == nullptrにしておけば次回にこの関数が呼び出されたときにバッファは確保される。
+		}
+	}
+
+	// 自分のスレッド用のバッファに残っている分をファイルに書き出すためのバッファに移動させる。
+	void finalize(size_t thread_id)
+	{
+		std::unique_lock<Mutex> lk(mutex);
+
+		auto& buf = sfen_buffers[thread_id];
+
+		// buf==nullptrであるケースもあるのでそのチェックが必要。
+		if (buf && buf->size() != 0)
+			sfen_buffers_pool.push_back(buf);
+
+		buf = nullptr;
+	}
+
+	// write_workerスレッドを開始する。
+	void start_file_write_worker()
+	{
+		file_worker_thread = std::thread([&] { this->file_write_worker(); });
+	}
+
+	// ファイルに書き出すの専用スレッド
+	void file_write_worker()
+	{
+		auto output_status = [&]()
+		{
+			// 現在時刻も出力
+			sync_cout << endl << sfen_write_count << " sfens , at " << now_string() << sync_endl;
+
+			// flush()はこのタイミングで十分。
+			fs.flush();
+		};
+
+		while (!finished || sfen_buffers_pool.size())
+		{
+			vector<PSVector*> buffers;
+			{
+				std::unique_lock<Mutex> lk(mutex);
+
+				// まるごとコピー
+				buffers = sfen_buffers_pool;
+				sfen_buffers_pool.clear();
+			}
+
+			// 何も取得しなかったならsleep()
+			if (!buffers.size())
+				sleep(100);
+			else
+			{
+				for (auto ptr : buffers)
+				{
+					fs.write((const char*)&((*ptr)[0]), sizeof(PackedSfenValue) * ptr->size());
+
+					sfen_write_count += ptr->size();
+
+#if 1
+					// 処理した件数をここに加算していき、save_everyを超えたら、ファイル名を変更し、このカウンターをリセットする。
+					save_every_counter += ptr->size();
+					if (save_every_counter >= save_every)
+					{
+						save_every_counter = 0;
+						// ファイル名を変更。
+
+						fs.close();
+
+						// ファイルにつける連番
+						int n = (int)(sfen_write_count / save_every);
+						// ファイル名を変更して再度openする。上書き考慮してios::appをつけておく。(運用によっては、ないほうがいいかも..)
+						string filename = filename_ + "_" + std::to_string(n);
+						fs.open(filename, ios::out | ios::binary | ios::app);
+						cout << endl << "output sfen file = " << filename << endl;
+					}
+#endif
+
+					// 棋譜を書き出すごとに'.'を出力。
+					std::cout << ".";
+
+					// 40回ごとに処理した局面数を出力
+					// 最後、各スレッドの教師局面の余りを書き出すので中途半端な数が表示されるが、まあいいか…。
+					// スレッドを論理コアの最大数まで酷使するとコンソールが詰まるのでもう少し間隔甘くてもいいと思う。
+					if ((++time_stamp_count % 40) == 0)
+						output_status();
+
+					// このメモリは不要なのでこのタイミングで開放しておく。
+					delete ptr;
+				}
+			}
+		}
+
+		// 終了前にもう一度、タイムスタンプを出力。
+		output_status();
+	}
+
+	// この単位でファイル名を変更する。
+	uint64_t save_every = UINT64_MAX;
+
+private:
+
+	fstream fs;
+
+	// コンストラクタで渡されたファイル名
+	std::string filename_;
+
+	// 処理した件数をここに加算していき、save_everyを超えたら、ファイル名を変更し、このカウンターをリセットする。
+	uint64_t save_every_counter = 0;
+
+	// ファイルに書き込む用のthread
+	std::thread file_worker_thread;
+	// すべてのスレッドが終了したかのフラグ
+	atomic<bool> finished;
+
+	// タイムスタンプの出力用のカウンター
+	uint64_t time_stamp_count = 0;
+
+	// ファイルに書き出す前のバッファ
+	// sfen_buffersは各スレッドに対するバッファ
+	// sfen_buffers_poolは書き出しのためのバッファ。
+	// 前者のバッファに局面をSFEN_WRITE_SIZEだけ積んだら、後者に積み替える。
+	std::vector<PSVector*> sfen_buffers;
+	std::vector<PSVector*> sfen_buffers_pool;
+
+	// sfen_buffers_poolにアクセスするときに必要なmutex
+	Mutex mutex;
+
+	// 書きだした局面の数
+	uint64_t sfen_write_count = 0;
+};
+
+// -----------------------------------
+//  棋譜を生成するworker(スレッドごと)
+// -----------------------------------
+
+// 複数スレッドでsfenを生成するためのクラス
+struct MultiThinkGenSfen : public MultiThink
+{
+	MultiThinkGenSfen(int search_depth_, int search_depth2_, SfenWriter& sw_)
+		: search_depth(search_depth_), search_depth2(search_depth2_), sw(sw_)
+	{
+		hash.resize(GENSFEN_HASH_SIZE);
+
+		// PCを並列化してgensfenするときに同じ乱数seedを引いていないか確認用の出力。
+		std::cout << prng << std::endl;
+	}
+
+	virtual void thread_worker(size_t thread_id);
+	void start_file_write_worker() { sw.start_file_write_worker(); }
+
+	//  search_depth = 通常探索の探索深さ
+	int search_depth;
+	int search_depth2;
+
+	// 生成する局面の評価値の上限
+	int eval_limit;
+
+	// ランダムムーブを行なう最小ply
+	int random_move_minply;
+	// ランダムムーブを行なう最大ply
+	int random_move_maxply;
+	// 1局のなかでランダムムーブを行なう回数
+	int random_move_count;
+	// Aperyのようにランダムムーブのときに1/Nの確率で玉を動かす。
+	// また玉を動かしたときは1/Nの確率で相手番で1回ランダムムーブする。
+	// AperyはN=2。ここ0を指定するとこの機能を無効化する。
+	int random_move_like_apery;
+
+	// ランダムムーブの代わりにmulti pvを使うとき用。
+	// random_multi_pvは、MultiPVのときの候補手の数。
+	// 候補手の指し手を採択するとき、1位の指し手の評価値とN位の指し手の評価値との差が
+	// random_multi_pv_diffの範囲でなければならない。
+	// random_multi_pv_depthはMultiPVのときの探索深さ。
+	int random_multi_pv;
+	int random_multi_pv_diff;
+	int random_multi_pv_depth;
+
+	// 書き出す局面のply(初期局面からの手数)の最小、最大。
+	int write_minply;
+	int write_maxply;
+
+	// sfenの書き出し器
+	SfenWriter& sw;
+
+	// 同一局面の書き出しを制限するためのhash
+	// hash_indexを求めるためのmaskに使うので、2**Nでなければならない。
+	static const uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
+
+	vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
+};
+
+//  thread_id    = 0..Threads.size()-1
+void MultiThinkGenSfen::thread_worker(size_t thread_id)
+{
+	// とりあえず、書き出す手数の最大のところで引き分け扱いになるものとする。
+	const int MAX_PLY2 = write_maxply;
+
+	// StateInfoを最大手数分 + SearchのPVでleafにまで進めるbuffer
+	std::vector<StateInfo,AlignedAllocator<StateInfo>> states(MAX_PLY2 + MAX_PLY /* == search_depth + α */);
+	StateInfo si;
+
+	// 今回の指し手。この指し手で局面を進める。
+	Move m = MOVE_NONE;
+
+	// 終了フラグ
+	bool quit = false;
+
+	// 規定回数回になるまで繰り返し
+	while (!quit)
+	{
+		// Positionに対して従属スレッドの設定が必要。
+		// 並列化するときは、Threads (これが実体が vector<Thread*>なので、
+		// Threads[0]...Threads[thread_num-1]までに対して同じようにすれば良い。
+		auto th = Threads[thread_id];
+
+		auto& pos = th->rootPos;
+    pos.set(StartFEN, false, &si, th);
+
+		// 探索部で定義されているBookMoveSelectorのメンバを参照する。
+		//auto& book = ::book;
+
+		// 1局分の局面を保存しておき、終局のときに勝敗を含めて書き出す。
+		// 書き出す関数は、この下にあるflush_psv()である。
+		PSVector a_psv;
+		a_psv.reserve(MAX_PLY2 + MAX_PLY);
+
+		// a_psvに積まれている局面をファイルに書き出す。
+		// lastTurnIsWin : a_psvに積まれている最終局面の次の局面での勝敗
+		// 勝ちのときは1。負けのときは-1。引き分けのときは0を渡す。
+		// 返し値 : もう規定局面数に達したので終了する場合にtrue。
+		auto flush_psv = [&](int8_t lastTurnIsWin)
+		{
+			int8_t isWin = lastTurnIsWin;
+
+			// 終局の局面(の一つ前)から初手に向けて、各局面に関して、対局の勝敗の情報を付与しておく。
+			// a_psvに保存されている局面は(手番的に)連続しているものとする。
+			for (auto it = a_psv.rbegin(); it != a_psv.rend(); ++it)
+			{
+				// isWin == 0(引き分け)なら -1を掛けても 0(引き分け)のまま
+				isWin = - isWin;
+				it->game_result = isWin;
+
+				// 局面を書き出そうと思ったら規定回数に達していた。
+				// get_next_loop_count()内でカウンターを加算するので
+				// 局面を出力したときにこれを呼び出さないとカウンターが狂う。
+				auto loop_count = get_next_loop_count();
+				if (loop_count == UINT64_MAX)
+				{
+					// 終了フラグを立てておく。
+					quit = true;
+					return;
+				}
+
+				// 局面を一つ書き出す。
+				sw.write(thread_id, *it);
+
+#if 0
+				pos.set_from_packed_sfen(it->sfen);
+				cout << pos << "Win : " << it->isWin << " , " << it->score << endl;
+#endif
+			}
+		};
+
+		// ply手目でランダムムーブをするかどうかのフラグ
+		vector<bool> random_move_flag;
+		{
+			// ランダムムーブを入れるならrandom_move_maxply手目までに絶対にrandom_move_count回入れる。
+			// そこそこばらけて欲しい。
+			// どれくらいがベストなのかはよくわからない。色々条件を変えて実験中。
+			
+			// a[0] = 0 , a[1] = 1, ... みたいな配列を作って、これを
+			// Fisher-Yates shuffleして先頭のN個を取り出せば良い。
+			// 実際には、N個欲しいだけなので先頭N個分だけFisher-Yatesでshuffleすれば良い。
+
+			vector<int> a;
+			a.reserve((size_t)random_move_maxply);
+
+			// random_move_minply , random_move_maxplyは1 originで指定されるが、
+			// ここでは0 originで扱っているので注意。
+			for (int i = std::max(random_move_minply - 1 , 0) ; i < random_move_maxply; ++i)
+				a.push_back(i);
+
+			// Apery方式のランダムムーブの場合、insert()がrandom_move_count回呼び出される可能性があるので
+			// それを考慮したサイズだけ確保しておく。
+			random_move_flag.resize((size_t)random_move_maxply + random_move_count);
+
+			// a[]のsize()を超える回数のランダムムーブは適用できないので制限する。
+			for (int i = 0 ; i < std::min(random_move_count, (int)a.size()) ; ++i)
+			{
+				swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
+				random_move_flag[a[i]] = true;
+			}
+		}
+
+		// random moveを行なった回数をカウントしておくカウンター
+		// random_move_minply == -1のときに、連続してランダムムーブを行なうので、このときに用いる。
+		int random_move_c = 0;
+
+		// ply : 初期局面からの手数
+		for (int ply = 0; ; ++ply)
+		{
+			//cout << pos << endl;
+
+			// 今回の探索depth
+			// gotoで飛ぶので先に宣言しておく。
+			int depth = search_depth + (int)prng.rand(search_depth2 - search_depth + 1);
+
+			// 長手数に達したのか
+			if (ply >= MAX_PLY2)
+			{
+#if defined (LEARN_GENSFEN_USE_DRAW_RESULT)
+				// 勝敗 = 引き分けとして書き出す。
+				// こうしたほうが自分が入玉したときに、相手の入玉を許しにくい(かも)
+				flush_psv(0);
+#endif
+				break;
+			}
+
+			// 全駒されて詰んでいたりしないか？
+			if (pos.is_mated())
+			{
+        if (pos.checkers()) {
+          // (この局面の一つ前の局面までは書き出す)
+          // Write the packed fens if checkmate.
+          // Do not write if stalemate.
+          flush_psv(-1);
+        }
+				break;
+			}
+
+			//// 定跡
+			//if ((m = book.probe(pos)) != MOVE_NONE)
+			//{
+			//	// 定跡にhitした。
+			//	// その指し手はmに格納された。
+
+			//	// 定跡の局面は学習には用いない。
+			//	a_psv.clear();
+
+			//	if (random_move_minply != -1)
+			//		// 定跡の局面であっても、一定確率でランダムムーブは行なう。
+			//		goto RANDOM_MOVE;
+			//	else
+			//		// random_move_minplyとして-1が指定されているときは定跡を抜けるところまでは定跡に従って指す。
+			//		// 巨大定跡を用いて、ConsiderBookMoveCount trueとして定跡を抜けた局面を無数に用意しておき、
+			//		// そこから5回ランダムムーブを行なう、などの用途に用いる。
+			//		goto DO_MOVE;
+			//}
+
+			{
+				// search_depth～search_depth2 手読みの評価値とPV(最善応手列)
+				// 探索窓を狭めておいても問題ないはず。
+
+				auto pv_value1 = search(pos, depth);
+
+				auto value1 = pv_value1.first;
+				auto& pv1 = pv_value1.second;
+
+				// 評価値の絶対値がこの値以上の局面については
+				// その局面を学習に使うのはあまり意味がないのでこの試合を終了する。
+				// これをもって勝敗がついたという扱いをする。
+
+				// 1手詰め、宣言勝ちならば、ここでmate_in(2)が返るのでeval_limitの上限値と同じ値になり、
+				// このif式は必ず真になる。resignについても同様。
+
+				if (abs(value1) >= eval_limit)
+				{
+//					sync_cout << pos << "eval limit = " << eval_limit << " over , move = " << pv1[0] << sync_endl;
+
+					// この局面でvalue1 >= eval_limitならば、(この局面の手番側の)勝ちである。
+					flush_psv((value1 >= eval_limit) ? 1 : -1);
+					break;
+				}
+
+				// おかしな指し手の検証
+				if (pv1.size() > 0
+					&& (pv1[0] == MOVE_NONE || pv1[0] == MOVE_NULL)
+					)
+				{
+					// MOVE_WINは、この手前で宣言勝ちの局面であるかチェックしているので
+					// ここで宣言勝ちの指し手が返ってくることはないはず。
+					// また、MOVE_RESIGNのときvalue1は1手詰めのスコアであり、eval_limitの最小値(-31998)のはずなのだが…。
+					cout << "Error! : " << pos.fen() << m << value1 << endl;
+					break;
+				}
+
+				// 各千日手に応じた処理。
+
+        if (pos.is_draw(0)) {
+#if defined	(LEARN_GENSFEN_USE_DRAW_RESULT)
+          // 引き分けを書き出すとき
+          flush_psv(is_win);
+#endif
+          break;
+        }
+
+				// PVの指し手でleaf nodeまで進めて、そのleaf nodeでevaluate()を呼び出した値を用いる。
+				auto evaluate_leaf = [&](Position& pos , vector<Move>& pv)
+				{
+					auto rootColor = pos.side_to_move();
+
+					int ply2 = ply;
+					for (auto m : pv)
+					{
+						// デバッグ用の検証として、途中に非合法手が存在しないことを確認する。
+						// NULL_MOVEはこないものとする。
+
+						// 十分にテストしたのでコメントアウトで良い。
+#if 1
+						// 非合法手はやってこないはずなのだが。
+						// 宣言勝ちとmated()でないことは上でテストしているので
+						// 読み筋としてMOVE_WINとMOVE_RESIGNが来ないことは保証されている。(はずだが…)
+						if (!pos.pseudo_legal(m) || !pos.legal(m))
+						{
+							cout << "Error! : " << pos.fen() << m << endl;
+						}
+#endif
+						pos.do_move(m, states[ply2++]);
+						
+						// 毎ノードevaluate()を呼び出さないと、evaluate()の差分計算が出来ないので注意！
+						// depthが8以上だとこの差分計算はしないほうが速いと思われる。
+#if defined(EVAL_NNUE)
+            if (depth < 8)
+              Eval::evaluate_with_no_return(pos);
+#endif  // defined(EVAL_NNUE)
+					}
+
+					// leafに到達
+					//      cout << pos;
+
+					auto v = Eval::evaluate(pos);
+					// evaluate()は手番側の評価値を返すので、
+					// root_colorと違う手番なら、vを反転させて返さないといけない。
+					if (rootColor != pos.side_to_move())
+						v = -v;
+
+					// 巻き戻す。
+					// C++x14にもなって、いまだreverseで回すforeachすらないのか…。
+					//  for (auto it : boost::adaptors::reverse(pv))
+
+					for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+						pos.undo_move(*it);
+
+					return v;
+				};
+
+#if 0
+				dbg_hit_on(pv_value1.first == leaf_value);
+				// gensfen depth 3 eval_limit 32000
+				// Total 217749 Hits 203579 hit rate (%) 93.490
+				// gensfen depth 6 eval_limit 32000
+				// Total 78407 Hits 69190 hit rate (%) 88.245
+				// gensfen depth 6 eval_limit 3000
+				// Total 53879 Hits 43713 hit rate (%) 81.132
+
+				// 置換表の指し手で枝刈りされるなどの問題。
+				// これ、教師としては少し気持ち悪いが…。
+#endif
+
+				// depth 0の場合、pvが得られていないのでdepth 2で探索しなおす。
+				if (search_depth <= 0)
+				{
+					pv_value1 = search(pos, 2);
+					pv1 = pv_value1.second;
+				}
+
+				// 初期局面周辺はは類似局面ばかりなので
+				// 学習に用いると過学習になりかねないから書き出さない。
+				// →　比較実験すべき
+				if (ply < write_minply - 1)
+				{
+					a_psv.clear();
+					goto SKIP_SAVE;
+				}
+
+				// 同一局面を書き出したところか？
+				// これ、複数のPCで並列して生成していると同じ局面が含まれることがあるので
+				// 読み込みのときにも同様の処理をしたほうが良い。
+				{
+					auto key = pos.key();
+					auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
+					auto key2 = hash[hash_index];
+					if (key == key2)
+					{
+						// スキップするときはこれ以前に関する
+						// 勝敗の情報がおかしくなるので保存している局面をクリアする。
+						// どのみち、hashが合致した時点でそこ以前の局面も合致している可能性が高いから
+						// 書き出す価値がない。
+						a_psv.clear();
+						goto SKIP_SAVE;
+					}
+					hash[hash_index] = key; // 今回のkeyに入れ替えておく。
+				}
+
+				// 局面の一時保存。
+				{
+					a_psv.emplace_back(PackedSfenValue());
+					auto &psv = a_psv.back();
+					
+					// packを要求されているならpackされたsfenとそのときの評価値を書き出す。
+					// 最終的な書き出しは、勝敗がついてから。
+					pos.sfen_pack(psv.sfen);
+
+					// PV lineのleaf nodeでのroot colorから見たevaluate()の値を取得。
+					// search()の返し値をそのまま使うのとこうするのとの善悪は良くわからない。
+					psv.score = evaluate_leaf(pos, pv1);
+					psv.gamePly = ply;
+
+					// PVの初手を取り出す。これはdepth 0でない限りは存在するはず。
+					assert(pv_value1.second.size() >= 1);
+					Move pv_move1 = pv_value1.second[0];
+					psv.move = pv_move1;
+				}
+
+			SKIP_SAVE:;
+
+				// 何故かPVが得られなかった(置換表などにhitして詰んでいた？)ので次の対局に行く。
+				// かなりのレアケースなので無視して良いと思う。
+				if (pv1.size() == 0)
+					break;
+				
+				// search_depth手読みの指し手で局面を進める。
+				m = pv1[0];
+			}
+
+		RANDOM_MOVE:;
+
+			// 合法手のなかからランダムに1手選ぶフェーズ
+			if (
+				// 1. random_move_minplyからrandom_move_maxplyの間でrandom_move_count回のランダムムーブを行なうモード
+				(random_move_minply != -1 && ply < (int)random_move_flag.size() && random_move_flag[ply]) ||
+				// 2. 定跡を抜けたあとにまとめてrandom_move_count回のランダムムーブを行なうモード
+				(random_move_minply == -1 && random_move_c < random_move_count))
+			{
+				++random_move_c;
+
+				// mateではないので合法手が1手はあるはず…。
+				if (random_multi_pv == 0)
+				{
+					// 普通のランダムムーブ
+
+					MoveList<LEGAL> list(pos);
+
+					// ここをApery方式にするのとの善悪はよくわからない。
+					if (random_move_like_apery == 0
+						|| prng.rand(random_move_like_apery) != 0
+					)
+					{
+						// 普通に合法手から1手選択
+						m = list.at((size_t)prng.rand((uint64_t)list.size()));
+					}
+					else {
+						// 玉が動かせるなら玉を動かす
+						Move moves[8]; // 8近傍
+						Move* p = &moves[0];
+						for (auto& m : list)
+							if (type_of(pos.moved_piece(m)) == KING)
+								*(p++) = m;
+						size_t n = p - &moves[0];
+						if (n != 0)
+						{
+							// 玉を動かす指し手
+							m = moves[prng.rand(n)];
+
+							// Apery方式ではこのとき1/2の確率で相手もランダムムーブ
+							if (prng.rand(2) == 0)
+							{
+								// random_move_flag[ply]の次のところに"1"を追加するのがシンプルなhackか。
+								random_move_flag.insert(random_move_flag.begin() + ply + 1, 1, true);
+							}
+						}
+						else
+							// 普通に合法手から1手選択
+							m = list.at((size_t)prng.rand((uint64_t)list.size()));
+					}
+
+					// 玉の2手指しのコードを入れていたが、合法手から1手選べばそれに相当するはずで
+					// コードが複雑化するだけだから不要だと判断した。
+				}
+				else {
+					// ロジックが複雑になるので、すまんがここで再度MultiPVで探索する。
+					Learner::search(pos, random_multi_pv_depth, random_multi_pv);
+					// rootMovesの上位N手のなかから一つ選択
+
+					auto& rm = pos.this_thread()->rootMoves;
+
+					uint64_t s = min((uint64_t)rm.size(), (uint64_t)random_multi_pv);
+					for (uint64_t i = 1; i < s; ++i)
+					{
+						// rm[0]の評価値との差がrandom_multi_pv_diffの範囲でなければならない。
+						// rm[x].scoreは、降順に並んでいると仮定できる。 
+						if (rm[0].score > rm[i].score + random_multi_pv_diff)
+						{
+							s = i;
+							break;
+						}
+					}
+
+					m = rm[prng.rand(s)].pv[0];
+
+					// まだ1局面も書き出していないのに終局してたので書き出し処理は端折って次の対局に。
+					if (!is_ok(m))
+						break;
+				}
+
+				// ゲームの勝敗から指し手を評価しようとするとき、
+				// 今回のrandom moveがあるので、ここ以前には及ばないようにする。
+				a_psv.clear(); // 保存していた局面のクリア
+			}
+
+		DO_MOVE:;
+			pos.do_move(m, states[ply]);
+
+			// 差分計算を行なうために毎node evaluate()を呼び出しておく。
+			Eval::evaluate_with_no_return(pos);
+
+		} // for (int ply = 0; ; ++ply)
+	
+	} // while(!quit)
+	
+	sw.finalize(thread_id);
+}
+
+// -----------------------------------
+//    棋譜を生成するコマンド(master thread)
+// -----------------------------------
+
+// 棋譜を生成するコマンド
+void gen_sfen(Position&, istringstream& is)
+{
+	// スレッド数(これは、USIのsetoptionで与えられる)
+	uint32_t thread_num = (uint32_t)Options["Threads"];
+
+	// 生成棋譜の個数 default = 80億局面(Ponanza仕様)
+	uint64_t loop_max = 8000000000UL;
+
+	// 評価値がこの値になったら生成を打ち切る。
+	int eval_limit = 3000;
+
+	// 探索深さ
+	int search_depth = 3;
+	int search_depth2 = INT_MIN;
+
+	// ランダムムーブを行なう最小plyと最大plyと回数
+	int random_move_minply = 1;
+	int random_move_maxply = 24;
+	int random_move_count = 5;
+	// ランダムムーブをAperyのように玉を主に動かす機能
+	// これを例えば3にすると1/3の確率で玉を動かす。
+	int random_move_like_apery = 0;
+	// ランダムムーブの代わりにmultipvで探索してそのなかからランダムに選ぶときはrandom_multi_pv = 1以上の数にする。
+	int random_multi_pv = 0;
+	int random_multi_pv_diff = 32000;
+	int random_multi_pv_depth = INT_MIN;
+
+	// 書き出す局面のply(初期局面からの手数)の最小、最大。
+	int write_minply = 16;
+	int write_maxply = 400;
+
+	// 書き出すファイル名
+	string output_file_name = "generated_kifu.bin";
+
+	string token;
+
+	// eval hashにhitすると初期局面付近の評価値として、hash衝突して大きな値を書き込まれてしまうと
+	// eval_limitが小さく設定されているときに初期局面で毎回eval_limitを超えてしまい局面の生成が進まなくなる。
+	// そのため、eval hashは無効化する必要がある。
+	// あとeval hashのhash衝突したときに、変な値の評価値が使われ、それを教師に使うのが気分が悪いというのもある。
+	bool use_eval_hash = false;
+
+	// この単位でファイルに保存する。
+	// ファイル名は file_1.bin , file_2.binのように連番がつく。
+	uint64_t save_every = UINT64_MAX;
+
+	// ファイル名の末尾にランダムな数値を付与する。
+	bool random_file_name = false;
+
+	while (true)
+	{
+		token = "";
+		is >> token;
+		if (token == "")
+			break;
+
+		if (token == "depth")
+			is >> search_depth;
+		else if (token == "depth2")
+			is >> search_depth2;
+		else if (token == "loop")
+			is >> loop_max;
+		else if (token == "output_file_name")
+			is >> output_file_name;
+		else if (token == "eval_limit")
+		{
+			is >> eval_limit;
+			// 最大値を1手詰みのスコアに制限する。(そうしないとループを終了しない可能性があるので)
+			eval_limit = std::min(eval_limit, (int)mate_in(2));
+		}
+		else if (token == "random_move_minply")
+			is >> random_move_minply;
+		else if (token == "random_move_maxply")
+			is >> random_move_maxply;
+		else if (token == "random_move_count")
+			is >> random_move_count;
+		else if (token == "random_move_like_apery")
+			is >> random_move_like_apery;
+		else if (token == "random_multi_pv")
+			is >> random_multi_pv;
+		else if (token == "random_multi_pv_diff")
+			is >> random_multi_pv_diff;
+		else if (token == "random_multi_pv_depth")
+			is >> random_multi_pv_depth;
+		else if (token == "write_minply")
+			is >> write_minply;
+		else if (token == "write_maxply")
+			is >> write_maxply;
+		else if (token == "use_eval_hash")
+			is >> use_eval_hash;
+		else if (token == "save_every")
+			is >> save_every;
+		else if (token == "random_file_name")
+			is >> random_file_name;
+		else
+			cout << "Error! : Illegal token " << token << endl;
+	}
+
+#if defined(USE_GLOBAL_OPTIONS)
+	// あとで復元するために保存しておく。
+	auto oldGlobalOptions = GlobalOptions;
+	GlobalOptions.use_eval_hash = use_eval_hash;
+#endif
+
+	// search depth2が設定されていないなら、search depthと同じにしておく。
+	if (search_depth2 == INT_MIN)
+		search_depth2 = search_depth;
+	if (random_multi_pv_depth == INT_MIN)
+		random_multi_pv_depth = search_depth;
+
+	if (random_file_name)
+	{
+		// output_file_nameにこの時点でランダムな数値を付与してしまう。
+    std::random_device seed_gen;
+    PRNG r(seed_gen());
+		// 念のため乱数振り直しておく。
+		for(int i=0;i<10;++i)
+			r.rand(1);
+		auto to_hex = [](uint64_t u){
+			std::stringstream ss;
+			ss << std::hex << u;
+			return ss.str();
+		};
+		// 64bitの数値で偶然かぶると嫌なので念のため64bitの数値２つくっつけておく。
+		output_file_name = output_file_name + "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
+	}
+
+	std::cout << "gensfen : " << endl
+		<< "  search_depth = " << search_depth << " to " << search_depth2 << endl
+		<< "  loop_max = " << loop_max << endl
+		<< "  eval_limit = " << eval_limit << endl
+		<< "  thread_num (set by USI setoption) = " << thread_num << endl
+		<< "  book_moves (set by USI setoption) = " << Options["BookMoves"] << endl
+		<< "  random_move_minply     = " << random_move_minply << endl
+		<< "  random_move_maxply     = " << random_move_maxply << endl
+		<< "  random_move_count      = " << random_move_count << endl
+		<< "  random_move_like_apery = " << random_move_like_apery << endl
+		<< "  random_multi_pv        = " << random_multi_pv << endl
+		<< "  random_multi_pv_diff   = " << random_multi_pv_diff << endl
+		<< "  random_multi_pv_depth  = " << random_multi_pv_depth << endl
+		<< "  write_minply           = " << write_minply << endl
+		<< "  write_maxply           = " << write_maxply << endl
+		<< "  output_file_name       = " << output_file_name << endl
+		<< "  use_eval_hash          = " << use_eval_hash << endl
+		<< "  save_every             = " << save_every << endl
+		<< "  random_file_name       = " << random_file_name << endl;
+
+	// Options["Threads"]の数だけスレッドを作って実行。
+	{
+		SfenWriter sw(output_file_name, thread_num);
+		sw.save_every = save_every;
+
+		MultiThinkGenSfen multi_think(search_depth, search_depth2, sw);
+		multi_think.set_loop_max(loop_max);
+		multi_think.eval_limit = eval_limit;
+		multi_think.random_move_minply = random_move_minply;
+		multi_think.random_move_maxply = random_move_maxply;
+		multi_think.random_move_count = random_move_count;
+		multi_think.random_move_like_apery = random_move_like_apery;
+		multi_think.random_multi_pv = random_multi_pv;
+		multi_think.random_multi_pv_diff = random_multi_pv_diff;
+		multi_think.random_multi_pv_depth = random_multi_pv_depth;
+		multi_think.write_minply = write_minply;
+		multi_think.write_maxply = write_maxply;
+		multi_think.start_file_write_worker();
+		multi_think.go_think();
+
+		// SfenWriterのデストラクタでjoinするので、joinが終わってから終了したというメッセージを
+		// 表示させるべきなのでここをブロックで囲む。
+	}
+
+	std::cout << "gensfen finished." << endl;
+
+#if defined(USE_GLOBAL_OPTIONS)
+	// GlobalOptionsの復元。
+	GlobalOptions = oldGlobalOptions;
+#endif
+
+}
+
+// -----------------------------------
+// 生成した棋譜から学習させるコマンド(learn)
+// -----------------------------------
+
+// 普通のシグモイド関数
+double sigmoid(double x)
+{
+	return 1.0 / (1.0 + std::exp(-x));
+}
+
+// 評価値を勝率[0,1]に変換する関数
+double winning_percentage(double value)
+{
+	// この600.0という定数は、ponanza定数。(ponanzaがそうしているらしいという意味で)
+	// ゲームの進行度に合わせたものにしたほうがいいかも知れないけども、その効果のほどは不明。
+	return sigmoid(value / 600.0);
+}
+
+// 普通のシグモイド関数の導関数。
+double dsigmoid(double x)
+{
+	// シグモイド関数
+	//    f(x) = 1/(1+exp(-x))
+	// に対して1階微分は、
+	//    f'(x) = df/dx = f(x)・{ 1 - f(x) }
+	// となる。
+
+	return sigmoid(x) * (1.0 - sigmoid(x));
+}
+
+// 目的関数が勝率の差の二乗和のとき
+#if defined (LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
+// 勾配を計算する関数
+double calc_grad(Value deep, Value shallow, PackedSfenValue& psv)
+{
+	// 勝率の差の2乗が目的関数それを最小化する。
+	// 目的関数 J = 1/2m Σ ( win_rate(shallow) - win_rate(deep) ) ^2
+	// ただし、σはシグモイド関数で、評価値を勝率の差に変換するもの。
+	// mはサンプルの件数。shallowは浅い探索(qsearch())のときの評価値。deepは深い探索のときの評価値。
+	// また、Wを特徴ベクトル(評価関数のパラメーター)、Xi,Yiを教師とすると
+	// shallow = W*Xi   // *はアダマール積で、Wの転置・X の意味
+	// f(Xi) = win_rate(W*Xi)
+	// σ(i番目のdeep) = Yi とおくと、
+	// J = m/2 Σ ( f(Xi) - Yi )^2
+	// とよくある式になる。
+	// Wはベクトルで、j番目の要素をWjと書くとすると、連鎖律から
+	// ∂J/∂Wj =            ∂J/∂f     ・  ∂f/∂W   ・ ∂W/∂Wj
+	//          =  1/m Σ ( f(Xi) - y )  ・  f'(Xi)    ・    1
+
+	// 1/mはあとで掛けるとして、勾配の値としてはΣの中身を配列に保持しておけば良い。
+	// f'(Xi) = win_rate'(shallow) = sigmoid'(shallow/600) = dsigmoid(shallow / 600) / 600
+	// この末尾の /600 は学習率で調整するから書かなくていいか..
+	// また1/mという係数も、Adam , AdaGradのような勾配の自動調整機能を持つ更新式を用いるなら不要。
+	// ゆえにメモリ上に保存しておく必要はない。
+
+	double p = winning_percentage(deep);
+	double q = winning_percentage(shallow);
+	return (q - p) * dsigmoid(double(shallow) / 600.0);
+}
+#endif
+
+#if defined (LOSS_FUNCTION_IS_CROSS_ENTOROPY)
+double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
+{
+	// 交差エントロピーを用いた目的関数
+
+	// 交差エントロピーの概念と性質については、
+	// http://nnadl-ja.github.io/nnadl_site_ja/chap3.html#the_cross-entropy_cost_function
+	// http://postd.cc/visual-information-theory-3/
+	// などを参考に。
+
+	// 目的関数の設計)
+	// pの分布をqの分布に近づけたい → pとqの確率分布間の交差エントロピーの最小化問題と考える。
+	// J = H(p,q) = - Σ p(x) log(q(x)) = -p log q - (1-p) log(1-q)
+	//                 x
+
+	// pは定数、qはWiの関数(q = σ(W・Xi) )としてWiに対する偏微分を求める。
+	// ∂J/∂Wi = -p・q'/q - (1-p)(1-q)'/(1-q)
+	//          = ...
+	//          = q - p.
+
+	double p = winning_percentage(deep);
+	double q = winning_percentage(shallow);
+
+	return q - p;
+}
+#endif
+
+#if defined ( LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE )
+double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
+{
+	// 勝率の関数を通さない版
+	// これ、EVAL_LIMITを低くしておかないと、終盤の形に対して評価値を一致させようとして
+	// evalがevalの範囲を超えかねない。
+	return shallow - deep;
+}
+#endif
+
+#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
+
+// elmo(WCSC27)で使われている定数。要調整。
+// elmoのほうは式を内分していないので値が違う。
+// learnコマンドでこの値を設定できる。
+// 0.33は、elmo(WCSC27)で使われていた定数(0.5)相当
+double ELMO_LAMBDA = 0.33;
+double ELMO_LAMBDA2 = 0.33;
+double ELMO_LAMBDA_LIMIT = 32000;
+
+double calc_grad(Value deep, Value shallow , const PackedSfenValue& psv)
+{
+	// elmo(WCSC27)方式
+	// 実際のゲームの勝敗で補正する。
+
+	const double eval_winrate = winning_percentage(shallow);
+	const double teacher_winrate = winning_percentage(deep);
+
+	// 期待勝率を勝っていれば1、負けていれば 0、引き分けなら0.5として補正項として用いる。
+	// game_result = 1,0,-1なので1足して2で割る。
+	const double t = double(psv.game_result + 1) / 2;
+
+	// 深い探索での評価値がELMO_LAMBDA_LIMITを超えているならELMO_LAMBDAではなくELMO_LAMBDA2を適用する。
+	const double lambda = (abs(deep) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
+
+	// 実際の勝率を補正項として使っている。
+	// これがelmo(WCSC27)のアイデアで、現代のオーパーツ。
+	const double grad = (1 - lambda) * (eval_winrate - t) + lambda * (eval_winrate - teacher_winrate);
+
+	return grad;
+}
+
+// 学習時の交差エントロピーの計算
+// elmo式の勝敗項と勝率項との個別の交差エントロピーが引数であるcross_entropy_evalとcross_entropy_winに返る。
+void calc_cross_entropy(Value deep, Value shallow, const PackedSfenValue& psv,
+	double& cross_entropy_eval, double& cross_entropy_win, double& cross_entropy,
+	double& entropy_eval, double& entropy_win, double& entropy)
+{
+	const double p /* teacher_winrate */ = winning_percentage(deep);
+	const double q /* eval_winrate    */ = winning_percentage(shallow);
+	const double t = double(psv.game_result + 1) / 2;
+
+	constexpr double epsilon = 0.000001;
+
+	// 深い探索での評価値がELMO_LAMBDA_LIMITを超えているならELMO_LAMBDAではなくELMO_LAMBDA2を適用する。
+	const double lambda = (abs(deep) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
+
+	const double m = (1.0 - lambda) * t + lambda * p;
+
+	cross_entropy_eval =
+		(-p * std::log(q + epsilon) - (1.0 - p) * std::log(1.0 - q + epsilon));
+	cross_entropy_win =
+		(-t * std::log(q + epsilon) - (1.0 - t) * std::log(1.0 - q + epsilon));
+	entropy_eval =
+		(-p * std::log(p + epsilon) - (1.0 - p) * std::log(1.0 - p + epsilon));
+	entropy_win =
+		(-t * std::log(t + epsilon) - (1.0 - t) * std::log(1.0 - t + epsilon));
+
+	cross_entropy =
+		(-m * std::log(q + epsilon) - (1.0 - m) * std::log(1.0 - q + epsilon));
+	entropy =
+		(-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
+}
+
+#endif
+
+
+// 目的関数として他のバリエーションも色々用意するかも..
+
+double calc_grad(Value shallow, const PackedSfenValue& psv) {
+	return calc_grad((Value)psv.score, shallow, psv);
+}
+
+// Sfenの読み込み機
+struct SfenReader
+{
+	SfenReader(int thread_num) : prng((std::random_device())())
+	{
+		packed_sfens.resize(thread_num);
+		total_read = 0;
+		total_done = 0;
+		last_done = 0;
+		next_update_weights = 0;
+		save_count = 0;
+		end_of_files = false;
+		no_shuffle = false;
+		stop_flag = false;
+
+		hash.resize(READ_SFEN_HASH_SIZE);
+	}
+
+	~SfenReader()
+	{
+		if (file_worker_thread.joinable())
+			file_worker_thread.join();
+
+		for (auto p : packed_sfens)
+			delete p;
+		for (auto p : packed_sfens_pool)
+			delete p;
+	}
+
+	// mseなどの計算用に用いる局面数
+	// mini-batch size = 1Mが標準的なので、その0.2%程度なら時間的には無視できるはず。
+	// 指し手一致率の計算でdepth = 1でsearch()をするので、単純比較はできないが…。
+	const uint64_t sfen_for_mse_size = 2000;
+
+	// mseなどの計算用に局面を読み込んでおく。
+	void read_for_mse()
+	{
+		auto th = Threads.main();
+		Position& pos = th->rootPos;
+		for (uint64_t i = 0; i < sfen_for_mse_size; ++i)
+		{
+			PackedSfenValue ps;
+			if (!read_to_thread_buffer(0, ps))
+			{
+				cout << "Error! read packed sfen , failed." << endl;
+				break;
+			}
+			sfen_for_mse.push_back(ps);
+
+			// hash keyを求める。
+			StateInfo si;
+			pos.set_from_packed_sfen(ps.sfen,&si,th);
+			sfen_for_mse_hash.insert(pos.key());
+		}
+	}
+
+	void read_validation_set(const string file_name, int eval_limit)
+	{
+		ifstream fs(file_name, ios::binary);
+
+		while (fs)
+		{
+			PackedSfenValue p;
+			if (fs.read((char*)&p, sizeof(PackedSfenValue)))
+			{
+				if (eval_limit < abs(p.score))
+					continue;
+#if !defined (LEARN_GENSFEN_USE_DRAW_RESULT)
+				if (p.game_result == 0)
+					continue;
+#endif
+
+				sfen_for_mse.push_back(p);
+			} else {
+				break;
+			}
+		}
+	}
+
+	// 各スレッドがバッファリングしている局面数 0.1M局面。40HTで4M局面
+	const size_t THREAD_BUFFER_SIZE = 10 * 1000;
+
+	// ファイル読み込み用のバッファ(これ大きくしたほうが局面がshuffleが大きくなるので局面がバラけていいと思うが
+	// あまり大きいとメモリ消費量も上がる。
+	// SFEN_READ_SIZEはTHREAD_BUFFER_SIZEの倍数であるものとする。
+	const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
+
+	// [ASYNC] スレッドが局面を一つ返す。なければfalseが返る。
+	bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
+	{
+		// スレッドバッファに局面が残っているなら、それを1つ取り出して返す。
+		auto& thread_ps = packed_sfens[thread_id];
+
+		// バッファに残りがなかったらread bufferから充填するが、それすらなかったらもう終了。
+		if ((thread_ps == nullptr || thread_ps->size() == 0) // バッファが空なら充填する。
+			&& !read_to_thread_buffer_impl(thread_id))
+			return false;
+
+		// read_to_thread_buffer_impl()がtrueを返したというこは、
+		// スレッドバッファへの局面の充填が無事完了したということなので
+		// thread_ps->rbegin()は健在。
+
+		ps = *(thread_ps->rbegin());
+		thread_ps->pop_back();
+		
+		// バッファを使いきったのであれば自らdeleteを呼び出してこのバッファを開放する。
+		if (thread_ps->size() == 0)
+		{
+			delete thread_ps;
+			thread_ps = nullptr;
+		}
+
+		return true;
+	}
+
+	// [ASYNC] スレッドバッファに局面をある程度読み込む。
+	bool read_to_thread_buffer_impl(size_t thread_id)
+	{
+		while (true)
+		{
+			{
+				std::unique_lock<Mutex> lk(mutex);
+				// ファイルバッファから充填できたなら、それで良し。
+				if (packed_sfens_pool.size() != 0)
+				{
+					// 充填可能なようなので充填して終了。
+
+					packed_sfens[thread_id] = packed_sfens_pool.front();
+					packed_sfens_pool.pop_front();
+
+					total_read += THREAD_BUFFER_SIZE;
+
+					return true;
+				}
+			}
+
+			// もうすでに読み込むファイルは無くなっている。もうダメぽ。
+			if (end_of_files)
+				return false;
+
+			// file workerがpacked_sfens_poolに充填してくれるのを待っている。
+			// mutexはlockしていないのでいずれ充填してくれるはずだ。
+			sleep(1);
+		}
+
+	}
+	
+	// 局面ファイルをバックグラウンドで読み込むスレッドを起動する。
+	void start_file_read_worker()
+	{
+		file_worker_thread = std::thread([&] { this->file_read_worker(); });
+	}
+
+	// ファイルの読み込み専用スレッド用
+	void file_read_worker()
+	{
+		auto open_next_file = [&]()
+		{
+			if (fs.is_open())
+				fs.close();
+
+			// もう無い
+			if (filenames.size() == 0)
+				return false;
+
+			// 次のファイル名ひとつ取得。
+			string filename = *filenames.rbegin();
+			filenames.pop_back();
+
+			fs.open(filename, ios::in | ios::binary);
+			cout << "open filename = " << filename << endl;
+			assert(fs);
+
+			return true;
+		};
+
+		while (true)
+		{
+			// バッファが減ってくるのを待つ。
+			// このsize()の読み取りはread onlyなのでlockしなくていいだろう。
+			while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
+				sleep(100);
+			if (stop_flag)
+				return;
+
+			PSVector sfens;
+			sfens.reserve(SFEN_READ_SIZE);
+
+			// ファイルバッファにファイルから読み込む。
+			while (sfens.size() < SFEN_READ_SIZE)
+			{
+				PackedSfenValue p;
+				if (fs.read((char*)&p, sizeof(PackedSfenValue)))
+				{
+					sfens.push_back(p);
+				} else
+				{
+					// 読み込み失敗
+					if (!open_next_file())
+					{
+						// 次のファイルもなかった。あぼーん。
+						cout << "..end of files." << endl;
+						end_of_files = true;
+						return;
+					}
+				}
+			}
+
+			// この読み込んだ局面データをshuffleする。
+			// random shuffle by Fisher-Yates algorithm
+
+			if (!no_shuffle)
+			{
+				auto size = sfens.size();
+				for (size_t i = 0; i < size; ++i)
+					swap(sfens[i], sfens[(size_t)(prng.rand((uint64_t)size - i) + i)]);
+			}
+
+			// これをTHREAD_BUFFER_SIZEごとの細切れにする。それがsize個あるはず。
+			// SFEN_READ_SIZEはTHREAD_BUFFER_SIZEの倍数であるものとする。
+			assert((SFEN_READ_SIZE % THREAD_BUFFER_SIZE)==0);
+
+			auto size = size_t(SFEN_READ_SIZE / THREAD_BUFFER_SIZE);
+			std::vector<PSVector*> ptrs;
+			ptrs.reserve(size);
+
+			for (size_t i = 0; i < size; ++i)
+			{
+				// このポインターのdeleteは、受け側で行なう。
+				PSVector* ptr = new PSVector();
+				ptr->resize(THREAD_BUFFER_SIZE);
+				memcpy(&((*ptr)[0]), &sfens[i * THREAD_BUFFER_SIZE], sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
+
+				ptrs.push_back(ptr);
+			}
+
+			// sfensの用意が出来たので、折を見てコピー
+			{
+				std::unique_lock<Mutex> lk(mutex);
+
+				// ポインタをコピーするだけなのでこの時間は無視できるはず…。
+				// packed_sfens_poolの内容を変更するのでmutexのlockが必要。
+
+				for (size_t i = 0; i < size; ++i)
+					packed_sfens_pool.push_back(ptrs[i]);
+			}
+		}
+	}
+
+	// sfenファイル群
+	vector<string> filenames;
+
+	// 読み込んだ局面数(ファイルからメモリ上のバッファへ)
+	atomic<uint64_t> total_read;
+
+	// 処理した局面数
+	atomic<uint64_t> total_done;
+
+	// 前回までに処理した件数
+	uint64_t last_done;
+
+	// total_readがこの値を超えたらupdate_weights()してmseの計算をする。
+	uint64_t next_update_weights;
+
+	uint64_t save_count;
+
+	// 局面読み込み時のシャッフルを行わない。
+	bool no_shuffle;
+
+	bool stop_flag;
+
+	// rmseの計算用の局面であるかどうかを判定する。
+	// (rmseの計算用の局面は学習のために使うべきではない。)
+	bool is_for_rmse(Key key) const
+	{
+		return sfen_for_mse_hash.count(key) != 0;
+	}
+
+	// 同一局面の読み出しを制限するためのhash
+	// 6400万局面って多すぎるか？そうでもないか..
+	// hash_indexを求めるためのmaskに使うので、2**Nでなければならない。
+	static const uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
+	vector<Key> hash; // 64MB*8 = 512MB
+
+	// mse計算用のtest局面
+	PSVector sfen_for_mse;
+
+protected:
+
+	// fileをバックグラウンドで読み込みしているworker thread
+	std::thread file_worker_thread;
+
+	// 局面の読み込み時にshuffleするための乱数
+	PRNG prng;
+
+	// ファイル群を読み込んでいき、最後まで到達したか。
+	atomic<bool> end_of_files;
+
+
+	// sfenファイルのハンドル
+	std::fstream fs;
+
+	// 各スレッド用のsfen
+	// (使いきったときにスレッドが自らdeleteを呼び出して開放すべし。)
+	std::vector<PSVector*> packed_sfens;
+
+	// packed_sfens_poolにアクセスするときのmutex
+	Mutex mutex;
+
+	// sfenのpool。fileから読み込むworker threadはここに補充する。
+	// 各worker threadはここから自分のpacked_sfens[thread_id]に充填する。
+	// ※　mutexをlockしてアクセスすること。
+	std::list<PSVector*> packed_sfens_pool;
+
+	// mse計算用の局面を学習に用いないためにhash keyを保持しておく。
+	std::unordered_set<Key> sfen_for_mse_hash;
+};
+
+// 複数スレッドでsfenを生成するためのクラス
+struct LearnerThink: public MultiThink
+{
+	LearnerThink(SfenReader& sr_):sr(sr_),stop_flag(false), save_only_once(false)
+	{
+#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
+		learn_sum_cross_entropy_eval = 0.0;
+		learn_sum_cross_entropy_win = 0.0;
+		learn_sum_cross_entropy = 0.0;
+		learn_sum_entropy_eval = 0.0;
+		learn_sum_entropy_win = 0.0;
+		learn_sum_entropy = 0.0;
+#endif
+#if defined(EVAL_NNUE)
+		newbob_scale = 1.0;
+		newbob_decay = 1.0;
+		newbob_num_trials = 2;
+		best_loss = std::numeric_limits<double>::infinity();
+		latest_loss_sum = 0.0;
+		latest_loss_count = 0;
+#endif
+	}
+
+	virtual void thread_worker(size_t thread_id);
+
+	// 局面ファイルをバックグラウンドで読み込むスレッドを起動する。
+	void start_file_read_worker() { sr.start_file_read_worker(); }
+
+	// 評価関数パラメーターをファイルに保存
+	bool save(bool is_final=false);
+
+	// sfenの読み出し器
+	SfenReader& sr;
+
+	// 学習の反復回数のカウンター
+	uint64_t epoch = 0;
+
+	// ミニバッチサイズのサイズ。必ずこのclassを使う側で設定すること。
+	uint64_t mini_batch_size = 1000*1000;
+
+	bool stop_flag;
+
+	// 割引率
+	double discount_rate;
+
+	// 序盤を学習対象から外すオプション
+	int reduction_gameply;
+
+	// kk/kkp/kpp/kpppを学習させないオプション
+	std::array<bool,4> freeze;
+
+	// 教師局面の深い探索の評価値の絶対値がこの値を超えていたらその教師局面を捨てる。
+	int eval_limit;
+
+	// 評価関数の保存するときに都度フォルダを掘るかのフラグ。
+	// trueだとフォルダを掘らない。
+	bool save_only_once;
+
+	// --- lossの計算
+
+#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
+	// 学習用データのロスの計算用
+	atomic<double> learn_sum_cross_entropy_eval;
+	atomic<double> learn_sum_cross_entropy_win;
+	atomic<double> learn_sum_cross_entropy;
+	atomic<double> learn_sum_entropy_eval;
+	atomic<double> learn_sum_entropy_win;
+	atomic<double> learn_sum_entropy;
+#endif
+
+#if defined(EVAL_NNUE)
+	shared_timed_mutex nn_mutex;
+	double newbob_scale;
+	double newbob_decay;
+	int newbob_num_trials;
+	double best_loss;
+	double latest_loss_sum;
+	uint64_t latest_loss_count;
+	std::string best_nn_directory;
+#endif
+
+	uint64_t eval_save_interval;
+	uint64_t loss_output_interval;
+	uint64_t mirror_percentage;
+
+	// ロスの計算。
+	// done : 今回対象とした局面数
+	void calc_loss(size_t thread_id , uint64_t done);
+
+	// ↑のlossの計算をタスクとして定義してやり、それを実行する
+	TaskDispatcher task_dispatcher;
+};
+
+void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
+{
+	// 置換表にhitされてもかなわんので、このタイミングで置換表の世代を新しくする。
+	// 置換表を無効にしているなら関係ないのだが。
+	TT.new_search();
+
+#if defined(EVAL_NNUE)
+	std::cout << "PROGRESS: " << now_string() << ", ";
+	std::cout << sr.total_done << " sfens";
+	std::cout << ", iteration " << epoch;
+	std::cout << ", eta = " << Eval::get_eta() << ", ";
+#endif
+
+#if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
+	double sum_error = 0;
+	double sum_error2 = 0;
+	double sum_error3 = 0;
+#endif
+
+#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
+	// 検証用データのロスの計算用
+	atomic<double> test_sum_cross_entropy_eval,test_sum_cross_entropy_win,test_sum_cross_entropy;
+	atomic<double> test_sum_entropy_eval,test_sum_entropy_win,test_sum_entropy;
+	test_sum_cross_entropy_eval = 0;
+	test_sum_cross_entropy_win = 0;
+	test_sum_cross_entropy = 0;
+	test_sum_entropy_eval = 0;
+	test_sum_entropy_win = 0;
+	test_sum_entropy = 0;
+
+	// 学習時のnorm
+	atomic<double> sum_norm;
+	sum_norm = 0;
+#endif
+
+	// 深い探索のpvの初手と、search(1)のpvの初手の指し手が一致した回数。
+	atomic<int> move_accord_count;
+	move_accord_count = 0;
+
+	// 平手の初期局面のeval()の値を表示させて、揺れを見る。
+	auto th = Threads[thread_id];
+	auto& pos = th->rootPos;
+	StateInfo si;
+  pos.set(StartFEN, false, &si, th);
+  std::cout << "hirate eval = " << Eval::evaluate(pos);
+
+	//Eval::print_eval_stat(pos);
+
+	// ここ、並列化したほうが良いのだがslaveの前の探索が終わってなかったりしてちょっと面倒。
+	// taskを呼び出すための仕組みを作ったのでそれを用いる。
+
+	// こなすべきtaskの数。
+	atomic<int> task_count;
+	task_count = (int)sr.sfen_for_mse.size();
+	task_dispatcher.task_reserve(task_count);
+
+	// 局面の探索をするtaskを生成して各スレッドに振ってやる。
+	for (const auto& ps : sr.sfen_for_mse)
+	{
+		// TaskDispatcherを用いて各スレッドに作業を振る。
+		// そのためのタスクの定義。
+		// ↑で使っているposをcaptureされるとたまらんのでcaptureしたい変数は一つずつ指定しておく。
+		auto task = [&ps,&test_sum_cross_entropy_eval,&test_sum_cross_entropy_win,&test_sum_cross_entropy,&test_sum_entropy_eval,&test_sum_entropy_win,&test_sum_entropy, &sum_norm,&task_count ,&move_accord_count](size_t thread_id)
+		{
+			// これ、C++ではループごとに新たなpsのインスタンスをちゃんとcaptureするのだろうか.. →　するようだ。
+			auto th = Threads[thread_id];
+			auto& pos = th->rootPos;
+			StateInfo si;
+			if (pos.set_from_packed_sfen(ps.sfen ,&si, th) != 0)
+			{
+				// 運悪くrmse計算用のsfenとして、不正なsfenを引いてしまっていた。
+				cout << "Error! : illegal packed sfen " << pos.fen() << endl;
+			}
+
+			// 浅い探索の評価値
+			// evaluate()の値を用いても良いのだが、ロスを計算するときにlearn_cross_entropyと
+			// 値が比較しにくくて困るのでqsearch()を用いる。
+			// EvalHashは事前に無効化してある。(そうしないと毎回同じ値が返ってしまう)
+			auto r = qsearch(pos);
+
+			auto shallow_value = r.first;
+			{
+				const auto rootColor = pos.side_to_move();
+				const auto pv = r.second;
+				std::vector<StateInfo,AlignedAllocator<StateInfo>> states(pv.size());
+				for (size_t i = 0; i < pv.size(); ++i)
+				{
+					pos.do_move(pv[i], states[i]);
+					Eval::evaluate_with_no_return(pos);
+				}
+				shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
+				for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+					pos.undo_move(*it);
+			}
+
+			// 深い探索の評価値
+			auto deep_value = (Value)ps.score;
+
+			// 注) このコードは、learnコマンドでeval_limitを指定しているときのことを考慮してない。
+
+			// --- 誤差の計算
+
+#if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
+			auto grad = calc_grad(deep_value, shallow_value, ps);
+
+			// rmse的なもの
+			sum_error += grad*grad;
+			// 勾配の絶対値を足したもの
+			sum_error2 += abs(grad);
+			// 評価値の差の絶対値を足したもの
+			sum_error3 += abs(shallow_value - deep_value);
+#endif
+
+			// --- 交差エントロピーの計算
+
+			// とりあえずelmo methodの時だけ勝率項と勝敗項に関して
+			// 交差エントロピーを計算して表示させる。
+
+#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
+			double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
+			double test_entropy_eval, test_entropy_win, test_entropy;
+			calc_cross_entropy(deep_value, shallow_value, ps, test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy, test_entropy_eval, test_entropy_win, test_entropy);
+			// 交差エントロピーの合計は定義的にabs()をとる必要がない。
+			test_sum_cross_entropy_eval += test_cross_entropy_eval;
+			test_sum_cross_entropy_win += test_cross_entropy_win;
+			test_sum_cross_entropy += test_cross_entropy;
+			test_sum_entropy_eval += test_entropy_eval;
+			test_sum_entropy_win += test_entropy_win;
+			test_sum_entropy += test_entropy;
+			sum_norm += (double)abs(shallow_value);
+#endif
+
+			// 教師の指し手と浅い探索のスコアが一致するかの判定
+			{
+				auto r = search(pos,1);
+				if ((uint16_t)r.second[0] == ps.move)
+					move_accord_count.fetch_add(1, std::memory_order_relaxed);
+			}
+
+			// こなしたのでタスク一つ減る
+			--task_count;
+		};
+
+		// 定義したタスクをslaveに投げる。
+		task_dispatcher.push_task_async(task);
+	}
+
+	// 自分自身もslaveとして参加する
+	task_dispatcher.on_idle(thread_id);
+
+	// すべてのtaskの完了を待つ
+	while (task_count)
+		sleep(1);
+
+#if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
+	// rmse = root mean square error : 平均二乗誤差
+	// mae  = mean absolute error    : 平均絶対誤差
+	auto dsig_rmse = std::sqrt(sum_error / (sfen_for_mse.size() + epsilon));
+	auto dsig_mae = sum_error2 / (sfen_for_mse.size() + epsilon);
+	auto eval_mae = sum_error3 / (sfen_for_mse.size() + epsilon);
+	cout << " , dsig rmse = " << dsig_rmse << " , dsig mae = " << dsig_mae
+		<< " , eval mae = " << eval_mae;
+#endif
+
+#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
+#if defined(EVAL_NNUE)
+	latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
+	latest_loss_count += sr.sfen_for_mse.size();
+#endif
+
+	// learn_cross_entropyは、機械学習の世界ではtrain cross entropyと呼ぶべきかも知れないが、
+	// 頭文字を略するときに、lceと書いて、test cross entropy(tce)と区別出来たほうが嬉しいのでこうしてある。
+
+	if (sr.sfen_for_mse.size() && done)
+	{
+		cout
+			<< " , test_cross_entropy_eval = "  << test_sum_cross_entropy_eval / sr.sfen_for_mse.size()
+			<< " , test_cross_entropy_win = "   << test_sum_cross_entropy_win / sr.sfen_for_mse.size()
+			<< " , test_entropy_eval = "        << test_sum_entropy_eval / sr.sfen_for_mse.size()
+			<< " , test_entropy_win = "         << test_sum_entropy_win / sr.sfen_for_mse.size()
+			<< " , test_cross_entropy = "       << test_sum_cross_entropy / sr.sfen_for_mse.size()
+			<< " , test_entropy = "             << test_sum_entropy / sr.sfen_for_mse.size()
+			<< " , norm = "						<< sum_norm
+			<< " , move accuracy = "			<< (move_accord_count * 100.0 / sr.sfen_for_mse.size()) << "%";
+		if (done != static_cast<uint64_t>(-1))
+		{
+			cout
+				<< " , learn_cross_entropy_eval = " << learn_sum_cross_entropy_eval / done
+				<< " , learn_cross_entropy_win = "  << learn_sum_cross_entropy_win / done
+				<< " , learn_entropy_eval = "       << learn_sum_entropy_eval / done
+				<< " , learn_entropy_win = "        << learn_sum_entropy_win / done
+				<< " , learn_cross_entropy = "      << learn_sum_cross_entropy / done
+				<< " , learn_entropy = "            << learn_sum_entropy / done;
+		}
+		cout << endl;
+	}
+	else {
+		cout << "Error! : sr.sfen_for_mse.size() = " << sr.sfen_for_mse.size() << " ,  done = " << done << endl;
+	}
+
+	// 次回のために0クリアしておく。
+	learn_sum_cross_entropy_eval = 0.0;
+	learn_sum_cross_entropy_win = 0.0;
+	learn_sum_cross_entropy = 0.0;
+	learn_sum_entropy_eval = 0.0;
+	learn_sum_entropy_win = 0.0;
+	learn_sum_entropy = 0.0;
+#else
+	<< endl;
+#endif
+}
+
+
+void LearnerThink::thread_worker(size_t thread_id)
+{
+#if defined(_OPENMP)
+	omp_set_num_threads((int)Options["Threads"]);
+#endif
+
+	auto th = Threads[thread_id];
+	auto& pos = th->rootPos;
+
+	while (true)
+	{
+		// mseの表示(これはthread 0のみときどき行う)
+		// ファイルから読み込んだ直後とかでいいような…。
+
+#if defined(EVAL_NNUE)
+		// 更新中に評価関数を使わないようにロックする。
+		shared_lock<shared_timed_mutex> read_lock(nn_mutex, defer_lock);
+		if (sr.next_update_weights <= sr.total_done ||
+		    (thread_id != 0 && !read_lock.try_lock()))
+#else
+		if (sr.next_update_weights <= sr.total_done)
+#endif
+		{
+			if (thread_id != 0)
+			{
+				// thread_id == 0以外は、待機。
+
+				if (stop_flag)
+					break;
+
+				// rmseの計算などを並列化したいのでtask()が積まれていればそれを処理する。
+				task_dispatcher.on_idle(thread_id);
+				continue;
+			}
+			else
+			{
+				// thread_id == 0だけが以下の更新処理を行なう。
+
+				// 初回はweight配列の更新は行わない。
+				if (sr.next_update_weights == 0)
+				{
+					sr.next_update_weights += mini_batch_size;
+					continue;
+				}
+
+#if !defined(EVAL_NNUE)
+				// 現在時刻を出力。毎回出力する。
+				std::cout << sr.total_done << " sfens , at " << now_string() << std::endl;
+
+				// このタイミングで勾配をweight配列に反映。勾配の計算も1M局面ごとでmini-batch的にはちょうどいいのでは。
+				Eval::update_weights(epoch , freeze);
+
+				// デバッグ用にepochと現在のetaを表示してやる。
+				std::cout << "epoch = " << epoch << " , eta = " << Eval::get_eta() << std::endl;
+#else
+				{
+					// パラメータの更新
+
+					// 更新中に評価関数を使わないようにロックする。
+					lock_guard<shared_timed_mutex> write_lock(nn_mutex);
+					Eval::NNUE::UpdateParameters(epoch);
+				}
+#endif
+				++epoch;
+
+				// 10億局面ごとに1回保存、ぐらいの感じで。
+
+				// ただし、update_weights(),calc_rmse()している間の時間経過は無視するものとする。
+				if (++sr.save_count * mini_batch_size >= eval_save_interval)
+				{
+					sr.save_count = 0;
+
+					// この間、gradientの計算が進むと値が大きくなりすぎて困る気がするので他のスレッドを停止させる。
+					const bool converged = save();
+					if (converged)
+					{
+						stop_flag = true;
+						sr.stop_flag = true;
+						break;
+					}
+				}
+
+				// rmseを計算する。1万局面のサンプルに対して行う。
+				// 40コアでやると100万局面ごとにupdate_weightsするとして、特定のスレッドが
+				// つきっきりになってしまうのあまりよくないような気も…。
+				static uint64_t loss_output_count = 0;
+				if (++loss_output_count * mini_batch_size >= loss_output_interval)
+				{
+					loss_output_count = 0;
+
+					// 今回処理した件数
+					uint64_t done = sr.total_done - sr.last_done;
+
+					// lossの計算
+					calc_loss(thread_id , done);
+
+#if defined(EVAL_NNUE)
+					Eval::NNUE::CheckHealth();
+#endif
+
+					// どこまで集計したかを記録しておく。
+					sr.last_done = sr.total_done;
+				}
+
+				// 次回、この一連の処理は、次回、mini_batch_sizeだけ処理したときに再度やって欲しい。
+				sr.next_update_weights += mini_batch_size;
+
+				// main thread以外は、このsr.next_update_weightsの更新を待っていたので
+				// この値が更新されると再度動き始める。				
+			}
+		}
+
+		PackedSfenValue ps;
+	RetryRead:;
+		if (!sr.read_to_thread_buffer(thread_id, ps))
+		{
+			// 自分のスレッド用の局面poolを使い尽くした。
+			// 局面がもうほとんど残っていないということだから、
+			// 他のスレッドもすべて終了させる。
+
+			stop_flag = true;
+			break;
+		}
+
+		// 評価値が学習対象の値を超えている。
+		// この局面情報を無視する。
+		if (eval_limit < abs(ps.score))
+			goto RetryRead;
+
+#if !defined (LEARN_GENSFEN_USE_DRAW_RESULT)
+		if (ps.game_result == 0)
+			goto RetryRead;
+#endif
+
+		// 序盤局面に関する読み飛ばし
+		if (ps.gamePly < prng.rand(reduction_gameply))
+			goto RetryRead;
+
+#if 0
+		auto sfen = pos.sfen_unpack(ps.data);
+		pos.set(sfen);
+#endif
+		// ↑sfenを経由すると遅いので専用の関数を作った。
+		StateInfo si;
+		const bool mirror = prng.rand(100) < mirror_percentage;
+		if (pos.set_from_packed_sfen(ps.sfen,&si,th,mirror) != 0)
+		{
+			// 変なsfenを掴かまされた。デバッグすべき！
+			// 不正なsfenなのでpos.sfen()で表示できるとは限らないが、しないよりマシ。
+			cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
+			goto RetryRead;
+		}
+#if !defined(EVAL_NNUE)
+		{
+			auto key = pos.key();
+			// rmseの計算用に使っている局面なら除外する。
+			if (sr.is_for_rmse(key))
+				goto RetryRead;
+
+			// 直近で用いた局面も除外する。
+			auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
+			auto key2 = sr.hash[hash_index];
+			if (key == key2)
+				goto RetryRead;
+			sr.hash[hash_index] = key; // 今回のkeyに入れ替えておく。
+		}
+#endif
+
+		// 全駒されて詰んでいる可能性がある。
+		// また宣言勝ちの局面はPVの指し手でleafに行けないので学習から除外しておく。
+		// (そのような教師局面自体を書き出すべきではないのだが古い生成ルーチンで書き出しているかも知れないので)
+		if (pos.is_mated())
+			goto RetryRead;
+
+		// 読み込めたので試しに表示してみる。
+		//		cout << pos << value << endl;
+
+		// 浅い探索(qsearch)の評価値
+		auto r = qsearch(pos);
+		auto pv = r.second;
+
+		// 深い探索の評価値
+		auto deep_value = (Value)ps.score;
+
+		// mini batchのほうが勾配が出ていいような気がする。
+		// このままleaf nodeに行って、勾配配列にだけ足しておき、あとでrmseの集計のときにAdaGradしてみる。
+
+		auto rootColor = pos.side_to_move();
+
+		// PVの初手が異なる場合は学習に用いないほうが良いのでは…。
+		// 全然違うところを探索した結果だとそれがノイズに成りかねない。
+		// 評価値の差が大きすぎるところも学習対象としないほうがいいかも…。
+
+#if 0
+		// これやると13%程度の局面が学習対象から外れてしまう。善悪は微妙。
+		if (pv.size() >= 1 && (uint16_t)pv[0] != ps.move)
+		{
+//			dbg_hit_on(false);
+			continue;
+		}
+#endif
+
+#if 0
+		// 評価値の差が大きすぎるところも学習対象としないほうがいいかも…。
+		// →　勝率の関数を通すのでまあいいか…。30%ぐらいの局面が学習対象から外れてしまうしな…。
+		if (abs((int16_t)r.first - ps.score) >= Eval::PawnValue * 4)
+		{
+//			dbg_hit_on(false);
+			continue;
+		}
+		//		dbg_hit_on(true);
+#endif
+
+		int ply = 0;
+
+		// 現在の局面に対して勾配を加算するヘルパー関数。
+		auto pos_add_grad = [&]() {
+			// shallow_valueとして、leafでのevaluateの値を用いる。
+			// qsearch()の戻り値をshallow_valueとして用いると、
+			// PVが途中で途切れている場合、勾配を計算するのにevaluate()を呼び出した局面と、
+			// その勾配を与える局面とが異なることになるので、これはあまり好ましい性質ではないと思う。
+			// 置換表をオフにはしているのだが、1手詰みなどはpv配列を更新していないので…。
+
+			Value shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
+
+#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
+			// 学習データに対するロスの計算
+			double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
+			double learn_entropy_eval, learn_entropy_win, learn_entropy;
+			calc_cross_entropy(deep_value, shallow_value, ps, learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy, learn_entropy_eval, learn_entropy_win, learn_entropy);
+			learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
+			learn_sum_cross_entropy_win += learn_cross_entropy_win;
+			learn_sum_cross_entropy += learn_cross_entropy;
+			learn_sum_entropy_eval += learn_entropy_eval;
+			learn_sum_entropy_win += learn_entropy_win;
+			learn_sum_entropy += learn_entropy;
+#endif
+
+#if !defined(EVAL_NNUE)
+			// 勾配
+			double dj_dw = calc_grad(deep_value, shallow_value, ps);
+
+			// 現在、leaf nodeで出現している特徴ベクトルに対する勾配(∂J/∂Wj)として、jd_dwを加算する。
+
+			// PV終端でなければ割引率みたいなものを適用。
+			if (discount_rate != 0 && ply != (int)pv.size())
+				dj_dw *= discount_rate;
+
+			// leafに到達したのでこの局面に出現している特徴に勾配を加算しておく。
+			// 勾配に基づくupdateはのちほど行なう。
+			Eval::add_grad(pos, rootColor, dj_dw, freeze);
+#else
+			const double example_weight =
+			    (discount_rate != 0 && ply != (int)pv.size()) ? discount_rate : 1.0;
+			Eval::NNUE::AddExample(pos, rootColor, ps, example_weight);
+#endif
+
+			// 処理が終了したので処理した件数のカウンターをインクリメント
+			sr.total_done++;
+		};
+
+		StateInfo state[MAX_PLY]; // qsearchのPVがそんなに長くなることはありえない。
+		for (auto m : pv)
+		{
+			// 非合法手はやってこないはずなのだが。
+			if (!pos.pseudo_legal(m) || !pos.legal(m))
+			{
+				cout << pos << m << endl;
+				assert(false);
+			}
+
+			// 各PV上のnodeでも勾配を加算する場合の処理。
+			// discount_rateが0のときはこの処理は行わない。
+			if (discount_rate != 0)
+				pos_add_grad();
+
+			pos.do_move(m, state[ply++]);
+			
+			// leafでのevaluateの値を用いるので差分更新していく。
+			Eval::evaluate_with_no_return(pos);
+		}
+
+		// PVの終端局面に達したので、ここで勾配を加算する。
+		pos_add_grad();
+
+		// 局面を巻き戻す
+		for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+			pos.undo_move(*it);
+
+#if 0
+		// rootの局面にも勾配を加算する場合
+		shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
+		dj_dw = calc_grad(deep_value, shallow_value, ps);
+		Eval::add_grad(pos, rootColor, dj_dw , without_kpp);
+#endif
+
+	}
+
+}
+
+// 評価関数ファイルの書き出し。
+bool LearnerThink::save(bool is_final)
+{
+	// 保存前にcheck sumを計算して出力しておく。(次に読み込んだときに合致するか調べるため)
+	std::cout << "Check Sum = " << std::hex << Eval::calc_check_sum() << std::dec << std::endl;
+
+	// 保存ごとにファイル名の拡張子部分を"0","1","2",..のように変えていく。
+	// (あとでそれぞれの評価関数パラメーターにおいて勝率を比較したいため)
+
+	if (save_only_once)
+	{
+		// EVAL_SAVE_ONLY_ONCEが定義されているときは、
+		// 1度だけの保存としたいのでサブフォルダを掘らない。
+		Eval::save_eval("");
+	}
+	else if (is_final) {
+		Eval::save_eval("final");
+		return true;
+	}
+	else {
+		static int dir_number = 0;
+		const std::string dir_name = std::to_string(dir_number++);
+		Eval::save_eval(dir_name);
+#if defined(EVAL_NNUE)
+		if (newbob_decay != 1.0 && latest_loss_count > 0) {
+			static int trials = newbob_num_trials;
+			const double latest_loss = latest_loss_sum / latest_loss_count;
+			latest_loss_sum = 0.0;
+			latest_loss_count = 0;
+			cout << "loss: " << latest_loss;
+			if (latest_loss < best_loss) {
+				cout << " < best (" << best_loss << "), accepted" << endl;
+				best_loss = latest_loss;
+				best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
+				trials = newbob_num_trials;
+			} else {
+				cout << " >= best (" << best_loss << "), rejected" << endl;
+				if (best_nn_directory.empty()) {
+					cout << "WARNING: no improvement from initial model" << endl;
+				} else {
+					cout << "restoring parameters from " << best_nn_directory << endl;
+					Eval::NNUE::RestoreParameters(best_nn_directory);
+				}
+				if (--trials > 0 && !is_final) {
+					cout << "reducing learning rate scale from " << newbob_scale
+					     << " to " << (newbob_scale * newbob_decay)
+					     << " (" << trials << " more trials)" << endl;
+					newbob_scale *= newbob_decay;
+					Eval::NNUE::SetGlobalLearningRateScale(newbob_scale);
+				}
+			}
+			if (trials == 0) {
+				cout << "converged" << endl;
+				return true;
+			}
+		}
+#endif
+	}
+	return false;
+}
+
+// shuffle_files() , shuffle_files_quick()の下請けで、書き出し部分。
+// output_file_name : 書き出すファイル名
+// prng : 乱数
+// afs  : それぞれの教師局面ファイルのfstream
+// a_count : それぞれのファイルに内在する教師局面の数。
+void shuffle_write(const string& output_file_name , PRNG& prng , vector<fstream>& afs , vector<uint64_t>& a_count)
+{
+	uint64_t total_sfen_count = 0;
+	for (auto c : a_count)
+		total_sfen_count += c;
+
+	// 書き出した局面数
+	uint64_t write_sfen_count = 0;
+
+	// 進捗をこの局面数ごとに画面に出力する。
+	const uint64_t buffer_size = 10000000;
+
+	auto print_status = [&]()
+	{
+		// 10M局面ごと、もしくは、すべての書き出しが終わったときに進捗を出力する
+		if (((write_sfen_count % buffer_size) == 0) ||
+			(write_sfen_count == total_sfen_count))
+			cout << write_sfen_count << " / " << total_sfen_count << endl;
+	};
+
+
+	cout << endl <<  "write : " << output_file_name << endl;
+
+	fstream fs(output_file_name, ios::out | ios::binary);
+
+	// 教師局面の合計
+	uint64_t sum = 0;
+	for (auto c : a_count)
+		sum += c;
+
+	while (sum != 0)
+	{
+		auto r = prng.rand(sum);
+
+		// fs[0]のファイルに格納されている局面 ... fs[1]のファイルに格納されている局面 ...
+		// のようにひと続きになっているものと考えて、rがどのファイルに格納されている局面を指しているかを確定させる。
+		// ファイルの中身はシャッフルされているので、そのファイルから次の要素を1つ取ってくれば良い。
+		// それぞれのファイルにはa_count[x]ずつ局面が残っているので、この処理は以下のように書ける。
+
+		uint64_t n = 0;
+		while (a_count[n] <= r)
+			r -= a_count[n++];
+
+		// これでnが確定した。忘れないうちに残り件数を減らしておく。
+
+		--a_count[n];
+		--sum;
+
+		PackedSfenValue psv;
+		// これ、パフォーマンスあんまりよくないまでまとめて読み書きしたほうが良いのだが…。
+		if (afs[n].read((char*)&psv, sizeof(PackedSfenValue)))
+		{
+			fs.write((char*)&psv, sizeof(PackedSfenValue));
+			++write_sfen_count;
+			print_status();
+		}
+	}
+	print_status();
+	fs.close();
+	cout << "done!" << endl;
+}
+
+// 教師局面のシャッフル "learn shuffle"コマンドの下請け。
+// output_file_name : シャッフルされた教師局面が書き出される出力ファイル名
+void shuffle_files(const vector<string>& filenames , const string& output_file_name , uint64_t buffer_size )
+{
+	// 出力先のフォルダは
+	// tmp/               一時書き出し用
+
+	// テンポラリファイルはbuffer_size局面ずつtmp/フォルダにいったん書き出す。
+	// 例えば、buffer_size = 20Mならば 20M*40bytes = 800MBのバッファが必要。
+	// メモリが少ないPCでは、ここを減らすと良いと思う。
+	// ただし、あまりファイル数が増えるとOSの制限などから同時にopen出来なくなる。
+	// Windowsだと1プロセス512という制約があったはずなので、ここでopen出来るのが500として、
+	// 現在の設定で500ファイル×20M = 10G = 100億局面が限度。
+
+	PSVector buf;
+	buf.resize(buffer_size);
+	// ↑のバッファ、どこまで使ったかを示すマーカー
+	uint64_t buf_write_marker = 0;
+
+	// 書き出すファイル名(連番なのでインクリメンタルカウンター)
+	uint64_t write_file_count = 0;
+
+	// シャッフルするための乱数
+	PRNG prng((std::random_device())());
+
+	// テンポラリファイルの名前を生成する
+	auto make_filename = [](uint64_t i)
+	{
+		return "tmp/" + to_string(i) + ".bin";
+	};
+
+	// 書き出したtmp/フォルダのファイル、それぞれに格納されている教師局面の数
+	vector<uint64_t> a_count;
+
+	auto write_buffer = [&](uint64_t size)
+	{
+		// buf[0]～buf[size-1]までをshuffle
+		for (uint64_t i = 0; i < size; ++i)
+			swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
+
+		// ファイルに書き出す
+		fstream fs;
+		fs.open(make_filename(write_file_count++), ios::out | ios::binary);
+		fs.write((char*)&buf[0], size * sizeof(PackedSfenValue));
+		fs.close();
+		a_count.push_back(size);
+
+		buf_write_marker = 0;
+		cout << ".";
+	};
+
+	Dependency::mkdir("tmp");
+
+	// 10M局面の細切れファイルとしてシャッフルして書き出す。
+	for (auto filename : filenames)
+	{
+		fstream fs(filename, ios::in | ios::binary);
+		cout << endl << "open file = " << filename;
+		while (fs.read((char*)&buf[buf_write_marker], sizeof(PackedSfenValue)))
+			if (++buf_write_marker == buffer_size)
+				write_buffer(buffer_size);
+
+		// sizeof(PackedSfenValue)単位で読み込んでいき、
+		// 最後に残っている端数は無視する。(fs.readで失敗するのでwhileを抜ける)
+		// (最後に残っている端数は、教師生成時に途中で停止させたために出来た中途半端なデータだと思われる。)
+
+	}
+
+	if (buf_write_marker != 0)
+		write_buffer(buf_write_marker);
+
+	// シャッフルされたファイルがwrite_file_count個だけ書き出された。
+	// 2pass目として、これをすべて同時にオープンし、ランダムに1つずつ選択して1局面ずつ読み込めば
+	// これにてシャッフルされたことになる。
+
+	// シャツフルする元ファイル+tmpファイル+書き出すファイルで元ファイルの3倍のストレージ容量が必要になる。
+	// 100億局面400GBなのでシャッフルするために1TBのSSDでは足りない。
+	// tmpに書き出しが終わったこのタイミングで元ファイルを消す(あるいは手で削除してしまう)なら、
+	// 元ファイルの2倍程度のストレージ容量で済む。
+	// だから、元ファイルを消すためのオプションを用意すべきかも知れない。
+
+	// ファイルの同時openをしている。これがFOPEN_MAXなどを超える可能性は高い。
+	// その場合、buffer_sizeを調整して、ファイルの数を減らすよりない。
+
+	vector<fstream> afs;
+	for (uint64_t i = 0; i < write_file_count; ++i)
+		afs.emplace_back(fstream(make_filename(i),ios::in | ios::binary));
+
+	// 下請け関数に丸投げして終わり。
+	shuffle_write(output_file_name, prng, afs, a_count);
+}
+
+// 教師局面のシャッフル "learn shuffleq"コマンドの下請け。
+// こちらは1passで書き出す。
+// output_file_name : シャッフルされた教師局面が書き出される出力ファイル名
+void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name)
+{
+	// 読み込んだ局面数
+	uint64_t read_sfen_count = 0;
+
+	// シャッフルするための乱数
+	PRNG prng((std::random_device())());
+
+	// ファイルの数
+	size_t file_count = filenames.size();
+
+	// filenamesのファイルそれぞれに格納されている教師局面の数
+	vector<uint64_t> a_count(file_count);
+
+	// それぞれのファイルの教師局面の数をカウントする。
+	vector<fstream> afs(file_count);
+
+	for (size_t i = 0; i < file_count ; ++i)
+	{
+		auto filename = filenames[i];
+		auto& fs = afs[i];
+
+		fs.open(filename, ios::in | ios::binary);
+		fs.seekg(0, fstream::end);
+		uint64_t eofPos = (uint64_t)fs.tellg();
+		fs.clear(); // これをしないと次のseekに失敗することがある。
+		fs.seekg(0, fstream::beg);
+		uint64_t begPos = (uint64_t)fs.tellg();
+		uint64_t file_size = eofPos - begPos;
+		uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
+		a_count[i] = sfen_count;
+
+		// 各ファイルに格納されていたsfenの数を出力する。
+		cout << filename << " = " << sfen_count << " sfens." << endl;
+	}
+
+	// それぞれのファイルのファイルサイズがわかったので、
+	// これらをすべて同時にオープンし(すでにオープンされている)、
+	// ランダムに1つずつ選択して1局面ずつ読み込めば
+	// これにてシャッフルされたことになる。
+
+	// 下請け関数に丸投げして終わり。
+	shuffle_write(output_file_name, prng, afs, a_count);
+}
+
+// 教師局面のシャッフル "learn shufflem"コマンドの下請け。
+// メモリに丸読みして指定ファイル名で書き出す。
+void shuffle_files_on_memory(const vector<string>& filenames,const string output_file_name)
+{
+	PSVector buf;
+
+	for (auto filename : filenames)
+	{
+		std::cout << "read : " << filename << std::endl;
+		read_file_to_memory(filename, [&buf](uint64_t size) {
+			assert((size % sizeof(PackedSfenValue)) == 0);
+			// バッファを拡充して、前回の末尾以降に読み込む。
+			uint64_t last = buf.size();
+			buf.resize(last + size / sizeof(PackedSfenValue));
+			return (void*)&buf[last];
+		});
+	}
+
+	// buf[0]～buf[size-1]までをshuffle
+	PRNG prng((std::random_device())());
+	uint64_t size = (uint64_t)buf.size();
+	std::cout << "shuffle buf.size() = " << size << std::endl;
+	for (uint64_t i = 0; i < size; ++i)
+		swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
+
+	std::cout << "write : " << output_file_name << endl;
+
+	// 書き出すファイルが2GBを超えるとfstream::write一発では書き出せないのでwrapperを用いる。
+	write_memory_to_file(output_file_name, (void*)&buf[0], (uint64_t)sizeof(PackedSfenValue)*(uint64_t)buf.size());
+
+	std::cout << "..shuffle_on_memory done." << std::endl;
+}
+
+void convert_bin(const vector<string>& filenames , const string& output_file_name)
+{
+	std::fstream fs;
+	auto th = Threads.main();
+	auto &tpos = th->rootPos;
+	// plain形式の雑巾をやねうら王用のpackedsfenvalueに変換する
+	fs.open(output_file_name, ios::app | ios::binary);
+
+	for (auto filename : filenames) {
+		std::cout << "convert " << filename << " ... ";
+		std::string line;
+		ifstream ifs;
+		ifs.open(filename);
+		PackedSfenValue p;
+		p.gamePly = 1; // apery形式では含まれない。一応初期化するべし
+		while (std::getline(ifs, line)) {
+			std::stringstream ss(line);
+			std::string token;
+			std::string value;
+			ss >> token;
+			if (token == "sfen") {
+				StateInfo si;
+				tpos.set(line.substr(5), false, &si, Threads.main());
+				tpos.sfen_pack(p.sfen);
+			}
+			else if (token == "move") {
+				ss >> value;
+				p.move = UCI::to_move(tpos, value);
+			}
+			else if (token == "score") {
+				ss >> p.score;
+			}
+			else if (token == "ply") {
+				int temp;
+				ss >> temp;
+				p.gamePly = uint16_t(temp); // 此処のキャストいらない？
+			}
+			else if (token == "result") {
+				int temp;
+				ss >> temp;
+				p.game_result = int8_t(temp); // 此処のキャストいらない？
+			}
+			else if (token == "e") {
+				fs.write((char*)&p, sizeof(PackedSfenValue));
+				// debug
+				/*
+				std::cout<<tpos<<std::endl;
+				std::cout<<to_usi_string(Move(p.move))<<","<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
+				*/
+			}
+		}
+		std::cout << "done" << std::endl;
+		ifs.close();
+	}
+	std::cout << "all done" << std::endl;
+	fs.close();
+}
+  
+//void convert_plain(const vector<string>& filenames , const string& output_file_name)
+//{
+//	Position tpos;
+//	std::ofstream ofs;
+//	ofs.open(output_file_name, ios::app);
+//	for (auto filename : filenames) {
+//		std::cout << "convert " << filename << " ... ";
+//
+//		// ひたすらpackedsfenvalueをテキストに変換する
+//		std::fstream fs;
+//		fs.open(filename, ios::in | ios::binary);
+//		PackedSfenValue p;
+//		while (true)
+//		{
+//			if (fs.read((char*)&p, sizeof(PackedSfenValue))) {
+//				// plain textとして書き込む
+//				ofs << "sfen " << tpos.sfen_unpack(p.sfen) << std::endl;
+//				ofs << "move " << to_usi_string(Move(p.move)) << std::endl;
+//				ofs << "score " << p.score << std::endl;
+//				ofs << "ply " << int(p.gamePly) << std::endl;
+//				ofs << "result " << int(p.game_result) << std::endl;
+//				ofs << "e" << std::endl;
+//			}
+//			else {
+//				break;
+//			}
+//		}
+//		fs.close();
+//		std::cout << "done" << std::endl;
+//	}
+//	ofs.close();
+//	std::cout << "all done" << std::endl;
+//}
+
+// 生成した棋譜からの学習
+void learn(Position&, istringstream& is)
+{
+	auto thread_num = (int)Options["Threads"];
+	SfenReader sr(thread_num);
+
+	LearnerThink learn_think(sr);
+	vector<string> filenames;
+
+	// mini_batch_size デフォルトで1M局面。これを大きくできる。
+	auto mini_batch_size = LEARN_MINI_BATCH_SIZE;
+
+	// ループ回数(この回数だけ棋譜ファイルを読み込む)
+	int loop = 1;
+
+	// 棋譜ファイル格納フォルダ(ここから相対pathで棋譜ファイルを取得)
+	string base_dir;
+
+	string target_dir;
+
+	// 0であれば、デフォルト値になる。
+	double eta1 = 0.0;
+	double eta2 = 0.0;
+	double eta3 = 0.0;
+	uint64_t eta1_epoch = 0; // defaultではeta2は適用されない
+	uint64_t eta2_epoch = 0; // defaultではeta3は適用されない
+
+#if defined(USE_GLOBAL_OPTIONS)
+	// あとで復元するために保存しておく。
+	auto oldGlobalOptions = GlobalOptions;
+	// eval hashにhitするとrmseなどの計算ができなくなるのでオフにしておく。
+	GlobalOptions.use_eval_hash = false;
+	// 置換表にhitするとそこで以前の評価値で枝刈りがされることがあるのでオフにしておく。
+	GlobalOptions.use_hash_probe = false;
+#endif
+
+	// --- 教師局面をシャッフルするだけの機能
+
+	// 通常シャッフル
+	bool shuffle_normal = false;
+	uint64_t buffer_size = 20000000;
+	// それぞれのファイルがシャッフルされていると仮定しての高速シャッフル
+	bool shuffle_quick = false;
+	// メモリにファイルを丸読みしてシャッフルする機能。(要、ファイルサイズのメモリ)
+	bool shuffle_on_memory = false;
+	// packed sfenの変換。plainではsfen(string), 評価値(整数), 指し手(例：7g7f, string)、結果(負け-1、勝ち1、引き分け0)からなる
+	bool use_convert_plain = false;
+	// plain形式の教師をやねうら王のbinに変換する
+	bool use_convert_bin = false;
+	// それらのときに書き出すファイル名(デフォルトでは"shuffled_sfen.bin")
+	string output_file_name = "shuffled_sfen.bin";
+
+	// 教師局面の深い探索での評価値の絶対値が、この値を超えていたらその局面は捨てる。
+	int eval_limit = 32000;
+
+	// 評価関数ファイルの保存は終了間際の1回に限定するかのフラグ。
+	bool save_only_once = false;
+
+	// 教師局面を先読みしている分に関してシャッフルする。(1000万局面単位ぐらいのシャッフル)
+	// 事前にシャッフルされているファイルを渡すならオンにすれば良い。
+	bool no_shuffle = false;
+
+#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
+	// elmo lambda
+	ELMO_LAMBDA = 0.33;
+	ELMO_LAMBDA2 = 0.33;
+	ELMO_LAMBDA_LIMIT = 32000;
+#endif
+
+	// 割引率。これを0以外にすると、PV終端以外でも勾配を加算する。(そのとき、この割引率を適用する)
+	double discount_rate = 0;
+
+	// if (gamePly < rand(reduction_gameply)) continue;
+	// のようにして、序盤を学習対象から程よく除外するためのオプション
+	// 1にしてあるとrand(1)==0なので、何も除外されない。
+	int reduction_gameply = 1;
+
+	// KK/KKP/KPP/KPPPを学習させないオプション項目
+	array<bool,4> freeze = {};
+
+#if defined(EVAL_NNUE)
+	uint64_t nn_batch_size = 1000;
+	double newbob_decay = 1.0;
+	int newbob_num_trials = 2;
+	string nn_options;
+#endif
+
+	uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
+	uint64_t loss_output_interval = 0;
+	uint64_t mirror_percentage = 0;
+
+	string validation_set_file_name;
+
+	// ファイル名が後ろにずらずらと書かれていると仮定している。
+	while (true)
+	{
+		string option;
+		is >> option;
+
+		if (option == "")
+			break;
+
+		// mini-batchの局面数を指定
+		if (option == "bat")
+		{
+			is >> mini_batch_size;
+			mini_batch_size *= 10000; // 単位は万
+		}
+
+		// 棋譜が格納されているフォルダを指定して、根こそぎ対象とする。
+		else if (option == "targetdir") is >> target_dir;
+
+		// ループ回数の指定
+		else if (option == "loop")      is >> loop;
+
+		// 棋譜ファイル格納フォルダ(ここから相対pathで棋譜ファイルを取得)
+		else if (option == "basedir")   is >> base_dir;
+
+		// ミニバッチのサイズ
+		else if (option == "batchsize") is >> mini_batch_size;
+
+		// 学習率
+		else if (option == "eta")        is >> eta1;
+		else if (option == "eta1")       is >> eta1; // alias
+		else if (option == "eta2")       is >> eta2;
+		else if (option == "eta3")       is >> eta3;
+		else if (option == "eta1_epoch") is >> eta1_epoch;
+		else if (option == "eta2_epoch") is >> eta2_epoch;
+
+		// 割引率
+		else if (option == "discount_rate") is >> discount_rate;
+
+		// KK/KKP/KPP/KPPPの学習なし。
+		else if (option == "freeze_kk")    is >> freeze[0];
+		else if (option == "freeze_kkp")   is >> freeze[1];
+		else if (option == "freeze_kpp")   is >> freeze[2];
+
+#if defined(EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_KPP_KKPT_FV_VAR) || defined(EVAL_NABLA)
+
+#elif defined(EVAL_KPPPT) || defined(EVAL_KPPP_KKPT) || defined(EVAL_HELICES)
+		else if (option == "freeze_kppp")  is >> freeze[3];
+#elif defined(EVAL_KKPP_KKPT) || defined(EVAL_KKPPT)
+		else if (option == "freeze_kkpp")  is >> freeze[3];
+#endif
+
+#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
+		// LAMBDA
+		else if (option == "lambda")       is >> ELMO_LAMBDA;
+		else if (option == "lambda2")      is >> ELMO_LAMBDA2;
+		else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
+
+#endif
+		else if (option == "reduction_gameply") is >> reduction_gameply;
+
+		// シャッフル関連
+		else if (option == "shuffle")	shuffle_normal = true;
+		else if (option == "buffer_size") is >> buffer_size;
+		else if (option == "shuffleq")	shuffle_quick = true;
+		else if (option == "shufflem")	shuffle_on_memory = true;
+		else if (option == "output_file_name") is >> output_file_name;
+
+		else if (option == "eval_limit") is >> eval_limit;
+		else if (option == "save_only_once") save_only_once = true;
+		else if (option == "no_shuffle") no_shuffle = true;
+
+#if defined(EVAL_NNUE)
+		else if (option == "nn_batch_size") is >> nn_batch_size;
+		else if (option == "newbob_decay") is >> newbob_decay;
+		else if (option == "newbob_num_trials") is >> newbob_num_trials;
+		else if (option == "nn_options") is >> nn_options;
+#endif
+		else if (option == "eval_save_interval") is >> eval_save_interval;
+		else if (option == "loss_output_interval") is >> loss_output_interval;
+		else if (option == "mirror_percentage") is >> mirror_percentage;
+		else if (option == "validation_set_file_name") is >> validation_set_file_name;
+		
+		// 雑巾のconvert関連
+		else if (option == "convert_plain") use_convert_plain = true;
+		else if (option == "convert_bin") use_convert_bin = true;
+		// さもなくば、それはファイル名である。
+		else
+			filenames.push_back(option);
+	}
+	if (loss_output_interval == 0)
+		loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
+
+	cout << "learn command , ";
+
+	// OpenMP無効なら警告を出すように。
+#if !defined(_OPENMP)
+	cout << "Warning! OpenMP disabled." << endl;
+#endif
+
+	// 学習棋譜ファイルの表示
+	if (target_dir != "")
+	{
+		string kif_base_dir = Path::Combine(base_dir, target_dir);
+
+		// このフォルダを根こそぎ取る。base_dir相対にしておく。
+#if defined(_MSC_VER)
+		// std::tr2を使用するとwaring C4996が出るので抑制。
+		// ※　std::tr2は、std:c++14 の下では既定で非推奨の警告を出し、/std:c++17 では既定で削除された。
+		#pragma warning(push)
+		#pragma warning(disable:4996)
+
+		namespace sys = std::tr2::sys;
+		sys::path p(kif_base_dir); // 列挙の起点
+		std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
+			[&](const sys::path& p) {
+			if (sys::is_regular_file(p))
+				filenames.push_back(Path::Combine(target_dir, p.filename().generic_string()));
+		});
+		#pragma warning(pop)
+
+#elif defined(__GNUC__)
+
+		auto ends_with = [](std::string const & value, std::string const & ending)
+		{
+			if (ending.size() > value.size()) return false;
+			return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
+		};
+
+		// 仕方ないのでdirent.hを用いて読み込む。
+		DIR *dp;       // ディレクトリへのポインタ
+		dirent* entry; // readdir() で返されるエントリーポイント
+
+		dp = opendir(kif_base_dir.c_str());
+		if (dp != NULL)
+		{
+			do {
+				entry = readdir(dp);
+				// ".bin"で終わるファイルのみを列挙
+				// →　連番でファイル生成するときにこの制約ちょっと嫌だな…。
+				if (entry != NULL  && ends_with(entry->d_name, ".bin")  )
+				{
+					//cout << entry->d_name << endl;
+					filenames.push_back(Path::Combine(target_dir, entry->d_name));
+				}
+			} while (entry != NULL);
+			closedir(dp);
+		}
+#endif
+	}
+
+	cout << "learn from ";
+	for (auto s : filenames)
+		cout << s << " , ";
+	cout << endl;
+	if (!validation_set_file_name.empty())
+	{
+		cout << "validation set  : " << validation_set_file_name << endl;
+	}
+
+	cout << "base dir        : " << base_dir   << endl;
+	cout << "target dir      : " << target_dir << endl;
+
+	// シャッフルモード
+	if (shuffle_normal)
+	{
+		cout << "buffer_size     : " << buffer_size << endl;
+		cout << "shuffle mode.." << endl;
+		shuffle_files(filenames,output_file_name , buffer_size);
+		return;
+	}
+	if (shuffle_quick)
+	{
+		cout << "quick shuffle mode.." << endl;
+		shuffle_files_quick(filenames, output_file_name);
+		return;
+	}
+	if (shuffle_on_memory)
+	{
+		cout << "shuffle on memory.." << endl;
+		shuffle_files_on_memory(filenames,output_file_name);
+		return;
+	}
+	//if (use_convert_plain)
+	//{
+	//  	is_ready(true);
+	//	cout << "convert_plain.." << endl;
+	//	convert_plain(filenames,output_file_name);
+	//	return;
+	//	
+	//}
+	if (use_convert_bin)
+	{
+	  	is_ready(true);
+		cout << "convert_bin.." << endl;
+		convert_bin(filenames,output_file_name);
+		return;
+		
+	}
+
+	cout << "loop              : " << loop << endl;
+	cout << "eval_limit        : " << eval_limit << endl;
+	cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
+	cout << "no_shuffle        : " << (no_shuffle ? "true" : "false") << endl;
+
+	// ループ回数分だけファイル名を突っ込む。
+	for (int i = 0; i < loop; ++i)
+		// sfen reader、逆順で読むからここでreverseしておく。すまんな。
+		for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
+			sr.filenames.push_back(Path::Combine(base_dir, *it));
+
+#if !defined(EVAL_NNUE)
+	cout << "Gradient Method   : " << LEARN_UPDATE      << endl;
+#endif
+	cout << "Loss Function     : " << LOSS_FUNCTION     << endl;
+	cout << "mini-batch size   : " << mini_batch_size   << endl;
+#if defined(EVAL_NNUE)
+	cout << "nn_batch_size     : " << nn_batch_size     << endl;
+	cout << "nn_options        : " << nn_options        << endl;
+#endif
+	cout << "learning rate     : " << eta1 << " , " << eta2 << " , " << eta3 << endl;
+	cout << "eta_epoch         : " << eta1_epoch << " , " << eta2_epoch << endl;
+#if defined(EVAL_NNUE)
+	if (newbob_decay != 1.0) {
+		cout << "scheduling        : newbob with decay = " << newbob_decay
+		     << ", " << newbob_num_trials << " trials" << endl;
+	} else {
+		cout << "scheduling        : default" << endl;
+	}
+#endif
+	cout << "discount rate     : " << discount_rate     << endl;
+
+	// reduction_gameplyに0を設定されるとrand(0)が0除算になってしまうので1に補正。
+	reduction_gameply = max(reduction_gameply, 1);
+	cout << "reduction_gameply : " << reduction_gameply << endl;
+
+#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
+	cout << "LAMBDA            : " << ELMO_LAMBDA       << endl;
+	cout << "LAMBDA2           : " << ELMO_LAMBDA2      << endl;
+	cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
+#endif
+	cout << "mirror_percentage : " << mirror_percentage << endl;
+	cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
+	cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
+
+#if defined(EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_KPP_KKPT_FV_VAR) || defined(EVAL_NABLA)
+	cout << "freeze_kk/kkp/kpp      : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << endl;
+#elif defined(EVAL_KPPPT) || defined(EVAL_KPPP_KKPT) || defined(EVAL_HELICES)
+	cout << "freeze_kk/kkp/kpp/kppp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
+#elif defined(EVAL_KKPP_KKPT) || defined(EVAL_KKPPT)
+	cout << "freeze_kk/kkp/kpp/kkpp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
+#endif
+
+	// -----------------------------------
+	//            各種初期化
+	// -----------------------------------
+
+	cout << "init.." << endl;
+
+	// 評価関数パラメーターの読み込み
+	is_ready(true);
+
+#if !defined(EVAL_NNUE)
+	cout << "init_grad.." << endl;
+
+	// 評価関数パラメーターの勾配配列の初期化
+	Eval::init_grad(eta1,eta1_epoch,eta2,eta2_epoch,eta3);
+#else
+	cout << "init_training.." << endl;
+	Eval::NNUE::InitializeTraining(eta1,eta1_epoch,eta2,eta2_epoch,eta3);
+	Eval::NNUE::SetBatchSize(nn_batch_size);
+	Eval::NNUE::SetOptions(nn_options);
+	if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
+		learn_think.best_nn_directory = std::string(Options["EvalDir"]);
+	}
+#endif
+
+#if 0
+	// 平手の初期局面に対して1.0の勾配を与えてみるテスト。
+	pos.set_hirate();
+	cout << Eval::evaluate(pos) << endl;
+	//Eval::print_eval_stat(pos);
+	Eval::add_grad(pos, BLACK, 32.0 , false);
+	Eval::update_weights(1);
+	pos.state()->sum.p[2][0] = VALUE_NOT_EVALUATED;
+	cout << Eval::evaluate(pos) << endl;
+	//Eval::print_eval_stat(pos);
+#endif
+
+	cout << "init done." << endl;
+
+	// その他、オプション設定を反映させる。
+	learn_think.discount_rate = discount_rate;
+	learn_think.eval_limit = eval_limit;
+	learn_think.save_only_once = save_only_once;
+	learn_think.sr.no_shuffle = no_shuffle;
+	learn_think.freeze = freeze;
+	learn_think.reduction_gameply = reduction_gameply;
+#if defined(EVAL_NNUE)
+	learn_think.newbob_scale = 1.0;
+	learn_think.newbob_decay = newbob_decay;
+	learn_think.newbob_num_trials = newbob_num_trials;
+#endif
+	learn_think.eval_save_interval = eval_save_interval;
+	learn_think.loss_output_interval = loss_output_interval;
+	learn_think.mirror_percentage = mirror_percentage;
+
+	// 局面ファイルをバックグラウンドで読み込むスレッドを起動
+	// (これを開始しないとmseの計算が出来ない。)
+	learn_think.start_file_read_worker();
+
+	learn_think.mini_batch_size = mini_batch_size;
+
+	if (validation_set_file_name.empty()) {
+		// mse計算用にデータ1万件ほど取得しておく。
+		sr.read_for_mse();
+	} else {
+		sr.read_validation_set(validation_set_file_name, eval_limit);
+	}
+
+	// この時点で一度rmseを計算(0 sfenのタイミング)
+	// sr.calc_rmse();
+#if defined(EVAL_NNUE)
+	if (newbob_decay != 1.0) {
+		learn_think.calc_loss(0, -1);
+		learn_think.best_loss = learn_think.latest_loss_sum / learn_think.latest_loss_count;
+		learn_think.latest_loss_sum = 0.0;
+		learn_think.latest_loss_count = 0;
+		cout << "initial loss: " << learn_think.best_loss << endl;
+	}
+#endif
+
+	// -----------------------------------
+	//   評価関数パラメーターの学習の開始
+	// -----------------------------------
+
+	// 学習開始。
+	learn_think.go_think();
+
+	// 最後に一度保存。
+	learn_think.save(true);
+
+#if defined(USE_GLOBAL_OPTIONS)
+	// GlobalOptionsの復元。
+	GlobalOptions = oldGlobalOptions;
+#endif
+}
+
+
+} // namespace Learner
+
+#if defined(GENSFEN2019)
+#include "gensfen2019.cpp"
+#endif
+
+
+#endif // EVAL_LEARN
diff --git a/src/learn/learning_tools.cpp b/src/learn/learning_tools.cpp
new file mode 100644
index 00000000..d3a7858f
--- /dev/null
+++ b/src/learn/learning_tools.cpp
@@ -0,0 +1,256 @@
+﻿#include "learning_tools.h"
+
+#if defined (EVAL_LEARN)
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+#include "../misc.h"
+
+using namespace Eval;
+
+namespace EvalLearningTools
+{
+
+	// --- static variables
+
+	double Weight::eta;
+	double Weight::eta1;
+	double Weight::eta2;
+	double Weight::eta3;
+	uint64_t Weight::eta1_epoch;
+	uint64_t Weight::eta2_epoch;
+
+	std::vector<bool> min_index_flag;
+
+	// --- 個別のテーブルごとの初期化
+
+	void init_min_index_flag()
+	{
+		// mir_piece、inv_pieceの初期化が終わっていなければならない。
+		assert(mir_piece(Eval::f_pawn) == Eval::e_pawn);
+
+		// 次元下げ用フラグ配列の初期化
+		// KPPPに関しては関与しない。
+
+		KK g_kk;
+		g_kk.set(SQUARE_NB, Eval::fe_end, 0);
+		KKP g_kkp;
+		g_kkp.set(SQUARE_NB, Eval::fe_end, g_kk.max_index());
+		KPP g_kpp;
+		g_kpp.set(SQUARE_NB, Eval::fe_end, g_kkp.max_index());
+
+		uint64_t size = g_kpp.max_index();
+		min_index_flag.resize(size);
+
+#pragma omp parallel
+		{
+#if defined(_OPENMP)
+			// Windows環境下でCPUが２つあるときに、論理64コアまでしか使用されないのを防ぐために
+			// ここで明示的にCPUに割り当てる
+			int thread_index = omp_get_thread_num();    // 自分のthread numberを取得
+			WinProcGroup::bindThisThread(thread_index);
+#endif
+
+#pragma omp for schedule(dynamic,20000)
+
+			for (int64_t index_ = 0; index_ < (int64_t)size; ++index_)
+			{
+				// OpenMPの制約からループ変数は符号型でないといけないらしいのだが、
+				// さすがに使いにくい。
+				uint64_t index = (uint64_t)index_;
+
+				if (g_kk.is_ok(index))
+				{
+					// indexからの変換と逆変換によって元のindexに戻ることを確認しておく。
+					// 起動時に1回しか実行しない処理なのでassertで書いておく。
+					assert(g_kk.fromIndex(index).toIndex() == index);
+
+					KK a[KK_LOWER_COUNT];
+					g_kk.fromIndex(index).toLowerDimensions(a);
+
+					// 次元下げの1つ目の要素が元のindexと同一であることを確認しておく。
+					assert(a[0].toIndex() == index);
+
+					uint64_t min_index = UINT64_MAX;
+					for (auto& e : a)
+						min_index = std::min(min_index, e.toIndex());
+					min_index_flag[index] = (min_index == index);
+				}
+				else if (g_kkp.is_ok(index))
+				{
+					assert(g_kkp.fromIndex(index).toIndex() == index);
+
+					KKP x = g_kkp.fromIndex(index);
+					KKP a[KKP_LOWER_COUNT];
+					x.toLowerDimensions(a);
+
+					assert(a[0].toIndex() == index);
+
+					uint64_t min_index = UINT64_MAX;
+					for (auto& e : a)
+						min_index = std::min(min_index, e.toIndex());
+					min_index_flag[index] = (min_index == index);
+				}
+				else if (g_kpp.is_ok(index))
+				{
+					assert(g_kpp.fromIndex(index).toIndex() == index);
+
+					KPP x = g_kpp.fromIndex(index);
+					KPP a[KPP_LOWER_COUNT];
+					x.toLowerDimensions(a);
+
+					assert(a[0].toIndex() == index);
+
+					uint64_t min_index = UINT64_MAX;
+					for (auto& e : a)
+						min_index = std::min(min_index, e.toIndex());
+					min_index_flag[index] = (min_index == index);
+				}
+				else
+				{
+					assert(false);
+				}
+			}
+		}
+	}
+
+	void learning_tools_unit_test_kpp()
+	{
+
+		// KPPの三角配列化にバグがないかテストする
+		// k-p0-p1のすべての組み合わせがきちんとKPPの扱う対象になっていかと、そのときの次元下げが
+		// 正しいかを判定する。
+
+		KK g_kk;
+		g_kk.set(SQUARE_NB, Eval::fe_end, 0);
+		KKP g_kkp;
+		g_kkp.set(SQUARE_NB, Eval::fe_end, g_kk.max_index());
+		KPP g_kpp;
+		g_kpp.set(SQUARE_NB, Eval::fe_end, g_kkp.max_index());
+
+		std::vector<bool> f;
+		f.resize(g_kpp.max_index() - g_kpp.min_index());
+
+		for(auto k = SQUARE_ZERO ; k < SQUARE_NB ; ++k)
+			for(auto p0 = BonaPiece::BONA_PIECE_ZERO; p0 < fe_end ; ++p0)
+				for (auto p1 = BonaPiece::BONA_PIECE_ZERO; p1 < fe_end; ++p1)
+				{
+					KPP kpp_org = g_kpp.fromKPP(k,p0,p1);
+					KPP kpp0;
+					KPP kpp1 = g_kpp.fromKPP(Mir(k), mir_piece(p0), mir_piece(p1));
+					KPP kpp_array[2];
+
+					auto index = kpp_org.toIndex();
+					assert(g_kpp.is_ok(index));
+
+					kpp0 = g_kpp.fromIndex(index);
+
+					//if (kpp0 != kpp_org)
+					//	std::cout << "index = " << index << "," << kpp_org << "," << kpp0 << std::endl;
+
+					kpp0.toLowerDimensions(kpp_array);
+
+					assert(kpp_array[0] == kpp0);
+					assert(kpp0 == kpp_org);
+					assert(kpp_array[1] == kpp1);
+
+					auto index2 = kpp1.toIndex();
+					f[index - g_kpp.min_index()] = f[index2-g_kpp.min_index()] = true;
+				}
+
+		// 抜けてるindexがなかったかの確認。
+		for(size_t index = 0 ; index < f.size(); index++)
+			if (!f[index])
+			{
+				std::cout << index << g_kpp.fromIndex(index + g_kpp.min_index()) <<  std::endl;
+			}
+	}
+
+	void learning_tools_unit_test_kppp()
+	{
+		// KPPPの計算に抜けがないかをテストする
+
+		KPPP g_kppp;
+		g_kppp.set(15, Eval::fe_end,0);
+		uint64_t min_index = g_kppp.min_index();
+		uint64_t max_index = g_kppp.max_index();
+
+		// 最後の要素の確認。
+		//KPPP x = KPPP::fromIndex(max_index-1);
+		//std::cout << x << std::endl;
+
+		for (uint64_t index = min_index; index < max_index; ++index)
+		{
+			KPPP x = g_kppp.fromIndex(index);
+			//std::cout << x << std::endl;
+
+#if 0
+			if ((index % 10000000) == 0)
+				std::cout << "index = " << index << std::endl;
+
+			// index = 9360000000
+			//	done.
+
+			if (x.toIndex() != index)
+			{
+				std::cout << "assertion failed , index = " << index << std::endl;
+			}
+#endif
+
+			assert(x.toIndex() == index);
+
+//			ASSERT((&kppp_ksq_pcpcpc(x.king(), x.piece0(), x.piece1(), x.piece2()) - &kppp[0][0]) == (index - min_index));
+		}
+
+	}
+
+	void learning_tools_unit_test_kkpp()
+	{
+		KKPP g_kkpp;
+		g_kkpp.set(SQUARE_NB, 10000 , 0);
+		uint64_t n = 0;
+		for (int k = 0; k<SQUARE_NB; ++k)
+			for (int i = 0; i<10000; ++i) // 試しに、かなり大きなfe_endを想定して10000で回してみる。
+				for (int j = 0; j < i; ++j)
+				{
+					auto kkpp = g_kkpp.fromKKPP(k, (BonaPiece)i, (BonaPiece)j);
+					auto r = kkpp.toRawIndex();
+					assert(n++ == r);
+					auto kkpp2 = g_kkpp.fromIndex(r + g_kkpp.min_index());
+					assert(kkpp2.king() == k && kkpp2.piece0() == i && kkpp2.piece1() == j);
+				}
+	}
+
+	// このEvalLearningTools全体の初期化
+	void init()
+	{
+		// 初期化は、起動後1回限りで良いのでそのためのフラグ。
+		static bool first = true;
+
+		if (first)
+		{
+			std::cout << "EvalLearningTools init..";
+
+			// mir_piece()とinv_piece()を利用可能にする。
+			// このあとmin_index_flagの初期化を行なうが、そこが
+			// これに依存しているので、こちらを先に行なう必要がある。
+			init_mir_inv_tables();
+
+			//learning_tools_unit_test_kpp();
+			//learning_tools_unit_test_kppp();
+			//learning_tools_unit_test_kkpp();
+
+			// UnitTestを実行するの最後でも良いのだが、init_min_index_flag()にとても時間がかかるので
+			// デバッグ時はこのタイミングで行いたい。
+
+			init_min_index_flag();
+
+			std::cout << "done." << std::endl;
+
+			first = false;
+		}
+	}
+}
+
+#endif
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
new file mode 100644
index 00000000..65f0887c
--- /dev/null
+++ b/src/learn/learning_tools.h
@@ -0,0 +1,1032 @@
+﻿#ifndef __LEARN_WEIGHT_H__
+#define __LEARN_WEIGHT_H__
+
+// 評価関数の機械学習のときに用いる重み配列などに関する機械学習用ツール類一式
+
+#include "learn.h"
+#if defined (EVAL_LEARN)
+#include "../eval/evaluate_mir_inv_tools.h"
+
+#if defined(SGD_UPDATE) || defined(USE_KPPP_MIRROR_WRITE)
+#include "../misc.h"  // PRNG , my_insertion_sort
+#endif
+
+#include <cmath>	// std::sqrt()
+
+namespace EvalLearningTools
+{
+	// -------------------------------------------------
+	//                     初期化
+	// -------------------------------------------------
+
+	// このEvalLearningTools名前空間にあるテーブル類を初期化する。
+	// 学習の開始までに必ず一度呼び出すこと。
+	// この関数のなかで、init_mir_inv_tables()も呼び出している。
+	// (この関数を呼ぶときは、init_mir_inv_tables()を呼び出す必要はない。)
+	void init();
+
+	// -------------------------------------------------
+	//                     flags
+	// -------------------------------------------------
+
+	// 次元下げしたときに、そのなかの一番小さなindexになることが
+	// わかっているindexに対してtrueとなっているフラグ配列。
+	// この配列もinit()によって初期化される。
+	// KPPPに関しては、関与しない。
+	// ゆえに、この配列の有効なindexの範囲は、KK::min_index()～KPP::max_index()まで。
+	extern std::vector<bool> min_index_flag;
+
+	// -------------------------------------------------
+	//       勾配等を格納している学習用の配列
+	// -------------------------------------------------
+
+#if defined(_MSC_VER)
+#pragma pack(push,2)
+#elif defined(__GNUC__)
+#pragma pack(2)
+#endif
+	struct Weight
+	{
+		// mini-batch 1回分の勾配の累積値
+		LearnFloatType g = LearnFloatType(0);
+
+		// ADA_GRAD_UPDATEのとき。LearnFloatType == floatとして、
+		// 合計 4*2 + 4*2 + 1*2 = 18 bytes
+		// 1GBの評価関数パラメーターに対してその4.5倍のサイズのWeight配列が確保できれば良い。
+		// ただし、構造体のアライメントが4バイト単位になっているとsizeof(Weight)==20なコードが生成されるので
+		// pragma pack(2)を指定しておく。
+
+		// SGD_UPDATE の場合、この構造体はさらに10バイト減って、8バイトで済む。
+
+		// AdaGradなどの学習率η(eta)。
+		// updateFV()が呼び出されるまでにeta1,2,3,eta1_epoch,eta2_epochは設定されているものとする。
+		// update_weights()のepochが、eta1_epochまでeta1から徐々にeta2に変化する。
+		// eta2_epoch以降は、eta2から徐々にeta3に変化する。
+		static double eta;
+		static double eta1;
+		static double eta2;
+		static double eta3;
+		static uint64_t eta1_epoch;
+		static uint64_t eta2_epoch;
+
+		// etaの一括初期化。0が渡された場合、デフォルト値が設定される。
+		static void init_eta(double eta1, double eta2, double eta3, uint64_t eta1_epoch, uint64_t eta2_epoch)
+		{
+			Weight::eta1 = (eta1 != 0) ? eta1 : 30.0;
+			Weight::eta2 = (eta2 != 0) ? eta2 : 30.0;
+			Weight::eta3 = (eta3 != 0) ? eta3 : 30.0;
+			Weight::eta1_epoch = (eta1_epoch != 0) ? eta1_epoch : 0;
+			Weight::eta2_epoch = (eta2_epoch != 0) ? eta2_epoch : 0;
+		}
+
+		// epochに応じたetaを設定してやる。
+		static void calc_eta(uint64_t epoch)
+		{
+			if (Weight::eta1_epoch == 0) // eta2適用除外
+				Weight::eta = Weight::eta1;
+			else if (epoch < Weight::eta1_epoch)
+				// 按分する
+				Weight::eta = Weight::eta1 + (Weight::eta2 - Weight::eta1) * epoch / Weight::eta1_epoch;
+			else if (Weight::eta2_epoch == 0) // eta3適用除外
+				Weight::eta = Weight::eta2;
+			else if (epoch < Weight::eta2_epoch)
+				Weight::eta = Weight::eta2 + (Weight::eta3 - Weight::eta2) * (epoch - Weight::eta1_epoch) / (Weight::eta2_epoch - Weight::eta1_epoch);
+			else
+				Weight::eta = Weight::eta3;
+		}
+
+		template <typename T> void updateFV(T& v) { updateFV(v, 1.0); }
+
+#if defined (ADA_GRAD_UPDATE)
+
+		// floatで正確に計算できる最大値はINT16_MAX*256-1なのでそれより
+		// 小さい値をマーカーにしておく。
+		const LearnFloatType V0_NOT_INIT = (INT16_MAX * 128);
+
+		// vを内部的に保持しているもの。以前の実装ではメモリの節約のために固定小数で小数部だけを保持していたが
+		// 精度的に怪しいし、見通しが悪くなるので廃止した。
+		LearnFloatType v0 = LearnFloatType(V0_NOT_INIT);
+
+		// AdaGradのg2
+		LearnFloatType g2 = LearnFloatType(0);
+
+		// AdaGradでupdateする
+		// この関数を実行しているときにgの値やメンバーが書き変わらないことは
+		// 呼び出し側で保証されている。atomic演算である必要はない。
+		// kはetaに掛かる係数。普通は1.0で良い。手番項に対してetaを下げたいときにここを1/8.0などとする。
+		template <typename T>
+		void updateFV(T& v,double k)
+		{
+			// AdaGradの更新式
+			//   勾配ベクトルをg、更新したいベクトルをv、η(eta)は定数として、
+			//     g2 = g2 + g^2
+			//     v = v - ηg/sqrt(g2)
+
+			constexpr double epsilon = 0.000001;
+
+			if (g == LearnFloatType(0))
+				return;
+
+			g2 += g * g;
+
+			// v0がV0_NOT_INITであるなら、値がKK/KKP/KPP配列の値で初期化されていないということだから、
+			// この場合、vの値を引数で渡されたものから読み込む。
+			double V = (v0 == V0_NOT_INIT) ? v : v0;
+
+			V -= k * eta * (double)g / sqrt((double)g2 + epsilon);
+
+			// Vの値を型の範囲に収まるように制限する。
+			// ちなみに、windows.hがmin,maxマクロを定義してしまうのでそれを回避するために、
+			// ここでは括弧で括ることで関数形式マクロとして扱われないようにしている。
+			V = (std::min)((double)(std::numeric_limits<T>::max)() , V);
+			V = (std::max)((double)(std::numeric_limits<T>::min)() , V);
+
+			v0 = (LearnFloatType)V;
+			v = (T)round(V);
+
+			// この要素に関するmini-batchの1回分の更新が終わったのでgをクリア
+			// g[i] = 0;
+			// →次元下げの問題があるので、これは呼び出し側で行なうことにする。
+		}
+
+#elif defined(SGD_UPDATE)
+
+		// 勾配の符号だけ見るSGDでupdateする
+		// この関数を実行しているときにgの値やメンバーが書き変わらないことは
+		// 呼び出し側で保証されている。atomic演算である必要はない。
+		template <typename T>
+		void updateFV(T & v , double k)
+		{
+			if (g == 0)
+				return;
+
+			// gの符号だけ見てupdateする。
+			// g < 0 なら vを少し足す。
+			// g > 0 なら vを少し引く。
+
+			// 整数しか足さないので小数部不要。
+
+			// 0～5ぐらいずつ動かすのがよさげ。
+			// ガウス分布っぽいほうが良いので5bitの乱数を発生させて(それぞれのbitは1/2の確率で1である)、
+			// それをpop_count()する。このとき、二項分布になっている。
+			//int16_t diff = (int16_t)POPCNT32((u32)prng.rand(31));
+			// →　これ80スレッドでやったら、このAsyncPRNG::rand()がlockするのでslow downした。この実装良くない。
+			int16_t diff = 1;
+
+			double V = v;
+			if (g > 0.0)
+				V-= diff;
+			else
+				V+= diff;
+
+			V = (std::min)((double)(std::numeric_limits<T>::max)(), V);
+			V = (std::max)((double)(std::numeric_limits<T>::min)(), V);
+
+			v = (T)V;
+		}
+
+#endif
+
+		// gradの設定
+		template <typename T> void set_grad(const T& g_) { g = g_; }
+
+		// gradの加算
+		template <typename T> void add_grad(const T& g_) { g += g_; }
+
+		LearnFloatType get_grad() const { return g; }
+	};
+#if defined(_MSC_VER)
+#pragma pack(pop)
+#elif defined(__GNUC__)
+#pragma pack(0)
+#endif
+
+	// 手番つきのweight配列
+	// 透過的に扱えるようにするために、Weightと同じメンバを持たせておいてやる。
+	struct Weight2
+	{
+		Weight w[2];
+
+		// 手番評価、etaを1/8に評価しておく。
+		template <typename T> void updateFV(std::array<T, 2>& v) { w[0].updateFV(v[0] , 1.0); w[1].updateFV(v[1],1.0/8.0); }
+
+		template <typename T> void set_grad(const std::array<T, 2>& g) { for (int i = 0; i<2; ++i) w[i].set_grad(g[i]); }
+		template <typename T> void add_grad(const std::array<T, 2>& g) { for (int i = 0; i<2; ++i) w[i].add_grad(g[i]); }
+
+		std::array<LearnFloatType, 2> get_grad() const { return std::array<LearnFloatType, 2>{w[0].get_grad(), w[1].get_grad()}; }
+	};
+
+	// -------------------------------------------------
+	// Weight配列を直列化したときのindexを計算したりするヘルパー。
+	// -------------------------------------------------
+
+	// KK,KKP,KPP,KKPPの基底クラス
+	// これらのクラスの使い方
+	// 
+	// 1. まずset()で初期化する。例) KK g_kk; g_kk.set(SQUARE_NB,fe_end,0);
+	// 2. 次にfromIndex(),fromKK()などでインスタンスを生成
+	// 3. king(),piece0(),piece1()などのプロパティを用いてアクセス。
+	// 
+	// この説明だけではわかりにくいかも知れないが、学習部のinit_grad(),add_grad(),update_weights()などを見れば
+	// 必要性を含めて理解できると思う。
+	//
+	// 注意 : この派生クラスでは次元下げのために上記のinv_piece/mir_pieceを間接的に参照することがあるので、
+	// 最初にEvalLearningTools::init()かinit_mir_inv_tables()を呼び出して初期化すること。
+	//
+	// 備考) 派生クラス側でoverrideすべきではない関数名には/*final*/と書いてある。
+	//       派生クラス側でoverrideすべき関数は "= 0"をつけて、純粋仮想関数にしてある。
+	//       派生クラス側でoverrideしてもしなくても良い関数はvirtualだけつけてある。
+	//
+	struct SerializerBase
+	{
+
+		// KK,KKP,KPP配列を直列化するときの通し番号の最小値、最大値+1。
+		/*final*/ uint64_t min_index() const { return min_index_; }
+		/*final*/ uint64_t max_index() const { return min_index() + max_raw_index_; }
+
+		// max_index() - min_index()の値。
+		// 派生クラス側でmax_king_sq_,fe_end_などから、値を計算して返すようにする。
+		virtual uint64_t size() const = 0;
+
+		// 与えられたindexが、min_index()以上、max_index()未満にあるかを判定する。
+		/*final*/ bool is_ok(uint64_t index) { return min_index() <= index && index < max_index(); }
+
+		// 必ずこのset()を呼び出して使う。さもなくば、派生クラス側のfromKK()/fromIndex()などでインスタンスを構築して使う。
+		virtual void set(int max_king_sq, uint64_t fe_end, uint64_t min_index)
+		{
+			max_king_sq_ = max_king_sq;
+			fe_end_ = fe_end;
+			min_index_ = min_index;
+			max_raw_index_ = size();
+		}
+
+		// 現在のメンバの値に基いて、直列化されたときのindexを取得する。
+		/*final*/ uint64_t toIndex() const {
+			return min_index() + toRawIndex();
+		}
+
+		// 直列化するときのindexを返す。(min_index()の値は加算する前のもの)
+		virtual uint64_t toRawIndex() const = 0;
+
+	protected:
+		// このクラスの返すmin_index()の値
+		uint64_t min_index_;
+
+		// このクラスの返すmax_index()の値 = min_index() + max_raw_index_
+		// この変数は派生クラスのsize()で計算されたもの。
+		uint64_t max_raw_index_;
+
+		// サポートする玉の升の数(通常SQUARE_NB)
+		int max_king_sq_;
+
+		// サポートするBonaPieceの最大値
+		uint64_t fe_end_;
+
+	};
+
+	struct KK : public SerializerBase
+	{
+	protected:
+		KK(Square king0, Square king1,bool inverse) : king0_(king0), king1_(king1) , inverse_sign(inverse) {}
+	public:
+		KK() {}
+
+		virtual uint64_t size() const { return max_king_sq_ * max_king_sq_; }
+
+		// index(通し番号)からKKのオブジェクトを生成するbuilder
+		KK fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
+
+		// raw_index(通し番号ではなく0から始まる番号)からKKのオブジェクトを生成するbuilder
+		KK fromRawIndex(uint64_t raw_index) const
+		{
+			int king1 = (int)(raw_index % SQUARE_NB);
+			raw_index /= SQUARE_NB;
+			int king0 = (int)(raw_index  /* % SQUARE_NB */);
+			assert(king0 < SQUARE_NB);
+			return fromKK((Square)king0, (Square)king1 , false);
+		}
+		KK fromKK(Square king0, Square king1 , bool inverse) const
+		{
+			// kkという変数名はEval::kk配列などで使っているので別の名前にする必要がある。(以下、KKP,KPPクラスなどでも同様)
+			KK my_kk(king0, king1, inverse);
+			my_kk.set(max_king_sq_, fe_end_, min_index());
+			return my_kk;
+		}
+		KK fromKK(Square king0, Square king1) const { return fromKK(king0, king1, false); }
+
+		// fromIndex()を用いてこのオブジェクトを構築したときに、以下のアクセッサで情報が得られる。
+		Square king0() const { return king0_; }
+		Square king1() const { return king1_; }
+
+// 次元下げの数
+#if defined(USE_KK_INVERSE_WRITE)
+	#define KK_LOWER_COUNT 4
+#elif defined(USE_KK_MIRROR_WRITE)
+	#define KK_LOWER_COUNT 2
+#else 
+	#define KK_LOWER_COUNT 1
+#endif
+
+#if defined(USE_KK_INVERSE_WRITE) && !defined(USE_KK_MIRROR_WRITE) 
+		// USE_KK_INVERSE_WRITEわ使うならUSE_KK_MIRROR_WRITEも定義して欲しい。
+		static_assert(false, "define also USE_KK_MIRROR_WRITE!");
+#endif
+
+		// 低次元の配列のindexを得る。
+		// USE_KK_INVERSE_WRITEが有効なときは、それらをinverseしたものが[2],[3]に入る。
+		// この次元下げに関して、gradの符号は反転させないといけないので注意すること。
+		// is_inverse()で判定できるのでこれを利用すると良い。
+		void toLowerDimensions(/*out*/KK kk_[KK_LOWER_COUNT]) const {
+			kk_[0] = fromKK(king0_, king1_,false);
+#if defined(USE_KK_MIRROR_WRITE)
+			kk_[1] = fromKK(Mir(king0_),Mir(king1_),false);
+#if defined(USE_KK_INVERSE_WRITE)
+			kk_[2] = fromKK(Inv(king1_), Inv(king0_),true);
+			kk_[3] = fromKK(Inv(Mir(king1_)) , Inv(Mir(king0_)),true);
+#endif
+#endif
+		}
+
+		// このクラスのmin_index()の値を0として数えたときのindexを取得する。
+		virtual uint64_t toRawIndex() const {
+			return (uint64_t)king0_ * (uint64_t)max_king_sq_ + (uint64_t)king1_;
+		}
+
+		// toLowerDimensionsで次元下げしたものがinverseしたものであるかを返す。
+		bool is_inverse() const {
+			return inverse_sign;
+		}
+
+		// is_inverse() == trueのときに、gradの手番ではないほうの符号を反転させて返す。
+		template <typename T>
+		std::array<T, 2> apply_inverse_sign(const std::array<T, 2>& rhs)
+		{
+			return !is_inverse() ? rhs : std::array<T, 2>{-rhs[0], rhs[1]};
+		}
+
+		// 比較演算子
+		bool operator==(const KK& rhs) { return king0() == rhs.king0() && king1() == rhs.king1(); }
+		bool operator!=(const KK& rhs) { return !(*this == rhs); }
+
+	private:
+		Square king0_, king1_ ;
+		bool inverse_sign;
+	};
+
+	// デバッグ用出力。
+	static std::ostream& operator<<(std::ostream& os, KK rhs)
+	{
+		os << "KK(" << rhs.king0() << "," << rhs.king1() << ")";
+		return os;
+	}
+
+	// KKと同じく。KKP用。
+	struct KKP : public SerializerBase
+	{
+	protected:
+		KKP(Square king0, Square king1, Eval::BonaPiece p) : king0_(king0), king1_(king1), piece_(p), inverse_sign(false) {}
+		KKP(Square king0, Square king1, Eval::BonaPiece p, bool inverse) : king0_(king0), king1_(king1), piece_(p),inverse_sign(inverse) {}
+	public:
+		KKP() {}
+
+		virtual uint64_t size() const { return (uint64_t)max_king_sq_*(uint64_t)max_king_sq_*(uint64_t)fe_end_; }
+
+		// index(通し番号)からKKPのオブジェクトを生成するbuilder
+		KKP fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
+
+		// raw_index(通し番号ではなく0から始まる番号)からKKPのオブジェクトを生成するbuilder
+		KKP fromRawIndex(uint64_t raw_index) const
+		{
+			int piece = (int)(raw_index % Eval::fe_end);
+			raw_index /= Eval::fe_end;
+			int king1 = (int)(raw_index % SQUARE_NB);
+			raw_index /= SQUARE_NB;
+			int king0 = (int)(raw_index  /* % SQUARE_NB */);
+			assert(king0 < SQUARE_NB);
+			return fromKKP((Square)king0, (Square)king1, (Eval::BonaPiece)piece,false);
+		}
+
+		KKP fromKKP(Square king0, Square king1, Eval::BonaPiece p, bool inverse) const
+		{
+			KKP my_kkp(king0, king1, p, inverse);
+			my_kkp.set(max_king_sq_,fe_end_,min_index());
+			return my_kkp;
+		}
+		KKP fromKKP(Square king0, Square king1, Eval::BonaPiece p) const { return fromKKP(king0, king1, p, false); }
+
+		// fromIndex()を用いてこのオブジェクトを構築したときに、以下のアクセッサで情報が得られる。
+		Square king0() const { return king0_; }
+		Square king1() const { return king1_; }
+		Eval::BonaPiece piece() const { return piece_; }
+
+		// KKPの次元下げの数
+#if defined(USE_KKP_INVERSE_WRITE)
+		#define KKP_LOWER_COUNT 4
+#elif defined(USE_KKP_MIRROR_WRITE)
+		#define KKP_LOWER_COUNT 2
+#else
+		#define KKP_LOWER_COUNT 1
+#endif
+
+#if defined(USE_KKP_INVERSE_WRITE) && !defined(USE_KKP_MIRROR_WRITE) 
+		// USE_KKP_INVERSE_WRITEわ使うならUSE_KKP_MIRROR_WRITEも定義して欲しい。
+		static_assert(false, "define also USE_KKP_MIRROR_WRITE!");
+#endif
+
+		// 低次元の配列のindexを得る。ミラーしたものがkkp_[1]に返る。
+		// USE_KKP_INVERSE_WRITEが有効なときは、それらをinverseしたものが[2],[3]に入る。
+		// この次元下げに関して、gradの符号は反転させないといけないので注意すること。
+		// is_inverse()で判定できるのでこれを利用すると良い。
+		void toLowerDimensions(/*out*/ KKP kkp_[KKP_LOWER_COUNT]) const {
+			kkp_[0] = fromKKP(king0_, king1_, piece_,false);
+#if defined(USE_KKP_MIRROR_WRITE)
+			kkp_[1] = fromKKP(Mir(king0_), Mir(king1_), mir_piece(piece_),false);
+#if defined(USE_KKP_INVERSE_WRITE)
+			kkp_[2] = fromKKP( Inv(king1_), Inv(king0_), inv_piece(piece_),true);
+			kkp_[3] = fromKKP( Inv(Mir(king1_)), Inv(Mir(king0_)) , inv_piece(mir_piece(piece_)),true);
+#endif
+#endif
+		}
+
+		// このクラスのmin_index()の値を0として数えたときのindexを取得する。
+		virtual uint64_t toRawIndex() const {
+			return  ((uint64_t)king0_ * (uint64_t)max_king_sq_ + (uint64_t)king1_) * (uint64_t)fe_end_ + (uint64_t)piece_;
+		}
+
+		// toLowerDimensionsで次元下げしたものがinverseしたものであるかを返す。
+		bool is_inverse() const {
+			return inverse_sign;
+		}
+
+		// is_inverse() == trueのときに、gradの手番ではないほうの符号を反転させて返す。
+		template <typename T>
+		std::array<T, 2> apply_inverse_sign(const std::array<T, 2>& rhs)
+		{
+			return !is_inverse() ? rhs : std::array<T, 2>{-rhs[0], rhs[1]};
+		}
+
+		// 比較演算子
+		bool operator==(const KKP& rhs) { return king0() == rhs.king0() && king1() == rhs.king1() && piece() == rhs.piece(); }
+		bool operator!=(const KKP& rhs) { return !(*this == rhs); }
+
+	private:
+		Square king0_, king1_;
+		Eval::BonaPiece piece_;
+		bool inverse_sign;
+	};
+
+	// デバッグ用出力。
+	static std::ostream& operator<<(std::ostream& os, KKP rhs)
+	{
+		os << "KKP(" << rhs.king0() << "," << rhs.king1() << "," << rhs.piece() << ")";
+		return os;
+	}
+
+
+	// KK,KKPと同様。KPP用
+	struct KPP : public SerializerBase
+	{
+	protected:
+		KPP(Square king, Eval::BonaPiece p0, Eval::BonaPiece p1) : king_(king), piece0_(p0), piece1_(p1) {}
+
+	public:
+		KPP() {}
+
+		// KK,KKP,KPP配列を直列化するときの通し番号の、KPPの最小値、最大値。
+#if !defined(USE_TRIANGLE_WEIGHT_ARRAY)
+		virtual uint64_t size() const { return (uint64_t)max_king_sq_*(uint64_t)fe_end_*(uint64_t)fe_end_; }
+#else
+		// kpp[SQUARE_NB][fe_end][fe_end]の[fe_end][fe_end]な正方配列の部分を三角配列化する。
+		// kpp[SQUARE_NB][triangle_fe_end]とすると、この三角配列の1行目は要素1個、2行目は2個、…。
+		// ゆえに、triangle_fe_end = 1 + 2 + .. + fe_end = fe_end * (fe_end + 1) / 2
+		virtual uint64_t size() const { return (uint64_t)max_king_sq_*(uint64_t)triangle_fe_end; }
+#endif
+
+		virtual void set(int max_king_sq, uint64_t fe_end, uint64_t min_index)
+		{
+			// この値、size()で用いていて、SerializerBase::set()でsize()を使うので先に計算する。
+			triangle_fe_end = (uint64_t)fe_end*((uint64_t)fe_end + 1) / 2;
+
+			SerializerBase::set(max_king_sq, fe_end, min_index);
+		}
+
+		// index(通し番号)からKPPのオブジェクトを生成するbuilder
+		KPP fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
+
+		// raw_index(通し番号ではなく0から始まる番号)からKPPのオブジェクトを生成するbuilder
+		KPP fromRawIndex(uint64_t raw_index) const
+		{
+			const uint64_t triangle_fe_end = (uint64_t)fe_end_*((uint64_t)fe_end_ + 1) / 2;
+
+#if !defined(USE_TRIANGLE_WEIGHT_ARRAY)
+			int piece1 = (int)(raw_index % fe_end_);
+			raw_index /= fe_end_;
+			int piece0 = (int)(raw_index % fe_end_);
+			raw_index /= fe_end_;
+#else
+			uint64_t index2 = raw_index % triangle_fe_end;
+
+			// ここにindex2からpiece0,piece1を求める式を書く。
+			// これは index2 = i * (i+1) / 2 + j の逆関数となる。
+			// j = 0 の場合、i^2 + i - 2 * index2 == 0なので
+			// 2次方程式の解の公式から i = (sqrt(8*index2+1) - 1) / 2である。
+			// iを整数化したのちに、j = index2 - i * (i + 1) / 2としてjを求めれば良い。
+
+			// BonaPieceは32bit(16bitに収まらない可能性)を想定しているのでこの掛け算は64bitでないといけない。
+			int piece1 = int(sqrt(8 * index2 + 1) - 1) / 2;
+			int piece0 = int(index2 - (uint64_t)piece1*((uint64_t)piece1 + 1) / 2);
+
+			assert(piece1 < (int)fe_end_);
+			assert(piece0 < (int)fe_end_);
+			assert(piece0 > piece1);
+
+			raw_index /= triangle_fe_end;
+#endif
+			int king = (int)(raw_index  /* % SQUARE_NB */);
+			assert(king < max_king_sq_);
+			return fromKPP((Square)king, (Eval::BonaPiece)piece0, (Eval::BonaPiece)piece1);
+		}
+
+		KPP fromKPP(Square king, Eval::BonaPiece p0, Eval::BonaPiece p1) const
+		{
+			KPP my_kpp(king, p0, p1);
+			my_kpp.set(max_king_sq_,fe_end_,min_index());
+			return my_kpp;
+		}
+
+		// fromIndex()を用いてこのオブジェクトを構築したときに、以下のアクセッサで情報が得られる。
+		Square king() const { return king_; }
+		Eval::BonaPiece piece0() const { return piece0_; }
+		Eval::BonaPiece piece1() const { return piece1_; }
+
+
+		// 次元下げの数
+#if defined(USE_KPP_MIRROR_WRITE)
+	#if !defined(USE_TRIANGLE_WEIGHT_ARRAY)
+		#define KPP_LOWER_COUNT 4
+	#else
+		#define KPP_LOWER_COUNT 2
+	#endif
+#else
+	#if !defined(USE_TRIANGLE_WEIGHT_ARRAY)
+		#define KPP_LOWER_COUNT 2
+	#else
+		#define KPP_LOWER_COUNT 1
+	#endif
+#endif
+
+		// 低次元の配列のindexを得る。p1,p2を入れ替えたもの、ミラーしたものなどが返る。
+		void toLowerDimensions(/*out*/ KPP kpp_[KPP_LOWER_COUNT]) const {
+
+#if defined(USE_TRIANGLE_WEIGHT_ARRAY)
+			// 三角配列を用いる場合は、piece0とpiece1を入れ替えたものは返らないので注意。
+			kpp_[0] = fromKPP(king_, piece0_, piece1_);
+#if defined(USE_KPP_MIRROR_WRITE)
+			kpp_[1] = fromKPP(Mir(king_), mir_piece(piece0_), mir_piece(piece1_));
+#endif
+
+#else
+			// 三角配列を用いない場合
+			kpp_[0] = fromKPP(king_, piece0_, piece1_);
+			kpp_[1] = fromKPP(king_, piece1_, piece0_);
+#if defined(USE_KPP_MIRROR_WRITE)
+			kpp_[2] = fromKPP(Mir(king_), mir_piece(piece0_), mir_piece(piece1_));
+			kpp_[3] = fromKPP(Mir(king_), mir_piece(piece1_), mir_piece(piece0_));
+#endif
+#endif
+		}
+
+		// このクラスのmin_index()の値を0として数えたときのindexを取得する。
+		virtual uint64_t toRawIndex() const {
+
+#if !defined(USE_TRIANGLE_WEIGHT_ARRAY)
+
+			return ((uint64_t)king_ * (uint64_t)fe_end_ + (uint64_t)piece0_) * (uint64_t)fe_end_ + (uint64_t)piece1_;
+
+#else
+			// Bonanza6.0で使われているのに似せたマクロ
+			auto PcPcOnSq = [&](Square k, Eval::BonaPiece i, Eval::BonaPiece j)
+			{
+
+				// この三角配列の(i,j)は、i行目のj列目の要素。
+				// i行目0列目は、そこまでの要素の合計であるから、1 + 2 + ... + i = i * (i+1) / 2
+				// i行目j列目は、これにjを足したもの。i * (i + 1) /2 + j
+
+				// BonaPiece型は、32bitを想定しているので掛け算には気をつけないとオーバーフローする。
+				return (uint64_t)k * triangle_fe_end + (uint64_t)(uint64_t(i)*(uint64_t(i)+1) / 2 + uint64_t(j));
+			};
+
+			auto k = king_;
+			auto i = piece0_;
+			auto j = piece1_;
+
+			return (i >= j) ? PcPcOnSq(k, i, j) : PcPcOnSq(k, j, i);
+#endif
+		}
+
+		// toLowerDimensionsで次元下げしたものがinverseしたものであるかを返す。
+		// KK,KKPとinterfaceを合せるために用意してある。このKPPクラスでは、このメソッドは常にfalseを返す。
+		bool is_inverse() const {
+			return false;
+		}
+
+		// 比較演算子
+		bool operator==(const KPP& rhs) {
+			return king() == rhs.king() &&
+				((piece0() == rhs.piece0() && piece1() == rhs.piece1())
+#if defined(USE_TRIANGLE_WEIGHT_ARRAY)
+					// 三角配列を用いるときはpiece0とpiece1の入れ替わりを許容する。
+				|| (piece0() == rhs.piece1() && piece1() == rhs.piece0())
+#endif
+					); }
+		bool operator!=(const KPP& rhs) { return !(*this == rhs); }
+
+
+	private:
+		Square king_;
+		Eval::BonaPiece piece0_, piece1_;
+
+		uint64_t triangle_fe_end; // = (uint64_t)fe_end_*((uint64_t)fe_end_ + 1) / 2;
+	};
+
+	// デバッグ用出力。
+	static std::ostream& operator<<(std::ostream& os, KPP rhs)
+	{
+		os << "KPP(" << rhs.king() << "," << rhs.piece0() << "," << rhs.piece1() << ")";
+		return os;
+	}
+
+	// KPPPの4駒関係。ただし、手番ありでミラー等を考慮しないと学習に2TB以上のメモリが必要…。
+	// 三角配列を使っても学習のために50GB×12バイト = 600GB必要。
+	// ミラーしたもののみを格納するようにしてもの半分ぐらい必要。
+	// ここでは、三角配列は必ず用いて、かつミラーしたものを格納するものとする。
+	//
+	// また、このクラスのking()は、実際のkingのSquareとは限らず、単に、0～(king_sq-1)までの値が返る。
+	// これは、ミラーを利用した圧縮を行なう場合など、利用側で適切な玉の位置に変換してやる必要がある。
+	// 
+	// あと、このクラスの返すpiece0,1,2に関して、
+	//   piece0() > piece1() > piece2()
+	// であり、コンストラクタでpiece0,1,2を渡すときも、この制約を守る必要がある。
+	struct KPPP : public SerializerBase
+	{
+	protected:
+		KPPP(int king, Eval::BonaPiece p0, Eval::BonaPiece p1, Eval::BonaPiece p2) :
+			king_(king), piece0_(p0), piece1_(p1), piece2_(p2)
+		{
+			assert(piece0_ > piece1_ && piece1_ > piece2_);
+			/* sort_piece(); */
+		}
+
+	public:
+		KPPP() {}
+
+		virtual uint64_t size() const { return (uint64_t)max_king_sq_*triangle_fe_end; }
+
+		// fe_endとking_sqを設定する。
+		// fe_end : このKPPPクラスの想定するfe_end
+		// king_sq : KPPPのときに扱う玉の升の数。
+		//  3段×ミラーなら3段×5筋 = 15みたいな感じ。
+		//  2段×ミラーなしなら2×9筋 = 18みたいな感じ。
+		//  これをこのKPPPクラスを使う側でset()を用いて最初に設定する。
+		virtual void set(int max_king_sq, uint64_t fe_end,uint64_t min_index) {
+			// この値、size()で用いていて、SerializerBase::set()でsize()を使うので先に計算する。
+			triangle_fe_end = fe_end * (fe_end - 1) * (fe_end - 2) / 6;
+
+			SerializerBase::set(max_king_sq, fe_end, min_index);
+		}
+
+		// 次元下げの数
+		// とりあえず、ミラーの次元下げ非対応。ここでやることもないかと…。
+/*
+#if defined(USE_KPPP_MIRROR_WRITE)
+#define KPPP_LOWER_COUNT 2
+#else
+#define KPPP_LOWER_COUNT 1
+#endif
+*/
+#define KPPP_LOWER_COUNT 1
+
+		// 低次元の配列のindexを得る。
+		// p0,p1,p2を入れ替えたものは返らないので注意。
+		// またミラーしたものも、USE_KPPP_MIRROR_WRITEが有効なときしか返さない。
+		void toLowerDimensions(/*out*/ KPPP kppp_[KPPP_LOWER_COUNT]) const
+		{
+			kppp_[0] = fromKPPP(king_, piece0_, piece1_,piece2_);
+#if KPPP_LOWER_COUNT > 1
+			// mir_pieceするとsortされてない状態になる。sortするコードが必要。
+			Eval::BonaPiece p_list[3] = { mir_piece(piece2_), mir_piece(piece1_), mir_piece(piece0_) };
+			my_insertion_sort(p_list, 0, 3);
+			kppp_[1] = fromKPPP((int)Mir((Square)king_), p_list[2] , p_list[1], p_list[0]);
+#endif
+		}
+
+		// index(通し番号)からKPPPのオブジェクトを生成するbuilder
+		KPPP fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
+
+		// raw_index(通し番号ではなく0から始まる番号)からKPPPのオブジェクトを生成するbuilder
+		KPPP fromRawIndex(uint64_t raw_index) const
+		{
+			uint64_t index2 = raw_index % triangle_fe_end;
+
+			// ここにindex2からpiece0,piece1,piece2を求める式を書く。
+			// これは index2 = i(i-1)(i-2)/6-1 + j(j+1)/2 + k の逆関数となる。
+			// j = k = 0 の場合、3次方程式の解の公式から実根は、 i = ...である。(以下式) 
+			// ただしindex2が0,1のときは実数解が複数ある。これを考慮しないといけない。計算精度が足りないことに対する対策必要。
+			// iが求まったあとはiを整数化したのちに、最初の式に入れてKPPのとき同様にjを求めれば良い。
+
+			// この処理、数値計算としてわりと難しい。色々工夫が必要。
+
+			int piece0;
+			if (index2 <= 1)
+			{
+				// index2 == 0,1のときだけ実数解が複数ある。
+				piece0 = (int)index2 + 2;
+
+			} else {
+
+				//double t = pow(sqrt((243 *index2 * index2 - 1) * 3) + 27 * index2, 1.0 / 3);
+				// →　これだとindex2が大きくなるとsqrt()の中身、オーバーフローする。
+
+				// sqrt()の中身がオーバーフローするので、sqrtのなかで3.0を掛けずにsqrtの外側でsqrt(3.0)を掛ける。
+				// sqrt()の中身がオーバーフローするので、index2が大きいときは近似式を用いる。
+
+				double t;
+				
+				if (index2 < 100000000)
+					t = pow(sqrt((243.0 *index2 * index2 - 1)) * sqrt(3.0) + 27 * index2, 1.0 / 3);
+				else
+					// index2が非常に大きいとき、sqrtの中身、近似的に √243 * index2とみなせるだろう。
+					t = pow( index2 * sqrt(243 * 3.0) + 27 * index2, 1.0 / 3);
+				
+				// 丸めのときに計算誤差でわずかに足りないのを防ぐためデルタを加算する。
+				// 大きすぎると1大きい数になってしまう時があるので調整が必要。
+				
+				const double delta = 0.000000001;
+
+				piece0 = int(t / pow(3.0, 2.0 / 3) + 1.0 / (pow(3.0, 1.0 / 3) * t) + delta) + 1;
+				// ううう。ほんまにこんなことせんとあかんのか？(´ω｀)
+			}
+
+			// piece2が求まったので、上式のi(i-1)(i-2)/6(=aとする)のiにpiece2を代入。また、k = 0を代入。
+			// j(j+1)/2 = index2 - a
+			// これは、2次方程式の解の公式より..
+
+			uint64_t a = (uint64_t)piece0*((uint64_t)piece0 - 1)*((uint64_t)piece0 - 2) / 6;
+			int piece1 = int((1 + sqrt(8.0 * (index2 - a ) + 1)) / 2);
+			uint64_t b = (uint64_t)piece1 * (piece1 - 1) / 2;
+			int piece2 = int(index2 - a - b);
+
+#if 0
+			if (!((piece0 > piece1 && piece1 > piece2)))
+			{
+				std::cout << index << " , " << index2 << "," << a << "," << sqrt(8.0 * (index2 - a) + 1);
+			}
+#endif
+
+			assert(piece0 > piece1 && piece1 > piece2);
+
+			assert(piece2 < (int)fe_end_);
+			assert(piece1 < (int)fe_end_);
+			assert(piece0 < (int)fe_end_);
+
+			raw_index /= triangle_fe_end;
+
+			int king = (int)(raw_index  /* % SQUARE_NB */);
+			assert(king < max_king_sq_);
+
+			// king_sqとfe_endに関しては伝播させる。
+			return fromKPPP((Square)king, (Eval::BonaPiece)piece0, (Eval::BonaPiece)piece1 , (Eval::BonaPiece)piece2);
+		}
+
+		// k,p0,p1,p2を指定してKPPPのインスタンスをbuildする。
+		// 内部的に保持しているset()で渡されたking_sqとfe_endは引き継ぐ。
+		KPPP fromKPPP(int king, Eval::BonaPiece p0, Eval::BonaPiece p1, Eval::BonaPiece p2) const
+		{
+			KPPP kppp(king, p0, p1, p2);
+			kppp.set(max_king_sq_, fe_end_,min_index());
+			return kppp;
+		}
+
+		// このクラスのmin_index()の値を0として数えたときのindexを取得する。
+		virtual uint64_t toRawIndex() const {
+
+			// Bonanza 6.0で使われているのに似せたマクロ
+			// 前提条件) i > j > k であること。
+			// i==j,j==kのケースはNG。
+			auto PcPcPcOnSq = [this](int king, Eval::BonaPiece i, Eval::BonaPiece j , Eval::BonaPiece k)
+			{
+				// この三角配列の(i,j,k)は、i行目のj列目の要素。
+				// i行目0列0番目は、そこまでの要素の合計であるから、0 + 0 + 1 + 3 + 6 + ... + (i)*(i-1)/2 = i*(i-1)*(i-2)/ 6
+				// i行目j列0番目は、そこにjを加味したもの。 + j*(j-1) / 2
+				// i行目j列k番目は、そこにkを足したもの。   + k
+				assert(i > j && j > k);
+
+				// BonaPiece型は、32bitを想定しているので掛け算には気をつけないとオーバーフローする。
+				return (uint64_t)king * triangle_fe_end + (uint64_t)(
+						  uint64_t(i)*(uint64_t(i) - 1) * (uint64_t(i) - 2) / 6
+						+ uint64_t(j)*(uint64_t(j) - 1) / 2
+						+ uint64_t(k)
+					);
+			};
+
+			return PcPcPcOnSq(king_, piece0_, piece1_, piece2_);
+		}
+
+		// fromIndex()を用いてこのオブジェクトを構築したときに、以下のアクセッサで情報が得られる。
+		int king() const { return king_; }
+		Eval::BonaPiece piece0() const { return piece0_; }
+		Eval::BonaPiece piece1() const { return piece1_; }
+		Eval::BonaPiece piece2() const { return piece2_; }
+		// toLowerDimensionsで次元下げしたものがinverseしたものであるかを返す。
+		// KK,KKPとinterfaceを合せるために用意してある。このKPPPクラスでは、このメソッドは常にfalseを返す。
+		bool is_inverse() const {
+			return false;
+		}
+
+		// 3角配列化したときの要素の数を返す。kppp配列が、以下のような2次元配列だと想定している。
+		//   kppp[king_sq][triangle_fe_end];
+		uint64_t get_triangle_fe_end() const { return triangle_fe_end; }
+
+		// 比較演算子
+		bool operator==(const KPPP& rhs) {
+			// piece0 > piece1 > piece2を前提とするので、入れ替わりの可能性はない。
+			return king() == rhs.king() && piece0() == rhs.piece0() && piece1() == rhs.piece1() && piece2() == rhs.piece2();
+		}
+		bool operator!=(const KPPP& rhs) { return !(*this == rhs); }
+
+	private:
+
+		int king_;
+		Eval::BonaPiece piece0_, piece1_,piece2_;
+
+		// kppp[king_sq][fe_end][fe_end][fe_end]の[fe_end][fe_end][fe_end]な正方配列の部分を三角配列化する。
+		// kppp[king_sq][triangle_fe_end]とすると、この三角配列の0行目から要素数は、0,0,1,3,…,n行目はn(n-1)/2個。
+		// ゆえに、
+		// triangle_fe_end = Σn(n-1)/2 , n=0..fe_end-1
+		//                 =  fe_end * (fe_end - 1) * (fe_end - 2) / 6
+		uint64_t triangle_fe_end; // ((uint64_t)Eval::fe_end)*((uint64_t)Eval::fe_end - 1)*((uint64_t)Eval::fe_end - 2) / 6;
+	};
+
+	// デバッグ用出力。
+	static std::ostream& operator<<(std::ostream& os, KPPP rhs)
+	{
+		os << "KPPP(" << rhs.king() << "," << rhs.piece0() << "," << rhs.piece1() << "," << rhs.piece2() << ")";
+		return os;
+	}
+
+	// KKPPによる4駒関係の学習用。
+	//
+	// KPPPクラスと同じ設計。KPPPクラスで、pが一枚少ないものとして扱う。
+	// ２つの玉の位置は0～king_sq-1までの値としてencodeされているものとする。
+	//
+	// あと、このクラスの返すpiece0,1に関して、
+	//   piece0() > piece1()
+	// であり、コンストラクタでpiece0,1を渡すときも、この制約を守る必要がある。
+	//
+	// この制約から、BonaPieceZeroをpiece0,piece1に同時に代入して渡すことは出来ない。
+	// 駒落ちの学習に対応させるならevaluate()で工夫が必要。
+	struct KKPP : SerializerBase
+	{
+	protected:
+		KKPP(int king, Eval::BonaPiece p0, Eval::BonaPiece p1) :
+			king_(king), piece0_(p0), piece1_(p1)
+		{
+			assert(piece0_ > piece1_);
+			/* sort_piece(); */
+		}
+
+	public:
+		KKPP() {}
+
+		virtual uint64_t size() const { return (uint64_t)max_king_sq_*triangle_fe_end; }
+
+		// fe_endとking_sqを設定する。
+		// fe_end : このKPPPクラスの想定するfe_end
+		// king_sq : KPPPのときに扱う玉の升の数。
+		//  9段×ミラーなら9段×5筋の2乗(先後の玉) = 45*45 = 2025 みたいな感じ。
+		//  これをこのKKPPクラスを使う側でset()を用いて最初に設定する。
+		void set(int max_king_sq, uint64_t fe_end , uint64_t min_index) {
+			// この値、size()で用いていて、SerializerBase::set()でsize()を使うので先に計算する。
+			triangle_fe_end = fe_end * (fe_end - 1) / 2;
+
+			SerializerBase::set(max_king_sq, fe_end, min_index);
+		}
+
+		// 次元下げの数
+		// とりあえず、ミラーの次元下げ非対応。ここでやることもないかと…。(学習用のメモリがもったいないので)
+#define KKPP_LOWER_COUNT 1
+
+		// 低次元の配列のindexを得る。
+		// p0,p1,p2を入れ替えたものは返らないので注意。
+		// またミラーしたものも、USE_KPPP_MIRROR_WRITEが有効なときしか返さない。
+		void toLowerDimensions(/*out*/ KKPP kkpp_[KPPP_LOWER_COUNT]) const
+		{
+			kkpp_[0] = fromKKPP(king_, piece0_, piece1_);
+
+			// ミラーする場合、mir_pieceするとsortされてない状態になる。sortするコードが必要。
+			// あとking_に対するミラーを定義する必要も。
+		}
+
+		// index(通し番号)からKKPPのオブジェクトを生成するbuilder
+		KKPP fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
+
+		// raw_index(通し番号ではなく0から始まる番号)からKKPPのオブジェクトを生成するbuilder
+		KKPP fromRawIndex(uint64_t raw_index) const
+		{
+			uint64_t index2 = raw_index % triangle_fe_end;
+
+			// ここにindex2からpiece0,piece1,piece2を求める式を書く。
+			// これは index2 = i(i-1)/2 + j の逆関数となる。
+			// j=0として、二次方程式の解の公式を用いる。
+			// index2=0のときは重根だが小さいほうはi>jを満たさないので無視。
+
+			int piece0 = (int(sqrt(8 * index2 + 1)) + 1)/2;
+			int piece1 = int(index2 - piece0 * (piece0 - 1) /2 );
+
+			assert(piece0 > piece1);
+
+			assert(piece1 < (int)fe_end_);
+			assert(piece0 < (int)fe_end_);
+
+			raw_index /= triangle_fe_end;
+
+			int king = (int)(raw_index  /* % SQUARE_NB */);
+			assert(king < max_king_sq_);
+
+			// king_sqとfe_endに関しては伝播させる。
+			return fromKKPP(king, (Eval::BonaPiece)piece0, (Eval::BonaPiece)piece1);
+		}
+
+		// k,p0,p1を指定してKKPPのインスタンスをbuildする。
+		// 内部的に保持しているset()で渡されたking_sqとfe_endは引き継ぐ。
+		KKPP fromKKPP(int king, Eval::BonaPiece p0, Eval::BonaPiece p1) const
+		{
+			KKPP kkpp(king, p0, p1);
+			kkpp.set(max_king_sq_, fe_end_,min_index());
+			return kkpp;
+		}
+
+		// このクラスのmin_index()の値を0として数えたときのindexを取得する。
+		virtual uint64_t toRawIndex() const {
+
+			// Bonanza 6.0で使われているのに似せたマクロ
+			// 前提条件) i > jであること。
+			// i==j,j==kのケースはNG。
+			auto PcPcOnSq = [this](int king, Eval::BonaPiece i, Eval::BonaPiece j)
+			{
+				assert(i > j);
+
+				// BonaPiece型は、32bitを想定しているので掛け算には気をつけないとオーバーフローする。
+				return (uint64_t)king * triangle_fe_end + (uint64_t)(
+					+ uint64_t(i)*(uint64_t(i) - 1) / 2
+					+ uint64_t(j)
+					);
+			};
+
+			return PcPcOnSq(king_, piece0_, piece1_);
+		}
+
+		// fromIndex(),fromKKPP()を用いてこのオブジェクトを構築したときに、以下のアクセッサで情報が得られる。
+		int king() const { return king_; }
+		Eval::BonaPiece piece0() const { return piece0_; }
+		Eval::BonaPiece piece1() const { return piece1_; }
+
+		// toLowerDimensionsで次元下げしたものがinverseしたものであるかを返す。
+		// KK,KKPとinterfaceを合せるために用意してある。このKKPPクラスでは、このメソッドは常にfalseを返す。
+		bool is_inverse() const {
+			return false;
+		}
+
+		// 3角配列化したときの要素の数を返す。kkpp配列が、以下のような2次元配列だと想定している。
+		//   kkpp[king_sq][triangle_fe_end];
+		uint64_t get_triangle_fe_end() const { return triangle_fe_end; }
+
+		// 比較演算子
+		bool operator==(const KKPP& rhs) {
+			// piece0 > piece1を前提とするので、入れ替わりの可能性はない。
+			return king() == rhs.king() && piece0() == rhs.piece0() && piece1() == rhs.piece1();
+		}
+		bool operator!=(const KKPP& rhs) { return !(*this == rhs); }
+
+	private:
+
+		int king_;
+		Eval::BonaPiece piece0_, piece1_;
+
+		// kppp[king_sq][fe_end][fe_end]の[fe_end][fe_end]な正方配列の部分を三角配列化する。
+		uint64_t triangle_fe_end = 0;
+		
+	};
+
+	// デバッグ用出力。
+	static std::ostream& operator<<(std::ostream& os, KKPP rhs)
+	{
+		os << "KKPP(" << rhs.king() << "," << rhs.piece0() << "," << rhs.piece1() << ")";
+		return os;
+	}
+
+
+}
+
+#endif // defined (EVAL_LEARN)
+#endif
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
new file mode 100644
index 00000000..2dcb5b46
--- /dev/null
+++ b/src/learn/multi_think.cpp
@@ -0,0 +1,123 @@
+﻿#include "../types.h"
+
+#if defined(EVAL_LEARN)
+
+#include "multi_think.h"
+#include "../tt.h"
+#include "../uci.h"
+
+#include <thread>
+
+void MultiThink::go_think()
+{
+	// あとでOptionsの設定を復元するためにコピーで保持しておく。
+	auto oldOptions = Options;
+
+	// 定跡を用いる場合、on the flyで行なうとすごく時間がかかる＆ファイルアクセスを行なう部分が
+	// thread safeではないので、メモリに丸読みされている状態であることをここで保証する。
+	Options["BookOnTheFly"] = std::string("false");
+
+	// 評価関数の読み込み等
+	// learnコマンドの場合、評価関数読み込み後に評価関数の値を補正している可能性があるので、
+	// メモリの破損チェックは省略する。
+	is_ready(true);
+
+	// 派生クラスのinit()を呼び出す。
+	init();
+
+	// ループ上限はset_loop_max()で設定されているものとする。
+	loop_count = 0;
+	done_count = 0;
+
+	// threadをOptions["Threads"]の数だけ生成して思考開始。
+	std::vector<std::thread> threads;
+	auto thread_num = (size_t)Options["Threads"];
+
+	// worker threadの終了フラグの確保
+	thread_finished.resize(thread_num);
+	
+	// worker threadの起動
+	for (size_t i = 0; i < thread_num; ++i)
+	{
+		thread_finished[i] = 0;
+		threads.push_back(std::thread([i, this]
+		{ 
+			// プロセッサの全スレッドを使い切る。
+			WinProcGroup::bindThisThread(i);
+
+			// オーバーライドされている処理を実行
+			this->thread_worker(i);
+
+			// スレッドが終了したので終了フラグを立てる
+			this->thread_finished[i] = 1;
+		}));
+	}
+
+	// すべてのthreadの終了待ちを
+	// for (auto& th : threads)
+	//  th.join();
+	// のように書くとスレッドがまだ仕事をしている状態でここに突入するので、
+	// その間、callback_func()が呼び出せず、セーブできなくなる。
+	// そこで終了フラグを自前でチェックする必要がある。
+
+	// すべてのスレッドが終了したかを判定する関数
+	auto threads_done = [&]()
+	{
+		// ひとつでも終了していなければfalseを返す
+		for (auto& f : thread_finished)
+			if (!f)
+				return false;
+		return true;
+	};
+
+	// コールバック関数が設定されているならコールバックする。
+	auto do_a_callback = [&]()
+	{
+		if (callback_func)
+			callback_func();
+	};
+
+
+	for (uint64_t i = 0 ; ; )
+	{
+		// 全スレッドが終了していたら、ループを抜ける。
+		if (threads_done())
+			break;
+
+		sleep(1000);
+
+		// callback_secondsごとにcallback_func()が呼び出される。
+		if (++i == callback_seconds)
+		{
+			do_a_callback();
+			// ↑から戻ってきてからカウンターをリセットしているので、
+			// do_a_callback()のなかでsave()などにどれだけ時間がかかろうと
+			// 次に呼び出すのは、そこから一定時間の経過を要する。
+			i = 0;
+		}
+	}
+
+	// 最後の保存。
+	std::cout << std::endl << "finalize..";
+
+	// do_a_callback();
+	// →　呼び出し元で保存するはずで、ここでは要らない気がする。
+
+	// 終了したフラグは立っているがスレッドの終了コードの実行中であるということはありうるので
+	// join()でその終了を待つ必要がある。
+	for (auto& th : threads)
+		th.join();
+
+	// 全スレッドが終了しただけでfileの書き出しスレッドなどはまだ動いていて
+	// 作業自体は完了していない可能性があるのでスレッドがすべて終了したことだけ出力する。
+	std::cout << "all threads are joined." << std::endl;
+
+	// Optionsを書き換えたので復元。
+	// 値を代入しないとハンドラが起動しないのでこうやって復元する。
+	for (auto& s : oldOptions)
+		Options[s.first] = std::string(s.second);
+
+}
+
+
+#endif // defined(EVAL_LEARN)
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
new file mode 100644
index 00000000..4d4e0daf
--- /dev/null
+++ b/src/learn/multi_think.h
@@ -0,0 +1,151 @@
+﻿#ifndef _MULTI_THINK_
+#define _MULTI_THINK_
+
+#if defined(EVAL_LEARN)
+
+#include <functional>
+
+#include "../misc.h"
+#include "../learn/learn.h"
+#include "../thread_win32_osx.h"
+
+#include <atomic>
+
+// 棋譜からの学習や、自ら思考させて定跡を生成するときなど、
+// 複数スレッドが個別にSearch::think()を呼び出したいときに用いるヘルパクラス。
+// このクラスを派生させて用いる。
+struct MultiThink
+{
+	MultiThink() : prng(21120903)
+	{
+		loop_count = 0;
+	}
+
+	// マスタースレッドからこの関数を呼び出すと、スレッドがそれぞれ思考して、
+	// 思考終了条件を満たしたところで制御を返す。
+	// 他にやってくれること。
+	// ・各スレッドがLearner::search(),qsearch()を呼び出しても安全なように
+	// 　置換表をスレッドごとに分離してくれる。(終了後、元に戻してくれる。)
+	// ・bookはon the flyモードだとthread safeではないので、このモードを一時的に
+	// 　オフにしてくれる。
+	// [要件]
+	// 1) thread_worker()のオーバーライド
+	// 2) set_loop_max()でループ回数の設定
+	// 3) 定期的にcallbackされる関数を設定する(必要なら)
+	//   callback_funcとcallback_interval
+	void go_think();
+
+	// 派生クラス側で初期化したいものがあればこれをoverrideしておけば、
+	// go_think()で初期化が終わったタイミングで呼び出される。
+	// 定跡の読み込みなどはそのタイミングで行うと良い。
+	virtual void init() {}
+
+	// go_think()したときにスレッドを生成して呼び出されるthread worker
+	// これをoverrideして用いる。
+	virtual void thread_worker(size_t thread_id) = 0;
+
+	// go_think()したときにcallback_seconds[秒]ごとにcallbackされる。
+	std::function<void()> callback_func;
+	uint64_t callback_seconds = 600;
+
+	// workerが処理する(Search::think()を呼び出す)回数を設定する。
+	void set_loop_max(uint64_t loop_max_) { loop_max = loop_max_; }
+	
+	// set_loop_max()で設定した値を取得する。
+	uint64_t get_loop_max() const { return loop_max; }
+
+	// [ASYNC] ループカウンターの値を取り出して、取り出し後にループカウンターを加算する。
+	// もしループカウンターがloop_maxに達していたらUINT64_MAXを返す。
+	// 局面を生成する場合などは、局面を生成するタイミングでこの関数を呼び出すようにしないと、
+	// 生成した局面数と、カウンターの値が一致しなくなってしまうので注意すること。
+	uint64_t get_next_loop_count() {
+		std::unique_lock<Mutex> lk(loop_mutex);
+		if (loop_count >= loop_max)
+			return UINT64_MAX;
+		return loop_count++;
+	}
+
+	// [ASYNC] 処理した個数を返す用。呼び出されるごとにインクリメントされたカウンターが返る。
+	uint64_t get_done_count() {
+		std::unique_lock<Mutex> lk(loop_mutex);
+		return ++done_count;
+	}
+
+	// worker threadがI/Oにアクセスするときのmutex
+	Mutex io_mutex;
+
+protected:
+	// 乱数発生器本体
+	AsyncPRNG prng;
+
+private:
+	// workerが処理する(Search::think()を呼び出す)回数
+	std::atomic<uint64_t> loop_max;
+	// workerが処理した(Search::think()を呼び出した)回数
+	std::atomic<uint64_t> loop_count;
+	// 処理した回数を返す用。
+	std::atomic<uint64_t> done_count;
+
+	// ↑の変数を変更するときのmutex
+	Mutex loop_mutex;
+
+	// スレッドの終了フラグ。
+	// vector<bool>にすると複数スレッドから書き換えようとしたときに正しく反映されないことがある…はず。
+	typedef uint8_t Flag;
+	std::vector<Flag> thread_finished;
+
+};
+
+// idle時間にtaskを処理する仕組み。
+// masterは好きなときにpush_task_async()でtaskを渡す。
+// slaveは暇なときにon_idle()を実行すると、taskを一つ取り出してqueueがなくなるまで実行を続ける。
+// MultiThinkのthread workerをmaster-slave方式で書きたいときに用いると便利。
+struct TaskDispatcher
+{
+	typedef std::function<void(size_t /* thread_id */)> Task;
+
+	// slaveはidle中にこの関数を呼び出す。
+	void on_idle(size_t thread_id)
+	{
+		Task task;
+		while ((task = get_task_async()) != nullptr)
+			task(thread_id);
+
+		sleep(1);
+	}
+
+	// [ASYNC] taskを一つ積む。
+	void push_task_async(Task task)
+	{
+		std::unique_lock<Mutex> lk(task_mutex);
+		tasks.push_back(task);
+	}
+
+	// task用の配列の要素をsize分だけ事前に確保する。
+	void task_reserve(size_t size)
+	{
+		tasks.reserve(size);
+	}
+
+protected:
+	// taskの集合
+	std::vector<Task> tasks;
+
+	// [ASYNC] taskを一つ取り出す。on_idle()から呼び出される。
+	Task get_task_async()
+	{
+		std::unique_lock<Mutex> lk(task_mutex);
+		if (tasks.size() == 0)
+			return nullptr;
+		Task task = *tasks.rbegin();
+		tasks.pop_back();
+		return task;
+	}
+
+	// tasksにアクセスするとき用のmutex
+	Mutex task_mutex;
+};
+
+#endif // defined(EVAL_LEARN) && defined(YANEURAOU_2018_OTAFUKU_ENGINE)
+
+#endif
diff --git a/src/misc.cpp b/src/misc.cpp
index 69c6bacc..3bf4fddc 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -42,6 +42,7 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 #endif
 
 #include <fstream>
+#include <functional>
 #include <iomanip>
 #include <iostream>
 #include <sstream>
@@ -316,6 +317,27 @@ void bindThisThread(size_t idx) {
 
 } // namespace WinProcGroup
 
+// 現在時刻を文字列化したもを返す。(評価関数の学習時などに用いる)
+std::string now_string()
+{
+  // std::ctime(), localtime()を使うと、MSVCでセキュアでないという警告が出る。
+  // C++標準的にはそんなことないはずなのだが…。
+
+#if defined(_MSC_VER)
+  // C4996 : 'ctime' : This function or variable may be unsafe.Consider using ctime_s instead.
+#pragma warning(disable : 4996)
+#endif
+
+  auto now = std::chrono::system_clock::now();
+  auto tp = std::chrono::system_clock::to_time_t(now);
+  auto result = string(std::ctime(&tp));
+
+  // 末尾に改行コードが含まれているならこれを除去する
+  while (*result.rbegin() == '\n' || (*result.rbegin() == '\r'))
+    result.pop_back();
+  return result;
+}
+
 void sleep(int ms)
 {
 	std::this_thread::sleep_for(std::chrono::milliseconds(ms));
@@ -331,3 +353,127 @@ void* aligned_malloc(size_t size, size_t align)
 	}
 	return p;
 }
+
+int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func)
+{
+  fstream fs(filename, ios::in | ios::binary);
+  if (fs.fail())
+    return 1;
+
+  fs.seekg(0, fstream::end);
+  uint64_t eofPos = (uint64_t)fs.tellg();
+  fs.clear(); // これをしないと次のseekに失敗することがある。
+  fs.seekg(0, fstream::beg);
+  uint64_t begPos = (uint64_t)fs.tellg();
+  uint64_t file_size = eofPos - begPos;
+  //std::cout << "filename = " << filename << " , file_size = " << file_size << endl;
+
+  // ファイルサイズがわかったのでcallback_funcを呼び出してこの分のバッファを確保してもらい、
+  // そのポインターをもらう。
+  void* ptr = callback_func(file_size);
+
+  // バッファが確保できなかった場合や、想定していたファイルサイズと異なった場合は、
+  // nullptrを返すことになっている。このとき、読み込みを中断し、エラーリターンする。
+  if (ptr == nullptr)
+    return 2;
+
+  // 細切れに読み込む
+
+  const uint64_t block_size = 1024 * 1024 * 1024; // 1回のreadで読み込む要素の数(1GB)
+  for (uint64_t pos = 0; pos < file_size; pos += block_size)
+  {
+    // 今回読み込むサイズ
+    uint64_t read_size = (pos + block_size < file_size) ? block_size : (file_size - pos);
+    fs.read((char*)ptr + pos, read_size);
+
+    // ファイルの途中で読み込みエラーに至った。
+    if (fs.fail())
+      return 2;
+
+    //cout << ".";
+  }
+  fs.close();
+
+  return 0;
+}
+
+int write_memory_to_file(std::string filename, void* ptr, uint64_t size)
+{
+  fstream fs(filename, ios::out | ios::binary);
+  if (fs.fail())
+    return 1;
+
+  const uint64_t block_size = 1024 * 1024 * 1024; // 1回のwriteで書き出す要素の数(1GB)
+  for (uint64_t pos = 0; pos < size; pos += block_size)
+  {
+    // 今回書き出すメモリサイズ
+    uint64_t write_size = (pos + block_size < size) ? block_size : (size - pos);
+    fs.write((char*)ptr + pos, write_size);
+    //cout << ".";
+  }
+  fs.close();
+  return 0;
+}
+
+// ----------------------------
+//     mkdir wrapper
+// ----------------------------
+
+// カレントフォルダ相対で指定する。成功すれば0、失敗すれば非0が返る。
+// フォルダを作成する。日本語は使っていないものとする。
+// どうもmsys2環境下のgccだと_wmkdir()だとフォルダの作成に失敗する。原因不明。
+// 仕方ないので_mkdir()を用いる。
+
+#if defined(_WIN32)
+// Windows用
+
+#if defined(_MSC_VER)
+#include <codecvt>	// mkdirするのにwstringが欲しいのでこれが必要
+#include <locale>   // wstring_convertにこれが必要。
+
+namespace Dependency {
+  int mkdir(std::string dir_name)
+  {
+    std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> cv;
+    return _wmkdir(cv.from_bytes(dir_name).c_str());
+    //	::CreateDirectory(cv.from_bytes(dir_name).c_str(),NULL);
+  }
+}
+
+#elif defined(__GNUC__) 
+
+#include <direct.h>
+namespace Dependency {
+  int mkdir(std::string dir_name)
+  {
+    return _mkdir(dir_name.c_str());
+  }
+}
+
+#endif
+#elif defined(_LINUX)
+
+// linux環境において、この_LINUXというシンボルはmakefileにて定義されるものとする。
+
+// Linux用のmkdir実装。
+#include "sys/stat.h"
+
+namespace Dependency {
+  int mkdir(std::string dir_name)
+  {
+    return ::mkdir(dir_name.c_str(), 0777);
+  }
+}
+#else
+
+// Linux環境かどうかを判定するためにはmakefileを分けないといけなくなってくるな..
+// linuxでフォルダ掘る機能は、とりあえずナシでいいや..。評価関数ファイルの保存にしか使ってないし…。
+
+namespace Dependency {
+  int mkdir(std::string dir_name)
+  {
+    return 0;
+  }
+}
+
+#endif
diff --git a/src/misc.h b/src/misc.h
index 5b63ef1c..6ce75a4d 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -24,11 +24,13 @@
 #include <algorithm>
 #include <cassert>
 #include <chrono>
+#include <functional>
 #include <ostream>
 #include <string>
 #include <vector>
 
 #include "types.h"
+#include "thread_win32_osx.h"
 
 const std::string engine_info(bool to_uci = false);
 void prefetch(void* addr);
@@ -98,8 +100,20 @@ public:
   /// Output values only have 1/8th of their bits set on average.
   template<typename T> T sparse_rand()
   { return T(rand64() & rand64() & rand64()); }
+
+  // 0からn-1までの乱数を返す。(一様分布ではないが現実的にはこれで十分)
+  uint64_t rand(uint64_t n) { return rand<uint64_t>() % n; }
+
+  // 内部で使用している乱数seedを返す。
+  uint64_t get_seed() const { return s; }
 };
 
+// 乱数のseedを表示する。(デバッグ用)
+inline std::ostream& operator<<(std::ostream& os, PRNG& prng)
+{
+  os << "PRNG::seed = " << std::hex << prng.get_seed() << std::dec;
+  return os;
+}
 
 /// Under Windows it is not possible for a process to run on more than one
 /// logical processor group. This usually means to be limited to use max 64
@@ -114,6 +128,9 @@ namespace WinProcGroup {
 // 指定されたミリ秒だけsleepする。
 extern void sleep(int ms);
 
+// 現在時刻を文字列化したもを返す。(評価関数の学習時などにログ出力のために用いる)
+std::string now_string();
+
 // 途中での終了処理のためのwrapper
 static void my_exit()
 {
@@ -121,6 +138,54 @@ static void my_exit()
 	exit(EXIT_FAILURE);
 }
 
+// msys2、Windows Subsystem for Linuxなどのgcc/clangでコンパイルした場合、
+// C++のstd::ifstreamで::read()は、一発で2GB以上のファイルの読み書きが出来ないのでそのためのwrapperである。
+//
+// read_file_to_memory()の引数のcallback_funcは、ファイルがオープン出来た時点でそのファイルサイズを引数として
+// callbackされるので、バッファを確保して、その先頭ポインタを返す関数を渡すと、そこに読み込んでくれる。
+// これらの関数は、ファイルが見つからないときなどエラーの際には非0を返す。
+//
+// また、callbackされた関数のなかでバッファが確保できなかった場合や、想定していたファイルサイズと異なった場合は、
+// nullptrを返せば良い。このとき、read_file_to_memory()は、読み込みを中断し、エラーリターンする。
+
+int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func);
+int write_memory_to_file(std::string filename, void* ptr, uint64_t size);
+
+// --------------------
+//    PRNGのasync版
+// --------------------
+
+// PRNGのasync版
+struct AsyncPRNG
+{
+  AsyncPRNG(uint64_t seed) : prng(seed) { assert(seed); }
+  // [ASYNC] 乱数を一つ取り出す。
+  template<typename T> T rand() {
+    std::unique_lock<Mutex> lk(mutex);
+    return prng.rand<T>();
+  }
+
+  // [ASYNC] 0からn-1までの乱数を返す。(一様分布ではないが現実的にはこれで十分)
+  uint64_t rand(uint64_t n) {
+    std::unique_lock<Mutex> lk(mutex);
+    return prng.rand(n);
+  }
+
+  // 内部で使用している乱数seedを返す。
+  uint64_t get_seed() const { return prng.get_seed(); }
+
+protected:
+  Mutex mutex;
+  PRNG prng;
+};
+
+// 乱数のseedを表示する。(デバッグ用)
+inline std::ostream& operator<<(std::ostream& os, AsyncPRNG& prng)
+{
+  os << "AsyncPRNG::seed = " << std::hex << prng.get_seed() << std::dec;
+  return os;
+}
+
 // --------------------
 //       Math
 // --------------------
@@ -176,4 +241,39 @@ struct Path
 extern void* aligned_malloc(size_t size, size_t align);
 static void aligned_free(void* ptr) { _mm_free(ptr); }
 
+// alignasを指定しているのにnewのときに無視される＆STLのコンテナがメモリ確保するときに無視するので、
+// そのために用いるカスタムアロケーター。
+template <typename T>
+class AlignedAllocator {
+public:
+  using value_type = T;
+
+  AlignedAllocator() {}
+  AlignedAllocator(const AlignedAllocator&) {}
+  AlignedAllocator(AlignedAllocator&&) {}
+
+  template <typename U> AlignedAllocator(const AlignedAllocator<U>&) {}
+
+  T* allocate(std::size_t n) { return (T*)aligned_malloc(n * sizeof(T), alignof(T)); }
+  void deallocate(T* p, std::size_t n) { aligned_free(p); }
+};
+
+// --------------------
+//  Dependency Wrapper
+// --------------------
+
+namespace Dependency
+{
+  // Linux環境ではgetline()したときにテキストファイルが'\r\n'だと
+  // '\r'が末尾に残るのでこの'\r'を除去するためにwrapperを書く。
+  // そのため、fstreamに対してgetline()を呼び出すときは、
+  // std::getline()ではなく単にgetline()と書いて、この関数を使うべき。
+  extern bool getline(std::ifstream& fs, std::string& s);
+
+  // フォルダを作成する。
+  // カレントフォルダ相対で指定する。dir_nameに日本語は使っていないものとする。
+  // 成功すれば0、失敗すれば非0が返る。
+  extern int mkdir(std::string dir_name);
+}
+
 #endif // #ifndef MISC_H_INCLUDED
diff --git a/src/movegen.h b/src/movegen.h
index aeba93ad..5dda654a 100644
--- a/src/movegen.h
+++ b/src/movegen.h
@@ -68,6 +68,9 @@ struct MoveList {
     return std::find(begin(), end(), move) != end();
   }
 
+  // i�Ԗڂ̗v�f��Ԃ�
+  const ExtMove at(size_t i) const { assert(0 <= i && i < size()); return begin()[i]; }
+
 private:
   ExtMove moveList[MAX_MOVES], *last;
 };
diff --git a/src/position.cpp b/src/position.cpp
index 23ce5168..a3f05a87 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -1480,3 +1480,12 @@ PieceNumber Position::piece_no_of(Square sq) const
   return n;
 }
 #endif  // defined(EVAL_NNUE)
+
+#if defined(EVAL_LEARN)
+// ���ǖʂŎw���肪�Ȃ������e�X�g����B�w���萶�����[�`����p����̂ő����Ȃ��B�T�����ɂ͎g��Ȃ����ƁB
+bool Position::is_mated() const
+{
+  // �s���ŋl�߂������ł���p�^�[���͂Ȃ��̂�LEGAL_ALL�ł���K�v�͂Ȃ��B
+  return MoveList<LEGAL>(*this).size() == 0;
+}
+#endif // EVAL_LEARN
diff --git a/src/position.h b/src/position.h
index c6e4f9c9..2387dd1c 100644
--- a/src/position.h
+++ b/src/position.h
@@ -80,6 +80,9 @@ typedef std::unique_ptr<std::deque<StateInfo>> StateListPtr;
 /// traversing the search tree.
 class Thread;
 
+// pack���ꂽsfen
+struct PackedSfen { uint8_t data[32]; };
+
 class Position {
 public:
   static void init();
@@ -187,6 +190,29 @@ public:
   const Eval::EvalList* eval_list() const { return &evalList; }
 #endif  // defined(EVAL_NNUE)
 
+#if defined(EVAL_LEARN)
+  // ���ǖʂŎw���肪�Ȃ������e�X�g����B�w���萶�����[�`����p����̂ő����Ȃ��B�T�����ɂ͎g��Ȃ����ƁB
+  bool is_mated() const;
+
+  // -- sfen���w���p
+
+  // pack���ꂽsfen�𓾂�B�����Ɏw�肵���o�b�t�@�ɕԂ��B
+  // gamePly��pack�Ɋ܂߂Ȃ��B
+  void sfen_pack(PackedSfen& sfen);
+
+  // ��sfen���o�R����ƒx���̂Œ���pack���ꂽsfen���Z�b�g����֐���������B
+  // pos.set(sfen_unpack(data),si,th); �Ɠ����B
+  // �n���ꂽ�ǖʂɖ�肪�����āA�G���[�̂Ƃ��͔�0��Ԃ��B
+  // PackedSfen��gamePly�͊܂܂Ȃ��̂ŕ����ł��Ȃ��B������ݒ肵�����̂ł���Έ����Ŏw�肷�邱�ƁB
+  int set_from_packed_sfen(const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror = false);
+
+  // �ՖʂƎ��A��Ԃ�^���āA����sfen��Ԃ��B
+  //static std::string sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly);
+
+  // c���̋ʂ̈ʒu��Ԃ��B
+  Square king_square(Color c) const { return pieceList[make_piece(c, KING)][0]; }
+#endif // EVAL_LEARN
+
 private:
   // Initialization helpers (used while setting up a position)
   void set_castling_right(Color c, Square rfrom);
diff --git a/src/search.cpp b/src/search.cpp
index a3ce4c2d..279c6d8a 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1721,3 +1721,283 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
             m.tbRank = 0;
     }
 }
+
+// --- �w�K���ɗp����Adepth�Œ�T���Ȃǂ̊֐����O���ɑ΂��Č��J
+
+#if defined (EVAL_LEARN)
+
+namespace Learner
+{
+  // �w�K�p�ɁA1�̃X���b�h����search,qsearch()���Ăяo����悤�ȃX�^�u��p�ӂ���B
+  // ���܂ɂ��Ďv���΁AApery�̂悤��Searcher�������ăX���b�h���Ƃɒu���\�Ȃǂ�p�ӂ���ق���
+  // �ǂ����������m��Ȃ��B
+
+  // �w�K�̂��߂̏������B
+  // Learner::search(),Learner::qsearch()����Ăяo�����B
+  void init_for_search(Position& pos, Stack* ss)
+  {
+
+    // RootNode��ss->ply == 0�����̏����B
+    // �[���N���A����̂ŁAss->ply == 0�ƂȂ�̂ő��v�c�B
+
+    memset(ss - 4, 0, 7 * sizeof(Stack));
+
+    // Search::Limits�Ɋւ���
+    // ���̃����o�[�ϐ���global�Ȃ̂ő��̃X���b�h�ɉe�����y�ڂ��̂ŋC�����邱�ƁB
+    {
+      auto& limits = Search::Limits;
+
+      // �T����"go infinite"�R�}���h�����ɂ���B(time management�����ƍ��邽��)
+      limits.infinite = true;
+
+      // PV��\�������Ǝז��Ȃ̂ŏ����Ă����B
+      //limits.silent = true;
+
+      // �����p����Ɗe�X���b�h��nodes��ώZ�������̂Ɣ�r����Ă��܂��B�䂦�Ɏg�p���Ȃ��B
+      limits.nodes = 0;
+
+      // depth���ALearner::search()�̈����Ƃ��ēn���ꂽ���̂ŏ�������B
+      limits.depth = 0;
+
+      // ���������t�߂̎萔�ň��������̒l���Ԃ�̂�h�����߂ɑ傫�Ȓl�ɂ��Ă����B
+      //limits.max_game_ply = 1 << 16;
+
+      // ���ʃ��[��������Ă����Ȃ��ƈ��������ɂȂ��Č������ɂ����B
+      //limits.enteringKingRule = EnteringKingRule::EKR_27_POINT;
+    }
+
+    // DrawValue�̐ݒ�
+    {
+      // �X���b�h���Ƃɗp�ӂ��ĂȂ��̂�
+      // ���̃X���b�h�ŏ㏑�����ꂩ�˂Ȃ��B�d�����Ȃ����B
+      // �ǂ��������Ȃ�Ȃ�A0�ɂ��ׂ����Ǝv���B
+      //drawValueTable[REPETITION_DRAW][BLACK] = VALUE_ZERO;
+      //drawValueTable[REPETITION_DRAW][WHITE] = VALUE_ZERO;
+    }
+
+    // this_thread�Ɋւ��āB
+    {
+      auto th = pos.this_thread();
+
+      th->completedDepth = DEPTH_ZERO;
+      th->selDepth = 0;
+      th->rootDepth = DEPTH_ZERO;
+
+      // �T���m�[�h���̃[��������
+      th->nodes = 0;
+
+      // history�ނ�S���N���A����B���̏������͏������Ԃ������邵�A�T���̐��x�͂ނ��뉺����̂őP���͂悭�킩��Ȃ��B
+      // th->clear();
+
+      for (int i = 4; i > 0; i--)
+        (ss - i)->continuationHistory = &th->continuationHistory[SQUARE_ZERO][NO_PIECE];
+
+      // rootMoves�̐ݒ�
+      auto& rootMoves = th->rootMoves;
+
+      rootMoves.clear();
+      for (auto m : MoveList<LEGAL>(pos))
+        rootMoves.push_back(Search::RootMove(m));
+
+      assert(!rootMoves.empty());
+
+      //#if defined(USE_GLOBAL_OPTIONS)
+      // �T���X���b�h���Ƃ̒u���\�̐�����Ǘ����Ă���͂��Ȃ̂ŁA
+      // �V�K�̒T���ł��邩��A���̃X���b�h�ɑ΂���u���\�̐���𑝂₷�B
+            //TT.new_search(th->thread_id());
+
+            // ��������new_search���Ăяo����1��O�̒T�����ʂ��g���Ȃ��đ��Ƃ������Ƃ͂���̂ł́c�B
+            // �����ł���͂�炸�ɁA�Ăяo������1�ǂ��Ƃ�TT.new_search(th->thread_id())�����ׂ��ł́c�B
+
+            // ���@����̏I�ǐ}�Ɏ���̂�����������̂ŁA���t�������ɂ͒u���\�͑S�X�����ʂŎg���悤�ɂ���B
+      //#endif
+    }
+  }
+
+  // �ǂ݋؂ƕ]���l�̃y�A�BLearner::search(),Learner::qsearch()���Ԃ��B
+  typedef std::pair<Value, std::vector<Move> > ValueAndPV;
+
+  // �Î~�T���B
+  //
+  // �O�����) pos.set_this_thread(Threads[thread_id])�ŒT���X���b�h���ݒ肳��Ă��邱�ƁB
+  // �@�܂��AThreads.stop������ƒT���𒆒f���Ă��܂��̂ŁA���̂Ƃ���PV�͐������Ȃ��B
+  // �@search()����߂������ƁAThreads.stop == true�Ȃ�A���̒T�����ʂ�p���Ă͂Ȃ�Ȃ��B
+  // �@���ƁA�Ăяo���O�́AThreads.stop == false�̏�ԂŌĂяo���Ȃ��ƁA�T���𒆒f���ĕԂ��Ă��܂��̂Œ��ӁB
+  //
+  // �l�܂���Ă���ꍇ�́APV�z���MOVE_RESIGN���Ԃ�B
+  //
+  // ������alpha,beta���w��ł���悤�ɂ��Ă������A���ꂪ���̑��ŒT�������Ƃ��̌��ʂ�
+  // �u���\�ɏ������ނ̂ŁA���̑��ɑ΂��Ď}���肪�o����悤�Ȓl���������܂�Ċw�K�̂Ƃ���
+  // �����e��������̂ŁA���͈̔͂��w��ł���悤�ɂ���̂���߂邱�Ƃɂ����B
+  ValueAndPV qsearch(Position& pos)
+  {
+    Stack stack[MAX_PLY + 7], * ss = stack + 4;
+    Move pv[MAX_PLY + 1];
+    std::vector<Move> pvs;
+
+    init_for_search(pos, ss);
+    ss->pv = pv; // �Ƃ肠�����_�~�[�łǂ����o�b�t�@���Ȃ��Ƃ����Ȃ��B
+
+    // �l�܂���Ă���̂�
+    if (pos.is_mated())
+    {
+      pvs.push_back(MOVE_NONE);
+      return ValueAndPV(mated_in(/*ss->ply*/ 0 + 1), pvs);
+    }
+
+    auto bestValue = ::qsearch<PV>(pos, ss, -VALUE_INFINITE, VALUE_INFINITE, DEPTH_ZERO);
+
+    // ����ꂽPV��Ԃ��B
+    for (Move* p = &ss->pv[0]; is_ok(*p); ++p)
+      pvs.push_back(*p);
+
+    return ValueAndPV(bestValue, pvs);
+  }
+
+  // �ʏ�T���B�[��depth(�����Ŏw��)�B
+  // 3��ǂݎ��̃X�R�A���~�����Ȃ�A
+  //   auto v = search(pos,3);
+  // �̂悤�ɂ��ׂ��B
+  // v.first�ɕ]���l�Av.second��PV��������B
+  // multi pv���L���̂Ƃ��́Apos.this_thread()->rootMoves[N].pv�ɂ���PV(�ǂ݋�)�̔z�񂪓�����B
+  // multi pv�̎w��͂��̊֐��̈���multiPV�ōs�Ȃ��B(Options["MultiPV"]�̒l�͖��������)
+  // 
+  // root�ł̐錾��������͂��Ȃ��̂�(�������ʓ|�Ȃ̂�)�A�����ł͍s��Ȃ��B
+  // �Ăяo�����ŏ������邱�ƁB
+  //
+  // �O�����) pos.set_this_thread(Threads[thread_id])�ŒT���X���b�h���ݒ肳��Ă��邱�ƁB
+  // �@�܂��AThreads.stop������ƒT���𒆒f���Ă��܂��̂ŁA���̂Ƃ���PV�͐������Ȃ��B
+  // �@search()����߂������ƁAThreads.stop == true�Ȃ�A���̒T�����ʂ�p���Ă͂Ȃ�Ȃ��B
+  // �@���ƁA�Ăяo���O�́AThreads.stop == false�̏�ԂŌĂяo���Ȃ��ƁA�T���𒆒f���ĕԂ��Ă��܂��̂Œ��ӁB
+
+  ValueAndPV search(Position& pos, int depth_, size_t multiPV /* = 1 */, uint64_t nodesLimit /* = 0 */)
+  {
+    std::vector<Move> pvs;
+
+    Depth depth = depth_ * ONE_PLY;
+    if (depth < DEPTH_ZERO)
+      return std::pair<Value, std::vector<Move>>(Eval::evaluate(pos), std::vector<Move>());
+
+    if (depth == DEPTH_ZERO)
+      return qsearch(pos);
+
+    Stack stack[MAX_PLY + 7], * ss = stack + 4;
+    Move pv[MAX_PLY + 1];
+
+    init_for_search(pos, ss);
+
+    ss->pv = pv; // �Ƃ肠�����_�~�[�łǂ����o�b�t�@���Ȃ��Ƃ����Ȃ��B
+
+    // this_thread�Ɋ֘A����ϐ��̏�����
+    auto th = pos.this_thread();
+    auto& rootDepth = th->rootDepth;
+    auto& pvIdx = th->pvIdx;
+    auto& rootMoves = th->rootMoves;
+    auto& completedDepth = th->completedDepth;
+    auto& selDepth = th->selDepth;
+
+    // bestmove�Ƃ��Ă����̋ǖʂ̏��N��T������@�\
+    //size_t multiPV = Options["MultiPV"];
+
+    // ���̋ǖʂł̎w����̐��������Ă͂����Ȃ�
+    multiPV = std::min(multiPV, rootMoves.size());
+
+    // �m�[�h������MultiPV�̒l���|���Ă����Ȃ��ƁAdepth�Œ�AMultiPV����ɂ����Ƃ���1�̌���ɓ���node�����v�l�������ƂɂȂ�Ȃ��B
+    nodesLimit *= multiPV;
+
+    Value alpha = -VALUE_INFINITE;
+    Value beta = VALUE_INFINITE;
+    Value delta = -VALUE_INFINITE;
+    Value bestValue = -VALUE_INFINITE;
+
+    while ((rootDepth += ONE_PLY) <= depth
+      // node�����𒴂����ꍇ�����̃��[�v�𔲂���
+      // �T���m�[�h���́A���̊֐��̈����œn����Ă���B
+      && !(nodesLimit /*node��������*/ && th->nodes.load(std::memory_order_relaxed) >= nodesLimit)
+      )
+    {
+      for (RootMove& rm : rootMoves)
+        rm.previousScore = rm.score;
+
+      // MultiPV
+      for (pvIdx = 0; pvIdx < multiPV && !Threads.stop; ++pvIdx)
+      {
+        // ���ꂼ���depth��PV line�ɑ΂���USI info�ŏo�͂���selDepth
+        selDepth = 0;
+
+        // depth 5�ȏ�ɂ����Ă�aspiration search�ɐ؂�ւ���B
+        if (rootDepth >= 5 * ONE_PLY)
+        {
+          delta = Value(20);
+
+          Value p = rootMoves[pvIdx].previousScore;
+
+          alpha = std::max(p - delta, -VALUE_INFINITE);
+          beta = std::min(p + delta, VALUE_INFINITE);
+        }
+
+        // aspiration search
+        int failedHighCnt = 0;
+        while (true)
+        {
+          Depth adjustedDepth = std::max(ONE_PLY, rootDepth - failedHighCnt * ONE_PLY);
+          bestValue = ::search<PV>(pos, ss, alpha, beta, adjustedDepth, false);
+
+          stable_sort(rootMoves.begin() + pvIdx, rootMoves.end());
+          //my_stable_sort(pos.this_thread()->thread_id(),&rootMoves[0] + pvIdx, rootMoves.size() - pvIdx);
+
+          // fail low/high�ɑ΂���aspiration window���L����B
+          // �������A�����Ŏw�肳��Ă����l�ɂȂ��Ă�����A����fail low/high�����Ƃ���break����B
+          if (bestValue <= alpha)
+          {
+            beta = (alpha + beta) / 2;
+            alpha = std::max(bestValue - delta, -VALUE_INFINITE);
+
+            failedHighCnt = 0;
+            //if (mainThread)
+            //    mainThread->stopOnPonderhit = false;
+
+          }
+          else if (bestValue >= beta)
+          {
+            beta = std::min(bestValue + delta, VALUE_INFINITE);
+            ++failedHighCnt;
+          }
+          else
+            break;
+
+          delta += delta / 4 + 5;
+          assert(-VALUE_INFINITE <= alpha && beta <= VALUE_INFINITE);
+
+          // �\���`�F�b�N
+          //assert(th->nodes.load(std::memory_order_relaxed) <= 1000000 );
+        }
+
+        stable_sort(rootMoves.begin(), rootMoves.begin() + pvIdx + 1);
+        //my_stable_sort(pos.this_thread()->thread_id() , &rootMoves[0] , pvIdx + 1);
+
+      } // multi PV
+
+      completedDepth = rootDepth;
+    }
+
+    // ����PV�A�r����NULL_MOVE�̉\�������邩���m��Ȃ��̂Ŕr�����邽�߂�is_ok()��ʂ��B
+    // ���@PV�Ȃ̂�NULL_MOVE�͂��Ȃ����ƂɂȂ��Ă���͂������A
+    //     MOVE_WIN���˂����܂�Ă��邱�Ƃ͂Ȃ��B(���܂̂Ƃ���)
+    for (Move move : rootMoves[0].pv)
+    {
+      if (!is_ok(move))
+        break;
+      pvs.push_back(move);
+    }
+
+    //sync_cout << rootDepth << sync_endl;
+
+    // multiPV�����l�����āArootMoves[0]��score��bestValue�Ƃ��ĕԂ��B
+    bestValue = rootMoves[0].score;
+
+    return ValueAndPV(bestValue, pvs);
+  }
+
+}
+#endif
diff --git a/src/types.h b/src/types.h
index 5270ccd6..9b06ac0d 100644
--- a/src/types.h
+++ b/src/types.h
@@ -235,8 +235,8 @@ enum Square : int {
   SQ_A8, SQ_B8, SQ_C8, SQ_D8, SQ_E8, SQ_F8, SQ_G8, SQ_H8,
   SQ_NONE,
 
-  SQUARE_NB = 64,
-  SQUARE_NB_PLUS1 = SQUARE_NB + 1, // �ʂ����Ȃ��ꍇ�ASQ_NB�Ɉړ��������̂Ƃ��Ĉ������߁A�z���SQ_NB+1�Ŋm�ۂ��Ȃ��Ƃ����Ȃ��Ƃ�������̂ł��̒萔��p����B
+  SQUARE_ZERO = 0, SQUARE_NB = 64,
+  SQUARE_NB_PLUS1 = SQUARE_NB + 1, // �ʂ����Ȃ��ꍇ�ASQUARE_NB�Ɉړ��������̂Ƃ��Ĉ������߁A�z���SQUARE_NB+1�Ŋm�ۂ��Ȃ��Ƃ����Ȃ��Ƃ�������̂ł��̒萔��p����B
 };
 
 enum Direction : int {
@@ -362,10 +362,6 @@ constexpr Square operator~(Square s) {
   return Square(s ^ SQ_A8); // Vertical flip SQ_A1 -> SQ_A8
 }
 
-constexpr Square inverse(Square s) {
-	return static_cast<Square>(static_cast<int>(SQUARE_NB) - s - 1);
-}
-
 constexpr File operator~(File f) {
   return File(f ^ FILE_H); // Horizontal flip FILE_A -> FILE_H
 }
@@ -464,6 +460,12 @@ constexpr bool is_ok(Move m) {
   return from_sq(m) != to_sq(m); // Catch MOVE_NULL and MOVE_NONE
 }
 
+// �Ֆʂ�180���񂵂��Ƃ��̏��ڂ�Ԃ�
+constexpr Square Inv(Square sq) { return (Square)((SQUARE_NB - 1) - sq); }
+
+// �Ֆʂ��~���[�����Ƃ��̏��ڂ�Ԃ�
+constexpr Square Mir(Square sq) { return make_square(File(7 - (int)file_of(sq)), rank_of(sq)); }
+
 #if defined(EVAL_NNUE)
 // --------------------
 //        �
diff --git a/src/uci.cpp b/src/uci.cpp
index b47398ad..9a2f90ec 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -37,12 +37,10 @@ using namespace std;
 
 extern vector<string> setup_bench(const Position&, istream&);
 
+// FEN string of the initial position, normal chess
+const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
+
 namespace {
-
-  // FEN string of the initial position, normal chess
-  const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
-
-
   // position() is called when engine receives the "position" UCI command.
   // The function sets up the position described in the given FEN string ("fen")
   // or the starting position ("startpos") and then makes the moves given in the
@@ -179,74 +177,74 @@ namespace {
 
   // check sum���v�Z�����Ƃ��A�����ۑ����Ă����Ă��ƂŎ���ȍ~�A�������̃`�F�b�N���s�Ȃ��B
   uint64_t eval_sum;
+} // namespace
 
-  // is_ready_cmd()���O������Ăяo����悤�ɂ��Ă����B(bench�R�}���h�Ȃǂ���Ăяo����������)
-  // �ǖʂ͏���������Ȃ��̂Œ��ӁB
-  void is_ready(Position& pos, istringstream& is, StateListPtr& states)
-  {
+// is_ready_cmd()���O������Ăяo����悤�ɂ��Ă����B(bench�R�}���h�Ȃǂ���Ăяo����������)
+// �ǖʂ͏���������Ȃ��̂Œ��ӁB
+void is_ready(bool skipCorruptCheck)
+{
 #if defined(EVAL_NNUE)
-    // "isready"���󂯎�������ƁA"readyok"��Ԃ��܂�5�b���Ƃɉ��s�𑗂�悤�ɏC������B(keep alive�I�ȏ���)
-    //	USI2.0�̎d�l���B
-    //  -"isready"�̂��Ƃ�time out���Ԃ́A30�b���x�Ƃ���B����𒴂��āA�]���֐��̏������Ahash�e�[�u���̊m�ۂ��������ꍇ�A
-    //  �v�l�G���W�����������I�ɉ��炩�̃��b�Z�[�W(���s��)�𑗂�ׂ��ł���B
-    //  -ShogiGUI�ł͂��łɂ����Ȃ��Ă���̂ŁAMyShogi������ɒǐ�����B
-    //  -�܂��A��˂��牤�̃G���W�����́A"isready"���󂯎�������ƁA"readyok"��Ԃ��܂�5�b���Ƃɉ��s�𑗂�悤�ɏC������B
+  // "isready"���󂯎�������ƁA"readyok"��Ԃ��܂�5�b���Ƃɉ��s�𑗂�悤�ɏC������B(keep alive�I�ȏ���)
+  //	USI2.0�̎d�l���B
+  //  -"isready"�̂��Ƃ�time out���Ԃ́A30�b���x�Ƃ���B����𒴂��āA�]���֐��̏������Ahash�e�[�u���̊m�ۂ��������ꍇ�A
+  //  �v�l�G���W�����������I�ɉ��炩�̃��b�Z�[�W(���s��)�𑗂�ׂ��ł���B
+  //  -ShogiGUI�ł͂��łɂ����Ȃ��Ă���̂ŁAMyShogi������ɒǐ�����B
+  //  -�܂��A��˂��牤�̃G���W�����́A"isready"���󂯎�������ƁA"readyok"��Ԃ��܂�5�b���Ƃɉ��s�𑗂�悤�ɏC������B
 
-    auto ended = false;
-    auto th = std::thread([&ended] {
-      int count = 0;
-      while (!ended)
+  auto ended = false;
+  auto th = std::thread([&ended] {
+    int count = 0;
+    while (!ended)
+    {
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+      if (++count >= 50 /* 5�b */)
       {
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-        if (++count >= 50 /* 5�b */)
-        {
-          count = 0;
-          sync_cout << sync_endl; // ���s�𑗐M����B
-        }
+        count = 0;
+        sync_cout << sync_endl; // ���s�𑗐M����B
       }
-      });
-
-    // �]���֐��̓ǂݍ��݂Ȃǎ��Ԃ̂�����ł��낤�����͂��̃^�C�~���O�ōs�Ȃ��B
-    // �N�����Ɏ��Ԃ̂����鏈�������Ă��܂��Ə��������^�C���A�E�g��������āA�v�l�G���W���Ƃ��Ă̔F�������^�C�A���Ă��܂��B
-    if (!UCI::load_eval_finished)
-    {
-      // �]���֐��̓ǂݍ���
-      Eval::load_eval();
-
-      // �`�F�b�N�T���̌v�Z�ƕۑ�(���̌�̃������j���̃`�F�b�N�̂���)
-      eval_sum = Eval::calc_check_sum();
-
-      // �\�t�g���̕\��
-      Eval::print_softname(eval_sum);
-
-      UCI::load_eval_finished = true;
-
-    }
-    else
-    {
-      // ���������j�󂳂�Ă��Ȃ����𒲂ׂ邽�߂Ƀ`�F�b�N�T���𖈉񒲂ׂ�B
-      // ���Ԃ��������������Ȃ��C�����邪.. 0.1�b���炢�̂��ƂȂ̂ŗǂ��Ƃ���B
-      if (eval_sum != Eval::calc_check_sum())
-        sync_cout << "Error! : EVAL memory is corrupted" << sync_endl;
     }
+    });
 
-    // isready�ɑ΂��Ă�readyok��Ԃ��܂Ŏ��̃R�}���h�����Ȃ����Ƃ͖񑩂���Ă���̂�
-    // ���̃^�C�~���O�Ŋe��ϐ��̏����������Ă����B
+  // �]���֐��̓ǂݍ��݂Ȃǎ��Ԃ̂�����ł��낤�����͂��̃^�C�~���O�ōs�Ȃ��B
+  // �N�����Ɏ��Ԃ̂����鏈�������Ă��܂��Ə��������^�C���A�E�g��������āA�v�l�G���W���Ƃ��Ă̔F�������^�C�A���Ă��܂��B
+  if (!UCI::load_eval_finished)
+  {
+    // �]���֐��̓ǂݍ���
+    Eval::load_eval();
 
-    TT.resize(Options["Hash"]);
-    Search::clear();
-    Time.availableNodes = 0;
+    // �`�F�b�N�T���̌v�Z�ƕۑ�(���̌�̃������j���̃`�F�b�N�̂���)
+    eval_sum = Eval::calc_check_sum();
 
-    Threads.stop = false;
+    // �\�t�g���̕\��
+    Eval::print_softname(eval_sum);
 
-    // keep alive�𑗐M���邽�߂ɐ��������X���b�h���I�������A�ҋ@����B
-    ended = true;
-    th.join();
+    UCI::load_eval_finished = true;
+
+  }
+  else
+  {
+    // ���������j�󂳂�Ă��Ȃ����𒲂ׂ邽�߂Ƀ`�F�b�N�T���𖈉񒲂ׂ�B
+    // ���Ԃ��������������Ȃ��C�����邪.. 0.1�b���炢�̂��ƂȂ̂ŗǂ��Ƃ���B
+    if (!skipCorruptCheck && eval_sum != Eval::calc_check_sum())
+      sync_cout << "Error! : EVAL memory is corrupted" << sync_endl;
+  }
+
+  // isready�ɑ΂��Ă�readyok��Ԃ��܂Ŏ��̃R�}���h�����Ȃ����Ƃ͖񑩂���Ă���̂�
+  // ���̃^�C�~���O�Ŋe��ϐ��̏����������Ă����B
+
+  TT.resize(Options["Hash"]);
+  Search::clear();
+  Time.availableNodes = 0;
+
+  Threads.stop = false;
+
+  // keep alive�𑗐M���邽�߂ɐ��������X���b�h���I�������A�ҋ@����B
+  ended = true;
+  th.join();
 #endif  // defined(EVAL_NNUE)
 
-    sync_cout << "readyok" << sync_endl;
-  }
-} // namespace
+  sync_cout << "readyok" << sync_endl;
+}
 
 
 /// UCI::loop() waits for a command from stdin, parses it and calls the appropriate
@@ -296,7 +294,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "go")         go(pos, is, states);
       else if (token == "position")   position(pos, is, states);
       else if (token == "ucinewgame") Search::clear();
-      else if (token == "isready")    is_ready(pos, is, states);
+      else if (token == "isready")    is_ready();
 
       // Additional custom non-UCI commands, mainly for debugging
       else if (token == "flip")  pos.flip();
diff --git a/src/uci.h b/src/uci.h
index 4a7771ca..dac881c1 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -81,4 +81,12 @@ extern bool load_eval_finished; // = false;
 
 extern UCI::OptionsMap Options;
 
+// USI��"isready"�R�}���h���Ăяo���ꂽ�Ƃ��̏����B���̂Ƃ��ɕ]���֐��̓ǂݍ��݂Ȃǂ��s�Ȃ��B
+// benchmark�R�}���h�̃n���h���Ȃǂ�"isready"�����Ă��Ȃ��Ƃ��ɕ]���֐���ǂݍ��܂������Ƃ��ɗp����B
+// skipCorruptCheck == true�̂Ƃ��͕]���֐���2�x�ڂ̓ǂݍ��݂̂Ƃ���check sum�ɂ�郁�����j���`�F�b�N���ȗ�����B
+// ���@���̊֐��́AStockfish�ɂ͂Ȃ����Ȃ��ƕs�ւȂ̂Œǉ����Ă����B
+void is_ready(bool skipCorruptCheck = false);
+
+extern const char* StartFEN;
+
 #endif // #ifndef UCI_H_INCLUDED

From f58d6161985bb4c9e054b00e0841d1b20e630d34 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Tue, 18 Jun 2019 20:28:50 +0900
Subject: [PATCH 007/583] Fixed compile errors when EVAL_LEARN or EVAL_NNUE are
 defined.

---
 src/eval/evaluate_common.h          |  5 ++---
 src/eval/evaluate_mir_inv_tools.cpp |  6 +++++-
 src/eval/evaluate_mir_inv_tools.h   |  4 ++++
 src/evaluate.cpp                    | 13 ++++++++++++-
 src/evaluate.h                      |  4 ++--
 src/learn/learning_tools.h          |  2 ++
 src/position.h                      |  9 +++++----
 src/types.h                         |  4 ++--
 8 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index 889fda7a..5d5d05b1 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -3,7 +3,7 @@
 
 // いまどきの手番つき評価関数(EVAL_KPPTとEVAL_KPP_KKPT)の共用header的なもの。
 
-#if defined (EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_NNUE)
+#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
 #include <functional>
 
 // KKファイル名
@@ -77,7 +77,6 @@ namespace Eval
 
 }
 
-
-#endif
+#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
 
 #endif // _EVALUATE_KPPT_COMMON_H_
\ No newline at end of file
diff --git a/src/eval/evaluate_mir_inv_tools.cpp b/src/eval/evaluate_mir_inv_tools.cpp
index a0cf7461..56a0a63e 100644
--- a/src/eval/evaluate_mir_inv_tools.cpp
+++ b/src/eval/evaluate_mir_inv_tools.cpp
@@ -1,4 +1,6 @@
-﻿#include "evaluate_mir_inv_tools.h"
+﻿#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
+
+#include "evaluate_mir_inv_tools.h"
 
 namespace Eval
 {
@@ -184,3 +186,5 @@ namespace Eval
 	}
 
 }
+
+#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
diff --git a/src/eval/evaluate_mir_inv_tools.h b/src/eval/evaluate_mir_inv_tools.h
index 6e82ce58..8d6378ec 100644
--- a/src/eval/evaluate_mir_inv_tools.h
+++ b/src/eval/evaluate_mir_inv_tools.h
@@ -1,6 +1,8 @@
 ﻿#ifndef _EVALUATE_MIR_INV_TOOLS_
 #define _EVALUATE_MIR_INV_TOOLS_
 
+#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
+
 // BonaPieceのmirror(左右反転)やinverse(盤上の180度回転)させた駒を得るためのツール類。
 
 #include "../types.h"
@@ -40,4 +42,6 @@ namespace Eval
 	extern void init_mir_inv_tables();
 }
 
+#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
+
 #endif
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 65c7155a..dee72d64 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -911,7 +911,7 @@ std::string Eval::trace(const Position& pos) {
   return ss.str();
 }
 
-#if defined(EVAL_NNUE)
+#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
 namespace Eval {
 ExtBonaPiece kpp_board_index[PIECE_NB] = {
     { BONA_PIECE_ZERO, BONA_PIECE_ZERO },
@@ -980,4 +980,15 @@ bool EvalList::is_valid(const Position& pos)
   return true;
 }
 }
+#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
+
+#if !defined(EVAL_NNUE)
+namespace Eval {
+void evaluate_with_no_return(const Position& pos) {}
+void update_weights(uint64_t epoch, const std::array<bool, 4> & freeze) {}
+void init_grad(double eta1, uint64_t eta_epoch, double eta2, uint64_t eta2_epoch, double eta3) {}
+void add_grad(Position& pos, Color rootColor, double delt_grad, const std::array<bool, 4> & freeze) {}
+void save_eval(std::string suffix) {}
+double get_eta() {}
+}
 #endif  // defined(EVAL_NNUE)
diff --git a/src/evaluate.h b/src/evaluate.h
index f31ea142..6115eeb4 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -37,7 +37,7 @@ Value evaluate(const Position& pos);
 
 void evaluate_with_no_return(const Position& pos);
 
-#if defined(EVAL_NNUE)
+#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
 // �]���֐��t�@�C����ǂݍ��ށB
 // ����́A"is_ready"�R�}���h�̉�������1�x�����Ăяo�����B2�x�Ăяo�����Ƃ͑z�肵�Ă��Ȃ��B
 // (�������AEvalDir(�]���֐��t�H���_)���ύX�ɂȂ������ƁAisready���ēx�����Ă�����ǂ݂Ȃ����B)
@@ -216,7 +216,7 @@ struct DirtyPiece
 	int dirty_num;
 
 };
-#endif  // defined(EVAL_NNUE)
+#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
 }
 
 #endif // #ifndef EVALUATE_H_INCLUDED
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 65f0887c..2bcd3f35 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -5,6 +5,8 @@
 
 #include "learn.h"
 #if defined (EVAL_LEARN)
+#include <array>
+
 #include "../eval/evaluate_mir_inv_tools.h"
 
 #if defined(SGD_UPDATE) || defined(USE_KPPP_MIRROR_WRITE)
diff --git a/src/position.h b/src/position.h
index 2387dd1c..547320ea 100644
--- a/src/position.h
+++ b/src/position.h
@@ -28,6 +28,7 @@
 #include <string>
 
 #include "bitboard.h"
+#include "evaluate.h"
 #include "misc.h"
 #include "types.h"
 
@@ -179,7 +180,7 @@ public:
   bool pos_is_ok() const;
   void flip();
 
-#if defined(EVAL_NNUE)
+#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
   // --- StateInfo
 
   // ���݂̋ǖʂɑΉ�����StateInfo��Ԃ��B
@@ -188,7 +189,7 @@ public:
 
   // �]���֐��Ŏg�����߂́A�ǂ̋�ԍ��̋�ǂ��ɂ��邩�Ȃǂ̏��B
   const Eval::EvalList* eval_list() const { return &evalList; }
-#endif  // defined(EVAL_NNUE)
+#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
 
 #if defined(EVAL_LEARN)
   // ���ǖʂŎw���肪�Ȃ������e�X�g����B�w���萶�����[�`����p����̂ő����Ȃ��B�T�����ɂ͎g��Ȃ����ƁB
@@ -248,10 +249,10 @@ private:
   StateInfo* st;
   bool chess960;
 
-#if defined(EVAL_NNUE)
+#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
   // �]���֐��ŗp�����̃��X�g
   Eval::EvalList evalList;
-#endif  // defined(EVAL_NNUE)
+#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
 };
 
 namespace PSQT {
diff --git a/src/types.h b/src/types.h
index 9b06ac0d..cf075bbe 100644
--- a/src/types.h
+++ b/src/types.h
@@ -466,7 +466,7 @@ constexpr Square Inv(Square sq) { return (Square)((SQUARE_NB - 1) - sq); }
 // �Ֆʂ��~���[�����Ƃ��̏��ڂ�Ԃ�
 constexpr Square Mir(Square sq) { return make_square(File(7 - (int)file_of(sq)), rank_of(sq)); }
 
-#if defined(EVAL_NNUE)
+#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
 // --------------------
 //        �
 // --------------------
@@ -496,6 +496,6 @@ inline PieceNumber& operator--(PieceNumber& d) { return d = PieceNumber(int8_t(d
 
 // PieceNumber�̐������̌����Bassert�p�B
 constexpr bool is_ok(PieceNumber pn) { return pn < PIECE_NUMBER_NB; }
-#endif  // defined(EVAL_NNUE)
+#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
 
 #endif // #ifndef TYPES_H_INCLUDED

From 24576d77ab78a99f18a8cb6b797c93289f26b58b Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Tue, 18 Jun 2019 21:19:51 +0900
Subject: [PATCH 008/583] Merged uci parse logic.

---
 src/evaluate.cpp |  2 +-
 src/uci.cpp      | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index dee72d64..1d444819 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -989,6 +989,6 @@ void update_weights(uint64_t epoch, const std::array<bool, 4> & freeze) {}
 void init_grad(double eta1, uint64_t eta_epoch, double eta2, uint64_t eta2_epoch, double eta3) {}
 void add_grad(Position& pos, Color rootColor, double delt_grad, const std::array<bool, 4> & freeze) {}
 void save_eval(std::string suffix) {}
-double get_eta() {}
+double get_eta() { return 0.0; }
 }
 #endif  // defined(EVAL_NNUE)
diff --git a/src/uci.cpp b/src/uci.cpp
index 9a2f90ec..68bb850b 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -40,6 +40,30 @@ extern vector<string> setup_bench(const Position&, istream&);
 // FEN string of the initial position, normal chess
 const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
 
+// ������������������R�}���h
+#if defined (EVAL_LEARN)
+namespace Learner
+{
+  // ���t�ǖʂ̎�������
+  void gen_sfen(Position& pos, istringstream& is);
+
+  // ����������������̊w�K
+  void learn(Position& pos, istringstream& is);
+
+#if defined(GENSFEN2019)
+  // �J�����̋��t�ǖʂ̎��������R�}���h
+  void gen_sfen2019(Position& pos, istringstream& is);
+#endif
+
+  // �ǂ݋؂ƕ]���l�̃y�A�BLearner::search(),Learner::qsearch()���Ԃ��B
+  typedef std::pair<Value, std::vector<Move> > ValueAndPV;
+
+  ValueAndPV qsearch(Position& pos);
+  ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
+
+}
+#endif
+
 namespace {
   // position() is called when engine receives the "position" UCI command.
   // The function sets up the position described in the given FEN string ("fen")
@@ -301,6 +325,16 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "bench") bench(pos, is, states);
       else if (token == "d")     sync_cout << pos << sync_endl;
       else if (token == "eval")  sync_cout << Eval::trace(pos) << sync_endl;
+#if defined (EVAL_LEARN)
+      else if (token == "gensfen") Learner::gen_sfen(pos, is);
+      else if (token == "learn") Learner::learn(pos, is);
+
+#if defined (GENSFEN2019)
+      // �J�����̋��t�ǖʐ����R�}���h
+      else if (token == "gensfen2019") Learner::gen_sfen2019(pos, is);
+#endif
+
+#endif
       else
           sync_cout << "Unknown command: " << cmd << sync_endl;
 

From 90ef97dcbd204d621eaa79479aecd66a6392962d Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Thu, 20 Jun 2019 00:25:40 +0900
Subject: [PATCH 009/583] Fixed crash bugs.

---
 src/evaluate.cpp          | 11 ++++++++++
 src/evaluate.h            |  2 +-
 src/extra/sfen_packer.cpp | 37 ++++++++++++++++----------------
 src/learn/learner.cpp     | 10 +++++++++
 src/position.cpp          | 32 +++++++++++++++-------------
 src/search.cpp            | 45 ++++++++++++++++++++++++++++++++-------
 src/search.h              |  4 ++++
 src/ucioption.cpp         |  9 ++++++++
 8 files changed, 108 insertions(+), 42 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 1d444819..350f905b 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -22,6 +22,7 @@
 #include <cassert>
 #include <cstring>   // For std::memset
 #include <iomanip>
+#include <set>
 #include <sstream>
 
 #include "bitboard.h"
@@ -938,6 +939,16 @@ ExtBonaPiece kpp_board_index[PIECE_NB] = {
 // �� : �f�o�b�O�p�B�x���B
 bool EvalList::is_valid(const Position& pos)
 {
+  std::set<PieceNumber> piece_numbers;
+  for (Square sq = SQ_A1; sq != SQUARE_NB; ++sq) {
+    auto piece_number = piece_no_of_board(sq);
+    if (piece_number == PIECE_NUMBER_NB) {
+      continue;
+    }
+    assert(!piece_numbers.count(piece_number));
+    piece_numbers.insert(piece_number);
+  }
+
   for (int i = 0; i < length(); ++i)
   {
     BonaPiece fw = pieceListFw[i];
diff --git a/src/evaluate.h b/src/evaluate.h
index 6115eeb4..47dfbd34 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -187,7 +187,7 @@ public:
 	// VPGATHERDD���g���s���A4�̔{���łȂ���΂Ȃ�Ȃ��B
 	// �܂��AKPPT�^�]���֐��Ȃǂ́A39,40�Ԗڂ̗v�f���[���ł��邱�Ƃ�O��Ƃ���
 	// �A�N�Z�X�����Ă���ӏ�������̂Œ��ӂ��邱�ƁB
-	static const int MAX_LENGTH = 40;
+	static const int MAX_LENGTH = 32;
 
   // �Տ�̋�ɑ΂��āA���̋�ԍ�(PieceNumber)��ێ����Ă���z��
   // �ʂ�SQUARE_NB�Ɉړ����Ă���Ƃ��p��+1�܂ŕێ����Ă������A
diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
index df095ce1..f7c1d238 100644
--- a/src/extra/sfen_packer.cpp
+++ b/src/extra/sfen_packer.cpp
@@ -126,11 +126,12 @@ struct HuffmanedPiece
 
 HuffmanedPiece huffman_table[] =
 {
-  {0b000,1}, // NO_PIECE
-  {0b001,3}, // PAWN
-  {0b011,3}, // KNIGHT
-  {0b101,3}, // BISHOP
-  {0b111,3}, // ROOK
+  {0b0000,1}, // NO_PIECE
+  {0b0001,4}, // PAWN
+  {0b0011,4}, // KNIGHT
+  {0b0101,4}, // BISHOP
+  {0b0111,4}, // ROOK
+  {0b1001,4}, // QUEEN
 };
 
 // sfenを圧縮/解凍するためのクラス
@@ -269,7 +270,8 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 
 	std::memset(this, 0, sizeof(Position));
 	std::memset(si, 0, sizeof(StateInfo));
-	st = si;
+  std::fill_n(&pieceList[0][0], sizeof(pieceList) / sizeof(Square), SQ_NONE);
+  st = si;
 
 	// Active color
 	sideToMove = (Color)stream.read_one_bit();
@@ -279,13 +281,7 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 
 	// PieceListを更新する上で、どの駒がどこにあるかを設定しなければならないが、
 	// それぞれの駒をどこまで使ったかのカウンター
-	PieceNumber piece_no_count[KING] = {
-    PIECE_NUMBER_ZERO,
-    PIECE_NUMBER_PAWN,
-    PIECE_NUMBER_KNIGHT,
-		PIECE_NUMBER_BISHOP,
-    PIECE_NUMBER_ROOK,
-  };
+  PieceNumber next_piece_number = PIECE_NUMBER_ZERO;
 
   pieceList[W_KING][0] = SQUARE_NB;
   pieceList[B_KING][0] = SQUARE_NB;
@@ -294,12 +290,12 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 	if (mirror)
 	{
 		for (auto c : Colors)
-			board[Mir((Square)stream.read_n_bit(7))] = make_piece(c, KING);
+			board[Mir((Square)stream.read_n_bit(6))] = make_piece(c, KING);
 	}
 	else
 	{
 		for (auto c : Colors)
-			board[stream.read_n_bit(7)] = make_piece(c, KING);
+			board[stream.read_n_bit(6)] = make_piece(c, KING);
 	}
 
   // Piece placement
@@ -335,7 +331,7 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
       PieceNumber piece_no =
         (pc == B_KING) ? PIECE_NUMBER_BKING : // 先手玉
         (pc == W_KING) ? PIECE_NUMBER_WKING : // 後手玉
-        piece_no_count[type_of(pc)]++; // それ以外
+        next_piece_number++; // それ以外
 
       evalList.put_piece(piece_no, sq, pc); // sqの升にpcの駒を配置する
 
@@ -363,12 +359,12 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
   }
   if (stream.read_one_bit()) {
     Square rsq;
-    for (rsq = relative_square(BLACK, SQ_H1); piece_on(rsq) != W_ROOK; --rsq) {}
+    for (rsq = relative_square(BLACK, SQ_H1); piece_on(rsq) != B_ROOK; --rsq) {}
     set_castling_right(BLACK, rsq);
   }
   if (stream.read_one_bit()) {
     Square rsq;
-    for (rsq = relative_square(BLACK, SQ_A1); piece_on(rsq) != W_ROOK; ++rsq) {}
+    for (rsq = relative_square(BLACK, SQ_A1); piece_on(rsq) != B_ROOK; ++rsq) {}
     set_castling_right(BLACK, rsq);
   }
 
@@ -381,6 +377,9 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
       || !(pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove))))
       st->epSquare = SQ_NONE;
   }
+  else {
+    st->epSquare = SQ_NONE;
+  }
 
   // Halfmove clock
   st->rule50 = static_cast<Square>(stream.read_n_bit(6));
@@ -397,6 +396,8 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
   thisThread = th;
 	set_state(st);
 
+  //std::cout << *this << std::endl;
+
   assert(pos_is_ok());
 #if defined(EVAL_NNUE)
   assert(evalList.is_valid(*this));
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 0e904650..8babbee2 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -392,6 +392,16 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 		auto& pos = th->rootPos;
     pos.set(StartFEN, false, &si, th);
 
+    // Test cod for Packed SFEN.
+    //{
+    //  PackedSfen packed_sfen;
+    //  pos.sfen_pack(packed_sfen);
+    //  std::cout << pos << std::endl;
+    //  pos.set_from_packed_sfen(packed_sfen, &si, th);
+    //  std::string actual = pos.fen();
+    //  assert(actual == StartFEN);
+    //}
+
 		// 探索部で定義されているBookMoveSelectorのメンバを参照する。
 		//auto& book = ::book;
 
diff --git a/src/position.cpp b/src/position.cpp
index a3f05a87..b7d5096f 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -249,14 +249,7 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
 
   // PieceList���X�V�����ŁA�ǂ̋�ǂ��ɂ��邩��ݒ肵�Ȃ���΂Ȃ�Ȃ����A
   // ���ꂼ��̋���ǂ��܂Ŏg�������̃J�E���^�[
-  PieceNumber piece_no_count[KING] = {
-    PIECE_NUMBER_ZERO,
-    PIECE_NUMBER_PAWN,
-    PIECE_NUMBER_KNIGHT,
-    PIECE_NUMBER_BISHOP,
-    PIECE_NUMBER_ROOK,
-    PIECE_NUMBER_QUEEN
-  };
+  PieceNumber next_piece_number = PIECE_NUMBER_ZERO;
 #endif  // defined(EVAL_NNUE)
 
   ss >> std::noskipws;
@@ -279,7 +272,7 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
           PieceNumber piece_no =
             (idx == W_KING) ? PIECE_NUMBER_WKING : // ����
             (idx == B_KING) ? PIECE_NUMBER_BKING : // ����
-            piece_no_count[type_of(Piece(idx))]++; // ����ȊO
+            next_piece_number++; // ����ȊO
           evalList.put_piece(piece_no, sq, pc); // sq�̏���pc�̋��z�u����
 #endif  // defined(EVAL_NNUE)
 
@@ -780,6 +773,9 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   Piece pc = piece_on(from);
   Piece captured = type_of(m) == ENPASSANT ? make_piece(them, PAWN) : piece_on(to);
 
+  PieceNumber piece_no0 = PIECE_NUMBER_NB;
+  PieceNumber piece_no1 = PIECE_NUMBER_NB;
+
   assert(color_of(pc) == us);
   assert(captured == NO_PIECE || color_of(captured) == (type_of(m) != CASTLING ? them : us));
   assert(type_of(captured) != KING);
@@ -805,10 +801,6 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   {
       Square capsq = to;
 
-#if defined(EVAL_NNUE)
-      PieceNumber piece_no1;
-#endif  // defined(EVAL_NNUE)
-
       // If the captured piece is a pawn, update pawn hash key, otherwise
       // update non-pawn material.
       if (type_of(captured) == PAWN)
@@ -828,6 +820,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
 #endif  // defined(EVAL_NNUE)
 
               board[capsq] = NO_PIECE; // Not done by remove_piece()
+              evalList.piece_no_list_board[capsq] = PIECE_NUMBER_NB;
           }
           else {
 #if defined(EVAL_NNUE)
@@ -893,7 +886,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   // Move the piece. The tricky Chess960 castling is handled earlier
   if (type_of(m) != CASTLING) {
 #if defined(EVAL_NNUE)
-    PieceNumber piece_no0 = piece_no_of(from);
+    piece_no0 = piece_no_of(from);
 #endif  // defined(EVAL_NNUE)
 
     move_piece(pc, from, to);
@@ -901,6 +894,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
 #if defined(EVAL_NNUE)
     dp.pieceNo[0] = piece_no0;
     dp.changed_piece[0].old_piece = evalList.bona_piece(piece_no0);
+    evalList.piece_no_list_board[from] = PIECE_NUMBER_NB;
     evalList.put_piece(piece_no0, to, pc);
     dp.changed_piece[0].new_piece = evalList.bona_piece(piece_no0);
 #endif  // defined(EVAL_NNUE)
@@ -928,9 +922,10 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
           put_piece(promotion, to);
 
 #if defined(EVAL_NNUE)
-          PieceNumber piece_no0 = piece_no_of(to);
+          piece_no0 = piece_no_of(to);
           dp.pieceNo[0] = piece_no0;
           dp.changed_piece[0].old_piece = evalList.bona_piece(piece_no0);
+          assert(evalList.piece_no_list_board[from] == PIECE_NUMBER_NB);
           evalList.put_piece(piece_no0, to, promotion);
           dp.changed_piece[0].new_piece = evalList.bona_piece(piece_no0);
 #endif  // defined(EVAL_NNUE)
@@ -985,6 +980,8 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
       }
   }
 
+  //std::cout << *this << std::endl;
+
   assert(pos_is_ok());
 #if defined(EVAL_NNUE)
   assert(evalList.is_valid(*this));
@@ -1038,6 +1035,7 @@ void Position::undo_move(Move m) {
 #if defined(EVAL_NNUE)
       PieceNumber piece_no0 = st->dirtyPiece.pieceNo[0];
       evalList.put_piece(piece_no0, from, pc);
+      evalList.piece_no_list_board[to] = PIECE_NUMBER_NB;
 #endif  // defined(EVAL_NNUE)
 
       if (st->capturedPiece)
@@ -1118,16 +1116,20 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ
   if (Do) {
     dp.pieceNo[0] = piece_no0;
     dp.changed_piece[0].old_piece = evalList.bona_piece(piece_no0);
+    evalList.piece_no_list_board[from] = PIECE_NUMBER_NB;
     evalList.put_piece(piece_no0, to, make_piece(us, KING));
     dp.changed_piece[0].new_piece = evalList.bona_piece(piece_no0);
 
     dp.pieceNo[1] = piece_no1;
     dp.changed_piece[1].old_piece = evalList.bona_piece(piece_no1);
+    evalList.piece_no_list_board[rfrom] = PIECE_NUMBER_NB;
     evalList.put_piece(piece_no1, rto, make_piece(us, ROOK));
     dp.changed_piece[1].new_piece = evalList.bona_piece(piece_no1);
   }
   else {
+    evalList.piece_no_list_board[to] = PIECE_NUMBER_NB;
     evalList.put_piece(piece_no0, from, make_piece(us, KING));
+    evalList.piece_no_list_board[rto] = PIECE_NUMBER_NB;
     evalList.put_piece(piece_no1, rfrom, make_piece(us, ROOK));
   }
 #endif  // defined(EVAL_NNUE)
diff --git a/src/search.cpp b/src/search.cpp
index 279c6d8a..df88564a 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -871,7 +871,7 @@ moves_loop: // When in check, search starts from here
 
       ss->moveCount = ++moveCount;
 
-      if (rootNode && thisThread == Threads.main() && Time.elapsed() > 3000)
+      if (rootNode && thisThread == Threads.main() && Time.elapsed() > 3000 && !Limits.silent)
           sync_cout << "info depth " << depth / ONE_PLY
                     << " currmove " << UCI::move(move, pos.is_chess960())
                     << " currmovenumber " << moveCount + thisThread->pvIdx << sync_endl;
@@ -1382,7 +1382,9 @@ moves_loop: // When in check, search starts from here
       ss->continuationHistory = &thisThread->continuationHistory[pos.moved_piece(move)][to_sq(move)];
 
       // Make and search the move
+      //std::cout << pos << std::endl;
       pos.do_move(move, st, givesCheck);
+      //std::cout << pos << std::endl;
       value = -qsearch<NT>(pos, ss+1, -beta, -alpha, depth - ONE_PLY);
       pos.undo_move(move);
 
@@ -1740,7 +1742,7 @@ namespace Learner
     // RootNode��ss->ply == 0�����̏����B
     // �[���N���A����̂ŁAss->ply == 0�ƂȂ�̂ő��v�c�B
 
-    memset(ss - 4, 0, 7 * sizeof(Stack));
+    std::memset(ss - 7, 0, 10 * sizeof(Stack));
 
     // Search::Limits�Ɋւ���
     // ���̃����o�[�ϐ���global�Ȃ̂ő��̃X���b�h�ɉe�����y�ڂ��̂ŋC�����邱�ƁB
@@ -1751,7 +1753,7 @@ namespace Learner
       limits.infinite = true;
 
       // PV��\�������Ǝז��Ȃ̂ŏ����Ă����B
-      //limits.silent = true;
+      limits.silent = true;
 
       // �����p����Ɗe�X���b�h��nodes��ώZ�������̂Ɣ�r����Ă��܂��B�䂦�Ɏg�p���Ȃ��B
       limits.nodes = 0;
@@ -1789,8 +1791,23 @@ namespace Learner
       // history�ނ�S���N���A����B���̏������͏������Ԃ������邵�A�T���̐��x�͂ނ��뉺����̂őP���͂悭�킩��Ȃ��B
       // th->clear();
 
-      for (int i = 4; i > 0; i--)
-        (ss - i)->continuationHistory = &th->continuationHistory[SQUARE_ZERO][NO_PIECE];
+      int ct = int(Options["Contempt"]) * PawnValueEg / 100; // From centipawns
+      Color us = pos.side_to_move();
+
+      // In analysis mode, adjust contempt in accordance with user preference
+      if (Limits.infinite || Options["UCI_AnalyseMode"])
+        ct = Options["Analysis Contempt"] == "Off" ? 0
+        : Options["Analysis Contempt"] == "Both" ? ct
+        : Options["Analysis Contempt"] == "White" && us == BLACK ? -ct
+        : Options["Analysis Contempt"] == "Black" && us == WHITE ? -ct
+        : ct;
+
+      // Evaluation score is from the white point of view
+      th->contempt = (us == WHITE ? make_score(ct, ct / 2)
+        : -make_score(ct, ct / 2));
+
+      for (int i = 7; i > 0; i--)
+        (ss - i)->continuationHistory = &th->continuationHistory[NO_PIECE][0]; // Use as sentinel
 
       // rootMoves�̐ݒ�
       auto& rootMoves = th->rootMoves;
@@ -1831,7 +1848,7 @@ namespace Learner
   // �����e��������̂ŁA���͈̔͂��w��ł���悤�ɂ���̂���߂邱�Ƃɂ����B
   ValueAndPV qsearch(Position& pos)
   {
-    Stack stack[MAX_PLY + 7], * ss = stack + 4;
+    Stack stack[MAX_PLY + 10], * ss = stack + 7;
     Move pv[MAX_PLY + 1];
     std::vector<Move> pvs;
 
@@ -1881,7 +1898,7 @@ namespace Learner
     if (depth == DEPTH_ZERO)
       return qsearch(pos);
 
-    Stack stack[MAX_PLY + 7], * ss = stack + 4;
+    Stack stack[MAX_PLY + 10], * ss = stack + 7;
     Move pv[MAX_PLY + 1];
 
     init_for_search(pos, ss);
@@ -1892,6 +1909,7 @@ namespace Learner
     auto th = pos.this_thread();
     auto& rootDepth = th->rootDepth;
     auto& pvIdx = th->pvIdx;
+    auto& pvLast = th->pvLast;
     auto& rootMoves = th->rootMoves;
     auto& completedDepth = th->completedDepth;
     auto& selDepth = th->selDepth;
@@ -1919,9 +1937,20 @@ namespace Learner
       for (RootMove& rm : rootMoves)
         rm.previousScore = rm.score;
 
-      // MultiPV
+      size_t pvFirst = 0;
+      pvLast = 0;
+
+      // MultiPV loop. We perform a full root search for each PV line
       for (pvIdx = 0; pvIdx < multiPV && !Threads.stop; ++pvIdx)
       {
+        if (pvIdx == pvLast)
+        {
+          pvFirst = pvLast;
+          for (pvLast++; pvLast < rootMoves.size(); pvLast++)
+            if (rootMoves[pvLast].tbRank != rootMoves[pvFirst].tbRank)
+              break;
+        }
+
         // ���ꂼ���depth��PV line�ɑ΂���USI info�ŏo�͂���selDepth
         selDepth = 0;
 
diff --git a/src/search.h b/src/search.h
index 92e124fc..7c6dcff7 100644
--- a/src/search.h
+++ b/src/search.h
@@ -86,6 +86,7 @@ struct LimitsType {
     time[WHITE] = time[BLACK] = inc[WHITE] = inc[BLACK] = npmsec = movetime = TimePoint(0);
     movestogo = depth = mate = perft = infinite = 0;
     nodes = 0;
+    silent = false;
   }
 
   bool use_time_management() const {
@@ -96,6 +97,9 @@ struct LimitsType {
   TimePoint time[COLOR_NB], inc[COLOR_NB], npmsec, movetime, startTime;
   int movestogo, depth, mate, perft, infinite;
   int64_t nodes;
+  // ��ʂɏo�͂��Ȃ��T�C�����g���[�h(�v���Z�X���ł̘A�����ȑΐ�̂Ƃ��p)
+  // ���̂Ƃ�PV���o�͂��Ȃ��B
+  bool silent;
 };
 
 extern LimitsType Limits;
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index e549c6e0..87dbfa82 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -87,6 +87,15 @@ void init(OptionsMap& o) {
   // �����ł��̉B���I�v�V������isready���̕]���֐��̓ǂݍ��݂�}�����āA
   // test evalconvert�R�}���h��@���B
   o["SkipLoadingEval"]       << Option(false);
+  // ��Ղ̎w���������ڂ܂ŗp���邩
+  o["BookMoves"] << Option(16, 0, 10000);
+
+#if defined(EVAL_LEARN)
+  // �]���֐��̊w�K���s�Ȃ��Ƃ��́A�]���֐��̕ۑ���̃t�H���_��ύX�ł���B
+  // �f�t�H���g�ł�evalsave�B���̃t�H���_�͎��O�ɗp�ӂ���Ă�����̂Ƃ���B
+  // ���̃t�H���_�z���Ƀt�H���_��"0/","1/",�c�̂悤�Ɏ����I�Ɍ@��A�����ɕ]���֐��t�@�C����ۑ�����B
+  o["EvalSaveDir"] << Option("evalsave");
+#endif
 }
 
 
From 9dab4660ce20e905f0f99e1058e38fe8614c71c6 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sat, 22 Jun 2019 00:36:42 +0900
Subject: [PATCH 010/583] Added source files.

---
 src/Makefile | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 2d6042e2..2a8565ff 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -38,7 +38,20 @@ PGOBENCH = ./$(EXE) bench
 ### Object files
 OBJS = benchmark.o bitbase.o bitboard.o endgame.o evaluate.o main.o \
 	material.o misc.o movegen.o movepick.o pawns.o position.o psqt.o \
-	search.o thread.o timeman.o tt.o uci.o ucioption.o syzygy/tbprobe.o
+	search.o thread.o timeman.o tt.o uci.o ucioption.o syzygy/tbprobe.o \
+	eval/evaluate_mir_inv_tools.o \
+	eval/nnue/evaluate_nnue.o \
+	eval/nnue/evaluate_nnue_learner.o \
+	eval/nnue/features/half_kp.o \
+	eval/nnue/features/half_relative_kp.o \
+	eval/nnue/features/k.o \
+	eval/nnue/features/p.o \
+	eval/nnue/nnue_test_command.o \
+	extra/sfen_packer.o \
+	learn/gensfen2019.o \
+	learn/learner.o \
+	learn/learning_tools.o \
+	learn/multi_think.o
 
 ### Establish the operating system name
 KERNEL = $(shell uname -s)

From 998d8721bdad03f790ff3ca08e968a1da5578752 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sat, 22 Jun 2019 00:37:10 +0900
Subject: [PATCH 011/583] Fixed a bug that White and Black are reversed.

---
 src/eval/nnue/evaluate_nnue_learner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/eval/nnue/evaluate_nnue_learner.cpp b/src/eval/nnue/evaluate_nnue_learner.cpp
index 0e558f39..b474f2e2 100644
--- a/src/eval/nnue/evaluate_nnue_learner.cpp
+++ b/src/eval/nnue/evaluate_nnue_learner.cpp
@@ -134,7 +134,7 @@ void AddExample(Position& pos, Color rootColor,
   for (const auto trigger : kRefreshTriggers) {
     RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
   }
-  if (pos.side_to_move() != BLACK) {
+  if (pos.side_to_move() != WHITE) {
     active_indices[0].swap(active_indices[1]);
   }
   for (const auto color : Colors) {

From 84a96a3d9c8a2d9a88270238e45b64294ea4cc10 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sat, 22 Jun 2019 00:37:31 +0900
Subject: [PATCH 012/583] Fixed a compilation error.

---
 src/eval/nnue/trainer/features/factorizer_half_kp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/eval/nnue/trainer/features/factorizer_half_kp.h b/src/eval/nnue/trainer/features/factorizer_half_kp.h
index 36f53edc..28c11074 100644
--- a/src/eval/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/eval/nnue/trainer/features/factorizer_half_kp.h
@@ -85,7 +85,7 @@ class Factorizer<HalfKP<AssociatedKing>> {
       index_offset += SkipFeatures(kProperties[kFeaturesHalfRelativeKP]);
     }
 
-    ASSERT_LV5(index_offset == GetDimensions());
+    assert(index_offset == GetDimensions());
   }
 };
 

From 07dc336b0f12f12816435c47077268f54f430776 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sat, 22 Jun 2019 00:37:59 +0900
Subject: [PATCH 013/583] Added validation logic.

---
 src/evaluate.cpp | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 350f905b..bbc92248 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -988,6 +988,49 @@ bool EvalList::is_valid(const Position& pos)
   Found:;
   }
 
+  // Validate piece_no_list_board
+  for (auto sq = SQUARE_ZERO; sq < SQUARE_NB; ++sq) {
+    Piece expected_piece = pos.piece_on(sq);
+    PieceNumber piece_number = piece_no_list_board[sq];
+    if (piece_number == PIECE_NUMBER_NB) {
+      assert(expected_piece == NO_PIECE);
+      if (expected_piece != NO_PIECE) {
+        return false;
+      }
+      continue;
+    }
+
+    BonaPiece bona_piece_white = pieceListFw[piece_number];
+    Piece actual_piece;
+    for (actual_piece = NO_PIECE; actual_piece < PIECE_NB; ++actual_piece) {
+      if (kpp_board_index[actual_piece].fw == BONA_PIECE_ZERO) {
+        continue;
+      }
+
+      if (kpp_board_index[actual_piece].fw <= bona_piece_white
+        && bona_piece_white < kpp_board_index[actual_piece].fw + SQUARE_NB) {
+        break;
+      }
+    }
+
+    assert(actual_piece != PIECE_NB);
+    if (actual_piece == PIECE_NB) {
+      return false;
+    }
+
+    assert(actual_piece == expected_piece);
+    if (actual_piece != expected_piece) {
+      return false;
+    }
+
+    Square actual_square = static_cast<Square>(
+      bona_piece_white - kpp_board_index[actual_piece].fw);
+    assert(sq == actual_square);
+    if (sq != actual_square) {
+      return false;
+    }
+  }
+
   return true;
 }
 }

From 57ead90f18313949da17705a4b2a76c099c2bb55 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sat, 22 Jun 2019 00:38:24 +0900
Subject: [PATCH 014/583] Fixed a bug that the game play is invalid.

---
 src/extra/sfen_packer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
index f7c1d238..4d861a58 100644
--- a/src/extra/sfen_packer.cpp
+++ b/src/extra/sfen_packer.cpp
@@ -197,7 +197,7 @@ struct SfenPacker
 
     stream.write_n_bit(pos.state()->rule50, 6);
 
-    stream.write_n_bit(pos.game_ply(), 8);
+    stream.write_n_bit(1 + (pos.game_ply() - (pos.side_to_move() == BLACK)) / 2, 8);
 
     assert(stream.get_cursor() <= 256);
   }

From 641724e3a50a34aff036bad1090840c4e362123b Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sat, 22 Jun 2019 00:39:21 +0900
Subject: [PATCH 015/583] Added debug code.

---
 src/learn/learner.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 8babbee2..34ea38c6 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -687,6 +687,13 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 					// 最終的な書き出しは、勝敗がついてから。
 					pos.sfen_pack(psv.sfen);
 
+          //{
+          //  std::string before_fen = pos.fen();
+          //  pos.set_from_packed_sfen(psv.sfen, &si, th);
+          //  std::string after_fen = pos.fen();
+          //  assert(before_fen == after_fen);
+          //}
+
 					// PV lineのleaf nodeでのroot colorから見たevaluate()の値を取得。
 					// search()の返し値をそのまま使うのとこうするのとの善悪は良くわからない。
 					psv.score = evaluate_leaf(pos, pv1);

From 5772509e8be630805f75b9bec64a81eff353901b Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sat, 22 Jun 2019 00:40:25 +0900
Subject: [PATCH 016/583] Disabled TT when EVAL_LEARN is enabled.

---
 src/tt.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/tt.cpp b/src/tt.cpp
index b8fe7567..0b2bf9e9 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -118,6 +118,9 @@ void TranspositionTable::clear() {
 /// TTEntry t2 if its replace value is greater than that of t2.
 
 TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
+#if defined(EVAL_LEARN)
+  return found = false, first_entry(0);
+#else
 
   TTEntry* const tte = first_entry(key);
   const uint16_t key16 = key >> 48;  // Use the high 16 bits as key inside the cluster
@@ -142,6 +145,7 @@ TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
           replace = &tte[i];
 
   return found = false, replace;
+#endif
 }
 
 
From 9a73df73797e24a0243ddaf2126757684e0055df Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sat, 22 Jun 2019 00:40:46 +0900
Subject: [PATCH 017/583] Added test commands.

---
 src/uci.cpp | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/src/uci.cpp b/src/uci.cpp
index 68bb850b..19af09a0 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -271,6 +271,44 @@ void is_ready(bool skipCorruptCheck)
 }
 
 
+// --------------------
+// �e�X�g�p��qsearch(),search()�𒼐ڌĂ�
+// --------------------
+
+#if defined(EVAL_LEARN)
+void qsearch_cmd(Position& pos)
+{
+  cout << "qsearch : ";
+  auto pv = Learner::qsearch(pos);
+  cout << "Value = " << pv.first << " , " << UCI::value(pv.first) << " , PV = ";
+  for (auto m : pv.second)
+    cout << UCI::move(m, false) << " ";
+  cout << endl;
+}
+
+void search_cmd(Position& pos, istringstream& is)
+{
+  string token;
+  int depth = 1;
+  int multi_pv = (int)Options["MultiPV"];
+  while (is >> token)
+  {
+    if (token == "depth")
+      is >> depth;
+    if (token == "multipv")
+      is >> multi_pv;
+  }
+
+  cout << "search depth = " << depth << " , multi_pv = " << multi_pv << " : ";
+  auto pv = Learner::search(pos, depth, multi_pv);
+  cout << "Value = " << pv.first << " , " << UCI::value(pv.first) << " , PV = ";
+  for (auto m : pv.second)
+    cout << UCI::move(m, false) << " ";
+  cout << endl;
+}
+
+#endif
+
 /// UCI::loop() waits for a command from stdin, parses it and calls the appropriate
 /// function. Also intercepts EOF from stdin to ensure gracefully exiting if the
 /// GUI dies unexpectedly. When called with some command line arguments, e.g. to
@@ -333,6 +371,9 @@ void UCI::loop(int argc, char* argv[]) {
       // �J�����̋��t�ǖʐ����R�}���h
       else if (token == "gensfen2019") Learner::gen_sfen2019(pos, is);
 #endif
+      // �e�X�g�p��qsearch(),search()�𒼐ڌĂԃR�}���h
+      else if (token == "qsearch") qsearch_cmd(pos);
+      else if (token == "search") search_cmd(pos, is);
 
 #endif
       else

From a413bf7aad25a67c515ac44db718514cd0dc679b Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Mon, 24 Jun 2019 23:17:46 +0900
Subject: [PATCH 018/583] Added hack to avoid crash during machine learning.

---
 src/learn/learner.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 34ea38c6..a0a3833a 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -2056,13 +2056,17 @@ void LearnerThink::thread_worker(size_t thread_id)
 		};
 
 		StateInfo state[MAX_PLY]; // qsearchのPVがそんなに長くなることはありえない。
+		bool illegal_move = false;
 		for (auto m : pv)
 		{
 			// 非合法手はやってこないはずなのだが。
+			// An illegal move sometimes comes here...
 			if (!pos.pseudo_legal(m) || !pos.legal(m))
 			{
-				cout << pos << m << endl;
-				assert(false);
+				//cout << pos << m << endl;
+				//assert(false);
+				illegal_move = true;
+				break;
 			}
 
 			// 各PV上のnodeでも勾配を加算する場合の処理。
@@ -2076,6 +2080,11 @@ void LearnerThink::thread_worker(size_t thread_id)
 			Eval::evaluate_with_no_return(pos);
 		}
 
+		if (illegal_move) {
+			sync_cout << "An illical move was detected... Excluded the position from the learning data..." << sync_endl;
+			continue;
+		}
+
 		// PVの終端局面に達したので、ここで勾配を加算する。
 		pos_add_grad();
 

From 26271586cb2dcac34416033ed63ea0611731be0a Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Mon, 24 Jun 2019 23:18:17 +0900
Subject: [PATCH 019/583] Added #if to fix compile errors.

---
 src/position.cpp |  7 ++++---
 src/search.cpp   | 10 +++++++---
 src/tt.cpp       |  2 +-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/position.cpp b/src/position.cpp
index b7d5096f..91f3ab8e 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -773,8 +773,10 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   Piece pc = piece_on(from);
   Piece captured = type_of(m) == ENPASSANT ? make_piece(them, PAWN) : piece_on(to);
 
+#if defined(EVAL_NNUE)
   PieceNumber piece_no0 = PIECE_NUMBER_NB;
   PieceNumber piece_no1 = PIECE_NUMBER_NB;
+#endif  // defined(EVAL_NNUE)
 
   assert(color_of(pc) == us);
   assert(captured == NO_PIECE || color_of(captured) == (type_of(m) != CASTLING ? them : us));
@@ -820,7 +822,9 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
 #endif  // defined(EVAL_NNUE)
 
               board[capsq] = NO_PIECE; // Not done by remove_piece()
+#if defined(EVAL_NNUE)
               evalList.piece_no_list_board[capsq] = PIECE_NUMBER_NB;
+#endif  // defined(EVAL_NNUE)
           }
           else {
 #if defined(EVAL_NNUE)
@@ -1473,9 +1477,6 @@ bool Position::pos_is_ok() const {
 #if defined(EVAL_NNUE)
 PieceNumber Position::piece_no_of(Square sq) const
 {
-  if (piece_on(sq) == NO_PIECE) {
-    sync_cout << *this << sync_endl;
-  }
   assert(piece_on(sq) != NO_PIECE);
   PieceNumber n = evalList.piece_no_of_board(sq);
   assert(is_ok(n));
diff --git a/src/search.cpp b/src/search.cpp
index df88564a..b176e0b9 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1372,7 +1372,13 @@ moves_loop: // When in check, search starts from here
       prefetch(TT.first_entry(pos.key_after(move)));
 
       // Check for legality just before making the move
-      if (!pos.legal(move))
+      if (
+#if defined(EVAL_LEARN)
+        // HACK: pos.piece_on(from_sq(m)) sometimes will be NO_PIECE during machine learning.
+        !pos.pseudo_legal(move) ||
+#endif // EVAL_LEARN
+        !pos.legal(move)
+        )
       {
           moveCount--;
           continue;
@@ -1382,9 +1388,7 @@ moves_loop: // When in check, search starts from here
       ss->continuationHistory = &thisThread->continuationHistory[pos.moved_piece(move)][to_sq(move)];
 
       // Make and search the move
-      //std::cout << pos << std::endl;
       pos.do_move(move, st, givesCheck);
-      //std::cout << pos << std::endl;
       value = -qsearch<NT>(pos, ss+1, -beta, -alpha, depth - ONE_PLY);
       pos.undo_move(move);
 
diff --git a/src/tt.cpp b/src/tt.cpp
index 0b2bf9e9..21309fd6 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -118,7 +118,7 @@ void TranspositionTable::clear() {
 /// TTEntry t2 if its replace value is greater than that of t2.
 
 TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
-#if defined(EVAL_LEARN)
+#if defined(DISABLE_TT)
   return found = false, first_entry(0);
 #else
 

From 00f84ed99a08e1e27e35547a4cc32166719da694 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Wed, 26 Jun 2019 08:48:48 +0900
Subject: [PATCH 020/583] Changed the constant value to calculate the winning
 percentage.

---
 src/learn/learner.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index a0a3833a..5c5d4036 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1014,7 +1014,8 @@ double winning_percentage(double value)
 {
 	// この600.0という定数は、ponanza定数。(ponanzaがそうしているらしいという意味で)
 	// ゲームの進行度に合わせたものにしたほうがいいかも知れないけども、その効果のほどは不明。
-	return sigmoid(value / 600.0);
+  // Pawn Advantage, Win Percentage, and Elo - Chessprogramming wiki https://www.chessprogramming.org/Pawn_Advantage,_Win_Percentage,_and_Elo
+	return sigmoid(value * log(10.0) / 4.0 / PawnValueEg);
 }
 
 // 普通のシグモイド関数の導関数。

From 81262320c36cf47eb585ba50e8e421082c78d020 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sun, 30 Jun 2019 11:29:43 +0900
Subject: [PATCH 021/583] Revert "Changed the constant value to calculate the
 winning percentage."

This reverts commit 00f84ed99a08e1e27e35547a4cc32166719da694.
---
 src/learn/learner.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 5c5d4036..a0a3833a 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1014,8 +1014,7 @@ double winning_percentage(double value)
 {
 	// この600.0という定数は、ponanza定数。(ponanzaがそうしているらしいという意味で)
 	// ゲームの進行度に合わせたものにしたほうがいいかも知れないけども、その効果のほどは不明。
-  // Pawn Advantage, Win Percentage, and Elo - Chessprogramming wiki https://www.chessprogramming.org/Pawn_Advantage,_Win_Percentage,_and_Elo
-	return sigmoid(value * log(10.0) / 4.0 / PawnValueEg);
+	return sigmoid(value / 600.0);
 }
 
 // 普通のシグモイド関数の導関数。

From 89e846c476bb94ca0f120a7a5aecc53c2863c713 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Thu, 4 Jul 2019 23:44:58 +0900
Subject: [PATCH 022/583] Fixed a bug that Learner::qsearch() recognizes
 stalemate as checkmated.

---
 src/learn/learner.cpp | 19 +++++++++++--------
 src/position.cpp      |  9 ---------
 src/position.h        |  3 ---
 src/search.cpp        | 13 +++++++++----
 4 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index a0a3833a..c143e451 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -502,15 +502,17 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 				break;
 			}
 
+      if (pos.is_draw(ply)) {
+        // Do not write if draw.
+        break;
+      }
+
 			// 全駒されて詰んでいたりしないか？
-			if (pos.is_mated())
+			if (MoveList<LEGAL>(pos).size() == 0)
 			{
-        if (pos.checkers()) {
-          // (この局面の一つ前の局面までは書き出す)
-          // Write the packed fens if checkmate.
-          // Do not write if stalemate.
-          flush_psv(-1);
-        }
+        // (この局面の一つ前の局面までは書き出す)
+        // Write the positions other than this position if checkmated.
+        flush_psv(-1);
 				break;
 			}
 
@@ -1965,7 +1967,8 @@ void LearnerThink::thread_worker(size_t thread_id)
 		// 全駒されて詰んでいる可能性がある。
 		// また宣言勝ちの局面はPVの指し手でleafに行けないので学習から除外しておく。
 		// (そのような教師局面自体を書き出すべきではないのだが古い生成ルーチンで書き出しているかも知れないので)
-		if (pos.is_mated())
+    // Skip the position if there are no legal moves (=checkmated or stalemate).
+		if (MoveList<LEGAL>(pos).size() == 0)
 			goto RetryRead;
 
 		// 読み込めたので試しに表示してみる。
diff --git a/src/position.cpp b/src/position.cpp
index 91f3ab8e..fefeac92 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -1483,12 +1483,3 @@ PieceNumber Position::piece_no_of(Square sq) const
   return n;
 }
 #endif  // defined(EVAL_NNUE)
-
-#if defined(EVAL_LEARN)
-// ���ǖʂŎw���肪�Ȃ������e�X�g����B�w���萶�����[�`����p����̂ő����Ȃ��B�T�����ɂ͎g��Ȃ����ƁB
-bool Position::is_mated() const
-{
-  // �s���ŋl�߂������ł���p�^�[���͂Ȃ��̂�LEGAL_ALL�ł���K�v�͂Ȃ��B
-  return MoveList<LEGAL>(*this).size() == 0;
-}
-#endif // EVAL_LEARN
diff --git a/src/position.h b/src/position.h
index 547320ea..6efe37e7 100644
--- a/src/position.h
+++ b/src/position.h
@@ -192,9 +192,6 @@ public:
 #endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
 
 #if defined(EVAL_LEARN)
-  // ���ǖʂŎw���肪�Ȃ������e�X�g����B�w���萶�����[�`����p����̂ő����Ȃ��B�T�����ɂ͎g��Ȃ����ƁB
-  bool is_mated() const;
-
   // -- sfen���w���p
 
   // pack���ꂽsfen�𓾂�B�����Ɏw�肵���o�b�t�@�ɕԂ��B
diff --git a/src/search.cpp b/src/search.cpp
index b176e0b9..fa423d6e 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1854,21 +1854,26 @@ namespace Learner
   {
     Stack stack[MAX_PLY + 10], * ss = stack + 7;
     Move pv[MAX_PLY + 1];
-    std::vector<Move> pvs;
 
     init_for_search(pos, ss);
     ss->pv = pv; // �Ƃ肠�����_�~�[�łǂ����o�b�t�@���Ȃ��Ƃ����Ȃ��B
 
+    if (pos.is_draw(0)) {
+      // Return draw value if draw.
+      return { VALUE_DRAW, {} };
+    }
+
     // �l�܂���Ă���̂�
-    if (pos.is_mated())
+    if (MoveList<LEGAL>(pos).size() == 0)
     {
-      pvs.push_back(MOVE_NONE);
-      return ValueAndPV(mated_in(/*ss->ply*/ 0 + 1), pvs);
+      // Return the mated value if checkmated.
+      return { mated_in(/*ss->ply*/ 0 + 1), {} };
     }
 
     auto bestValue = ::qsearch<PV>(pos, ss, -VALUE_INFINITE, VALUE_INFINITE, DEPTH_ZERO);
 
     // ����ꂽPV��Ԃ��B
+    std::vector<Move> pvs;
     for (Move* p = &ss->pv[0]; is_ok(*p); ++p)
       pvs.push_back(*p);
 

From 09e529edd37257d5d272bd924932d4aee767ba73 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Fri, 5 Jul 2019 00:22:41 +0900
Subject: [PATCH 023/583] Added a hack to avoid crash with binaries compiled by
 g++ on MSYS2.

---
 src/eval/nnue/layers/affine_transform.h  | 10 ++++++-
 src/eval/nnue/layers/clipped_relu.h      | 38 ++++++++++++++++++++----
 src/eval/nnue/nnue_feature_transformer.h | 30 +++++++++++++++++--
 3 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/src/eval/nnue/layers/affine_transform.h b/src/eval/nnue/layers/affine_transform.h
index 9b227270..d8101ba4 100644
--- a/src/eval/nnue/layers/affine_transform.h
+++ b/src/eval/nnue/layers/affine_transform.h
@@ -101,7 +101,15 @@ class AffineTransform {
       const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
       for (IndexType j = 0; j < kNumChunks; ++j) {
         __m256i product = _mm256_maddubs_epi16(
-            _mm256_load_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+#if defined(__MINGW32__) || defined(__MINGW64__)
+          // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+          //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+          //       even though alignas is specified.
+          _mm256_loadu_si256
+#else
+          _mm256_load_si256
+#endif
+          (&input_vector[j]), _mm256_load_si256(&row[j]));
         product = _mm256_madd_epi16(product, kOnes);
         sum = _mm256_add_epi32(sum, product);
       }
diff --git a/src/eval/nnue/layers/clipped_relu.h b/src/eval/nnue/layers/clipped_relu.h
index f904de74..5877fc32 100644
--- a/src/eval/nnue/layers/clipped_relu.h
+++ b/src/eval/nnue/layers/clipped_relu.h
@@ -73,12 +73,40 @@ class ClippedReLU {
     const auto out = reinterpret_cast<__m256i*>(output);
     for (IndexType i = 0; i < kNumChunks; ++i) {
       const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-          _mm256_load_si256(&in[i * 4 + 0]),
-          _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
+#if defined(__MINGW32__) || defined(__MINGW64__)
+        // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+        //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+        //       even though alignas is specified.
+        _mm256_loadu_si256
+#else
+        _mm256_load_si256
+#endif
+        (&in[i * 4 + 0]),
+#if defined(__MINGW32__) || defined(__MINGW64__)
+        _mm256_loadu_si256
+#else
+        _mm256_load_si256
+#endif
+        (&in[i * 4 + 1])), kWeightScaleBits);
       const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-          _mm256_load_si256(&in[i * 4 + 2]),
-          _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
-      _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+#if defined(__MINGW32__) || defined(__MINGW64__)
+        _mm256_loadu_si256
+#else
+        _mm256_load_si256
+#endif
+        (&in[i * 4 + 2]),
+#if defined(__MINGW32__) || defined(__MINGW64__)
+        _mm256_loadu_si256
+#else
+        _mm256_load_si256
+#endif
+        (&in[i * 4 + 3])), kWeightScaleBits);
+#if defined(__MINGW32__) || defined(__MINGW64__)
+      _mm256_storeu_si256
+#else
+      _mm256_store_si256
+#endif
+        (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
           _mm256_packs_epi16(words0, words1), kZero), kOffsets));
     }
     constexpr IndexType kStart = kNumChunks * kSimdWidth;
diff --git a/src/eval/nnue/nnue_feature_transformer.h b/src/eval/nnue/nnue_feature_transformer.h
index f7c2080f..57d25310 100644
--- a/src/eval/nnue/nnue_feature_transformer.h
+++ b/src/eval/nnue/nnue_feature_transformer.h
@@ -100,9 +100,24 @@ class FeatureTransformer {
 #if defined(USE_AVX2)
       auto out = reinterpret_cast<__m256i*>(&output[offset]);
       for (IndexType j = 0; j < kNumChunks; ++j) {
-        __m256i sum0 = _mm256_load_si256(&reinterpret_cast<const __m256i*>(
+        __m256i sum0 =
+#if defined(__MINGW32__) || defined(__MINGW64__)
+          // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+          //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+          //       even though alignas is specified.
+          _mm256_loadu_si256
+#else
+          _mm256_load_si256
+#endif
+          (&reinterpret_cast<const __m256i*>(
             accumulation[perspectives[p]][0])[j * 2 + 0]);
-        __m256i sum1 = _mm256_load_si256(&reinterpret_cast<const __m256i*>(
+        __m256i sum1 =
+#if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_loadu_si256
+#else
+          _mm256_load_si256
+#endif
+          (&reinterpret_cast<const __m256i*>(
             accumulation[perspectives[p]][0])[j * 2 + 1]);
         for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
           sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
@@ -110,7 +125,12 @@ class FeatureTransformer {
           sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
               accumulation[perspectives[p]][i])[j * 2 + 1]);
         }
-        _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+#if defined(__MINGW32__) || defined(__MINGW64__)
+        _mm256_storeu_si256
+#else
+        _mm256_store_si256
+#endif
+        (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
             _mm256_packs_epi16(sum0, sum1), kZero), kControl));
       }
 #elif defined(USE_SSE41)
@@ -177,7 +197,11 @@ class FeatureTransformer {
           auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
           for (IndexType j = 0; j < kNumChunks; ++j) {
+#if defined(__MINGW32__) || defined(__MINGW64__)
+            _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j]));
+#else
             accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
+#endif
           }
 #elif defined(USE_SSE2)
           auto accumulation = reinterpret_cast<__m128i*>(

From 5c0037de7fe0bc7efece17bf0573412c079bf909 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sun, 7 Jul 2019 17:02:34 +0900
Subject: [PATCH 024/583] Added the castling right feature.  Added
 k-p-cr_256x2-32-32 architecture.

---
 .../nnue/architectures/k-p-cr_256x2-32-32.h   | 37 ++++++++++
 src/eval/nnue/features/castling_right.cpp     | 73 +++++++++++++++++++
 src/eval/nnue/features/castling_right.h       | 48 ++++++++++++
 src/eval/nnue/nnue_architecture.h             | 13 +---
 4 files changed, 161 insertions(+), 10 deletions(-)
 create mode 100644 src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
 create mode 100644 src/eval/nnue/features/castling_right.cpp
 create mode 100644 src/eval/nnue/features/castling_right.h

diff --git a/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h b/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
new file mode 100644
index 00000000..9ce7ecf1
--- /dev/null
+++ b/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
@@ -0,0 +1,37 @@
+// NNUE�]���֐��ŗp������͓����ʂƃl�b�g���[�N�\���̒�`
+
+#include "../features/feature_set.h"
+#include "../features/k.h"
+#include "../features/p.h"
+#include "../features/castling_right.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    // �]���֐��ŗp������͓�����
+    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
+      Features::CastlingRight>;
+
+    // �ϊ���̓��͓����ʂ̎�����
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+      // �l�b�g���[�N�\���̒�`
+      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+  }  // namespace NNUE
+
+}  // namespace Eval
diff --git a/src/eval/nnue/features/castling_right.cpp b/src/eval/nnue/features/castling_right.cpp
new file mode 100644
index 00000000..30e46e23
--- /dev/null
+++ b/src/eval/nnue/features/castling_right.cpp
@@ -0,0 +1,73 @@
+// NNUE評価関数の入力特徴量Kの定義
+
+#if defined(EVAL_NNUE)
+
+#include "castling_right.h"
+#include "index_list.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    namespace Features {
+
+      // 特徴量のうち、値が1であるインデックスのリストを取得する
+      void CastlingRight::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
+        // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+        int castling_rights = pos.state()->castlingRights;
+        int relative_castling_rights;
+        if (perspective == WHITE) {
+          relative_castling_rights = castling_rights;
+        }
+        else {
+          // Invert the perspective.
+          relative_castling_rights = ((castling_rights & 3) << 2)
+            & ((castling_rights >> 2) & 3);
+        }
+
+        for (int i = 0; i < kDimensions; ++i) {
+          if (relative_castling_rights & (i << 1)) {
+            active->push_back(i);
+          }
+        }
+      }
+
+      // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+      void CastlingRight::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* added) {
+
+        int previous_castling_rights = pos.state()->previous->castlingRights;
+        int current_castling_rights = pos.state()->castlingRights;
+        int relative_previous_castling_rights;
+        int relative_current_castling_rights;
+        if (perspective == WHITE) {
+          relative_previous_castling_rights = previous_castling_rights;
+          relative_current_castling_rights = current_castling_rights;
+        }
+        else {
+          // Invert the perspective.
+          relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
+            & ((previous_castling_rights >> 2) & 3);
+          relative_current_castling_rights = ((current_castling_rights & 3) << 2)
+            & ((current_castling_rights >> 2) & 3);
+        }
+
+        for (int i = 0; i < kDimensions; ++i) {
+          if ((relative_previous_castling_rights & (i << 1)) &&
+            (relative_current_castling_rights & (i << 1)) == 0) {
+            removed->push_back(i);
+          }
+        }
+      }
+
+    }  // namespace Features
+
+  }  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
diff --git a/src/eval/nnue/features/castling_right.h b/src/eval/nnue/features/castling_right.h
new file mode 100644
index 00000000..1384865f
--- /dev/null
+++ b/src/eval/nnue/features/castling_right.h
@@ -0,0 +1,48 @@
+// NNUE�]���֐��̓��͓�����K�̒�`
+
+#ifndef _NNUE_FEATURES_CASTLING_RIGHT_H_
+#define _NNUE_FEATURES_CASTLING_RIGHT_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    namespace Features {
+
+      // ������K�F�ʂ̈ʒu
+      class CastlingRight {
+      public:
+        // �����ʖ�
+        static constexpr const char* kName = "CastlingRight";
+        // �]���֐��t�@�C���ɖ��ߍ��ރn�b�V���l
+        static constexpr std::uint32_t kHashValue = 0x913968AAu;
+        // �����ʂ̎�����
+        static constexpr IndexType kDimensions = 4;
+        // �����ʂ̂����A�����ɒl��1�ƂȂ�C���f�b�N�X�̐��̍ő�l
+        static constexpr IndexType kMaxActiveDimensions = 4;
+        // �����v�Z�̑���ɑS�v�Z���s���^�C�~���O
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+        // �����ʂ̂����A�l��1�ł���C���f�b�N�X�̃��X�g���擾����
+        static void AppendActiveIndices(const Position& pos, Color perspective,
+          IndexList* active);
+
+        // �����ʂ̂����A���O����l���ω������C���f�b�N�X�̃��X�g���擾����
+        static void AppendChangedIndices(const Position& pos, Color perspective,
+          IndexList* removed, IndexList* added);
+      };
+
+    }  // namespace Features
+
+  }  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/nnue_architecture.h b/src/eval/nnue/nnue_architecture.h
index 5f11a02b..3170cdab 100644
--- a/src/eval/nnue/nnue_architecture.h
+++ b/src/eval/nnue/nnue_architecture.h
@@ -6,16 +6,9 @@
 #if defined(EVAL_NNUE)
 
 // 入力特徴量とネットワーク構造が定義されたヘッダをincludeする
-
-// KP256型を使いたいときは、これを事前にdefineする。
-#define EVAL_NNUE_KP256
-#if defined(EVAL_NNUE_KP256)
-#include "architectures/k-p_256x2-32-32.h"
-#else // #if defined(EVAL_NNUE_HALFKP256)
-
-// NNUE評価関数のデフォルトは、halfKP256
-#include "architectures/halfkp_256x2-32-32.h"
-#endif
+//#include "architectures/k-p_256x2-32-32.h"
+#include "architectures/k-p-cr_256x2-32-32.h"
+//#include "architectures/halfkp_256x2-32-32.h"
 
 namespace Eval {
 

From 92052bc16b3f15a29fdfcc43f1a042fe1e6b2b6b Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sun, 7 Jul 2019 17:22:07 +0900
Subject: [PATCH 025/583] Fixed build errors.

---
 src/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Makefile b/src/Makefile
index 2a8565ff..27969174 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -46,6 +46,7 @@ OBJS = benchmark.o bitbase.o bitboard.o endgame.o evaluate.o main.o \
 	eval/nnue/features/half_relative_kp.o \
 	eval/nnue/features/k.o \
 	eval/nnue/features/p.o \
+	eval/nnue/features/castling_right.o \
 	eval/nnue/nnue_test_command.o \
 	extra/sfen_packer.o \
 	learn/gensfen2019.o \

From df827ea7ee477afabc03101163322727768b3505 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sun, 7 Jul 2019 19:24:46 +0900
Subject: [PATCH 026/583] Added enpassant feature.  Added k-p-cr-ep_256x2-32-32
 architecture.

---
 src/Makefile                                  |  1 +
 .../architectures/k-p-cr-ep_256x2-32-32.h     | 38 +++++++++++++++
 src/eval/nnue/features/enpassant.cpp          | 47 ++++++++++++++++++
 src/eval/nnue/features/enpassant.h            | 48 +++++++++++++++++++
 src/eval/nnue/nnue_architecture.h             |  3 +-
 5 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
 create mode 100644 src/eval/nnue/features/enpassant.cpp
 create mode 100644 src/eval/nnue/features/enpassant.h

diff --git a/src/Makefile b/src/Makefile
index 27969174..a48d5674 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -47,6 +47,7 @@ OBJS = benchmark.o bitbase.o bitboard.o endgame.o evaluate.o main.o \
 	eval/nnue/features/k.o \
 	eval/nnue/features/p.o \
 	eval/nnue/features/castling_right.o \
+	eval/nnue/features/enpassant.o \
 	eval/nnue/nnue_test_command.o \
 	extra/sfen_packer.o \
 	learn/gensfen2019.o \
diff --git a/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h b/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
new file mode 100644
index 00000000..17871169
--- /dev/null
+++ b/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
@@ -0,0 +1,38 @@
+// NNUE�]���֐��ŗp������͓����ʂƃl�b�g���[�N�\���̒�`
+
+#include "../features/feature_set.h"
+#include "../features/k.h"
+#include "../features/p.h"
+#include "../features/castling_right.h"
+#include "../features/enpassant.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    // �]���֐��ŗp������͓�����
+    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
+      Features::CastlingRight, Features::EnPassant>;
+
+    // �ϊ���̓��͓����ʂ̎�����
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+      // �l�b�g���[�N�\���̒�`
+      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+  }  // namespace NNUE
+
+}  // namespace Eval
diff --git a/src/eval/nnue/features/enpassant.cpp b/src/eval/nnue/features/enpassant.cpp
new file mode 100644
index 00000000..523fd966
--- /dev/null
+++ b/src/eval/nnue/features/enpassant.cpp
@@ -0,0 +1,47 @@
+// NNUE�]���֐��̓��͓�����K�̒�`
+
+#if defined(EVAL_NNUE)
+
+#include "enpassant.h"
+#include "index_list.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    namespace Features {
+
+      // �����ʂ̂����A�l��1�ł���C���f�b�N�X�̃��X�g���擾����
+      void EnPassant::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
+        // �R���p�C���̌x����������邽�߁A�z��T�C�Y���������ꍇ�͉������Ȃ�
+        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+        auto epSquare = pos.state()->epSquare;
+        if (epSquare == SQ_NONE) {
+          return;
+        }
+
+        if (perspective == BLACK) {
+          epSquare = Inv(epSquare);
+        }
+
+        auto file = file_of(epSquare);
+        active->push_back(file);
+      }
+
+      // �����ʂ̂����A���O����l���ω������C���f�b�N�X�̃��X�g���擾����
+      void EnPassant::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* added) {
+        // Not implemented.
+        assert(false);
+      }
+
+    }  // namespace Features
+
+  }  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
diff --git a/src/eval/nnue/features/enpassant.h b/src/eval/nnue/features/enpassant.h
new file mode 100644
index 00000000..fe827584
--- /dev/null
+++ b/src/eval/nnue/features/enpassant.h
@@ -0,0 +1,48 @@
+// NNUE�]���֐��̓��͓�����K�̒�`
+
+#ifndef _NNUE_FEATURES_ENPASSANT_H_
+#define _NNUE_FEATURES_ENPASSANT_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    namespace Features {
+
+      // ������K�F�ʂ̈ʒu
+      class EnPassant {
+      public:
+        // �����ʖ�
+        static constexpr const char* kName = "EnPassant";
+        // �]���֐��t�@�C���ɖ��ߍ��ރn�b�V���l
+        static constexpr std::uint32_t kHashValue = 0x02924F91u;
+        // �����ʂ̎�����
+        static constexpr IndexType kDimensions = 8;
+        // �����ʂ̂����A�����ɒl��1�ƂȂ�C���f�b�N�X�̐��̍ő�l
+        static constexpr IndexType kMaxActiveDimensions = 1;
+        // �����v�Z�̑���ɑS�v�Z���s���^�C�~���O
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kAnyPieceMoved;
+
+        // �����ʂ̂����A�l��1�ł���C���f�b�N�X�̃��X�g���擾����
+        static void AppendActiveIndices(const Position& pos, Color perspective,
+          IndexList* active);
+
+        // �����ʂ̂����A���O����l���ω������C���f�b�N�X�̃��X�g���擾����
+        static void AppendChangedIndices(const Position& pos, Color perspective,
+          IndexList* removed, IndexList* added);
+      };
+
+    }  // namespace Features
+
+  }  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
diff --git a/src/eval/nnue/nnue_architecture.h b/src/eval/nnue/nnue_architecture.h
index 3170cdab..9f21d289 100644
--- a/src/eval/nnue/nnue_architecture.h
+++ b/src/eval/nnue/nnue_architecture.h
@@ -7,7 +7,8 @@
 
 // 入力特徴量とネットワーク構造が定義されたヘッダをincludeする
 //#include "architectures/k-p_256x2-32-32.h"
-#include "architectures/k-p-cr_256x2-32-32.h"
+//#include "architectures/k-p-cr_256x2-32-32.h"
+#include "architectures/k-p-cr-ep_256x2-32-32.h"
 //#include "architectures/halfkp_256x2-32-32.h"
 
 namespace Eval {

From 747d98bf1b670f811b567bfd13fcf5980fb42f51 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sun, 7 Jul 2019 20:31:54 +0900
Subject: [PATCH 027/583] Added halfkp-cr-ep_256x2-32-32 architecture.

---
 .../architectures/halfkp-cr-ep_256x2-32-32.h  | 38 +++++++++++++++++++
 src/eval/nnue/nnue_architecture.h             |  3 +-
 2 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h

diff --git a/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h b/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
new file mode 100644
index 00000000..7063f334
--- /dev/null
+++ b/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
@@ -0,0 +1,38 @@
+// NNUE�]���֐��ŗp������͓����ʂƃl�b�g���[�N�\���̒�`
+
+#include "../features/feature_set.h"
+#include "../features/half_kp.h"
+#include "../features/castling_right.h"
+#include "../features/enpassant.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    // �]���֐��ŗp������͓�����
+    using RawFeatures = Features::FeatureSet<
+      Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
+      Features::EnPassant>;
+
+    // �ϊ���̓��͓����ʂ̎�����
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+      // �l�b�g���[�N�\���̒�`
+      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+  }  // namespace NNUE
+
+}  // namespace Eval
diff --git a/src/eval/nnue/nnue_architecture.h b/src/eval/nnue/nnue_architecture.h
index 9f21d289..02c35782 100644
--- a/src/eval/nnue/nnue_architecture.h
+++ b/src/eval/nnue/nnue_architecture.h
@@ -8,8 +8,9 @@
 // 入力特徴量とネットワーク構造が定義されたヘッダをincludeする
 //#include "architectures/k-p_256x2-32-32.h"
 //#include "architectures/k-p-cr_256x2-32-32.h"
-#include "architectures/k-p-cr-ep_256x2-32-32.h"
+//#include "architectures/k-p-cr-ep_256x2-32-32.h"
 //#include "architectures/halfkp_256x2-32-32.h"
+#include "architectures/halfkp-cr-ep_256x2-32-32.h"
 
 namespace Eval {
 

From b300a9d43eb39063c207b2100bbdcb642f7550ca Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Sun, 7 Jul 2019 21:44:02 +0900
Subject: [PATCH 028/583] Enabled halfkp_256x2-32-32.

---
 src/eval/nnue/nnue_architecture.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/eval/nnue/nnue_architecture.h b/src/eval/nnue/nnue_architecture.h
index 02c35782..7479ac0a 100644
--- a/src/eval/nnue/nnue_architecture.h
+++ b/src/eval/nnue/nnue_architecture.h
@@ -9,8 +9,8 @@
 //#include "architectures/k-p_256x2-32-32.h"
 //#include "architectures/k-p-cr_256x2-32-32.h"
 //#include "architectures/k-p-cr-ep_256x2-32-32.h"
-//#include "architectures/halfkp_256x2-32-32.h"
-#include "architectures/halfkp-cr-ep_256x2-32-32.h"
+#include "architectures/halfkp_256x2-32-32.h"
+//#include "architectures/halfkp-cr-ep_256x2-32-32.h"
 
 namespace Eval {
 

From 3dcd2bb69bf85ab2cf8f7fedb92bc2af0f92aaa3 Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Sun, 23 Jun 2019 23:25:12 +0900
Subject: [PATCH 029/583] =?UTF-8?q?Makefile=E3=81=AEobjclean:=E3=81=AB?=
 =?UTF-8?q?=E3=80=8C./eval/nnue/*.o=E3=80=8D=E7=AD=89=E3=82=92=E8=BF=BD?=
 =?UTF-8?q?=E5=8A=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index a48d5674..342debb4 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -456,7 +456,7 @@ clean: objclean profileclean
 
 # clean binaries and objects
 objclean:
-	@rm -f $(EXE) *.o ./syzygy/*.o
+	@rm -f $(EXE) *.o ./syzygy/*.o ./learn/*.o ./extra/*.o ./eval/*.o ./eval/nnue/*.o ./eval/nnue/features/*.o
 
 # clean auxiliary profiling files
 profileclean:

From fc5f64b3837240f5b8e5e5ad69dbbd0b8a72cc62 Mon Sep 17 00:00:00 2001
From: HiraokaTakuya <hiraoka64@gmail.com>
Date: Sun, 30 Jun 2019 16:38:44 +0900
Subject: [PATCH 030/583] Add targets nnue-learn, nnue-learn-use-blas

---
 src/Makefile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/Makefile b/src/Makefile
index 342debb4..bd026c0b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -546,6 +546,12 @@ icc-profile-use:
 	EXTRACXXFLAGS='-prof_use -prof_dir ./profdir' \
 	all
 
+nnue-learn: config-sanity
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2' build
+
+nnue-learn-use-blas: config-sanity
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DUSE_BLAS' build
+
 .depend:
 	-@$(CXX) $(DEPENDFLAGS) -MM $(OBJS:.o=.cpp) > $@ 2> /dev/null
 

From 87184389432b110d3b43bab13a9f423ed80df580 Mon Sep 17 00:00:00 2001
From: HiraokaTakuya <hiraoka64@gmail.com>
Date: Sun, 30 Jun 2019 16:43:36 +0900
Subject: [PATCH 031/583] std::conditional_t can be used from C++14.

---
 src/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index bd026c0b..4aa32cf2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -160,8 +160,8 @@ endif
 
 ### 3.1 Selecting compiler (default = gcc)
 
-CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++11 $(EXTRACXXFLAGS)
-DEPENDFLAGS += -std=c++11
+CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++14 $(EXTRACXXFLAGS)
+DEPENDFLAGS += -std=c++14
 LDFLAGS += $(EXTRALDFLAGS)
 
 ifeq ($(COMP),)

From 10aa774d082ae3d646d22069ac6bcc9b4ad3de2f Mon Sep 17 00:00:00 2001
From: HiraokaTakuya <hiraoka64@gmail.com>
Date: Sun, 30 Jun 2019 16:48:11 +0900
Subject: [PATCH 032/583] Fix a compile error.

---
 src/eval/nnue/evaluate_nnue.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index a19b2a0e..ce478783 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -286,7 +286,7 @@ Value evaluate(const Position& pos) {
 
 #if defined(USE_EVAL_HASH)
   // evaluate hash tableにはあるかも。
-  const Key key = pos.state()->key();
+  const Key key = pos.key();
   ScoreKeyValue entry = *g_evalTable[key];
   entry.decode();
   if (entry.key == key) {

From c643ee0b45d0d8093e706207f820c9b6cf220813 Mon Sep 17 00:00:00 2001
From: HiraokaTakuya <hiraoka64@gmail.com>
Date: Sun, 30 Jun 2019 17:01:25 +0900
Subject: [PATCH 033/583] Fix a compile error.

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 4aa32cf2..794365f4 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -339,7 +339,7 @@ endif
 ifeq ($(pext),yes)
 	CXXFLAGS += -DUSE_PEXT
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
-		CXXFLAGS += -mbmi2
+		CXXFLAGS += -mbmi2 -mavx2
 	endif
 endif
 

From 0be41dbb67bf00fd6b16e151b369af660d6b4291 Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Fri, 5 Jul 2019 18:00:56 +0900
Subject: [PATCH 034/583] =?UTF-8?q?nullmove=E3=81=A8promotion=E3=81=AE?=
 =?UTF-8?q?=E5=A0=B4=E5=90=88=E3=81=AB=E8=A9=95=E4=BE=A1=E5=80=A4=E3=81=AE?=
 =?UTF-8?q?=E5=B7=AE=E5=88=86=E8=A8=88=E7=AE=97=E3=81=A8=E5=85=A8=E8=A8=88?=
 =?UTF-8?q?=E7=AE=97=E3=81=AE=E7=B5=90=E6=9E=9C=E3=81=8C=E7=95=B0=E3=81=AA?=
 =?UTF-8?q?=E3=81=A3=E3=81=A6=E3=81=84=E3=81=9F=E3=81=AE=E3=82=92=E4=BF=AE?=
 =?UTF-8?q?=E6=AD=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/position.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/position.cpp b/src/position.cpp
index fefeac92..5f65071f 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -927,8 +927,8 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
 
 #if defined(EVAL_NNUE)
           piece_no0 = piece_no_of(to);
-          dp.pieceNo[0] = piece_no0;
-          dp.changed_piece[0].old_piece = evalList.bona_piece(piece_no0);
+          //dp.pieceNo[0] = piece_no0;
+          //dp.changed_piece[0].old_piece = evalList.bona_piece(piece_no0);
           assert(evalList.piece_no_list_board[from] == PIECE_NUMBER_NB);
           evalList.put_piece(piece_no0, to, promotion);
           dp.changed_piece[0].new_piece = evalList.bona_piece(piece_no0);
@@ -1161,6 +1161,10 @@ void Position::do_null_move(StateInfo& newSt) {
   st->key ^= Zobrist::side;
   prefetch(TT.first_entry(st->key));
 
+#if defined(EVAL_NNUE)
+  st->accumulator.computed_score = false;
+#endif
+
   ++st->rule50;
   st->pliesFromNull = 0;
 

From 2d70487caac80b617066b9d8a08c91545d4137fa Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Mon, 8 Jul 2019 19:02:09 +0900
Subject: [PATCH 035/583] Enabled k-p_256x2-32-32.

---
 src/eval/nnue/nnue_architecture.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/eval/nnue/nnue_architecture.h b/src/eval/nnue/nnue_architecture.h
index 7479ac0a..bf2ad66d 100644
--- a/src/eval/nnue/nnue_architecture.h
+++ b/src/eval/nnue/nnue_architecture.h
@@ -6,10 +6,10 @@
 #if defined(EVAL_NNUE)
 
 // 入力特徴量とネットワーク構造が定義されたヘッダをincludeする
-//#include "architectures/k-p_256x2-32-32.h"
+#include "architectures/k-p_256x2-32-32.h"
 //#include "architectures/k-p-cr_256x2-32-32.h"
 //#include "architectures/k-p-cr-ep_256x2-32-32.h"
-#include "architectures/halfkp_256x2-32-32.h"
+//#include "architectures/halfkp_256x2-32-32.h"
 //#include "architectures/halfkp-cr-ep_256x2-32-32.h"
 
 namespace Eval {

From c4d30f3649fa835eb36e0e3ded045e3dd3d7305d Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Wed, 10 Jul 2019 01:26:39 +0900
Subject: [PATCH 036/583] =?UTF-8?q?set=5Ffrom=5Fpacked=5Fsfen()=E3=81=A7mi?=
 =?UTF-8?q?rror=E3=82=92epSquare=E3=81=AB=E3=82=82=E9=81=A9=E7=94=A8?=
 =?UTF-8?q?=E3=81=99=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB=E4=BF=AE=E6=AD=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/extra/sfen_packer.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
index 4d861a58..d56e808b 100644
--- a/src/extra/sfen_packer.cpp
+++ b/src/extra/sfen_packer.cpp
@@ -371,6 +371,9 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
   // En passant square. Ignore if no pawn capture is possible
   if (stream.read_one_bit()) {
     Square ep_square = static_cast<Square>(stream.read_n_bit(6));
+    if (mirror) {
+      ep_square = Mir(ep_square);
+    }
     st->epSquare = ep_square;
 
     if (!(attackers_to(st->epSquare) & pieces(sideToMove, PAWN))

From e2165155d11d5456e6e256d43e00a444a2057246 Mon Sep 17 00:00:00 2001
From: Hisayori Noda <nodchip@gmail.com>
Date: Thu, 11 Jul 2019 22:47:55 +0900
Subject: [PATCH 037/583] Enabled halfkp_256x2-32-32.h.

---
 src/eval/nnue/nnue_architecture.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/eval/nnue/nnue_architecture.h b/src/eval/nnue/nnue_architecture.h
index bf2ad66d..7479ac0a 100644
--- a/src/eval/nnue/nnue_architecture.h
+++ b/src/eval/nnue/nnue_architecture.h
@@ -6,10 +6,10 @@
 #if defined(EVAL_NNUE)
 
 // 入力特徴量とネットワーク構造が定義されたヘッダをincludeする
-#include "architectures/k-p_256x2-32-32.h"
+//#include "architectures/k-p_256x2-32-32.h"
 //#include "architectures/k-p-cr_256x2-32-32.h"
 //#include "architectures/k-p-cr-ep_256x2-32-32.h"
-//#include "architectures/halfkp_256x2-32-32.h"
+#include "architectures/halfkp_256x2-32-32.h"
 //#include "architectures/halfkp-cr-ep_256x2-32-32.h"
 
 namespace Eval {

From f18acf97eded99e20aa1c69baf7dece2f64039a8 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Thu, 28 May 2020 10:08:51 +0900
Subject: [PATCH 038/583] Added the "nnue" target. Fixed build errors on the
 "nnue-learn-use-blas" target.

---
 src/Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 794365f4..cd07e426 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -546,11 +546,14 @@ icc-profile-use:
 	EXTRACXXFLAGS='-prof_use -prof_dir ./profdir' \
 	all
 
+nnue: config-sanity
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2' build
+
 nnue-learn: config-sanity
 	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2' build
 
 nnue-learn-use-blas: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DUSE_BLAS' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DUSE_BLAS -I/mingw64/include/OpenBLAS' LDFLAGS='-lopenblas' build
 
 .depend:
 	-@$(CXX) $(DEPENDFLAGS) -MM $(OBJS:.o=.cpp) > $@ 2> /dev/null

From dd9818c2c1e2d672f83332f9a70b9b380f9e3bf2 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Fri, 29 May 2020 09:36:24 +0900
Subject: [PATCH 039/583] Added "-static" compiler option.

---
 src/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index cd07e426..b32f858a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -212,6 +212,7 @@ ifeq ($(COMP),mingw)
 	CXXFLAGS += -Wextra -Wshadow
 	LDFLAGS += -static
 endif
+LDFLAGS += -static
 
 ifeq ($(COMP),icc)
 	comp=icc
@@ -553,7 +554,7 @@ nnue-learn: config-sanity
 	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2' build
 
 nnue-learn-use-blas: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DUSE_BLAS -I/mingw64/include/OpenBLAS' LDFLAGS='-lopenblas' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DUSE_BLAS -I/mingw64/include/OpenBLAS' LDFLAGS='$(LDFLAGS) -lopenblas' build
 
 .depend:
 	-@$(CXX) $(DEPENDFLAGS) -MM $(OBJS:.o=.cpp) > $@ 2> /dev/null

From 6703ec8ab081824bfa1699e58e134e31f03128ba Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 30 May 2020 09:50:29 +0900
Subject: [PATCH 040/583] =?UTF-8?q?nnue-gen-sfen-from-original-eval?=
 =?UTF-8?q?=E3=82=BF=E3=83=BC=E3=82=B2=E3=83=83=E3=83=88=E3=82=92=E8=BF=BD?=
 =?UTF-8?q?=E5=8A=A0=E3=81=97=E3=81=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Makefile b/src/Makefile
index b32f858a..c2c27404 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -550,6 +550,9 @@ icc-profile-use:
 nnue: config-sanity
 	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2' build
 
+nnue-gen-sfen-from-original-eval: config-sanity
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2' build
+
 nnue-learn: config-sanity
 	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2' build
 

From 78134b76418e83a3c335b56254b8ef124ebbd921 Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Mon, 1 Jun 2020 22:36:23 +0900
Subject: [PATCH 041/583] =?UTF-8?q?OpenMP=E3=81=AE=E6=9C=89=E5=8A=B9?=
 =?UTF-8?q?=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index c2c27404..cf9bf900 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -557,7 +557,7 @@ nnue-learn: config-sanity
 	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2' build
 
 nnue-learn-use-blas: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DUSE_BLAS -I/mingw64/include/OpenBLAS' LDFLAGS='$(LDFLAGS) -lopenblas' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp' LDFLAGS='$(LDFLAGS) -lopenblas -fopenmp' build
 
 .depend:
 	-@$(CXX) $(DEPENDFLAGS) -MM $(OBJS:.o=.cpp) > $@ 2> /dev/null

From a85e3055f4afc20223caa6a065a119fb813d7221 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 2 Jun 2020 00:13:35 +0900
Subject: [PATCH 042/583] =?UTF-8?q?=E3=81=84=E3=81=8F=E3=81=A4=E3=81=8B?=
 =?UTF-8?q?=E3=81=AE=E3=82=BF=E3=83=BC=E3=82=B2=E3=83=83=E3=83=88=E3=81=A7?=
 =?UTF-8?q?OpenMP=E3=82=92=E6=9C=89=E5=8A=B9=E3=81=AB=E3=81=97=E3=81=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index cf9bf900..c12a3eb6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -548,13 +548,13 @@ icc-profile-use:
 	all
 
 nnue: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
 
 nnue-gen-sfen-from-original-eval: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
 
 nnue-learn: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
 
 nnue-learn-use-blas: config-sanity
 	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp' LDFLAGS='$(LDFLAGS) -lopenblas -fopenmp' build

From 2523f72ff963b160dac5779bb9c7215de129113f Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 3 Jun 2020 23:32:08 +0900
Subject: [PATCH 043/583] =?UTF-8?q?=E7=9B=A4=E4=B8=8A=E3=81=8B=E3=82=89?=
 =?UTF-8?q?=E5=8F=96=E3=82=8A=E9=99=A4=E3=81=8B=E3=82=8C=E3=81=9F=E9=A7=92?=
 =?UTF-8?q?=E3=81=AB=E9=96=A2=E3=81=99=E3=82=8B=E5=B7=AE=E5=88=86=E8=A8=88?=
 =?UTF-8?q?=E7=AE=97=E3=82=92=E7=9C=81=E3=81=8D=E3=80=81=E9=AB=98=E9=80=9F?=
 =?UTF-8?q?=E5=8C=96=E3=81=97=E3=81=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/eval/nnue/features/half_kp.cpp          | 12 +++++++++---
 src/eval/nnue/features/half_relative_kp.cpp | 12 +++++++++---
 src/eval/nnue/features/p.cpp                | 12 +++++++++---
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/src/eval/nnue/features/half_kp.cpp b/src/eval/nnue/features/half_kp.cpp
index 1741f3ce..5cd95637 100644
--- a/src/eval/nnue/features/half_kp.cpp
+++ b/src/eval/nnue/features/half_kp.cpp
@@ -42,7 +42,9 @@ void HalfKP<AssociatedKing>::AppendActiveIndices(
   Square sq_target_k;
   GetPieces(pos, perspective, &pieces, &sq_target_k);
   for (PieceNumber i = PIECE_NUMBER_ZERO; i < PIECE_NUMBER_KING; ++i) {
-    active->push_back(MakeIndex(sq_target_k, pieces[i]));
+    if (pieces[i] != Eval::BONA_PIECE_ZERO) {
+      active->push_back(MakeIndex(sq_target_k, pieces[i]));
+    }
   }
 }
 
@@ -59,10 +61,14 @@ void HalfKP<AssociatedKing>::AppendChangedIndices(
     if (dp.pieceNo[i] >= PIECE_NUMBER_KING) continue;
     const auto old_p = static_cast<BonaPiece>(
         dp.changed_piece[i].old_piece.from[perspective]);
-    removed->push_back(MakeIndex(sq_target_k, old_p));
+    if (old_p != Eval::BONA_PIECE_ZERO) {
+      removed->push_back(MakeIndex(sq_target_k, old_p));
+    }
     const auto new_p = static_cast<BonaPiece>(
         dp.changed_piece[i].new_piece.from[perspective]);
-    added->push_back(MakeIndex(sq_target_k, new_p));
+    if (new_p != Eval::BONA_PIECE_ZERO) {
+      added->push_back(MakeIndex(sq_target_k, new_p));
+    }
   }
 }
 
diff --git a/src/eval/nnue/features/half_relative_kp.cpp b/src/eval/nnue/features/half_relative_kp.cpp
index d0810df6..d62beea0 100644
--- a/src/eval/nnue/features/half_relative_kp.cpp
+++ b/src/eval/nnue/features/half_relative_kp.cpp
@@ -50,7 +50,9 @@ void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
   GetPieces(pos, perspective, &pieces, &sq_target_k);
   for (PieceNumber i = PIECE_NUMBER_ZERO; i < PIECE_NUMBER_KING; ++i) {
     if (pieces[i] >= fe_hand_end) {
-      active->push_back(MakeIndex(sq_target_k, pieces[i]));
+      if (pieces[i] != Eval::BONA_PIECE_ZERO) {
+        active->push_back(MakeIndex(sq_target_k, pieces[i]));
+      }
     }
   }
 }
@@ -69,12 +71,16 @@ void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
     const auto old_p = static_cast<BonaPiece>(
         dp.changed_piece[i].old_piece.from[perspective]);
     if (old_p >= fe_hand_end) {
-      removed->push_back(MakeIndex(sq_target_k, old_p));
+      if (old_p != Eval::BONA_PIECE_ZERO) {
+        removed->push_back(MakeIndex(sq_target_k, old_p));
+      }
     }
     const auto new_p = static_cast<BonaPiece>(
         dp.changed_piece[i].new_piece.from[perspective]);
     if (new_p >= fe_hand_end) {
-      added->push_back(MakeIndex(sq_target_k, new_p));
+      if (new_p != Eval::BONA_PIECE_ZERO) {
+        added->push_back(MakeIndex(sq_target_k, new_p));
+      }
     }
   }
 }
diff --git a/src/eval/nnue/features/p.cpp b/src/eval/nnue/features/p.cpp
index da1481cb..56bca0a4 100644
--- a/src/eval/nnue/features/p.cpp
+++ b/src/eval/nnue/features/p.cpp
@@ -21,7 +21,9 @@ void P::AppendActiveIndices(
       pos.eval_list()->piece_list_fb() :
       pos.eval_list()->piece_list_fw();
   for (PieceNumber i = PIECE_NUMBER_ZERO; i < PIECE_NUMBER_KING; ++i) {
-    active->push_back(pieces[i]);
+    if (pieces[i] != Eval::BONA_PIECE_ZERO) {
+      active->push_back(pieces[i]);
+    }
   }
 }
 
@@ -32,8 +34,12 @@ void P::AppendChangedIndices(
   const auto& dp = pos.state()->dirtyPiece;
   for (int i = 0; i < dp.dirty_num; ++i) {
     if (dp.pieceNo[i] >= PIECE_NUMBER_KING) continue;
-    removed->push_back(dp.changed_piece[i].old_piece.from[perspective]);
-    added->push_back(dp.changed_piece[i].new_piece.from[perspective]);
+    if (dp.changed_piece[i].old_piece.from[perspective] != Eval::BONA_PIECE_ZERO) {
+      removed->push_back(dp.changed_piece[i].old_piece.from[perspective]);
+    }
+    if (dp.changed_piece[i].new_piece.from[perspective] != Eval::BONA_PIECE_ZERO) {
+      added->push_back(dp.changed_piece[i].new_piece.from[perspective]);
+    }
   }
 }
 

From d23f96d1567aef18f49239cb36da7d98a37921b6 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 6 Jun 2020 18:50:15 +0900
Subject: [PATCH 044/583] No pruning at low plies.

This makes those very early depths a bit more reliable.

Thanks, joergoster!

https://github.com/joergoster/Stockfish-NNUE/commit/be7f37187b85b8093ae0741909cbfd7b2bc76871
---
 src/search.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/search.cpp b/src/search.cpp
index fa423d6e..d03a04dd 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -950,6 +950,7 @@ moves_loop: // When in check, search starts from here
 
       // Step 14. Pruning at shallow depth (~170 Elo)
       if (  !rootNode
+          && thisThread->rootDepth > 4 * ONE_PLY
           && pos.non_pawn_material(us)
           && bestValue > VALUE_MATED_IN_MAX_PLY)
       {

From 91a7557ab4cafd007197e148f8d84577cdf311ac Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Mon, 8 Jun 2020 13:57:55 +0900
Subject: [PATCH 045/583] =?UTF-8?q?test=20nnue=20test=5Ffeatures=E7=AD=89?=
 =?UTF-8?q?=E3=81=AE=E6=9C=89=E5=8A=B9=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/Makefile                        |  8 ++++----
 src/eval/nnue/nnue_test_command.cpp | 21 +++++++++++++--------
 src/eval/nnue/nnue_test_command.h   |  2 --
 src/uci.cpp                         | 23 +++++++++++++++++++++++
 4 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index c12a3eb6..c718ba6d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -548,16 +548,16 @@ icc-profile-use:
 	all
 
 nnue: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
 
 nnue-gen-sfen-from-original-eval: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
 
 nnue-learn: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
 
 nnue-learn-use-blas: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp' LDFLAGS='$(LDFLAGS) -lopenblas -fopenmp' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp' LDFLAGS='$(LDFLAGS) -lopenblas -fopenmp' build
 
 .depend:
 	-@$(CXX) $(DEPENDFLAGS) -MM $(OBJS:.o=.cpp) > $@ 2> /dev/null
diff --git a/src/eval/nnue/nnue_test_command.cpp b/src/eval/nnue/nnue_test_command.cpp
index a2618b3b..28e44273 100644
--- a/src/eval/nnue/nnue_test_command.cpp
+++ b/src/eval/nnue/nnue_test_command.cpp
@@ -2,11 +2,16 @@
 
 #if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
 
-#include "../../extra/all.h"
+#include "../../thread.h"
+#include "../../uci.h"
 #include "evaluate_nnue.h"
 #include "nnue_test_command.h"
 
 #include <set>
+#include <fstream>
+
+#define ASSERT(X) { if (!(X)) { std::cout << "\nError : ASSERT(" << #X << "), " << __FILE__ << "(" << __LINE__ << "): " << __func__ << std::endl; \
+ std::this_thread::sleep_for(std::chrono::microseconds(3000)); *(int*)1 =0;} }
 
 namespace Eval {
 
@@ -18,7 +23,7 @@ namespace {
 void TestFeatures(Position& pos) {
   const std::uint64_t num_games = 1000;
   StateInfo si;
-  pos.set_hirate(&si,Threads.main());
+  pos.set(StartFEN, false, &si, Threads.main());
   const int MAX_PLY = 256; // 256手までテスト
 
   StateInfo state[MAX_PLY]; // StateInfoを最大手数分だけ
@@ -38,7 +43,7 @@ void TestFeatures(Position& pos) {
       Features::IndexList active_indices[2];
       RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
                                        active_indices);
-      for (const auto perspective : COLOR) {
+      for (const auto perspective : Colors) {
         for (const auto index : active_indices[perspective]) {
           ASSERT(index < RawFeatures::kDimensions);
           ASSERT(index_sets[i][perspective].count(index) == 0);
@@ -56,7 +61,7 @@ void TestFeatures(Position& pos) {
       bool reset[2];
       RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
                                         removed_indices, added_indices, reset);
-      for (const auto perspective : COLOR) {
+      for (const auto perspective : Colors) {
         if (reset[perspective]) {
           (*index_sets)[i][perspective].clear();
           ++num_resets[i];
@@ -91,7 +96,7 @@ void TestFeatures(Position& pos) {
   for (std::uint64_t i = 0; i < num_games; ++i) {
     auto index_sets = make_index_sets(pos);
     for (ply = 0; ply < MAX_PLY; ++ply) {
-      MoveList<LEGAL_ALL> mg(pos); // 全合法手の生成
+      MoveList<LEGAL> mg(pos); // 全合法手の生成
 
       // 合法な指し手がなかった == 詰み
       if (mg.size() == 0)
@@ -106,7 +111,7 @@ void TestFeatures(Position& pos) {
       ASSERT(index_sets == make_index_sets(pos));
     }
 
-    pos.set_hirate(&si,Threads.main());
+    pos.set(StartFEN, false, &si, Threads.main());
 
     // 100回に1回ごとに'.'を出力(進んでいることがわかるように)
     if ((i % 100) == 0)
@@ -184,8 +189,8 @@ void TestCommand(Position& pos, std::istream& stream) {
     PrintInfo(stream);
   } else {
     std::cout << "usage:" << std::endl;
-    std::cout << " test nn test_features" << std::endl;
-    std::cout << " test nn info [path/to/" << kFileName << "...]" << std::endl;
+    std::cout << " test nnue test_features" << std::endl;
+    std::cout << " test nnue info [path/to/" << kFileName << "...]" << std::endl;
   }
 }
 
diff --git a/src/eval/nnue/nnue_test_command.h b/src/eval/nnue/nnue_test_command.h
index bf5894c9..10f57f6c 100644
--- a/src/eval/nnue/nnue_test_command.h
+++ b/src/eval/nnue/nnue_test_command.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TEST_COMMAND_H_
 #define _NNUE_TEST_COMMAND_H_
 
-#include "../../config.h"
-
 #if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
 
 namespace Eval {
diff --git a/src/uci.cpp b/src/uci.cpp
index 19af09a0..d4178879 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -33,6 +33,10 @@
 #include "uci.h"
 #include "syzygy/tbprobe.h"
 
+#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
+#include "eval/nnue/nnue_test_command.h"
+#endif
+
 using namespace std;
 
 extern vector<string> setup_bench(const Position&, istream&);
@@ -64,6 +68,19 @@ namespace Learner
 }
 #endif
 
+#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
+void test_cmd(Position& pos, istringstream& is)
+{
+    // �T�������邩���m��Ȃ��̂ŏ��������Ă����B
+    is_ready();
+
+    std::string param;
+    is >> param;
+
+    if (param == "nnue") Eval::NNUE::TestCommand(pos, is);
+}
+#endif
+
 namespace {
   // position() is called when engine receives the "position" UCI command.
   // The function sets up the position described in the given FEN string ("fen")
@@ -376,6 +393,12 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "search") search_cmd(pos, is);
 
 #endif
+
+#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
+      // �e�X�g�R�}���h
+      else if (token == "test") test_cmd(pos, is);
+#endif
+
       else
           sync_cout << "Unknown command: " << cmd << sync_endl;
 

From 33772a0418117f88cc04de7833bdfdb05640f8bc Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 8 Jun 2020 23:46:06 +0900
Subject: [PATCH 046/583] =?UTF-8?q?=E3=82=B3=E3=83=B3=E3=83=91=E3=82=A4?=
 =?UTF-8?q?=E3=83=AB=E3=82=A8=E3=83=A9=E3=83=BC=E3=82=92=E4=BF=AE=E6=AD=A3?=
 =?UTF-8?q?=E3=81=97=E3=81=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/eval/nnue/evaluate_nnue_learner.cpp |  6 +++---
 src/learn/learner.cpp                   | 17 +++++++++--------
 src/learn/multi_think.h                 | 14 +++++++-------
 src/search.cpp                          | 20 ++++++++++----------
 4 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/src/eval/nnue/evaluate_nnue_learner.cpp b/src/eval/nnue/evaluate_nnue_learner.cpp
index b474f2e2..636f90e1 100644
--- a/src/eval/nnue/evaluate_nnue_learner.cpp
+++ b/src/eval/nnue/evaluate_nnue_learner.cpp
@@ -35,7 +35,7 @@ namespace {
 std::vector<Example> examples;
 
 // examplesの排他制御をするMutex
-Mutex examples_mutex;
+std::mutex examples_mutex;
 
 // ミニバッチのサンプル数
 uint64_t batch_size;
@@ -158,7 +158,7 @@ void AddExample(Position& pos, Color rootColor,
     }
   }
 
-  std::lock_guard<Mutex> lock(examples_mutex);
+  std::lock_guard<std::mutex> lock(examples_mutex);
   examples.push_back(std::move(example));
 }
 
@@ -170,7 +170,7 @@ void UpdateParameters(uint64_t epoch) {
   const auto learning_rate = static_cast<LearnFloatType>(
       get_eta() / batch_size);
 
-  std::lock_guard<Mutex> lock(examples_mutex);
+  std::lock_guard<std::mutex> lock(examples_mutex);
   std::shuffle(examples.begin(), examples.end(), rng);
   while (examples.size() >= batch_size) {
     std::vector<Example> batch(examples.end() - batch_size, examples.end());
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index c143e451..526c027c 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -15,6 +15,7 @@
 
 #if defined(EVAL_LEARN)
 
+#include <filesystem>
 #include <random>
 
 #include "learn.h"
@@ -170,7 +171,7 @@ struct SfenWriter
 			// sfen_buffers_poolに積んでおけばあとはworkerがよきに計らってくれる。
 
 			// sfen_buffers_poolの内容を変更するときはmutexのlockが必要。
-			std::unique_lock<Mutex> lk(mutex);
+			std::unique_lock<std::mutex> lk(mutex);
 			sfen_buffers_pool.push_back(buf);
 
 			buf = nullptr;
@@ -181,7 +182,7 @@ struct SfenWriter
 	// 自分のスレッド用のバッファに残っている分をファイルに書き出すためのバッファに移動させる。
 	void finalize(size_t thread_id)
 	{
-		std::unique_lock<Mutex> lk(mutex);
+		std::unique_lock<std::mutex> lk(mutex);
 
 		auto& buf = sfen_buffers[thread_id];
 
@@ -214,7 +215,7 @@ struct SfenWriter
 		{
 			vector<PSVector*> buffers;
 			{
-				std::unique_lock<Mutex> lk(mutex);
+				std::unique_lock<std::mutex> lk(mutex);
 
 				// まるごとコピー
 				buffers = sfen_buffers_pool;
@@ -299,7 +300,7 @@ private:
 	std::vector<PSVector*> sfen_buffers_pool;
 
 	// sfen_buffers_poolにアクセスするときに必要なmutex
-	Mutex mutex;
+	std::mutex mutex;
 
 	// 書きだした局面の数
 	uint64_t sfen_write_count = 0;
@@ -1293,7 +1294,7 @@ struct SfenReader
 		while (true)
 		{
 			{
-				std::unique_lock<Mutex> lk(mutex);
+				std::unique_lock<std::mutex> lk(mutex);
 				// ファイルバッファから充填できたなら、それで良し。
 				if (packed_sfens_pool.size() != 0)
 				{
@@ -1410,7 +1411,7 @@ struct SfenReader
 
 			// sfensの用意が出来たので、折を見てコピー
 			{
-				std::unique_lock<Mutex> lk(mutex);
+				std::unique_lock<std::mutex> lk(mutex);
 
 				// ポインタをコピーするだけなのでこの時間は無視できるはず…。
 				// packed_sfens_poolの内容を変更するのでmutexのlockが必要。
@@ -1479,7 +1480,7 @@ protected:
 	std::vector<PSVector*> packed_sfens;
 
 	// packed_sfens_poolにアクセスするときのmutex
-	Mutex mutex;
+	std::mutex mutex;
 
 	// sfenのpool。fileから読み込むworker threadはここに補充する。
 	// 各worker threadはここから自分のpacked_sfens[thread_id]に充填する。
@@ -2704,7 +2705,7 @@ void learn(Position&, istringstream& is)
 		#pragma warning(push)
 		#pragma warning(disable:4996)
 
-		namespace sys = std::tr2::sys;
+		namespace sys = std::filesystem;
 		sys::path p(kif_base_dir); // 列挙の起点
 		std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
 			[&](const sys::path& p) {
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index 4d4e0daf..ad6baa5e 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -59,7 +59,7 @@ struct MultiThink
 	// 局面を生成する場合などは、局面を生成するタイミングでこの関数を呼び出すようにしないと、
 	// 生成した局面数と、カウンターの値が一致しなくなってしまうので注意すること。
 	uint64_t get_next_loop_count() {
-		std::unique_lock<Mutex> lk(loop_mutex);
+		std::unique_lock<std::mutex> lk(loop_mutex);
 		if (loop_count >= loop_max)
 			return UINT64_MAX;
 		return loop_count++;
@@ -67,12 +67,12 @@ struct MultiThink
 
 	// [ASYNC] 処理した個数を返す用。呼び出されるごとにインクリメントされたカウンターが返る。
 	uint64_t get_done_count() {
-		std::unique_lock<Mutex> lk(loop_mutex);
+		std::unique_lock<std::mutex> lk(loop_mutex);
 		return ++done_count;
 	}
 
 	// worker threadがI/Oにアクセスするときのmutex
-	Mutex io_mutex;
+	std::mutex io_mutex;
 
 protected:
 	// 乱数発生器本体
@@ -87,7 +87,7 @@ private:
 	std::atomic<uint64_t> done_count;
 
 	// ↑の変数を変更するときのmutex
-	Mutex loop_mutex;
+	std::mutex loop_mutex;
 
 	// スレッドの終了フラグ。
 	// vector<bool>にすると複数スレッドから書き換えようとしたときに正しく反映されないことがある…はず。
@@ -117,7 +117,7 @@ struct TaskDispatcher
 	// [ASYNC] taskを一つ積む。
 	void push_task_async(Task task)
 	{
-		std::unique_lock<Mutex> lk(task_mutex);
+		std::unique_lock<std::mutex> lk(task_mutex);
 		tasks.push_back(task);
 	}
 
@@ -134,7 +134,7 @@ protected:
 	// [ASYNC] taskを一つ取り出す。on_idle()から呼び出される。
 	Task get_task_async()
 	{
-		std::unique_lock<Mutex> lk(task_mutex);
+		std::unique_lock<std::mutex> lk(task_mutex);
 		if (tasks.size() == 0)
 			return nullptr;
 		Task task = *tasks.rbegin();
@@ -143,7 +143,7 @@ protected:
 	}
 
 	// tasksにアクセスするとき用のmutex
-	Mutex task_mutex;
+	std::mutex task_mutex;
 };
 
 #endif // defined(EVAL_LEARN) && defined(YANEURAOU_2018_OTAFUKU_ENGINE)
diff --git a/src/search.cpp b/src/search.cpp
index 15655329..43032d86 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -2024,9 +2024,9 @@ namespace Learner
     {
       auto th = pos.this_thread();
 
-      th->completedDepth = DEPTH_ZERO;
+      th->completedDepth = 0;
       th->selDepth = 0;
-      th->rootDepth = DEPTH_ZERO;
+      th->rootDepth = 0;
 
       // �T���m�[�h���̃[��������
       th->nodes = 0;
@@ -2050,7 +2050,7 @@ namespace Learner
         : -make_score(ct, ct / 2));
 
       for (int i = 7; i > 0; i--)
-        (ss - i)->continuationHistory = &th->continuationHistory[NO_PIECE][0]; // Use as sentinel
+          (ss - i)->continuationHistory = &th->continuationHistory[0][0][NO_PIECE][0]; // Use as a sentinel
 
       // rootMoves�̐ݒ�
       auto& rootMoves = th->rootMoves;
@@ -2109,7 +2109,7 @@ namespace Learner
       return { mated_in(/*ss->ply*/ 0 + 1), {} };
     }
 
-    auto bestValue = ::qsearch<PV>(pos, ss, -VALUE_INFINITE, VALUE_INFINITE, DEPTH_ZERO);
+    auto bestValue = ::qsearch<PV>(pos, ss, -VALUE_INFINITE, VALUE_INFINITE, 0);
 
     // ����ꂽPV��Ԃ��B
     std::vector<Move> pvs;
@@ -2139,11 +2139,11 @@ namespace Learner
   {
     std::vector<Move> pvs;
 
-    Depth depth = depth_ * ONE_PLY;
-    if (depth < DEPTH_ZERO)
+    Depth depth = depth_;
+    if (depth < 0)
       return std::pair<Value, std::vector<Move>>(Eval::evaluate(pos), std::vector<Move>());
 
-    if (depth == DEPTH_ZERO)
+    if (depth == 0)
       return qsearch(pos);
 
     Stack stack[MAX_PLY + 10], * ss = stack + 7;
@@ -2176,7 +2176,7 @@ namespace Learner
     Value delta = -VALUE_INFINITE;
     Value bestValue = -VALUE_INFINITE;
 
-    while ((rootDepth += ONE_PLY) <= depth
+    while ((rootDepth += 1) <= depth
       // node�����𒴂����ꍇ�����̃��[�v�𔲂���
       // �T���m�[�h���́A���̊֐��̈����œn����Ă���B
       && !(nodesLimit /*node��������*/ && th->nodes.load(std::memory_order_relaxed) >= nodesLimit)
@@ -2203,7 +2203,7 @@ namespace Learner
         selDepth = 0;
 
         // depth 5�ȏ�ɂ����Ă�aspiration search�ɐ؂�ւ���B
-        if (rootDepth >= 5 * ONE_PLY)
+        if (rootDepth >= 5 * 1)
         {
           delta = Value(20);
 
@@ -2217,7 +2217,7 @@ namespace Learner
         int failedHighCnt = 0;
         while (true)
         {
-          Depth adjustedDepth = std::max(ONE_PLY, rootDepth - failedHighCnt * ONE_PLY);
+          Depth adjustedDepth = std::max(1, rootDepth - failedHighCnt * 1);
           bestValue = ::search<PV>(pos, ss, alpha, beta, adjustedDepth, false);
 
           stable_sort(rootMoves.begin() + pvIdx, rootMoves.end());

From 2d5c50d85b37427f6ad76874e5af41dec3b78f26 Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Mon, 22 Jun 2020 00:35:09 +0900
Subject: [PATCH 047/583] =?UTF-8?q?eval=5Fnnue=E3=82=B3=E3=83=9E=E3=83=B3?=
 =?UTF-8?q?=E3=83=89=E8=BF=BD=E5=8A=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/evaluate.h | 2 ++
 src/uci.cpp    | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/src/evaluate.h b/src/evaluate.h
index 1941e0dd..a9e6a563 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -35,6 +35,8 @@ Value evaluate(const Position& pos);
 
 void evaluate_with_no_return(const Position& pos);
 
+Value compute_eval(const Position& pos);
+
 #if defined(EVAL_NNUE) || defined(EVAL_LEARN)
 // �]���֐��t�@�C����ǂݍ��ށB
 // ����́A"is_ready"�R�}���h�̉�������1�x�����Ăяo�����B2�x�Ăяo�����Ƃ͑z�肵�Ă��Ȃ��B
diff --git a/src/uci.cpp b/src/uci.cpp
index 4e1b8c45..b7ece34b 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -400,6 +400,10 @@ void UCI::loop(int argc, char* argv[]) {
 
 #endif
 
+#if defined(EVAL_NNUE)
+      else if (token == "eval_nnue") sync_cout << "eval_nnue = " << Eval::compute_eval(pos) << sync_endl;
+#endif
+
 #if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
       // �e�X�g�R�}���h
       else if (token == "test") test_cmd(pos, is);

From 6c7a5943625cc0ec1efe5ae244a3e21b2030ac54 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 22 Jun 2020 10:27:52 +0900
Subject: [PATCH 048/583] Added "-Wl,-s" option.

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 1bd73bfa..6645f009 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -218,7 +218,7 @@ ifeq ($(COMP),mingw)
 	CXXFLAGS += -Wextra -Wshadow
 	LDFLAGS += -static
 endif
-LDFLAGS += -static
+LDFLAGS += -static -Wl,-s
 
 ifeq ($(COMP),icc)
 	comp=icc

From 76b0de40a1cae75b18bccb8b76908a087c842143 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 23 Jun 2020 23:47:59 +0900
Subject: [PATCH 049/583] =?UTF-8?q?=E3=82=B3=E3=83=B3=E3=83=91=E3=82=A4?=
 =?UTF-8?q?=E3=83=A9=E3=83=BC=E3=82=AA=E3=83=97=E3=82=B7=E3=83=A7=E3=83=B3?=
 =?UTF-8?q?=E3=82=92-std=3Dc++14=E3=81=8B=E3=82=89-std=3Dc++17=E3=81=AB?=
 =?UTF-8?q?=E5=A4=89=E6=9B=B4=E3=81=97=E3=81=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 6645f009..cb86b5c6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -166,8 +166,8 @@ endif
 
 ### 3.1 Selecting compiler (default = gcc)
 
-CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++14 $(EXTRACXXFLAGS)
-DEPENDFLAGS += -std=c++14
+CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS)
+DEPENDFLAGS += -std=c++17
 LDFLAGS += $(EXTRALDFLAGS)
 
 ifeq ($(COMP),)

From 43e78187d7f486b4f573b067fdd3901cfa85f5f7 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 24 Jun 2020 00:27:45 +0900
Subject: [PATCH 050/583] =?UTF-8?q?ARCH=3Dx86-64-avx2=E3=82=92=E6=8C=87?=
 =?UTF-8?q?=E5=AE=9A=E3=81=A7=E3=81=8D=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB?=
 =?UTF-8?q?=E3=81=97=E3=81=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/Makefile                | 36 ++++++++++++++++++++++++++++--------
 src/eval/nnue/nnue_common.h |  6 ++++++
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index cb86b5c6..33fc2c00 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -96,6 +96,7 @@ bits = 32
 prefetch = no
 popcnt = no
 sse = no
+avx2 = no
 pext = no
 
 ### 2.2 Architecture specific
@@ -134,12 +135,22 @@ ifeq ($(ARCH),x86-64-modern)
 	sse = yes
 endif
 
+ifeq ($(ARCH),x86-64-avx2)
+	arch = x86_64
+	bits = 64
+	prefetch = yes
+	popcnt = yes
+	sse = yes
+	avx2 = yes
+endif
+
 ifeq ($(ARCH),x86-64-bmi2)
 	arch = x86_64
 	bits = 64
 	prefetch = yes
 	popcnt = yes
 	sse = yes
+	avx2 = yes
 	pext = yes
 endif
 
@@ -336,11 +347,18 @@ endif
 ### 3.6 popcnt
 ifeq ($(popcnt),yes)
 	ifeq ($(arch),ppc64)
-		CXXFLAGS += -DUSE_POPCNT
+		CXXFLAGS += -DUSE_POPCNT -DUSE_SSE2
 	else ifeq ($(comp),icc)
-		CXXFLAGS += -msse3 -DUSE_POPCNT
+		CXXFLAGS += -msse3 -DUSE_POPCNT -DUSE_SSE2
 	else
-		CXXFLAGS += -msse3 -mpopcnt -DUSE_POPCNT
+		CXXFLAGS += -msse3 -mpopcnt -DUSE_POPCNT -DUSE_SSE2
+	endif
+endif
+
+ifeq ($(avx2),yes)
+	CXXFLAGS += -DUSE_AVX2
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -mavx2
 	endif
 endif
 
@@ -348,7 +366,7 @@ endif
 ifeq ($(pext),yes)
 	CXXFLAGS += -DUSE_PEXT
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
-		CXXFLAGS += -mbmi2 -mavx2
+		CXXFLAGS += -mbmi2
 	endif
 endif
 
@@ -400,6 +418,7 @@ help:
 	@echo "Supported archs:"
 	@echo ""
 	@echo "x86-64-bmi2             > x86 64-bit with pext support (also enables SSE4)"
+	@echo "x86-64-avx2             > x86 64-bit with avx2 support (also enables SSE4)"
 	@echo "x86-64-modern           > x86 64-bit with popcnt support (also enables SSE3)"
 	@echo "x86-64                  > x86 64-bit generic"
 	@echo "x86-32                  > x86 32-bit (also enables SSE)"
@@ -495,6 +514,7 @@ config-sanity:
 	@echo "prefetch: '$(prefetch)'"
 	@echo "popcnt: '$(popcnt)'"
 	@echo "sse: '$(sse)'"
+	@echo "avx2: '$(avx2)'"
 	@echo "pext: '$(pext)'"
 	@echo ""
 	@echo "Flags:"
@@ -556,16 +576,16 @@ icc-profile-use:
 	all
 
 nnue: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_NNUE -DUSE_EVAL_HASH -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
 
 nnue-gen-sfen-from-original-eval: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DUSE_EVAL_HASH -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
 
 nnue-learn: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
 
 nnue-learn-use-blas: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DUSE_AVX2 -DUSE_SSE2 -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp' LDFLAGS='$(LDFLAGS) -lopenblas -fopenmp' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp' LDFLAGS='$(LDFLAGS) -lopenblas -fopenmp' build
 
 .depend:
 	-@$(CXX) $(DEPENDFLAGS) -MM $(OBJS:.o=.cpp) > $@ 2> /dev/null
diff --git a/src/eval/nnue/nnue_common.h b/src/eval/nnue/nnue_common.h
index ac114b75..8ef8fee4 100644
--- a/src/eval/nnue/nnue_common.h
+++ b/src/eval/nnue/nnue_common.h
@@ -5,6 +5,12 @@
 
 #if defined(EVAL_NNUE)
 
+#if defined(USE_AVX2)
+#include <immintrin.h>
+#elif defined(USE_SSE2)
+#include <emmintrin.h>
+#endif
+
 namespace Eval {
 
 namespace NNUE {

From 999f5ec446bb6874699887abf1cd224a1db22f72 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 24 Jun 2020 00:47:34 +0900
Subject: [PATCH 051/583] =?UTF-8?q?COMP=3Dmsys2=E3=82=92=E6=8C=87=E5=AE=9A?=
 =?UTF-8?q?=E3=81=A7=E3=81=8D=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB=E3=81=97?=
 =?UTF-8?q?=E3=81=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/Makefile | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 33fc2c00..8b656da2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -229,7 +229,6 @@ ifeq ($(COMP),mingw)
 	CXXFLAGS += -Wextra -Wshadow
 	LDFLAGS += -static
 endif
-LDFLAGS += -static -Wl,-s
 
 ifeq ($(COMP),icc)
 	comp=icc
@@ -259,6 +258,28 @@ ifeq ($(COMP),clang)
 	endif
 endif
 
+ifeq ($(COMP),msys2)
+	comp=gcc
+	CXX=g++
+	CXXFLAGS += -pedantic -Wextra -Wshadow
+
+	ifeq ($(ARCH),armv7)
+		ifeq ($(OS),Android)
+			CXXFLAGS += -m$(bits)
+			LDFLAGS += -m$(bits)
+		endif
+	else
+		CXXFLAGS += -m$(bits)
+		LDFLAGS += -m$(bits)
+	endif
+
+	ifneq ($(KERNEL),Darwin)
+	   LDFLAGS += -Wl,--no-as-needed
+	endif
+
+	LDFLAGS += -static -Wl,-s
+endif
+
 ifeq ($(comp),icc)
 	profile_make = icc-profile-make
 	profile_use = icc-profile-use
@@ -357,7 +378,7 @@ endif
 
 ifeq ($(avx2),yes)
 	CXXFLAGS += -DUSE_AVX2
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
 		CXXFLAGS += -mavx2
 	endif
 endif
@@ -365,7 +386,7 @@ endif
 ### 3.7 pext
 ifeq ($(pext),yes)
 	CXXFLAGS += -DUSE_PEXT
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
 		CXXFLAGS += -mbmi2
 	endif
 endif
@@ -375,7 +396,7 @@ endif
 ### needs access to the optimization flags.
 ifeq ($(optimize),yes)
 ifeq ($(debug), no)
-	ifeq ($(comp),$(filter $(comp),gcc clang))
+	ifeq ($(comp),$(filter $(comp),gcc clang msys2))
 		CXXFLAGS += -flto
 		LDFLAGS += $(CXXFLAGS)
 	endif
@@ -435,6 +456,7 @@ help:
 	@echo "mingw                   > Gnu compiler with MinGW under Windows"
 	@echo "clang                   > LLVM Clang compiler"
 	@echo "icc                     > Intel compiler"
+	@echo "msys2                   > MSYS2"
 	@echo ""
 	@echo "Simple examples. If you don't know what to do, you likely want to run: "
 	@echo ""

From ccd2e602a0c4dd4d8e323075720d3f84ae1fc4a5 Mon Sep 17 00:00:00 2001
From: zz4032 <alg4032@arcor.de>
Date: Tue, 23 Jun 2020 19:55:54 +0200
Subject: [PATCH 052/583] Adding mm_malloc.h

Otherwise compiling with 'modern' or 'avx2' architecture on Linux aborts with errors.
---
 src/misc.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/misc.h b/src/misc.h
index 5f5aa7e5..e145c4f4 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -29,6 +29,7 @@
 #include <ostream>
 #include <string>
 #include <vector>
+#include <mm_malloc.h>
 
 #include "types.h"
 #include "thread_win32_osx.h"

From 3102896a0017ea8fc79ad12694a9228399aa0dc2 Mon Sep 17 00:00:00 2001
From: zz4032 <alg4032@arcor.de>
Date: Tue, 23 Jun 2020 20:53:32 +0200
Subject: [PATCH 053/583] Linux identifier corrected.

---
 src/misc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/misc.cpp b/src/misc.cpp
index 36924d37..25435ac5 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -658,7 +658,7 @@ namespace Dependency {
 }
 
 #endif
-#elif defined(_LINUX)
+#elif defined(__linux__)
 
 // linux環境において、この_LINUXというシンボルはmakefileにて定義されるものとする。
 

From 5aa801e72188da98c86857ee0f97f85795195ee2 Mon Sep 17 00:00:00 2001
From: zz4032 <alg4032@arcor.de>
Date: Tue, 23 Jun 2020 20:54:50 +0200
Subject: [PATCH 054/583] Update misc.h

---
 src/misc.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/misc.h b/src/misc.h
index e145c4f4..5f5aa7e5 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -29,7 +29,6 @@
 #include <ostream>
 #include <string>
 #include <vector>
-#include <mm_malloc.h>
 
 #include "types.h"
 #include "thread_win32_osx.h"

From 5ae64e22446833a21e49fd936682dd47edaf27cb Mon Sep 17 00:00:00 2001
From: zz4032 <alg4032@arcor.de>
Date: Tue, 23 Jun 2020 19:55:54 +0200
Subject: [PATCH 055/583] Adding mm_malloc.h

Otherwise compiling with 'modern' or 'avx2' architecture on Linux aborts with errors.
---
 src/misc.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/misc.h b/src/misc.h
index 5f5aa7e5..e145c4f4 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -29,6 +29,7 @@
 #include <ostream>
 #include <string>
 #include <vector>
+#include <mm_malloc.h>
 
 #include "types.h"
 #include "thread_win32_osx.h"

From 0abd692543c657b1b8e69750bd6bfbb4c7fc903c Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 24 Jun 2020 09:33:46 +0900
Subject: [PATCH 056/583] Fixed a build error on Visual Studio.

---
 src/misc.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/misc.h b/src/misc.h
index e145c4f4..d3322dac 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -29,7 +29,9 @@
 #include <ostream>
 #include <string>
 #include <vector>
+#ifndef _MSC_VER
 #include <mm_malloc.h>
+#endif
 
 #include "types.h"
 #include "thread_win32_osx.h"

From 8ef6c837b7907fe6db326d6adf73fd4efeba68c9 Mon Sep 17 00:00:00 2001
From: joergoster <osterj165@googlemail.com>
Date: Wed, 24 Jun 2020 18:04:28 +0200
Subject: [PATCH 057/583] Fix. Bench: 4471740

---
 src/misc.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/misc.h b/src/misc.h
index bd866842..72f621a6 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -142,6 +142,7 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
     uint64_t c3 = aL * bH + (uint32_t)c2;
     return aH * bH + (c2 >> 32) + (c3 >> 32);
 #endif
+}
 
 /// Under Windows it is not possible for a process to run on more than one
 /// logical processor group. This usually means to be limited to use max 64

From 0e932757e5421cfc9aaba33cc4b0f50e4d3378f1 Mon Sep 17 00:00:00 2001
From: joergoster <osterj165@googlemail.com>
Date: Wed, 24 Jun 2020 20:18:32 +0200
Subject: [PATCH 058/583] Re-enable increment operator for Piece. No functional
 change.

---
 src/evaluate.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 921cb808..c5430157 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -983,7 +983,7 @@ bool EvalList::is_valid(const Position& pos)
     for (Piece pc = NO_PIECE; pc < PIECE_NB; ++pc)
     {
       auto pt = type_of(pc);
-      if (pt == NO_PIECE || pt == 7) // ���݂��Ȃ���
+      if (pt == NO_PIECE_TYPE || pt == 7) // ���݂��Ȃ���
         continue;
 
       // ��pc��BonaPiece�̊J�n�ԍ�

From 5e119f5139fd13816ef6091285e785f44f19b202 Mon Sep 17 00:00:00 2001
From: joergoster <osterj165@googlemail.com>
Date: Wed, 24 Jun 2020 20:22:56 +0200
Subject: [PATCH 059/583] Finally.

---
 src/types.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/types.h b/src/types.h
index 2512fc29..ad2debca 100644
--- a/src/types.h
+++ b/src/types.h
@@ -309,6 +309,7 @@ ENABLE_FULL_OPERATORS_ON(Value)
 ENABLE_FULL_OPERATORS_ON(Direction)
 
 ENABLE_INCR_OPERATORS_ON(PieceType)
+ENABLE_INCR_OPERATORS_ON(Piece)
 ENABLE_INCR_OPERATORS_ON(Square)
 ENABLE_INCR_OPERATORS_ON(File)
 ENABLE_INCR_OPERATORS_ON(Rank)

From 08d8adbadedc6f822c5f0f35d378237ac06aed01 Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Wed, 24 Jun 2020 22:41:00 +0200
Subject: [PATCH 060/583] added header guards

5 include files in \eval\nnue\architectures
---
 src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h | 4 ++++
 src/eval/nnue/architectures/halfkp_256x2-32-32.h       | 4 ++++
 src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h    | 4 ++++
 src/eval/nnue/architectures/k-p-cr_256x2-32-32.h       | 4 ++++
 src/eval/nnue/architectures/k-p_256x2-32-32.h          | 3 +++
 5 files changed, 19 insertions(+)

diff --git a/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h b/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
index 7063f334..9f1f97c0 100644
--- a/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
+++ b/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
@@ -1,5 +1,8 @@
 // NNUE�]���֐��ŗp������͓����ʂƃl�b�g���[�N�\���̒�`
 
+#ifndef HALFKP_CR_EP_256X2_32_32_H
+#define HALFKP_CR_EP_256X2_32_32_H
+
 #include "../features/feature_set.h"
 #include "../features/half_kp.h"
 #include "../features/castling_right.h"
@@ -36,3 +39,4 @@ namespace Eval {
   }  // namespace NNUE
 
 }  // namespace Eval
+#endif // HALFKP_CR_EP_256X2_32_32_H
\ No newline at end of file
diff --git a/src/eval/nnue/architectures/halfkp_256x2-32-32.h b/src/eval/nnue/architectures/halfkp_256x2-32-32.h
index 9b25ee54..c79747c3 100644
--- a/src/eval/nnue/architectures/halfkp_256x2-32-32.h
+++ b/src/eval/nnue/architectures/halfkp_256x2-32-32.h
@@ -1,5 +1,8 @@
 ﻿// NNUE評価関数で用いる入力特徴量とネットワーク構造の定義
 
+#ifndef HALFKP_256X2_32_32_H
+#define HALFKP_256X2_32_32_H
+
 #include "../features/feature_set.h"
 #include "../features/half_kp.h"
 
@@ -33,3 +36,4 @@ using Network = Layers::OutputLayer;
 }  // namespace NNUE
 
 }  // namespace Eval
+#endif // HALFKP_256X2_32_32_H
diff --git a/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h b/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
index 17871169..dc761866 100644
--- a/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
+++ b/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
@@ -1,5 +1,8 @@
 // NNUE�]���֐��ŗp������͓����ʂƃl�b�g���[�N�\���̒�`
 
+#ifndef K_P_CR_EP_256X2_32_32_H
+#define K_P_CR_EP_256X2_32_32_H
+
 #include "../features/feature_set.h"
 #include "../features/k.h"
 #include "../features/p.h"
@@ -36,3 +39,4 @@ namespace Eval {
   }  // namespace NNUE
 
 }  // namespace Eval
+#endif // K_P_CR_EP_256X2_32_32_H
\ No newline at end of file
diff --git a/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h b/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
index 9ce7ecf1..331cb4f2 100644
--- a/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
+++ b/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
@@ -1,5 +1,8 @@
 // NNUE�]���֐��ŗp������͓����ʂƃl�b�g���[�N�\���̒�`
 
+#ifndef K_P_CR_256X2_32_32_H
+#define K_P_CR_256X2_32_32_H
+
 #include "../features/feature_set.h"
 #include "../features/k.h"
 #include "../features/p.h"
@@ -35,3 +38,4 @@ namespace Eval {
   }  // namespace NNUE
 
 }  // namespace Eval
+#endif // K_P_CR_256X2_32_32_H
\ No newline at end of file
diff --git a/src/eval/nnue/architectures/k-p_256x2-32-32.h b/src/eval/nnue/architectures/k-p_256x2-32-32.h
index b77aeaa6..2576ddfa 100644
--- a/src/eval/nnue/architectures/k-p_256x2-32-32.h
+++ b/src/eval/nnue/architectures/k-p_256x2-32-32.h
@@ -1,4 +1,6 @@
 ﻿// NNUE評価関数で用いる入力特徴量とネットワーク構造の定義
+#ifndef K_P_256X2_32_32_H
+#define K_P_256X2_32_32_H
 
 #include "../features/feature_set.h"
 #include "../features/k.h"
@@ -33,3 +35,4 @@ using Network = Layers::OutputLayer;
 }  // namespace NNUE
 
 }  // namespace Eval
+#endif // K_P_256X2_32_32_H
\ No newline at end of file

From 7a3c3eacdfc6915199a0034aaa0045bec228058b Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Wed, 24 Jun 2020 22:41:00 +0200
Subject: [PATCH 061/583] added header guards

5 include files in \eval\nnue\architectures
---
 src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h | 4 ++++
 src/eval/nnue/architectures/halfkp_256x2-32-32.h       | 4 ++++
 src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h    | 4 ++++
 src/eval/nnue/architectures/k-p-cr_256x2-32-32.h       | 4 ++++
 src/eval/nnue/architectures/k-p_256x2-32-32.h          | 3 +++
 5 files changed, 19 insertions(+)

diff --git a/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h b/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
index 7063f334..9f1f97c0 100644
--- a/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
+++ b/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
@@ -1,5 +1,8 @@
 // NNUE�]���֐��ŗp������͓����ʂƃl�b�g���[�N�\���̒�`
 
+#ifndef HALFKP_CR_EP_256X2_32_32_H
+#define HALFKP_CR_EP_256X2_32_32_H
+
 #include "../features/feature_set.h"
 #include "../features/half_kp.h"
 #include "../features/castling_right.h"
@@ -36,3 +39,4 @@ namespace Eval {
   }  // namespace NNUE
 
 }  // namespace Eval
+#endif // HALFKP_CR_EP_256X2_32_32_H
\ No newline at end of file
diff --git a/src/eval/nnue/architectures/halfkp_256x2-32-32.h b/src/eval/nnue/architectures/halfkp_256x2-32-32.h
index 9b25ee54..c79747c3 100644
--- a/src/eval/nnue/architectures/halfkp_256x2-32-32.h
+++ b/src/eval/nnue/architectures/halfkp_256x2-32-32.h
@@ -1,5 +1,8 @@
 ﻿// NNUE評価関数で用いる入力特徴量とネットワーク構造の定義
 
+#ifndef HALFKP_256X2_32_32_H
+#define HALFKP_256X2_32_32_H
+
 #include "../features/feature_set.h"
 #include "../features/half_kp.h"
 
@@ -33,3 +36,4 @@ using Network = Layers::OutputLayer;
 }  // namespace NNUE
 
 }  // namespace Eval
+#endif // HALFKP_256X2_32_32_H
diff --git a/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h b/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
index 17871169..dc761866 100644
--- a/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
+++ b/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
@@ -1,5 +1,8 @@
 // NNUE�]���֐��ŗp������͓����ʂƃl�b�g���[�N�\���̒�`
 
+#ifndef K_P_CR_EP_256X2_32_32_H
+#define K_P_CR_EP_256X2_32_32_H
+
 #include "../features/feature_set.h"
 #include "../features/k.h"
 #include "../features/p.h"
@@ -36,3 +39,4 @@ namespace Eval {
   }  // namespace NNUE
 
 }  // namespace Eval
+#endif // K_P_CR_EP_256X2_32_32_H
\ No newline at end of file
diff --git a/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h b/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
index 9ce7ecf1..331cb4f2 100644
--- a/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
+++ b/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
@@ -1,5 +1,8 @@
 // NNUE�]���֐��ŗp������͓����ʂƃl�b�g���[�N�\���̒�`
 
+#ifndef K_P_CR_256X2_32_32_H
+#define K_P_CR_256X2_32_32_H
+
 #include "../features/feature_set.h"
 #include "../features/k.h"
 #include "../features/p.h"
@@ -35,3 +38,4 @@ namespace Eval {
   }  // namespace NNUE
 
 }  // namespace Eval
+#endif // K_P_CR_256X2_32_32_H
\ No newline at end of file
diff --git a/src/eval/nnue/architectures/k-p_256x2-32-32.h b/src/eval/nnue/architectures/k-p_256x2-32-32.h
index b77aeaa6..2576ddfa 100644
--- a/src/eval/nnue/architectures/k-p_256x2-32-32.h
+++ b/src/eval/nnue/architectures/k-p_256x2-32-32.h
@@ -1,4 +1,6 @@
 ﻿// NNUE評価関数で用いる入力特徴量とネットワーク構造の定義
+#ifndef K_P_256X2_32_32_H
+#define K_P_256X2_32_32_H
 
 #include "../features/feature_set.h"
 #include "../features/k.h"
@@ -33,3 +35,4 @@ using Network = Layers::OutputLayer;
 }  // namespace NNUE
 
 }  // namespace Eval
+#endif // K_P_256X2_32_32_H
\ No newline at end of file

From 86e3fedf7e19482d5c9056d0415ac8284e99125f Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Thu, 25 Jun 2020 04:38:39 +0200
Subject: [PATCH 062/583] Update evaluate_nnue.cpp

---
 src/eval/nnue/evaluate_nnue.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index ce478783..46b3b5f9 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -250,10 +250,14 @@ void load_eval() {
 	if (!result)
 	{
 		// 読み込みエラーのとき終了してくれないと困る。
-		std::cout << "Error! : failed to read " << NNUE::kFileName << std::endl;
-		my_exit();
+		std::cout << "Error! " << NNUE::kFileName << " not found or wrong format" << std::endl;
+		//my_exit();
 	}
+	else
+	  std::cout << "info string NNUE " << NNUE::kFileName << " found & loaded" << std::endl;
   }
+  else
+    std::cout << "info string NNUE " << NNUE::kFileName << " not loaded" << std::endl;
 }
 
 // 初期化

From 8c8a30233c3742dbd60d3998c9e91ca4c783a147 Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Thu, 25 Jun 2020 04:38:39 +0200
Subject: [PATCH 063/583] Update evaluate_nnue.cpp

---
 src/eval/nnue/evaluate_nnue.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index ce478783..46b3b5f9 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -250,10 +250,14 @@ void load_eval() {
 	if (!result)
 	{
 		// 読み込みエラーのとき終了してくれないと困る。
-		std::cout << "Error! : failed to read " << NNUE::kFileName << std::endl;
-		my_exit();
+		std::cout << "Error! " << NNUE::kFileName << " not found or wrong format" << std::endl;
+		//my_exit();
 	}
+	else
+	  std::cout << "info string NNUE " << NNUE::kFileName << " found & loaded" << std::endl;
   }
+  else
+    std::cout << "info string NNUE " << NNUE::kFileName << " not loaded" << std::endl;
 }
 
 // 初期化

From a5fb69008cd30901f9ac3d2e74684f38c3b51014 Mon Sep 17 00:00:00 2001
From: joergoster <osterj165@googlemail.com>
Date: Thu, 25 Jun 2020 15:43:33 +0200
Subject: [PATCH 064/583] Bugfix. No legal move is either mate or stalemate.

---
 src/learn/learner.cpp | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 526c027c..221a561e 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -504,16 +504,24 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 			}
 
       if (pos.is_draw(ply)) {
-        // Do not write if draw.
-        break;
+#if defined (LEARN_GENSFEN_USE_DRAW_RESULT)
+          flush_psv(0);
+#endif
+          // Do not write if draw.
+          break;
       }
 
 			// 全駒されて詰んでいたりしないか？
-			if (MoveList<LEGAL>(pos).size() == 0)
+			if (MoveList<LEGAL>(pos).size() == 0) // Can be mate or stalemate
 			{
         // (この局面の一つ前の局面までは書き出す)
         // Write the positions other than this position if checkmated.
-        flush_psv(-1);
+                if (pos.checkers()) // Mate
+                    flush_psv(-1);
+#if defined (LEARN_GENSFEN_USE_DRAW_RESULT)
+                else                // Stalemate
+                    flush_psv(0);
+#endif
 				break;
 			}
 
@@ -578,7 +586,7 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
         if (pos.is_draw(0)) {
 #if defined	(LEARN_GENSFEN_USE_DRAW_RESULT)
           // 引き分けを書き出すとき
-          flush_psv(is_win);
+          flush_psv(0);
 #endif
           break;
         }

From 2af46deede06386ea483d500cece4deb4ed1a58f Mon Sep 17 00:00:00 2001
From: joergoster <osterj165@googlemail.com>
Date: Thu, 25 Jun 2020 15:52:19 +0200
Subject: [PATCH 065/583] Fix include.

---
 src/types.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/types.h b/src/types.h
index 6fe6d154..a4a9f315 100644
--- a/src/types.h
+++ b/src/types.h
@@ -40,6 +40,7 @@
 
 #include <cassert>
 #include <cctype>
+#include <climits>
 #include <cstdint>
 #include <cstdlib>
 #include <algorithm>

From 0761d9504e32b15cde4aa5c36b9d440fb84b8e89 Mon Sep 17 00:00:00 2001
From: rqs <qhapaq49@yahoo.co.jp>
Date: Sat, 27 Jun 2020 13:06:05 +0900
Subject: [PATCH 066/583] add convert_bin and option for draw positions

---
 src/learn/learn.h     |  6 ++--
 src/learn/learner.cpp | 73 +++++++++++++++++++++++++++++--------------
 2 files changed, 53 insertions(+), 26 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 58a017bd..8e3172d3 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -165,8 +165,8 @@ typedef float LearnFloatType;
 // 引き分けに至ったとき、それを教師局面として書き出す
 // これをするほうが良いかどうかは微妙。
 // #define LEARN_GENSFEN_USE_DRAW_RESULT
-
-
+extern bool use_draw_in_training;
+extern bool use_hash_in_training;
 // ======================
 //       configure
 // ======================
@@ -234,4 +234,4 @@ namespace Learner
 
 #endif
 
-#endif // ifndef _LEARN_H_
\ No newline at end of file
+#endif // ifndef _LEARN_H_
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 221a561e..09af98d3 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -84,6 +84,10 @@
 #include <shared_mutex>
 #endif
 
+bool use_draw_in_training=false;
+bool use_draw_in_validation=false;
+bool use_hash_in_training=true;
+
 using namespace std;
 
 //// これは探索部で定義されているものとする。
@@ -1248,11 +1252,8 @@ struct SfenReader
 			{
 				if (eval_limit < abs(p.score))
 					continue;
-#if !defined (LEARN_GENSFEN_USE_DRAW_RESULT)
-				if (p.game_result == 0)
+				if (!use_draw_in_validation && p.game_result == 0)
 					continue;
-#endif
-
 				sfen_for_mse.push_back(p);
 			} else {
 				break;
@@ -1934,10 +1935,10 @@ void LearnerThink::thread_worker(size_t thread_id)
 		if (eval_limit < abs(ps.score))
 			goto RetryRead;
 
-#if !defined (LEARN_GENSFEN_USE_DRAW_RESULT)
-		if (ps.game_result == 0)
+
+		if (!use_draw_in_training && ps.game_result == 0)
 			goto RetryRead;
-#endif
+
 
 		// 序盤局面に関する読み飛ばし
 		if (ps.gamePly < prng.rand(reduction_gameply))
@@ -1961,13 +1962,13 @@ void LearnerThink::thread_worker(size_t thread_id)
 		{
 			auto key = pos.key();
 			// rmseの計算用に使っている局面なら除外する。
-			if (sr.is_for_rmse(key))
+			if (sr.is_for_rmse(key) && use_hash_in_training)
 				goto RetryRead;
 
 			// 直近で用いた局面も除外する。
 			auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
 			auto key2 = sr.hash[hash_index];
-			if (key == key2)
+			if (key == key2 && use_hash_in_training)
 				goto RetryRead;
 			sr.hash[hash_index] = key; // 今回のkeyに入れ替えておく。
 		}
@@ -2416,30 +2417,36 @@ void shuffle_files_on_memory(const vector<string>& filenames,const string output
 	std::cout << "..shuffle_on_memory done." << std::endl;
 }
 
-void convert_bin(const vector<string>& filenames , const string& output_file_name)
+void convert_bin(const vector<string>& filenames, const string& output_file_name, const int ply_minimum, const int ply_maximum, const int interpolate_eval)
 {
 	std::fstream fs;
+	uint64_t data_size=0;
+	uint64_t filtered_size = 0;
 	auto th = Threads.main();
 	auto &tpos = th->rootPos;
 	// plain形式の雑巾をやねうら王用のpackedsfenvalueに変換する
 	fs.open(output_file_name, ios::app | ios::binary);
-
+	StateListPtr states;
 	for (auto filename : filenames) {
 		std::cout << "convert " << filename << " ... ";
 		std::string line;
 		ifstream ifs;
 		ifs.open(filename);
 		PackedSfenValue p;
+		data_size = 0;
+		filtered_size = 0;
 		p.gamePly = 1; // apery形式では含まれない。一応初期化するべし
+		bool ignore_flag = false;
+
 		while (std::getline(ifs, line)) {
 			std::stringstream ss(line);
 			std::string token;
 			std::string value;
 			ss >> token;
-			if (token == "sfen") {
-				StateInfo si;
-				tpos.set(line.substr(5), false, &si, Threads.main());
-				tpos.sfen_pack(p.sfen);
+			if (token == "fen") {
+			  states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
+			  tpos.set(line.substr(4), false, &states->back(), Threads.main());
+			  tpos.sfen_pack(p.sfen);
 			}
 			else if (token == "move") {
 				ss >> value;
@@ -2451,23 +2458,37 @@ void convert_bin(const vector<string>& filenames , const string& output_file_nam
 			else if (token == "ply") {
 				int temp;
 				ss >> temp;
+				if(temp < ply_minimum || temp > ply_maximum){
+				  ignore_flag = true;
+				}
 				p.gamePly = uint16_t(temp); // 此処のキャストいらない？
+				if (interpolate_eval != 0){
+				  p.score = min(3000, interpolate_eval * temp);
+				}
 			}
 			else if (token == "result") {
 				int temp;
 				ss >> temp;
 				p.game_result = int8_t(temp); // 此処のキャストいらない？
+				if (interpolate_eval){
+				  p.score = p.score * p.game_result;
+				}
 			}
 			else if (token == "e") {
+			  if(!ignore_flag){
 				fs.write((char*)&p, sizeof(PackedSfenValue));
+				data_size+=1;
 				// debug
-				/*
-				std::cout<<tpos<<std::endl;
-				std::cout<<to_usi_string(Move(p.move))<<","<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
-				*/
+				// std::cout<<tpos<<std::endl;
+				// std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
+			  }else{
+			    ignore_flag = false;
+			    filtered_size += 1;
+			  }
+				
 			}
 		}
-		std::cout << "done" << std::endl;
+		std::cout << "done" << data_size <<" parsed " << filtered_size<<" is filtered"<< std::endl;
 		ifs.close();
 	}
 	std::cout << "all done" << std::endl;
@@ -2557,6 +2578,9 @@ void learn(Position&, istringstream& is)
 	bool use_convert_plain = false;
 	// plain形式の教師をやねうら王のbinに変換する
 	bool use_convert_bin = false;
+	int ply_minimum = 0;
+	int ply_maximum = 114514;
+	bool interpolate_eval = 0;
 	// それらのときに書き出すファイル名(デフォルトでは"shuffled_sfen.bin")
 	string output_file_name = "shuffled_sfen.bin";
 
@@ -2636,7 +2660,9 @@ void learn(Position&, istringstream& is)
 		else if (option == "eta3")       is >> eta3;
 		else if (option == "eta1_epoch") is >> eta1_epoch;
 		else if (option == "eta2_epoch") is >> eta2_epoch;
-
+		else if (option == "use_draw_in_training") is >> use_draw_in_training;
+		else if (option == "use_draw_in_validation") is >> use_draw_in_validation;
+		else if (option == "use_hash_in_training") is >> use_hash_in_training;
 		// 割引率
 		else if (option == "discount_rate") is >> discount_rate;
 
@@ -2672,7 +2698,7 @@ void learn(Position&, istringstream& is)
 		else if (option == "eval_limit") is >> eval_limit;
 		else if (option == "save_only_once") save_only_once = true;
 		else if (option == "no_shuffle") no_shuffle = true;
-
+		
 #if defined(EVAL_NNUE)
 		else if (option == "nn_batch_size") is >> nn_batch_size;
 		else if (option == "newbob_decay") is >> newbob_decay;
@@ -2687,6 +2713,7 @@ void learn(Position&, istringstream& is)
 		// 雑巾のconvert関連
 		else if (option == "convert_plain") use_convert_plain = true;
 		else if (option == "convert_bin") use_convert_bin = true;
+		else if (option == "interpolate_eval") is >> interpolate_eval;
 		// さもなくば、それはファイル名である。
 		else
 			filenames.push_back(option);
@@ -2796,7 +2823,7 @@ void learn(Position&, istringstream& is)
 	{
 	  	is_ready(true);
 		cout << "convert_bin.." << endl;
-		convert_bin(filenames,output_file_name);
+		convert_bin(filenames,output_file_name, ply_minimum, ply_maximum, interpolate_eval);
 		return;
 		
 	}

From 4c926b8eb4ccdd129bdf83b354cb092d2206b4a0 Mon Sep 17 00:00:00 2001
From: rqs <qhapaq49@yahoo.co.jp>
Date: Sat, 27 Jun 2020 13:08:12 +0900
Subject: [PATCH 067/583] add pgn_to_plain

---
 script/README.md       | 52 ++++++++++++++++++++++++++++++++
 script/pgn_to_plain.py | 68 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 script/README.md
 create mode 100644 script/pgn_to_plain.py

diff --git a/script/README.md b/script/README.md
new file mode 100644
index 00000000..feb57ca2
--- /dev/null
+++ b/script/README.md
@@ -0,0 +1,52 @@
+# `pgn_to_plain`
+This script converts pgn files into text file to apply `learn convert_bin` command. You need to import [python-chess](https://pypi.org/project/python-chess/) to use this script.
+
+
+    pip install python-chess
+	
+
+# Example of Qhapaq's finetune using `pgn_to_plain`
+
+## Download data
+You can download data from [here](http://rebel13.nl/index.html)
+
+## Convert pgn files
+
+**Important : convert text will be superheavy (approx 200 byte / position)** 
+
+    python pgn_to_plain.py --pgn "pgn/*.pgn" --start_ply 1 --output converted_pgn.txt
+
+
+`--pgn` option supports wildcard. When you use pgn files with elo >= 3300, You will get 1.7 GB text file.
+	
+	
+## Convert into training data
+
+
+### Example build command
+
+    make nnue-learn ARCH=x86-64
+
+See `src/Makefile` for detail.
+
+
+### Convert
+
+    ./stockfish
+    learn convert_bin converted_pgn.txt output_file_name pgn_bin.bin
+	learn shuffle pgn_bin.bin
+	
+You also need to prepare validation data for training like following.
+	
+	python pgn_to_plain.py --pgn "pgn/ccrl-40-15-3400.pgn" --start_ply 1 --output ccrl-40-15-3400.txt
+	./stockfish
+    learn convert_bin ccrl-40-15-3400.txt ccrl-40-15-3400_plain.bin
+	
+	
+### Learn
+
+    ./stockfish
+	setoption name Threads value 8
+    learn shuffled_sfen.bin newbob_decay 0.5  validation_set_file_name ccrl-40-15-3400_plain.bin  nn_batch_size 50000 batchsize 1000000 eval_save_interval 8000000 eta 0.05 lambda 0.0 eval_limit 3000 mirror_percentage 0 use_draw_in_training 1
+
+
diff --git a/script/pgn_to_plain.py b/script/pgn_to_plain.py
new file mode 100644
index 00000000..61aa9917
--- /dev/null
+++ b/script/pgn_to_plain.py
@@ -0,0 +1,68 @@
+import chess.pgn
+import argparse
+import glob
+from typing import List
+
+# todo close in c++ tools using pgn-extract
+# https://www.cs.kent.ac.uk/people/staff/djb/pgn-extract/help.html#-w
+
+def parse_result(result_str:str, board:chess.Board) -> int:
+    if result_str == "1/2-1/2":
+        return 0
+    if result_str == "0-1":
+        if board.turn == chess.WHITE:
+            return -1
+        else:
+            return 1
+    elif result_str == "1-0":
+        if board.turn == chess.WHITE:
+            return 1
+        else:
+            return 0
+    else:
+        print("illeagal result", result_str)
+        raise ValueError
+
+def game_sanity_check(game: chess.pgn.Game) -> bool:
+    if not game.headers["Result"] in ["1/2-1/2", "0-1", "1-0"]:
+        print("invalid result", game.headers["Result"])
+        return False
+    return True
+    
+def parse_game(game: chess.pgn.Game, writer, start_play: int=1)->None:
+    board: chess.Board = game.board()
+    if not game_sanity_check(game):
+        return
+    result: str = game.headers["Result"]
+    for ply, move in enumerate(game.mainline_moves()):
+        if ply >= start_play:
+            writer.write("fen " + board.fen() + "\n")
+            writer.write("move " + str(move) + "\n")
+            writer.write("score 0\n")
+            writer.write("ply " + str(ply)+"\n")
+            writer.write("result " + str(parse_result(result, board)) +"\n")
+            writer.write("e\n")
+
+        board.push(move)
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pgn", type=str, required=True)
+    parser.add_argument("--start_ply", type=int, default=1)
+    parser.add_argument("--output", type=str, default="plain.txt")
+    args = parser.parse_args()
+
+    pgn_files: List[str] = glob.glob(args.pgn)
+    f = open(args.output, 'w')
+    for pgn_file in pgn_files:
+        print("parse", pgn_file)
+        pgn_loader = open(pgn_file)
+        while True:
+            game = chess.pgn.read_game(pgn_loader)
+            if game is None:
+                break
+            parse_game(game, f, args.start_ply)
+    f.close()
+    
+if __name__=="__main__":
+    main()

From aa2dc962f516d0ae8905c534eaf3c05427457bc0 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 27 Jun 2020 14:00:12 +0900
Subject: [PATCH 068/583] Added use_draw_in_training_data_generation option to
 write out draw games to the training data.

---
 src/learn/learn.h     |  4 ++--
 src/learn/learner.cpp | 43 ++++++++++++++++++++++---------------------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 8e3172d3..246e5cc9 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -165,8 +165,8 @@ typedef float LearnFloatType;
 // 引き分けに至ったとき、それを教師局面として書き出す
 // これをするほうが良いかどうかは微妙。
 // #define LEARN_GENSFEN_USE_DRAW_RESULT
-extern bool use_draw_in_training;
-extern bool use_hash_in_training;
+
+
 // ======================
 //       configure
 // ======================
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 09af98d3..98a310c4 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -84,10 +84,6 @@
 #include <shared_mutex>
 #endif
 
-bool use_draw_in_training=false;
-bool use_draw_in_validation=false;
-bool use_hash_in_training=true;
-
 using namespace std;
 
 //// これは探索部で定義されているものとする。
@@ -115,6 +111,11 @@ namespace Learner
 // 局面の配列 : PSVector は packed sfen vector の略。
 typedef std::vector<PackedSfenValue> PSVector;
 
+bool use_draw_in_training_data_generation = false;
+bool use_draw_in_training = false;
+bool use_draw_in_validation = false;
+bool use_hash_in_training = true;
+
 // -----------------------------------
 //    局面のファイルへの書き出し
 // -----------------------------------
@@ -499,19 +500,19 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 			// 長手数に達したのか
 			if (ply >= MAX_PLY2)
 			{
-#if defined (LEARN_GENSFEN_USE_DRAW_RESULT)
-				// 勝敗 = 引き分けとして書き出す。
-				// こうしたほうが自分が入玉したときに、相手の入玉を許しにくい(かも)
-				flush_psv(0);
-#endif
+				if (use_draw_in_training_data_generation) {
+					// 勝敗 = 引き分けとして書き出す。
+					// こうしたほうが自分が入玉したときに、相手の入玉を許しにくい(かも)
+					flush_psv(0);
+				}
 				break;
 			}
 
       if (pos.is_draw(ply)) {
-#if defined (LEARN_GENSFEN_USE_DRAW_RESULT)
-          flush_psv(0);
-#endif
-          // Do not write if draw.
+		  if (use_draw_in_training_data_generation) {
+			  // Write if draw.
+			  flush_psv(0);
+		  }
           break;
       }
 
@@ -522,10 +523,9 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
         // Write the positions other than this position if checkmated.
                 if (pos.checkers()) // Mate
                     flush_psv(-1);
-#if defined (LEARN_GENSFEN_USE_DRAW_RESULT)
-                else                // Stalemate
-                    flush_psv(0);
-#endif
+				else if (use_draw_in_training_data_generation) {
+					flush_psv(0); // Stalemate
+				}
 				break;
 			}
 
@@ -588,10 +588,10 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 				// 各千日手に応じた処理。
 
         if (pos.is_draw(0)) {
-#if defined	(LEARN_GENSFEN_USE_DRAW_RESULT)
-          // 引き分けを書き出すとき
-          flush_psv(0);
-#endif
+			if (use_draw_in_training_data_generation) {
+				// Write if draw.
+				flush_psv(0);
+			}
           break;
         }
 
@@ -2660,6 +2660,7 @@ void learn(Position&, istringstream& is)
 		else if (option == "eta3")       is >> eta3;
 		else if (option == "eta1_epoch") is >> eta1_epoch;
 		else if (option == "eta2_epoch") is >> eta2_epoch;
+		else if (option == "use_draw_in_training_data_generation") is >> use_draw_in_training_data_generation;
 		else if (option == "use_draw_in_training") is >> use_draw_in_training;
 		else if (option == "use_draw_in_validation") is >> use_draw_in_validation;
 		else if (option == "use_hash_in_training") is >> use_hash_in_training;

From e229015127df8264fc23cdc594e906aebb429096 Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Sat, 27 Jun 2020 15:24:20 +0900
Subject: [PATCH 069/583] =?UTF-8?q?learn=20convert=5Fbin=5Ffrom=5Fpgn-extr?=
 =?UTF-8?q?act=E3=82=B3=E3=83=9E=E3=83=B3=E3=83=89=E3=82=92=E8=BF=BD?=
 =?UTF-8?q?=E5=8A=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

http://rebel13.nl/download/data.html
Download Selected Lichess games
pgn-extract --fencomments -Wlalg --nochecks --nomovenumbers --noresults -w500000 -N -V -o comp-2019-06.txt comp-2019-06.pgn
stockfish.exe
setoption name SkipLoadingEval value true
isready
learn convert_bin_from_pgn-extract output_file_name fens_comp-2019-06.bin comp-2019-06.txt
---
 src/learn/learner.cpp | 202 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 201 insertions(+), 1 deletion(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 98a310c4..0567af76 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -17,6 +17,7 @@
 
 #include <filesystem>
 #include <random>
+#include <regex>
 
 #include "learn.h"
 #include "multi_think.h"
@@ -2494,7 +2495,196 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 	std::cout << "all done" << std::endl;
 	fs.close();
 }
-  
+
+static inline void ltrim(std::string &s) {
+	s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
+		return !std::isspace(ch);
+	}));
+}
+
+static inline void rtrim(std::string &s) {
+	s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
+		return !std::isspace(ch);
+	}).base(), s.end());
+}
+
+static inline void trim(std::string &s) {
+	ltrim(s);
+	rtrim(s);
+}
+
+int parse_game_result_from_pgn_extract(std::string result) {
+	// White Win
+	if (result == "\"1-0\"") {
+		return 1;
+	}
+	// Black Win
+	else if (result == "\"0-1\"") {
+		return -1;
+	}
+	// Draw
+	else {
+		return 0;
+	}
+}
+
+// 0.25 -->  25
+// #-4  --> -mate_in(4)
+// #3   -->  mate_in(3)
+Value parse_score_from_pgn_extract(std::string eval) {
+	if (eval.substr(0, 1) == "#") {
+		if (eval.substr(1, 1) == "-") {
+			return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
+		}
+		else {
+			return mate_in(stoi(eval.substr(1, eval.length() - 1)));
+		}
+	}
+	else {
+		return Value(stod(eval) * 100.0f);
+	}
+}
+
+// pgn-extract形式の教師をやねうら王用のPackedSfenValueに変換する
+void convert_bin_from_pgn_extract(const vector<string>& filenames, const string& output_file_name)
+{
+	auto th = Threads.main();
+	auto &pos = th->rootPos;
+
+	std::fstream ofs;
+	ofs.open(output_file_name, ios::out | ios::binary);
+
+	int game_count = 0;
+	int fen_count = 0;
+
+	for (auto filename : filenames) {
+		std::cout << now_string() << " convert " << filename << std::endl;
+		ifstream ifs;
+		ifs.open(filename);
+
+		int game_result = 0;
+
+		std::string line;
+		while (std::getline(ifs, line)) {
+
+			if (line.empty()) {
+				continue;
+			}
+
+			else if (line.substr(0, 1) == "[") {
+				std::regex pattern_result(R"(\[Result (.+?)\])");
+				std::smatch match;
+
+				// example: [Result "1-0"]
+				if (std::regex_search(line, match, pattern_result)) {
+					game_result = parse_game_result_from_pgn_extract(match.str(1));
+					//std::cout << "game_result=" << game_result << std::endl;
+
+					game_count++;
+					if (game_count % 10000 == 0) {
+						std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
+					}
+				}
+
+				continue;
+			}
+
+			else {
+				int gamePly = 0;
+
+				PackedSfenValue psv;
+				memset((char*)&psv, 0, sizeof(PackedSfenValue));
+
+				auto itr = line.cbegin();
+
+				while (true) {
+					gamePly++;
+
+					std::regex pattern_bracket(R"(\{(.+?)\})");
+					std::regex pattern_eval(R"(\[\%eval (.+?)\])");
+					std::regex pattern_move(R"((.+?)\{)");
+					std::smatch match;
+
+					// example: { [%eval 0.25] [%clk 0:10:00] }
+					if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+						break;
+					}
+
+					itr += match.position(0) + match.length(0);
+					std::string str_eval_clk = match.str(1);
+					trim(str_eval_clk);
+					//std::cout << "str_eval_clk="<< str_eval_clk << std::endl;
+
+					// example: [%eval 0.25]
+					// example: [%eval #-4]
+					// example: [%eval #3]
+					if (!std::regex_search(str_eval_clk, match, pattern_eval)) {
+						continue;
+					}
+					else {
+						std::string str_eval = match.str(1);
+						trim(str_eval);
+						psv.score = parse_score_from_pgn_extract(str_eval);
+						//std::cout << "psv.score=" << psv.score << std::endl;
+					}
+
+					// example: { rnbqkbnr/pppppppp/8/8/3P4/8/PPP1PPPP/RNBQKBNR b KQkq d3 0 1 }
+					if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+						break;
+					}
+
+					itr += match.position(0) + match.length(0);
+					std::string str_fen = match.str(1);
+					trim(str_fen);
+					//std::cout << "str_fen=" << str_fen << std::endl;
+
+					StateInfo si;
+					pos.set(str_fen, false, &si, th);
+					pos.sfen_pack(psv.sfen);
+
+					// example: d7d5 {
+					if (!std::regex_search(itr, line.cend(), match, pattern_move)) {
+						break;
+					}
+
+					itr += match.position(0) + match.length(0) - 1;
+					std::string str_move = match.str(1);
+					trim(str_move);
+					//std::cout << "str_move=" << str_move << std::endl;
+					psv.move = UCI::to_move(pos, str_move);
+
+					//
+					psv.gamePly = gamePly;
+					psv.game_result = game_result;
+
+					if (pos.side_to_move() == BLACK) {
+						psv.score *= -1;
+						psv.game_result *= -1;
+					}
+
+					//std::cout << "write: "
+					//		  << "score=" << psv.score
+					//		  << ", move=" << psv.move
+					//		  << ", gamePly=" << psv.gamePly
+					//		  << ", game_result=" << (int)psv.game_result
+					//		  << std::endl;
+
+					ofs.write((char*)&psv, sizeof(PackedSfenValue));
+					memset((char*)&psv, 0, sizeof(PackedSfenValue));
+
+					fen_count++;
+				}
+
+				game_result = 0;
+			}
+		}
+	}
+
+	std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
+	std::cout << now_string() << " all done" << std::endl;
+	ofs.close();
+}
+
 //void convert_plain(const vector<string>& filenames , const string& output_file_name)
 //{
 //	Position tpos;
@@ -2581,6 +2771,8 @@ void learn(Position&, istringstream& is)
 	int ply_minimum = 0;
 	int ply_maximum = 114514;
 	bool interpolate_eval = 0;
+	// pgn-extract形式の教師をやねうら王のbinに変換する
+	bool use_convert_bin_from_pgn_extract = false;
 	// それらのときに書き出すファイル名(デフォルトでは"shuffled_sfen.bin")
 	string output_file_name = "shuffled_sfen.bin";
 
@@ -2715,6 +2907,7 @@ void learn(Position&, istringstream& is)
 		else if (option == "convert_plain") use_convert_plain = true;
 		else if (option == "convert_bin") use_convert_bin = true;
 		else if (option == "interpolate_eval") is >> interpolate_eval;
+		else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
 		// さもなくば、それはファイル名である。
 		else
 			filenames.push_back(option);
@@ -2828,6 +3021,13 @@ void learn(Position&, istringstream& is)
 		return;
 		
 	}
+	if (use_convert_bin_from_pgn_extract)
+	{
+		is_ready(true);
+		cout << "convert_bin_from_pgn-extract.." << endl;
+		convert_bin_from_pgn_extract(filenames, output_file_name);
+		return;
+	}
 
 	cout << "loop              : " << loop << endl;
 	cout << "eval_limit        : " << eval_limit << endl;

From 13eb5400204fb2fea68ef0ca5875a668eabac339 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 27 Jun 2020 22:19:22 +0900
Subject: [PATCH 070/583] Changed the formula to calculate winning ratio to
 1/(1+10^(-Eval/4)).

---
 src/learn/learner.cpp | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 98a310c4..f59f458f 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1027,9 +1027,24 @@ double sigmoid(double x)
 // 評価値を勝率[0,1]に変換する関数
 double winning_percentage(double value)
 {
-	// この600.0という定数は、ponanza定数。(ponanzaがそうしているらしいという意味で)
-	// ゲームの進行度に合わせたものにしたほうがいいかも知れないけども、その効果のほどは不明。
-	return sigmoid(value / 600.0);
+	// In Maxima,
+	// load("C:/maxima-5.44.0/cform.lisp");
+	// PawnValueEg = 206;
+	// cform(1.0 / (1.0 + 10.0 ^ (-value / PawnValueEg / 4.0)));
+	constexpr double PawnValue = PawnValueEg;
+	return 1.0 * pow(pow(10.0, -0.25 * pow(PawnValue, -1) * value) + 1.0, -1);
+}
+
+double delta_winning_percentage(double value)
+{
+	// In Maxima,
+	// load("C:/maxima-5.44.0/cform.lisp");
+	// PawnValueEg = 206;
+	// cform(diff(1.0/(1.0+10.0^(-value/PawnValue/4.0)),value));
+	constexpr double PawnValue = PawnValueEg;
+	return
+		0.5756462732485115 * pow(PawnValue, -1) * pow(10.0, -0.25 * pow(PawnValue, -1) * value) *
+		pow(pow(10.0, -0.25 * pow(PawnValue, -1) * value) + 1.0, -2);
 }
 
 // 普通のシグモイド関数の導関数。
@@ -1127,8 +1142,9 @@ double calc_grad(Value deep, Value shallow , const PackedSfenValue& psv)
 	// elmo(WCSC27)方式
 	// 実際のゲームの勝敗で補正する。
 
-	const double eval_winrate = winning_percentage(shallow);
-	const double teacher_winrate = winning_percentage(deep);
+	const double q = winning_percentage(shallow);
+	const double p = winning_percentage(deep);
+	const double dq = delta_winning_percentage(shallow);
 
 	// 期待勝率を勝っていれば1、負けていれば 0、引き分けなら0.5として補正項として用いる。
 	// game_result = 1,0,-1なので1足して2で割る。
@@ -1139,7 +1155,9 @@ double calc_grad(Value deep, Value shallow , const PackedSfenValue& psv)
 
 	// 実際の勝率を補正項として使っている。
 	// これがelmo(WCSC27)のアイデアで、現代のオーパーツ。
-	const double grad = (1 - lambda) * (eval_winrate - t) + lambda * (eval_winrate - teacher_winrate);
+	const double pp = (q - p) * dq / q / (1.0 - q);
+	const double tt = (q - t) * dq / q / (1.0 - q);
+	const double grad = lambda * pp + (1.0 - lambda) * tt;
 
 	return grad;
 }

From 96f2541191ae2ed93f7cefeab9ccdead931cc2d4 Mon Sep 17 00:00:00 2001
From: joergoster <osterj165@googlemail.com>
Date: Sat, 27 Jun 2020 19:41:13 +0200
Subject: [PATCH 071/583] Fix compilation under Linux with -DUSE_SSE41.

---
 src/eval/nnue/nnue_common.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/eval/nnue/nnue_common.h b/src/eval/nnue/nnue_common.h
index 8ef8fee4..b82bc2c2 100644
--- a/src/eval/nnue/nnue_common.h
+++ b/src/eval/nnue/nnue_common.h
@@ -7,6 +7,8 @@
 
 #if defined(USE_AVX2)
 #include <immintrin.h>
+#elif defined(USE_SSE41)
+#include <smmintrin.h>
 #elif defined(USE_SSE2)
 #include <emmintrin.h>
 #endif

From aea08de018a72c2ecb86b568853e3815b32551e6 Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Sun, 28 Jun 2020 03:12:55 +0200
Subject: [PATCH 072/583] Translation

Files in /eval, /extra, & /learn - comments translated from Japanese to English
---
 src/eval/evaluate_common.h                    |   72 +-
 src/eval/evaluate_mir_inv_tools.cpp           |   72 +-
 src/eval/evaluate_mir_inv_tools.h             |   30 +-
 .../architectures/halfkp-cr-ep_256x2-32-32.h  |    8 +-
 .../nnue/architectures/halfkp_256x2-32-32.h   |    8 +-
 .../architectures/k-p-cr-ep_256x2-32-32.h     |    8 +-
 .../nnue/architectures/k-p-cr_256x2-32-32.h   |    8 +-
 src/eval/nnue/architectures/k-p_256x2-32-32.h |    8 +-
 src/eval/nnue/evaluate_nnue.cpp               |  114 +-
 src/eval/nnue/evaluate_nnue.h                 |   24 +-
 src/eval/nnue/evaluate_nnue_learner.cpp       |   44 +-
 src/eval/nnue/evaluate_nnue_learner.h         |   20 +-
 src/eval/nnue/features/castling_right.cpp     |   10 +-
 src/eval/nnue/features/castling_right.h       |   20 +-
 src/eval/nnue/features/enpassant.cpp          |    8 +-
 src/eval/nnue/features/enpassant.h            |   20 +-
 src/eval/nnue/features/feature_set.h          |   62 +-
 src/eval/nnue/features/features_common.h      |   26 +-
 src/eval/nnue/features/half_kp.cpp            |   10 +-
 src/eval/nnue/features/half_kp.h              |   24 +-
 src/eval/nnue/features/half_relative_kp.cpp   |   12 +-
 src/eval/nnue/features/half_relative_kp.h     |   30 +-
 src/eval/nnue/features/index_list.h           |    8 +-
 src/eval/nnue/features/k.cpp                  |    8 +-
 src/eval/nnue/features/k.h                    |   20 +-
 src/eval/nnue/features/p.cpp                  |    8 +-
 src/eval/nnue/features/p.h                    |   20 +-
 src/eval/nnue/layers/affine_transform.h       |   32 +-
 src/eval/nnue/layers/clipped_relu.h           |   26 +-
 src/eval/nnue/layers/input_slice.h            |   24 +-
 src/eval/nnue/layers/sum.h                    |   58 +-
 src/eval/nnue/nnue_accumulator.h              |    8 +-
 src/eval/nnue/nnue_architecture.h             |    8 +-
 src/eval/nnue/nnue_common.h                   |   18 +-
 src/eval/nnue/nnue_feature_transformer.h      |   40 +-
 src/eval/nnue/nnue_test_command.cpp           |   22 +-
 src/eval/nnue/nnue_test_command.h             |    4 +-
 src/eval/nnue/trainer/features/factorizer.h   |   28 +-
 .../trainer/features/factorizer_feature_set.h |   24 +-
 .../trainer/features/factorizer_half_kp.h     |   18 +-
 src/eval/nnue/trainer/trainer.h               |   22 +-
 .../nnue/trainer/trainer_affine_transform.h   |   54 +-
 src/eval/nnue/trainer/trainer_clipped_relu.h  |   38 +-
 .../trainer/trainer_feature_transformer.h     |   54 +-
 src/eval/nnue/trainer/trainer_input_slice.h   |   62 +-
 src/eval/nnue/trainer/trainer_sum.h           |   66 +-
 src/extra/sfen_packer.cpp                     |  193 ++-
 src/learn/half_float.h                        |    8 +-
 src/learn/learn.h                             |  176 +--
 src/learn/learner.cpp                         | 1367 ++++++++---------
 src/learn/learning_tools.cpp                  |   58 +-
 src/learn/learning_tools.h                    |  528 +++----
 src/learn/multi_think.cpp                     |   74 +-
 src/learn/multi_think.h                       |   96 +-
 54 files changed, 1903 insertions(+), 1905 deletions(-)

diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index 5d5d05b1..84a96bee 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -1,75 +1,75 @@
 ﻿#ifndef _EVALUATE_COMMON_H_
 #define _EVALUATE_COMMON_H_
 
-// いまどきの手番つき評価関数(EVAL_KPPTとEVAL_KPP_KKPT)の共用header的なもの。
+// A common header-like function for modern evaluation functions (EVAL_KPPT and EVAL_KPP_KKPT).
 
 #if defined(EVAL_NNUE) || defined(EVAL_LEARN)
 #include <functional>
 
-// KKファイル名
+// KK file name
 #define KK_BIN "KK_synthesized.bin"
 
-// KKPファイル名
+// KKP file name
 #define KKP_BIN "KKP_synthesized.bin"
 
-// KPPファイル名
+// KPP file name
 #define KPP_BIN "KPP_synthesized.bin"
 
 namespace Eval
 {
 
 #if defined(USE_EVAL_HASH)
-	// prefetchする関数
+	// prefetch function
 	void prefetch_evalhash(const Key key);
 #endif
 
-	// 評価関数のそれぞれのパラメーターに対して関数fを適用してくれるoperator。
-	// パラメーターの分析などに用いる。
-	// typeは調査対象を表す。
-	//   type = -1 : KK,KKP,KPPすべて
-	//   type = 0  : KK のみ 
-	//   type = 1  : KKPのみ 
-	//   type = 2  : KPPのみ 
+	// An operator that applies the function f to each parameter of the evaluation function.
+	// Used for parameter analysis etc.
+	// type indicates the survey target.
+	// type = -1 :KK,KKP,KPP all
+	// type = 0: KK only
+	// type = 1: KKP only
+	// type = 2: KPP only
 	void foreach_eval_param(std::function<void(int32_t, int32_t)>f, int type = -1);
 
 	// --------------------------
-	//        学習用
+	// for learning
 	// --------------------------
 
 #if defined(EVAL_LEARN)
-	// 学習のときの勾配配列の初期化
-	// 学習率を引数に渡しておく。0.0なら、defaultの値を採用する。
-	// update_weights()のepochが、eta_epochまでetaから徐々にeta2に変化する。
-	// eta2_epoch以降は、eta2から徐々にeta3に変化する。
+	// Initialize the gradient array during learning
+	// Pass the learning rate as an argument. If 0.0, the default value is used.
+	// The epoch of update_weights() gradually changes from eta to eta2 until eta_epoch.
+	// After eta2_epoch, gradually change from eta2 to eta3.
 	void init_grad(double eta1, uint64_t eta_epoch, double eta2, uint64_t eta2_epoch, double eta3);
 
-	// 現在の局面で出現している特徴すべてに対して、勾配の差分値を勾配配列に加算する。
-	// freeze[0]  : kkは学習させないフラグ
-	// freeze[1]  : kkpは学習させないフラグ
-	// freeze[2]  : kppは学習させないフラグ
-	// freeze[3]  : kpppは学習させないフラグ
+	// Add the gradient difference value to the gradient array for all features that appear in the current phase.
+	// freeze[0]: Flag that kk does not learn
+	// freeze[1]: Flag that kkp does not learn
+	// freeze[2]: Flag that kpp does not learn
+	// freeze[3]: Flag that kppp does not learn
 	void add_grad(Position& pos, Color rootColor, double delt_grad, const std::array<bool, 4>& freeze);
 
-	// 現在の勾配をもとにSGDかAdaGradか何かする。
-	// epoch      : 世代カウンター(0から始まる)
-	// freeze[0]  : kkは学習させないフラグ
-	// freeze[1]  : kkpは学習させないフラグ
-	// freeze[2]  : kppは学習させないフラグ
-	// freeze[3]  : kpppは学習させないフラグ
-	void update_weights(uint64_t epoch, const std::array<bool,4>& freeze);
+	// Do SGD or AdaGrad or something based on the current gradient.
+	// epoch: Generation counter (starting from 0)
+	// freeze[0]: Flag that kk does not learn
+	// freeze[1]: Flag that kkp does not learn
+	// freeze[2]: Flag that kpp does not learn
+	// freeze[3]: Flag that kppp does not learn
+	void update_weights(uint64_t epoch, const std::array<bool, 4>& freeze);
 
-	// 評価関数パラメーターをファイルに保存する。
-	// ファイルの末尾につける拡張子を指定できる。
+	// Save the evaluation function parameters to a file.
+	// You can specify the extension added to the end of the file.
 	void save_eval(std::string suffix);
 
-	// 現在のetaを取得する。
+	// Get the current eta.
 	double get_eta();
 
-	// -- 学習に関連したコマンド
+	// --learning related commands
 
-	// KKを正規化する関数。元の評価関数と完全に等価にはならないので注意。
-	// kkp,kppの値をなるべくゼロに近づけることで、学習中に出現しなかった特徴因子の値(ゼロになっている)が
-	// 妥当であることを保証しようという考え。
+	// A function that normalizes KK. Note that it is not completely equivalent to the original evaluation function.
+	// By making the values ​​of kkp and kpp as close to zero as possible, the value of the feature factor (which is zero) that did not appear during learning
+	// The idea of ​​ensuring it is valid.
 	void regularize_kk();
 
 #endif
diff --git a/src/eval/evaluate_mir_inv_tools.cpp b/src/eval/evaluate_mir_inv_tools.cpp
index 56a0a63e..3b5d3a36 100644
--- a/src/eval/evaluate_mir_inv_tools.cpp
+++ b/src/eval/evaluate_mir_inv_tools.cpp
@@ -7,35 +7,35 @@ namespace Eval
 
 	// --- tables
 
-	// あるBonaPieceを相手側から見たときの値
-	// BONA_PIECE_INITが-1なので符号型で持つ必要がある。
-	// KPPTを拡張しても当面、BonaPieceが2^15を超えることはないのでint16_tで良しとする。
+	// Value when a certain BonaPiece is seen from the other side
+	// BONA_PIECE_INIT is -1, so it must be a signed type.
+	// Even if KPPT is expanded, BonaPiece will not exceed 2^15 for the time being, so int16_t is good.
 	int16_t inv_piece_[Eval::fe_end];
 
-	// 盤面上のあるBonaPieceをミラーした位置にあるものを返す。
+	// Returns the one at the position where a BonaPiece on the board is mirrored.
 	int16_t mir_piece_[Eval::fe_end];
 
 
 	// --- methods
 
-	// あるBonaPieceを相手側から見たときの値を返す
+// Returns the value when a certain BonaPiece is seen from the other side
 	Eval::BonaPiece inv_piece(Eval::BonaPiece p) { return (Eval::BonaPiece)inv_piece_[p]; }
 
-	// 盤面上のあるBonaPieceをミラーした位置にあるものを返す。
+	// Returns the one at the position where a BonaPiece on the board is mirrored.
 	Eval::BonaPiece mir_piece(Eval::BonaPiece p) { return (Eval::BonaPiece)mir_piece_[p]; }
 
 	std::function<void()> mir_piece_init_function;
 
 	void init_mir_inv_tables()
 	{
-		// mirrorとinverseのテーブルの初期化。
+		// Initialize the mirror and inverse tables.
 
-		// 初期化は1回に限る。
+		// Initialization is limited to once.
 		static bool first = true;
 		if (!first) return;
 		first = false;
 
-		// fとeとの交換
+		// exchange f and e
 		int t[] = {
 			f_pawn             , e_pawn            ,
 			f_knight           , e_knight          ,
@@ -44,12 +44,12 @@ namespace Eval
 			f_queen            , e_queen           ,
 		};
 
-		// 未初期化の値を突っ込んでおく。
+		// Insert uninitialized value.
 		for (BonaPiece p = BONA_PIECE_ZERO; p < fe_end; ++p)
 		{
 			inv_piece_[p] = BONA_PIECE_NOT_INIT;
 
-			// mirrorは手駒に対しては機能しない。元の値を返すだけ。
+			// mirror does not work for hand pieces. Just return the original value.
 			mir_piece_[p] = (p < f_pawn) ? p : BONA_PIECE_NOT_INIT;
 		}
 
@@ -61,26 +61,26 @@ namespace Eval
 				{
 					Square sq = (Square)(p - t[i]);
 
-					// 見つかった!!
+					// found!!
 					BonaPiece q = (p < fe_hand_end) ? BonaPiece(sq + t[i + 1]) : (BonaPiece)(Inv(sq) + t[i + 1]);
 					inv_piece_[p] = q;
 					inv_piece_[q] = p;
 
 					/*
-					ちょっとトリッキーだが、pに関して盤上の駒は
-					p >= fe_hand_end
-					のとき。
+					It's a bit tricky, but regarding p
+										p >= fe_hand_end
+										When.
 
-					このpに対して、nを整数として(上のコードのiは偶数しかとらない)、
-					a)  t[2n + 0] <= p < t[2n + 1] のときは先手の駒
-					b)  t[2n + 1] <= p < t[2n + 2] のときは後手の駒
-					　である。
+					For this p, let n be an integer (i in the above code can only be an even number),
+					a) When t[2n + 0] <= p <t[2n + 1], the first piece
+					b) When t[2n + 1] <= p <t[2n + 2], the back piece
+					Is.
 
-					 ゆえに、a)の範囲にあるpをq = Inv(p-t[2n+0]) + t[2n+1] とすると180度回転させた升にある後手の駒となる。
-					 そこでpとqをswapさせてinv_piece[ ]を初期化してある。
-					 */
+					Therefore, if p in the range of a) is set to q = Inv(p-t[2n+0]) + t[2n+1], it becomes the back piece in the box rotated 180 degrees.
+					So inv_piece[] is initialized by swapping p and q.
+					*/
 
-					 // 手駒に関してはmirrorなど存在しない。
+					// There is no mirror for hand pieces.
 					if (p < fe_hand_end)
 						continue;
 
@@ -103,28 +103,28 @@ namespace Eval
 
 		for (BonaPiece p = BONA_PIECE_ZERO; p < fe_end; ++p)
 		{
-			// 未初期化のままになっている。上のテーブルの初期化コードがおかしい。
+			// It remains uninitialized. The initialization code in the table above is incorrect.
 			assert(mir_piece_[p] != BONA_PIECE_NOT_INIT && mir_piece_[p] < fe_end);
 			assert(inv_piece_[p] != BONA_PIECE_NOT_INIT && inv_piece_[p] < fe_end);
 
-			// mirとinvは、2回適用したら元の座標に戻る。
+			// mir and inv return to their original coordinates after being applied twice.
 			assert(mir_piece_[mir_piece_[p]] == p);
 			assert(inv_piece_[inv_piece_[p]] == p);
 
-			// mir->inv->mir->invは元の場所でなければならない。
+			// mir->inv->mir->inv must be the original location.
 			assert(p == inv_piece(mir_piece(inv_piece(mir_piece(p)))));
 
-			// inv->mir->inv->mirは元の場所でなければならない。
+			// inv->mir->inv->mir must be the original location.
 			assert(p == mir_piece(inv_piece(mir_piece(inv_piece(p)))));
 		}
 
 #if 0
-		// 評価関数のミラーをしても大丈夫であるかの事前検証
-		// 値を書き込んだときにassertionがあるので、ミラーしてダメである場合、
-		// そのassertに引っかかるはず。
+		// Pre-verification that it is okay to mirror the evaluation function
+		// When writing a value, there is an assertion, so if you can't mirror it,
+		// Should get caught in the assert.
 
-		// AperyのWCSC26の評価関数、kppのp1==0とかp1==20(後手の0枚目の歩)とかの
-		// ところにゴミが入っていて、これを回避しないとassertに引っかかる。
+		// Apery's WCSC26 evaluation function, kpp p1==0 or p1==20 (0th step on the back)
+		// There is dust in it, and if you don't avoid it, it will get caught in the assert.
 
 		std::unordered_set<BonaPiece> s;
 		vector<int> a = {
@@ -139,24 +139,24 @@ namespace Eval
 		for (auto b : a)
 			s.insert((BonaPiece)b);
 
-		// さらに出現しない升の盤上の歩、香、桂も除外(Aperyはここにもゴミが入っている)
+		// Excludes walks, incense, and katsura on the board that do not appear further (Apery also contains garbage here)
 		for (Rank r = RANK_1; r <= RANK_2; ++r)
 			for (File f = FILE_1; f <= FILE_9; ++f)
 			{
 				if (r == RANK_1)
 				{
-					// 1段目の歩
+					// first step
 					BonaPiece b1 = BonaPiece(f_pawn + (f | r));
 					s.insert(b1);
 					s.insert(inv_piece[b1]);
 
-					// 1段目の香
+					// 1st stage incense
 					BonaPiece b2 = BonaPiece(f_lance + (f | r));
 					s.insert(b2);
 					s.insert(inv_piece[b2]);
 				}
 
-				// 1,2段目の桂
+				// Katsura on the 1st and 2nd steps
 				BonaPiece b = BonaPiece(f_knight + (f | r));
 				s.insert(b);
 				s.insert(inv_piece[b]);
diff --git a/src/eval/evaluate_mir_inv_tools.h b/src/eval/evaluate_mir_inv_tools.h
index 8d6378ec..fa4e70ac 100644
--- a/src/eval/evaluate_mir_inv_tools.h
+++ b/src/eval/evaluate_mir_inv_tools.h
@@ -3,7 +3,7 @@
 
 #if defined(EVAL_NNUE) || defined(EVAL_LEARN)
 
-// BonaPieceのmirror(左右反転)やinverse(盤上の180度回転)させた駒を得るためのツール類。
+// BonaPiece's mirror (horizontal flip) and inverse (180° on the board) tools to get pieces.
 
 #include "../types.h"
 #include "../evaluate.h"
@@ -15,33 +15,33 @@ namespace Eval
 	//                  tables
 	// -------------------------------------------------
 
-	// 	--- BonaPieceに対してMirrorとInverseを提供する。
+	// --- Provide Mirror and Inverse to BonaPiece.
 
-	// これらの配列は、init()かinit_mir_inv_tables();を呼び出すと初期化される。
-	// このテーブルのみを評価関数のほうから使いたいときは、評価関数の初期化のときに
-	// init_mir_inv_tables()を呼び出すと良い。
-	// これらの配列は、以下のKK/KKP/KPPクラスから参照される。
+	// These arrays are initialized by calling init() or init_mir_inv_tables();.
+	// If you want to use only this table from the evaluation function,
+	// Call init_mir_inv_tables().
+	// These arrays are referenced from the KK/KKP/KPP classes below.
 
-	// あるBonaPieceを相手側から見たときの値を返す
+	// Returns the value when a certain BonaPiece is seen from the other side
 	extern Eval::BonaPiece inv_piece(Eval::BonaPiece p);
 
-	// 盤面上のあるBonaPieceをミラーした位置にあるものを返す。
+	// Returns the one at the position where a BonaPiece on the board is mirrored.
 	extern Eval::BonaPiece mir_piece(Eval::BonaPiece p);
 
 
-	// mir_piece/inv_pieceの初期化のときに呼び出されるcallback
-	// fe_endをユーザー側で拡張するときに用いる。
-	// この初期化のときに必要なのでinv_piece_とinv_piece_を公開している。
-	// mir_piece_init_functionが呼び出されたタイミングで、fe_old_endまでは
-	// これらのテーブルの初期化が完了していることが保証されている。
+	// callback called when initializing mir_piece/inv_piece
+	// Used when extending fe_end on the user side.
+	// Inv_piece_ and inv_piece_ are exposed because they are necessary for this initialization.
+	// At the timing when mir_piece_init_function is called, until fe_old_end
+	// It is guaranteed that these tables have been initialized.
 	extern std::function<void()> mir_piece_init_function;
 	extern int16_t mir_piece_[Eval::fe_end];
 	extern int16_t inv_piece_[Eval::fe_end];
 
-	// この関数を明示的に呼び出すか、init()を呼び出すかしたときに、上のテーブルが初期化される。
+	// The table above will be initialized when you call this function explicitly or call init().
 	extern void init_mir_inv_tables();
 }
 
 #endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h b/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
index 9f1f97c0..1bb9609e 100644
--- a/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
+++ b/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
@@ -1,4 +1,4 @@
-// NNUE�]���֐��ŗp������͓����ʂƃl�b�g���[�N�\���̒�`
+// Definition of input features and network structure used in NNUE evaluation function
 
 #ifndef HALFKP_CR_EP_256X2_32_32_H
 #define HALFKP_CR_EP_256X2_32_32_H
@@ -16,17 +16,17 @@ namespace Eval {
 
   namespace NNUE {
 
-    // �]���֐��ŗp������͓�����
+    // Input features used in evaluation function
     using RawFeatures = Features::FeatureSet<
       Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
       Features::EnPassant>;
 
-    // �ϊ���̓��͓����ʂ̎�����
+    // Number of input feature dimensions after conversion
     constexpr IndexType kTransformedFeatureDimensions = 256;
 
     namespace Layers {
 
-      // �l�b�g���[�N�\���̒�`
+      // define network structure
       using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
       using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
       using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
diff --git a/src/eval/nnue/architectures/halfkp_256x2-32-32.h b/src/eval/nnue/architectures/halfkp_256x2-32-32.h
index c79747c3..467d0222 100644
--- a/src/eval/nnue/architectures/halfkp_256x2-32-32.h
+++ b/src/eval/nnue/architectures/halfkp_256x2-32-32.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数で用いる入力特徴量とネットワーク構造の定義
+﻿// Definition of input features and network structure used in NNUE evaluation function
 
 #ifndef HALFKP_256X2_32_32_H
 #define HALFKP_256X2_32_32_H
@@ -14,16 +14,16 @@ namespace Eval {
 
 namespace NNUE {
 
-// 評価関数で用いる入力特徴量
+// Input features used in evaluation function
 using RawFeatures = Features::FeatureSet<
     Features::HalfKP<Features::Side::kFriend>>;
 
-// 変換後の入力特徴量の次元数
+// Number of input feature dimensions after conversion
 constexpr IndexType kTransformedFeatureDimensions = 256;
 
 namespace Layers {
 
-// ネットワーク構造の定義
+// define network structure
 using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
 using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
 using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
diff --git a/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h b/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
index dc761866..72531fd4 100644
--- a/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
+++ b/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
@@ -1,4 +1,4 @@
-// NNUE�]���֐��ŗp������͓����ʂƃl�b�g���[�N�\���̒�`
+// Definition of input features and network structure used in NNUE evaluation function
 
 #ifndef K_P_CR_EP_256X2_32_32_H
 #define K_P_CR_EP_256X2_32_32_H
@@ -17,16 +17,16 @@ namespace Eval {
 
   namespace NNUE {
 
-    // �]���֐��ŗp������͓�����
+    // Input features used in evaluation function
     using RawFeatures = Features::FeatureSet<Features::K, Features::P,
       Features::CastlingRight, Features::EnPassant>;
 
-    // �ϊ���̓��͓����ʂ̎�����
+    // Number of input feature dimensions after conversion
     constexpr IndexType kTransformedFeatureDimensions = 256;
 
     namespace Layers {
 
-      // �l�b�g���[�N�\���̒�`
+      // define network structure
       using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
       using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
       using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
diff --git a/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h b/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
index 331cb4f2..b4161880 100644
--- a/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
+++ b/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
@@ -1,4 +1,4 @@
-// NNUE�]���֐��ŗp������͓����ʂƃl�b�g���[�N�\���̒�`
+// Definition of input features and network structure used in NNUE evaluation function
 
 #ifndef K_P_CR_256X2_32_32_H
 #define K_P_CR_256X2_32_32_H
@@ -16,16 +16,16 @@ namespace Eval {
 
   namespace NNUE {
 
-    // �]���֐��ŗp������͓�����
+    // Input features used in evaluation function
     using RawFeatures = Features::FeatureSet<Features::K, Features::P,
       Features::CastlingRight>;
 
-    // �ϊ���̓��͓����ʂ̎�����
+    // Number of input feature dimensions after conversion
     constexpr IndexType kTransformedFeatureDimensions = 256;
 
     namespace Layers {
 
-      // �l�b�g���[�N�\���̒�`
+      // define network structure
       using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
       using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
       using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
diff --git a/src/eval/nnue/architectures/k-p_256x2-32-32.h b/src/eval/nnue/architectures/k-p_256x2-32-32.h
index 2576ddfa..9fc9b2a1 100644
--- a/src/eval/nnue/architectures/k-p_256x2-32-32.h
+++ b/src/eval/nnue/architectures/k-p_256x2-32-32.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数で用いる入力特徴量とネットワーク構造の定義
+﻿// Definition of input features and network structure used in NNUE evaluation function
 #ifndef K_P_256X2_32_32_H
 #define K_P_256X2_32_32_H
 
@@ -14,15 +14,15 @@ namespace Eval {
 
 namespace NNUE {
 
-// 評価関数で用いる入力特徴量
+// Input features used in evaluation function
 using RawFeatures = Features::FeatureSet<Features::K, Features::P>;
 
-// 変換後の入力特徴量の次元数
+// Number of input feature dimensions after conversion
 constexpr IndexType kTransformedFeatureDimensions = 256;
 
 namespace Layers {
 
-// ネットワーク構造の定義
+// define network structure
 using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
 using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
 using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index 46b3b5f9..6b3f0b2f 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の計算に関するコード
+﻿// Code for calculating NNUE evaluation function
 
 #if defined(EVAL_NNUE)
 
@@ -16,16 +16,16 @@ namespace Eval {
 
 namespace NNUE {
 
-// 入力特徴量変換器
+// Input feature converter
 AlignedPtr<FeatureTransformer> feature_transformer;
 
-// 評価関数
+// Evaluation function
 AlignedPtr<Network> network;
 
-// 評価関数ファイル名
+// Evaluation function file name
 const char* const kFileName = "nn.bin";
 
-// 評価関数の構造を表す文字列を取得する
+// Get a string that represents the structure of the evaluation function
 std::string GetArchitectureString() {
   return "Features=" + FeatureTransformer::GetStructureString() +
       ",Network=" + Network::GetStructureString();
@@ -35,14 +35,14 @@ namespace {
 
 namespace Detail {
 
-// 評価関数パラメータを初期化する
+// Initialize the evaluation function parameters
 template <typename T>
 void Initialize(AlignedPtr<T>& pointer) {
   pointer.reset(reinterpret_cast<T*>(aligned_malloc(sizeof(T), alignof(T))));
   std::memset(pointer.get(), 0, sizeof(T));
 }
 
-// 評価関数パラメータを読み込む
+// read evaluation function parameters
 template <typename T>
 bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
   std::uint32_t header;
@@ -51,7 +51,7 @@ bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
   return pointer->ReadParameters(stream);
 }
 
-// 評価関数パラメータを書き込む
+// write evaluation function parameters
 template <typename T>
 bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
   constexpr std::uint32_t header = T::GetHashValue();
@@ -61,7 +61,7 @@ bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
 
 }  // namespace Detail
 
-// 評価関数パラメータを初期化する
+// Initialize the evaluation function parameters
 void Initialize() {
   Detail::Initialize(feature_transformer);
   Detail::Initialize(network);
@@ -69,7 +69,7 @@ void Initialize() {
 
 }  // namespace
 
-// ヘッダを読み込む
+// read the header
 bool ReadHeader(std::istream& stream,
   std::uint32_t* hash_value, std::string* architecture) {
   std::uint32_t version, size;
@@ -82,7 +82,7 @@ bool ReadHeader(std::istream& stream,
   return !stream.fail();
 }
 
-// ヘッダを書き込む
+// write the header
 bool WriteHeader(std::ostream& stream,
   std::uint32_t hash_value, const std::string& architecture) {
   stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
@@ -93,7 +93,7 @@ bool WriteHeader(std::ostream& stream,
   return !stream.fail();
 }
 
-// 評価関数パラメータを読み込む
+// read evaluation function parameters
 bool ReadParameters(std::istream& stream) {
   std::uint32_t hash_value;
   std::string architecture;
@@ -104,7 +104,7 @@ bool ReadParameters(std::istream& stream) {
   return stream && stream.peek() == std::ios::traits_type::eof();
 }
 
-// 評価関数パラメータを書き込む
+// write evaluation function parameters
 bool WriteParameters(std::ostream& stream) {
   if (!WriteHeader(stream, kHashValue, GetArchitectureString())) return false;
   if (!Detail::WriteParameters(stream, feature_transformer)) return false;
@@ -112,12 +112,12 @@ bool WriteParameters(std::ostream& stream) {
   return !stream.fail();
 }
 
-// 差分計算ができるなら進める
+// proceed if you can calculate the difference
 static void UpdateAccumulatorIfPossible(const Position& pos) {
   feature_transformer->UpdateAccumulatorIfPossible(pos);
 }
 
-// 評価値を計算する
+// Calculate the evaluation value
 static Value ComputeScore(const Position& pos, bool refresh = false) {
   auto& accumulator = pos.state()->accumulator;
   if (!refresh && accumulator.computed_score) {
@@ -130,22 +130,22 @@ static Value ComputeScore(const Position& pos, bool refresh = false) {
   alignas(kCacheLineSize) char buffer[Network::kBufferSize];
   const auto output = network->Propagate(transformed_features, buffer);
 
-  // VALUE_MAX_EVALより大きな値が返ってくるとaspiration searchがfail highして
-  // 探索が終わらなくなるのでVALUE_MAX_EVAL以下であることを保証すべき。
+  // When a value larger than VALUE_MAX_EVAL is returned, aspiration search fails high
+  // It should be guaranteed that it is less than VALUE_MAX_EVAL because the search will not end.
 
-  // この現象が起きても、対局時に秒固定などだとそこで探索が打ち切られるので、
-  // 1つ前のiterationのときの最善手がbestmoveとして指されるので見かけ上、
-  // 問題ない。このVALUE_MAX_EVALが返ってくるような状況は、ほぼ詰みの局面であり、
-  // そのような詰みの局面が出現するのは終盤で形勢に大差がついていることが多いので
-  // 勝敗にはあまり影響しない。
+  // Even if this phenomenon occurs, if the seconds are fixed when playing, the search will be aborted there, so
+  // The best move in the previous iteration is pointed to as bestmove, so apparently
+  // no problem. The situation in which this VALUE_MAX_EVAL is returned is almost at a dead end,
+  // Since such a jamming phase often appears at the end, there is a big difference in the situation
+  // Doesn't really affect the outcome.
 
-  // しかし、教師生成時などdepth固定で探索するときに探索から戻ってこなくなるので
-  // そのスレッドの計算時間を無駄にする。またdepth固定対局でtime-outするようになる。
+  // However, when searching with a fixed depth such as when creating a teacher, it will not return from the search
+  // Waste the computation time for that thread. Also, it will be timed out with fixed depth game.
 
   auto score = static_cast<Value>(output[0] / FV_SCALE);
 
-  // 1) ここ、下手にclipすると学習時には影響があるような気もするが…。
-  // 2) accumulator.scoreは、差分計算の時に用いないので書き換えて問題ない。
+  // 1) I feel that if I clip too poorly, it will have an effect on my learning...
+  // 2) Since accumulator.score is not used at the time of difference calculation, it can be rewritten without any problem.
   score = Math::clamp(score , -VALUE_MAX_EVAL , VALUE_MAX_EVAL);
 
   accumulator.score = score;
@@ -153,10 +153,10 @@ static Value ComputeScore(const Position& pos, bool refresh = false) {
   return accumulator.score;
 }
 
-}  // namespace NNUE
+} // namespace NNUE
 
 #if defined(USE_EVAL_HASH)
-// HashTableに評価値を保存するために利用するクラス
+// Class used to store evaluation values ​​in HashTable
 struct alignas(16) ScoreKeyValue {
 #if defined(USE_SSE2)
   ScoreKeyValue() = default;
@@ -171,15 +171,15 @@ struct alignas(16) ScoreKeyValue {
   }
 #endif
 
-  // evaluate hashでatomicに操作できる必要があるのでそのための操作子
+  // It is necessary to be able to operate atomically with evaluate hash, so the manipulator for that
   void encode() {
 #if defined(USE_SSE2)
-    // ScoreKeyValue は atomic にコピーされるので key が合っていればデータも合っている。
+    // ScoreKeyValue is copied to atomic, so if the key matches, the data matches.
 #else
     key ^= score;
 #endif
   }
-  // decode()はencode()の逆変換だが、xorなので逆変換も同じ変換。
+  // decode() is the reverse conversion of encode(), but since it is xor, the reverse conversion is the same.
   void decode() { encode(); }
 
   union {
@@ -193,45 +193,45 @@ struct alignas(16) ScoreKeyValue {
   };
 };
 
-// シンプルなHashTableの実装。
-// Sizeは2のべき乗。
+// Simple HashTable implementation.
+// Size is a power of 2.
 template <typename T, size_t Size>
 struct HashTable {
   HashTable() { clear(); }
   T* operator [] (const Key k) { return entries_ + (static_cast<size_t>(k) & (Size - 1)); }
   void clear() { memset(entries_, 0, sizeof(T)*Size); }
 
-  // Size が 2のべき乗であることのチェック
+  // Check that Size is a power of 2
   static_assert((Size & (Size - 1)) == 0, "");
 
  private:
   T entries_[Size];
 };
 
-// evaluateしたものを保存しておくHashTable(俗にいうehash)
+//HashTable to save the evaluated ones (following ehash)
 
 #if !defined(USE_LARGE_EVAL_HASH)
-// 134MB(魔女のAVX2以外の時の設定)
+// 134MB (setting other than witch's AVX2)
 struct EvaluateHashTable : HashTable<ScoreKeyValue, 0x800000> {};
 #else
-// prefetch有りなら大きいほうが良いのでは…。
-// →　あまり変わらないし、メモリもったいないのでデフォルトでは↑の設定で良いか…。
-// 1GB(魔女のAVX2の時の設定)
+// If you have prefetch, it's better to have a big one...
+// → It doesn't change much and the memory is wasteful, so is it okay to set ↑ by default?
+// 1GB (setting for witch's AVX2)
 struct EvaluateHashTable : HashTable<ScoreKeyValue, 0x4000000> {};
 #endif
 
 EvaluateHashTable g_evalTable;
 
-// prefetchする関数も用意しておく。
+// Prepare a function to prefetch.
 void prefetch_evalhash(const Key key) {
   constexpr auto mask = ~((uint64_t)0x1f);
   prefetch((void*)((uint64_t)g_evalTable[key] & mask));
 }
 #endif
 
-// 評価関数ファイルを読み込む
-// benchコマンドなどでOptionsを保存して復元するのでこのときEvalDirが変更されたことになって、
-// 評価関数の再読込の必要があるというフラグを立てるため、この関数は2度呼び出されることがある。
+// read the evaluation function file
+// Save and restore Options with bench command etc., so EvalDir is changed at this time,
+// This function may be called twice to flag that the evaluation function needs to be reloaded.
 void load_eval() {
   NNUE::Initialize();
 
@@ -249,7 +249,7 @@ void load_eval() {
 //    ASSERT(result);
 	if (!result)
 	{
-		// 読み込みエラーのとき終了してくれないと困る。
+		// It's a problem if it doesn't finish when there is a read error.
 		std::cout << "Error! " << NNUE::kFileName << " not found or wrong format" << std::endl;
 		//my_exit();
 	}
@@ -260,19 +260,19 @@ void load_eval() {
     std::cout << "info string NNUE " << NNUE::kFileName << " not loaded" << std::endl;
 }
 
-// 初期化
+// Initialization
 void init() {
 }
 
-// 評価関数。差分計算ではなく全計算する。
-// Position::set()で一度だけ呼び出される。(以降は差分計算)
-// 手番側から見た評価値を返すので注意。(他の評価関数とは設計がこの点において異なる)
-// なので、この関数の最適化は頑張らない。
+// Evaluation function. Perform full calculation instead of difference calculation.
+// Called only once with Position::set(). (The difference calculation after that)
+// Note that the evaluation value seen from the turn side is returned. (Design differs from other evaluation functions in this respect)
+// Since, we will not try to optimize this function.
 Value compute_eval(const Position& pos) {
   return NNUE::ComputeScore(pos, true);
 }
 
-// 評価関数
+// Evaluation function
 Value evaluate(const Position& pos) {
   const auto& accumulator = pos.state()->accumulator;
   if (accumulator.computed_score) {
@@ -280,8 +280,8 @@ Value evaluate(const Position& pos) {
   }
 
 #if defined(USE_GLOBAL_OPTIONS)
-  // GlobalOptionsでeval hashを用いない設定になっているなら
-  // eval hashへの照会をskipする。
+  // If Global Options is set not to use eval hash
+  // Skip the query to the eval hash.
   if (!GlobalOptions.use_eval_hash) {
     ASSERT_LV5(pos.state()->materialValue == Eval::material(pos));
     return NNUE::ComputeScore(pos);
@@ -289,19 +289,19 @@ Value evaluate(const Position& pos) {
 #endif
 
 #if defined(USE_EVAL_HASH)
-  // evaluate hash tableにはあるかも。
+  // May be in the evaluate hash table.
   const Key key = pos.key();
   ScoreKeyValue entry = *g_evalTable[key];
   entry.decode();
   if (entry.key == key) {
-    // あった！
+    // there were!
     return Value(entry.score);
   }
 #endif
 
   Value score = NNUE::ComputeScore(pos);
 #if defined(USE_EVAL_HASH)
-  // せっかく計算したのでevaluate hash tableに保存しておく。
+  // Since it was calculated carefully, save it in the evaluate hash table.
   entry.key = key;
   entry.score = score;
   entry.encode();
@@ -311,12 +311,12 @@ Value evaluate(const Position& pos) {
   return score;
 }
 
-// 差分計算ができるなら進める
+// proceed if you can calculate the difference
 void evaluate_with_no_return(const Position& pos) {
   NNUE::UpdateAccumulatorIfPossible(pos);
 }
 
-// 現在の局面の評価値の内訳を表示する
+// display the breakdown of the evaluation value of the current phase
 void print_eval_stat(Position& /*pos*/) {
   std::cout << "--- EVAL STAT: not implemented" << std::endl;
 }
diff --git a/src/eval/nnue/evaluate_nnue.h b/src/eval/nnue/evaluate_nnue.h
index a95f2bd9..7f8f700a 100644
--- a/src/eval/nnue/evaluate_nnue.h
+++ b/src/eval/nnue/evaluate_nnue.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数で用いるheader
+﻿// header used in NNUE evaluation function
 
 #ifndef _EVALUATE_NNUE_H_
 #define _EVALUATE_NNUE_H_
@@ -14,11 +14,11 @@ namespace Eval {
 
 namespace NNUE {
 
-// 評価関数の構造のハッシュ値
+// hash value of evaluation function structure
 constexpr std::uint32_t kHashValue =
     FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
 
-// メモリ領域の解放を自動化するためのデリータ
+// Deleter for automating release of memory area
 template <typename T>
 struct AlignedDeleter {
   void operator()(T* ptr) const {
@@ -29,30 +29,30 @@ struct AlignedDeleter {
 template <typename T>
 using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
 
-// 入力特徴量変換器
+// Input feature converter
 extern AlignedPtr<FeatureTransformer> feature_transformer;
 
-// 評価関数
+// Evaluation function
 extern AlignedPtr<Network> network;
 
-// 評価関数ファイル名
+// Evaluation function file name
 extern const char* const kFileName;
 
-// 評価関数の構造を表す文字列を取得する
+// Get a string that represents the structure of the evaluation function
 std::string GetArchitectureString();
 
-// ヘッダを読み込む
+// read the header
 bool ReadHeader(std::istream& stream,
     std::uint32_t* hash_value, std::string* architecture);
 
-// ヘッダを書き込む
+// write the header
 bool WriteHeader(std::ostream& stream,
     std::uint32_t hash_value, const std::string& architecture);
 
-// 評価関数パラメータを読み込む
+// read evaluation function parameters
 bool ReadParameters(std::istream& stream);
 
-// 評価関数パラメータを書き込む
+// write evaluation function parameters
 bool WriteParameters(std::ostream& stream);
 
 }  // namespace NNUE
@@ -61,4 +61,4 @@ bool WriteParameters(std::ostream& stream);
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/evaluate_nnue_learner.cpp b/src/eval/nnue/evaluate_nnue_learner.cpp
index 636f90e1..0a2077a7 100644
--- a/src/eval/nnue/evaluate_nnue_learner.cpp
+++ b/src/eval/nnue/evaluate_nnue_learner.cpp
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の学習時用のコード
+﻿// Code for learning NNUE evaluation function
 
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
@@ -31,30 +31,30 @@ namespace NNUE {
 
 namespace {
 
-// 学習データ
+// learning data
 std::vector<Example> examples;
 
-// examplesの排他制御をするMutex
+// Mutex for exclusive control of examples
 std::mutex examples_mutex;
 
-// ミニバッチのサンプル数
+// number of samples in mini-batch
 uint64_t batch_size;
 
-// 乱数生成器
+// random number generator
 std::mt19937 rng;
 
-// 学習器
+// learner
 std::shared_ptr<Trainer<Network>> trainer;
 
-// 学習率のスケール
+// Learning rate scale
 double global_learning_rate_scale;
 
-// 学習率のスケールを取得する
+// Get the learning rate scale
 double GetGlobalLearningRateScale() {
   return global_learning_rate_scale;
 }
 
-// ハイパーパラメータなどのオプションを学習器に伝える
+// Tell the learner options such as hyperparameters
 void SendMessages(std::vector<Message> messages) {
   for (auto& message : messages) {
     trainer->SendMessage(&message);
@@ -64,7 +64,7 @@ void SendMessages(std::vector<Message> messages) {
 
 }  // namespace
 
-// 学習の初期化を行う
+// Initialize learning
 void InitializeTraining(double eta1, uint64_t eta1_epoch,
                         double eta2, uint64_t eta2_epoch, double eta3) {
   std::cout << "Initializing NN training for "
@@ -82,18 +82,18 @@ void InitializeTraining(double eta1, uint64_t eta1_epoch,
   EvalLearningTools::Weight::init_eta(eta1, eta2, eta3, eta1_epoch, eta2_epoch);
 }
 
-// ミニバッチのサンプル数を設定する
+// set the number of samples in the mini-batch
 void SetBatchSize(uint64_t size) {
   assert(size > 0);
   batch_size = size;
 }
 
-// 学習率のスケールを設定する
+// set the learning rate scale
 void SetGlobalLearningRateScale(double scale) {
   global_learning_rate_scale = scale;
 }
 
-// ハイパーパラメータなどのオプションを設定する
+// Set options such as hyperparameters
 void SetOptions(const std::string& options) {
   std::vector<Message> messages;
   for (const auto& option : Split(options, ',')) {
@@ -108,7 +108,7 @@ void SetOptions(const std::string& options) {
   SendMessages(std::move(messages));
 }
 
-// 学習用評価関数パラメータをファイルから読み直す
+// Reread the evaluation function parameters for learning from the file
 void RestoreParameters(const std::string& dir_name) {
   const std::string file_name = Path::Combine(dir_name, NNUE::kFileName);
   std::ifstream stream(file_name, std::ios::binary);
@@ -118,7 +118,7 @@ void RestoreParameters(const std::string& dir_name) {
   SendMessages({{"reset"}});
 }
 
-// 学習データを1サンプル追加する
+// Add 1 sample of learning data
 void AddExample(Position& pos, Color rootColor,
                 const Learner::PackedSfenValue& psv, double weight) {
   Example example;
@@ -162,7 +162,7 @@ void AddExample(Position& pos, Color rootColor,
   examples.push_back(std::move(example));
 }
 
-// 評価関数パラメーターを更新する
+// update the evaluation function parameters
 void UpdateParameters(uint64_t epoch) {
   assert(batch_size > 0);
 
@@ -192,21 +192,21 @@ void UpdateParameters(uint64_t epoch) {
   SendMessages({{"quantize_parameters"}});
 }
 
-// 学習に問題が生じていないかチェックする
+// Check if there are any problems with learning
 void CheckHealth() {
   SendMessages({{"check_health"}});
 }
 
 }  // namespace NNUE
 
-// 評価関数パラメーターをファイルに保存する
+// save merit function parameters to a file
 void save_eval(std::string dir_name) {
   auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
   std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
 
-  // すでにこのフォルダがあるならmkdir()に失敗するが、
-  // 別にそれは構わない。なければ作って欲しいだけ。
-  // また、EvalSaveDirまでのフォルダは掘ってあるものとする。
+  // mkdir() will fail if this folder already exists, but
+  // Apart from that. If not, I just want you to make it.
+  // Also, assume that the folders up to EvalSaveDir have been dug.
   Dependency::mkdir(eval_dir);
 
   if (Options["SkipLoadingEval"] && NNUE::trainer) {
@@ -221,7 +221,7 @@ void save_eval(std::string dir_name) {
   std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
 }
 
-// 現在のetaを取得する
+// get the current eta
 double get_eta() {
   return NNUE::GetGlobalLearningRateScale() * EvalLearningTools::Weight::eta;
 }
diff --git a/src/eval/nnue/evaluate_nnue_learner.h b/src/eval/nnue/evaluate_nnue_learner.h
index e2e68738..932a5f8c 100644
--- a/src/eval/nnue/evaluate_nnue_learner.h
+++ b/src/eval/nnue/evaluate_nnue_learner.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の学習で用いるインターフェイス
+﻿// Interface used for learning NNUE evaluation function
 
 #ifndef _EVALUATE_NNUE_LEARNER_H_
 #define _EVALUATE_NNUE_LEARNER_H_
@@ -11,30 +11,30 @@ namespace Eval {
 
 namespace NNUE {
 
-// 学習の初期化を行う
+// Initialize learning
 void InitializeTraining(double eta1, uint64_t eta1_epoch,
                         double eta2, uint64_t eta2_epoch, double eta3);
 
-// ミニバッチのサンプル数を設定する
+// set the number of samples in the mini-batch
 void SetBatchSize(uint64_t size);
 
-// 学習率のスケールを設定する
+// set the learning rate scale
 void SetGlobalLearningRateScale(double scale);
 
-// ハイパーパラメータなどのオプションを設定する
+// Set options such as hyperparameters
 void SetOptions(const std::string& options);
 
-// 学習用評価関数パラメータをファイルから読み直す
+// Reread the evaluation function parameters for learning from the file
 void RestoreParameters(const std::string& dir_name);
 
-// 学習データを1サンプル追加する
+// Add 1 sample of learning data
 void AddExample(Position& pos, Color rootColor,
                 const Learner::PackedSfenValue& psv, double weight);
 
-// 評価関数パラメータを更新する
+// update the evaluation function parameters
 void UpdateParameters(uint64_t epoch);
 
-// 学習に問題が生じていないかチェックする
+// Check if there are any problems with learning
 void CheckHealth();
 
 }  // namespace NNUE
@@ -43,4 +43,4 @@ void CheckHealth();
 
 #endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/features/castling_right.cpp b/src/eval/nnue/features/castling_right.cpp
index 30e46e23..ee7b6576 100644
--- a/src/eval/nnue/features/castling_right.cpp
+++ b/src/eval/nnue/features/castling_right.cpp
@@ -1,4 +1,4 @@
-// NNUE評価関数の入力特徴量Kの定義
+//Definition of input feature quantity K of NNUE evaluation function
 
 #if defined(EVAL_NNUE)
 
@@ -11,10 +11,10 @@ namespace Eval {
 
     namespace Features {
 
-      // 特徴量のうち、値が1であるインデックスのリストを取得する
+      // Get a list of indices with a value of 1 among the features
       void CastlingRight::AppendActiveIndices(
         const Position& pos, Color perspective, IndexList* active) {
-        // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+        // do nothing if array size is small to avoid compiler warning
         if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
 
         int castling_rights = pos.state()->castlingRights;
@@ -28,14 +28,14 @@ namespace Eval {
             & ((castling_rights >> 2) & 3);
         }
 
-        for (int i = 0; i < kDimensions; ++i) {
+        for (int i = 0; i <kDimensions; ++i) {
           if (relative_castling_rights & (i << 1)) {
             active->push_back(i);
           }
         }
       }
 
-      // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+      // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
       void CastlingRight::AppendChangedIndices(
         const Position& pos, Color perspective,
         IndexList* removed, IndexList* added) {
diff --git a/src/eval/nnue/features/castling_right.h b/src/eval/nnue/features/castling_right.h
index 1384865f..f585b1d7 100644
--- a/src/eval/nnue/features/castling_right.h
+++ b/src/eval/nnue/features/castling_right.h
@@ -1,4 +1,4 @@
-// NNUE�]���֐��̓��͓�����K�̒�`
+//Definition of input feature quantity K of NNUE evaluation function
 
 #ifndef _NNUE_FEATURES_CASTLING_RIGHT_H_
 #define _NNUE_FEATURES_CASTLING_RIGHT_H_
@@ -14,25 +14,25 @@ namespace Eval {
 
     namespace Features {
 
-      // ������K�F�ʂ̈ʒu
+      // Feature K: Ball position
       class CastlingRight {
       public:
-        // �����ʖ�
+        // feature quantity name
         static constexpr const char* kName = "CastlingRight";
-        // �]���֐��t�@�C���ɖ��ߍ��ރn�b�V���l
+        // Hash value embedded in the evaluation function file
         static constexpr std::uint32_t kHashValue = 0x913968AAu;
-        // �����ʂ̎�����
+        // number of feature dimensions
         static constexpr IndexType kDimensions = 4;
-        // �����ʂ̂����A�����ɒl��1�ƂȂ�C���f�b�N�X�̐��̍ő�l
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
         static constexpr IndexType kMaxActiveDimensions = 4;
-        // �����v�Z�̑���ɑS�v�Z���s���^�C�~���O
+        // Timing of full calculation instead of difference calculation
         static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-        // �����ʂ̂����A�l��1�ł���C���f�b�N�X�̃��X�g���擾����
+        // Get a list of indices with a value of 1 among the features
         static void AppendActiveIndices(const Position& pos, Color perspective,
           IndexList* active);
 
-        // �����ʂ̂����A���O����l���ω������C���f�b�N�X�̃��X�g���擾����
+        // Get a list of indices whose values ??have changed from the previous one in the feature quantity
         static void AppendChangedIndices(const Position& pos, Color perspective,
           IndexList* removed, IndexList* added);
       };
@@ -45,4 +45,4 @@ namespace Eval {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/features/enpassant.cpp b/src/eval/nnue/features/enpassant.cpp
index 523fd966..82a4158e 100644
--- a/src/eval/nnue/features/enpassant.cpp
+++ b/src/eval/nnue/features/enpassant.cpp
@@ -1,4 +1,4 @@
-// NNUE�]���֐��̓��͓�����K�̒�`
+//Definition of input feature quantity K of NNUE evaluation function
 
 #if defined(EVAL_NNUE)
 
@@ -11,10 +11,10 @@ namespace Eval {
 
     namespace Features {
 
-      // �����ʂ̂����A�l��1�ł���C���f�b�N�X�̃��X�g���擾����
+      // Get a list of indices with a value of 1 among the features
       void EnPassant::AppendActiveIndices(
         const Position& pos, Color perspective, IndexList* active) {
-        // �R���p�C���̌x����������邽�߁A�z��T�C�Y���������ꍇ�͉������Ȃ�
+        // do nothing if array size is small to avoid compiler warning
         if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
 
         auto epSquare = pos.state()->epSquare;
@@ -30,7 +30,7 @@ namespace Eval {
         active->push_back(file);
       }
 
-      // �����ʂ̂����A���O����l���ω������C���f�b�N�X�̃��X�g���擾����
+      // Get a list of indices whose values ??have changed from the previous one in the feature quantity
       void EnPassant::AppendChangedIndices(
         const Position& pos, Color perspective,
         IndexList* removed, IndexList* added) {
diff --git a/src/eval/nnue/features/enpassant.h b/src/eval/nnue/features/enpassant.h
index fe827584..c0ac8234 100644
--- a/src/eval/nnue/features/enpassant.h
+++ b/src/eval/nnue/features/enpassant.h
@@ -1,4 +1,4 @@
-// NNUE�]���֐��̓��͓�����K�̒�`
+//Definition of input feature quantity K of NNUE evaluation function
 
 #ifndef _NNUE_FEATURES_ENPASSANT_H_
 #define _NNUE_FEATURES_ENPASSANT_H_
@@ -14,25 +14,25 @@ namespace Eval {
 
     namespace Features {
 
-      // ������K�F�ʂ̈ʒu
+      // Feature K: Ball position
       class EnPassant {
       public:
-        // �����ʖ�
+        // feature quantity name
         static constexpr const char* kName = "EnPassant";
-        // �]���֐��t�@�C���ɖ��ߍ��ރn�b�V���l
+        // Hash value embedded in the evaluation function file
         static constexpr std::uint32_t kHashValue = 0x02924F91u;
-        // �����ʂ̎�����
+        // number of feature dimensions
         static constexpr IndexType kDimensions = 8;
-        // �����ʂ̂����A�����ɒl��1�ƂȂ�C���f�b�N�X�̐��̍ő�l
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
         static constexpr IndexType kMaxActiveDimensions = 1;
-        // �����v�Z�̑���ɑS�v�Z���s���^�C�~���O
+        // Timing of full calculation instead of difference calculation
         static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kAnyPieceMoved;
 
-        // �����ʂ̂����A�l��1�ł���C���f�b�N�X�̃��X�g���擾����
+        // Get a list of indices with a value of 1 among the features
         static void AppendActiveIndices(const Position& pos, Color perspective,
           IndexList* active);
 
-        // �����ʂ̂����A���O����l���ω������C���f�b�N�X�̃��X�g���擾����
+        // Get a list of indices whose values ??have changed from the previous one in the feature quantity
         static void AppendChangedIndices(const Position& pos, Color perspective,
           IndexList* removed, IndexList* added);
       };
@@ -45,4 +45,4 @@ namespace Eval {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/features/feature_set.h b/src/eval/nnue/features/feature_set.h
index 919be65d..6190db04 100644
--- a/src/eval/nnue/features/feature_set.h
+++ b/src/eval/nnue/features/feature_set.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の入力特徴量セットを表すクラステンプレート
+﻿// A class template that represents the input feature set of the NNUE evaluation function
 
 #ifndef _NNUE_FEATURE_SET_H_
 #define _NNUE_FEATURE_SET_H_
@@ -14,7 +14,7 @@ namespace NNUE {
 
 namespace Features {
 
-// 値のリストを表すクラステンプレート
+// A class template that represents a list of values
 template <typename T, T... Values>
 struct CompileTimeList;
 template <typename T, T First, T... Remaining>
@@ -36,7 +36,7 @@ struct CompileTimeList<T> {
   static constexpr std::array<T, 0> kValues = {{}};
 };
 
-// リストの先頭への追加を行うクラステンプレート
+// Class template that adds to the beginning of the list
 template <typename T, typename ListType, T Value>
 struct AppendToList;
 template <typename T, T... Values, T AnotherValue>
@@ -44,7 +44,7 @@ struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
   using Result = CompileTimeList<T, AnotherValue, Values...>;
 };
 
-// ソートされた重複のないリストへの追加を行うクラステンプレート
+// Class template for adding to a sorted, unique list
 template <typename T, typename ListType, T Value>
 struct InsertToSet;
 template <typename T, T First, T... Remaining, T AnotherValue>
@@ -52,7 +52,7 @@ struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
   using Result = std::conditional_t<
       CompileTimeList<T, First, Remaining...>::Contains(AnotherValue),
       CompileTimeList<T, First, Remaining...>,
-      std::conditional_t<(AnotherValue < First),
+      std::conditional_t<(AnotherValue <First),
           CompileTimeList<T, AnotherValue, First, Remaining...>,
           typename AppendToList<T, typename InsertToSet<
               T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
@@ -63,21 +63,21 @@ struct InsertToSet<T, CompileTimeList<T>, Value> {
   using Result = CompileTimeList<T, Value>;
 };
 
-// 特徴量セットの基底クラス
+// Base class of feature set
 template <typename Derived>
 class FeatureSetBase {
  public:
-  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  // Get a list of indices with a value of 1 among the features
   template <typename IndexListType>
   static void AppendActiveIndices(
       const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
-    for (const auto perspective : Colors) {
+    for (const auto perspective :Colors) {
       Derived::CollectActiveIndices(
           pos, trigger, perspective, &active[perspective]);
     }
   }
 
-  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
   template <typename PositionType, typename IndexListType>
   static void AppendChangedIndices(
       const PositionType& pos, TriggerEvent trigger,
@@ -85,7 +85,7 @@ class FeatureSetBase {
     const auto& dp = pos.state()->dirtyPiece;
     if (dp.dirty_num == 0) return;
 
-    for (const auto perspective : Colors) {
+    for (const auto perspective :Colors) {
       reset[perspective] = false;
       switch (trigger) {
         case TriggerEvent::kNone:
@@ -120,8 +120,8 @@ class FeatureSetBase {
   }
 };
 
-// 特徴量セットを表すクラステンプレート
-// 実行時の計算量を線形にするために、内部の処理はテンプレート引数の逆順に行う
+// Class template that represents the feature set
+// do internal processing in reverse order of template arguments in order to linearize the amount of calculation at runtime
 template <typename FirstFeatureType, typename... RemainingFeatureTypes>
 class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
     public FeatureSetBase<
@@ -131,27 +131,27 @@ class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
   using Tail = FeatureSet<RemainingFeatureTypes...>;
 
  public:
-  // 評価関数ファイルに埋め込むハッシュ値
+  // Hash value embedded in the evaluation function file
   static constexpr std::uint32_t kHashValue =
       Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
-  // 特徴量の次元数
+  // number of feature dimensions
   static constexpr IndexType kDimensions =
       Head::kDimensions + Tail::kDimensions;
-  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
   static constexpr IndexType kMaxActiveDimensions =
       Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
-  // 差分計算の代わりに全計算を行うタイミングのリスト
+  // List of timings to perform all calculations instead of difference calculation
   using SortedTriggerSet = typename InsertToSet<TriggerEvent,
       typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
   static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
 
-  // 特徴量名を取得する
+  // Get the feature quantity name
   static std::string GetName() {
     return std::string(Head::kName) + "+" + Tail::GetName();
   }
 
  private:
-  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  // Get a list of indices with a value of 1 among the features
   template <typename IndexListType>
   static void CollectActiveIndices(
       const Position& pos, const TriggerEvent trigger, const Color perspective,
@@ -166,7 +166,7 @@ class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
     }
   }
 
-  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
   template <typename IndexListType>
   static void CollectChangedIndices(
       const Position& pos, const TriggerEvent trigger, const Color perspective,
@@ -185,36 +185,36 @@ class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
     }
   }
 
-  // 基底クラスと、自身を再帰的に利用するクラステンプレートをfriendにする
+  // Make the base class and the class template that recursively uses itself a friend
   friend class FeatureSetBase<FeatureSet>;
   template <typename... FeatureTypes>
   friend class FeatureSet;
 };
 
-// 特徴量セットを表すクラステンプレート
-// テンプレート引数が1つの場合の特殊化
+// Class template that represents the feature set
+// Specialization with one template argument
 template <typename FeatureType>
 class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
  public:
-  // 評価関数ファイルに埋め込むハッシュ値
+  // Hash value embedded in the evaluation function file
   static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
-  // 特徴量の次元数
+  // number of feature dimensions
   static constexpr IndexType kDimensions = FeatureType::kDimensions;
-  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
   static constexpr IndexType kMaxActiveDimensions =
       FeatureType::kMaxActiveDimensions;
-  // 差分計算の代わりに全計算を行うタイミングのリスト
+  // List of timings to perform all calculations instead of difference calculation
   using SortedTriggerSet =
       CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
   static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
 
-  // 特徴量名を取得する
+  // Get the feature quantity name
   static std::string GetName() {
     return FeatureType::kName;
   }
 
  private:
-  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  // Get a list of indices with a value of 1 among the features
   static void CollectActiveIndices(
       const Position& pos, const TriggerEvent trigger, const Color perspective,
       IndexList* const active) {
@@ -223,7 +223,7 @@ class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
     }
   }
 
-  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
   static void CollectChangedIndices(
       const Position& pos, const TriggerEvent trigger, const Color perspective,
       IndexList* const removed, IndexList* const added) {
@@ -232,7 +232,7 @@ class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
     }
   }
 
-  // 基底クラスと、自身を再帰的に利用するクラステンプレートをfriendにする
+  // Make the base class and the class template that recursively uses itself a friend
   friend class FeatureSetBase<FeatureSet>;
   template <typename... FeatureTypes>
   friend class FeatureSet;
@@ -246,4 +246,4 @@ class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/features/features_common.h b/src/eval/nnue/features/features_common.h
index 15ccb8a7..0031d37b 100644
--- a/src/eval/nnue/features/features_common.h
+++ b/src/eval/nnue/features/features_common.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の入力特徴量の共通ヘッダ
+﻿//Common header of input features of NNUE evaluation function
 
 #ifndef _NNUE_FEATURES_COMMON_H_
 #define _NNUE_FEATURES_COMMON_H_
@@ -14,26 +14,26 @@ namespace NNUE {
 
 namespace Features {
 
-// インデックスリストの型
+// Index list type
 class IndexList;
 
-// 特徴量セットを表すクラステンプレート
+// Class template that represents the feature set
 template <typename... FeatureTypes>
 class FeatureSet;
 
-// 差分計算の代わりに全計算を行うタイミングの種類
+// Type of timing to perform all calculations instead of difference calculation
 enum class TriggerEvent {
-  kNone,             // 可能な場合は常に差分計算する
-  kFriendKingMoved,  // 自玉が移動した場合に全計算する
-  kEnemyKingMoved,   // 敵玉が移動した場合に全計算する
-  kAnyKingMoved,     // どちらかの玉が移動した場合に全計算する
-  kAnyPieceMoved,    // 常に全計算する
+  kNone, // Calculate the difference whenever possible
+  kFriendKingMoved, // calculate all when own ball moves
+  kEnemyKingMoved, // do all calculations when enemy balls move
+  kAnyKingMoved, // do all calculations if either ball moves
+  kAnyPieceMoved, // always do all calculations
 };
 
-// 手番側or相手側
+// turn side or other side
 enum class Side {
-  kFriend,  // 手番側
-  kEnemy,   // 相手側
+  kFriend, // turn side
+  kEnemy, // opponent
 };
 
 }  // namespace Features
@@ -44,4 +44,4 @@ enum class Side {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/features/half_kp.cpp b/src/eval/nnue/features/half_kp.cpp
index 5cd95637..72156c82 100644
--- a/src/eval/nnue/features/half_kp.cpp
+++ b/src/eval/nnue/features/half_kp.cpp
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の入力特徴量HalfKPの定義
+﻿//Definition of input features HalfKP of NNUE evaluation function
 
 #if defined(EVAL_NNUE)
 
@@ -11,13 +11,13 @@ namespace NNUE {
 
 namespace Features {
 
-// 玉の位置とBonaPieceから特徴量のインデックスを求める
+// Find the index of the feature quantity from the ball position and BonaPiece
 template <Side AssociatedKing>
 inline IndexType HalfKP<AssociatedKing>::MakeIndex(Square sq_k, BonaPiece p) {
   return static_cast<IndexType>(fe_end) * static_cast<IndexType>(sq_k) + p;
 }
 
-// 駒の情報を取得する
+// Get the piece information
 template <Side AssociatedKing>
 inline void HalfKP<AssociatedKing>::GetPieces(
     const Position& pos, Color perspective,
@@ -31,7 +31,7 @@ inline void HalfKP<AssociatedKing>::GetPieces(
   *sq_target_k = static_cast<Square>(((*pieces)[target] - f_king) % SQUARE_NB);
 }
 
-// 特徴量のうち、値が1であるインデックスのリストを取得する
+// Get a list of indices with a value of 1 among the features
 template <Side AssociatedKing>
 void HalfKP<AssociatedKing>::AppendActiveIndices(
     const Position& pos, Color perspective, IndexList* active) {
@@ -48,7 +48,7 @@ void HalfKP<AssociatedKing>::AppendActiveIndices(
   }
 }
 
-// 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+// Get a list of indices whose values ​​have changed from the previous one in the feature quantity
 template <Side AssociatedKing>
 void HalfKP<AssociatedKing>::AppendChangedIndices(
     const Position& pos, Color perspective,
diff --git a/src/eval/nnue/features/half_kp.h b/src/eval/nnue/features/half_kp.h
index 556127d3..65ea46f1 100644
--- a/src/eval/nnue/features/half_kp.h
+++ b/src/eval/nnue/features/half_kp.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の入力特徴量HalfKPの定義
+﻿//Definition of input features HalfKP of NNUE evaluation function
 
 #ifndef _NNUE_FEATURES_HALF_KP_H_
 #define _NNUE_FEATURES_HALF_KP_H_
@@ -14,39 +14,39 @@ namespace NNUE {
 
 namespace Features {
 
-// 特徴量HalfKP：自玉または敵玉の位置と、玉以外の駒の位置の組み合わせ
+// Feature HalfKP: Combination of the position of own ball or enemy ball and the position of pieces other than balls
 template <Side AssociatedKing>
 class HalfKP {
  public:
-  // 特徴量名
+  // feature quantity name
   static constexpr const char* kName =
       (AssociatedKing == Side::kFriend) ? "HalfKP(Friend)" : "HalfKP(Enemy)";
-  // 評価関数ファイルに埋め込むハッシュ値
+  // Hash value embedded in the evaluation function file
   static constexpr std::uint32_t kHashValue =
       0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
-  // 特徴量の次元数
+  // number of feature dimensions
   static constexpr IndexType kDimensions =
       static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(fe_end);
-  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
   static constexpr IndexType kMaxActiveDimensions = PIECE_NUMBER_KING;
-  // 差分計算の代わりに全計算を行うタイミング
+  // Timing of full calculation instead of difference calculation
   static constexpr TriggerEvent kRefreshTrigger =
       (AssociatedKing == Side::kFriend) ?
       TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
 
-  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  // Get a list of indices with a value of 1 among the features
   static void AppendActiveIndices(const Position& pos, Color perspective,
                                   IndexList* active);
 
-  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
   static void AppendChangedIndices(const Position& pos, Color perspective,
                                    IndexList* removed, IndexList* added);
 
-  // 玉の位置とBonaPieceから特徴量のインデックスを求める
+  // Find the index of the feature quantity from the ball position and BonaPiece
   static IndexType MakeIndex(Square sq_k, BonaPiece p);
 
  private:
-  // 駒の情報を取得する
+  // Get the piece information
   static void GetPieces(const Position& pos, Color perspective,
                         BonaPiece** pieces, Square* sq_target_k);
 };
@@ -59,4 +59,4 @@ class HalfKP {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/features/half_relative_kp.cpp b/src/eval/nnue/features/half_relative_kp.cpp
index d62beea0..623b839c 100644
--- a/src/eval/nnue/features/half_relative_kp.cpp
+++ b/src/eval/nnue/features/half_relative_kp.cpp
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の入力特徴量HalfRelativeKPの定義
+﻿//Definition of input features HalfRelativeKP of NNUE evaluation function
 
 #if defined(EVAL_NNUE)
 
@@ -11,7 +11,7 @@ namespace NNUE {
 
 namespace Features {
 
-// 玉の位置とBonaPieceから特徴量のインデックスを求める
+// Find the index of the feature quantity from the ball position and BonaPiece
 template <Side AssociatedKing>
 inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
     Square sq_k, BonaPiece p) {
@@ -24,7 +24,7 @@ inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
   return H * W * piece_index + H * relative_file + relative_rank;
 }
 
-// 駒の情報を取得する
+// Get the piece information
 template <Side AssociatedKing>
 inline void HalfRelativeKP<AssociatedKing>::GetPieces(
     const Position& pos, Color perspective,
@@ -38,11 +38,11 @@ inline void HalfRelativeKP<AssociatedKing>::GetPieces(
   *sq_target_k = static_cast<Square>(((*pieces)[target] - f_king) % SQUARE_NB);
 }
 
-// 特徴量のうち、値が1であるインデックスのリストを取得する
+// Get a list of indices with a value of 1 among the features
 template <Side AssociatedKing>
 void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
     const Position& pos, Color perspective, IndexList* active) {
-  // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+  // do nothing if array size is small to avoid compiler warning
   if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
 
   BonaPiece* pieces;
@@ -57,7 +57,7 @@ void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
   }
 }
 
-// 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+// Get a list of indices whose values ​​have changed from the previous one in the feature quantity
 template <Side AssociatedKing>
 void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
     const Position& pos, Color perspective,
diff --git a/src/eval/nnue/features/half_relative_kp.h b/src/eval/nnue/features/half_relative_kp.h
index 99e10c57..f6ca5cc0 100644
--- a/src/eval/nnue/features/half_relative_kp.h
+++ b/src/eval/nnue/features/half_relative_kp.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の入力特徴量HalfRelativeKPの定義
+﻿//Definition of input features HalfRelativeKP of NNUE evaluation function
 
 #ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
 #define _NNUE_FEATURES_HALF_RELATIVE_KP_H_
@@ -14,45 +14,45 @@ namespace NNUE {
 
 namespace Features {
 
-// 特徴量HalfRelativeKP：自玉または敵玉を基準とした、玉以外の各駒の相対位置
+// Feature HalfRelativeKP: Relative position of each piece other than the ball based on own ball or enemy ball
 template <Side AssociatedKing>
 class HalfRelativeKP {
  public:
-  // 特徴量名
+  // feature quantity name
   static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
       "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";
-  // 評価関数ファイルに埋め込むハッシュ値
+  // Hash value embedded in the evaluation function file
   static constexpr std::uint32_t kHashValue =
       0xF9180919u ^ (AssociatedKing == Side::kFriend);
-  // 玉を除いた駒種
+  // Piece type excluding balls
   static constexpr IndexType kNumPieceKinds = (fe_end - fe_hand_end) / SQUARE_NB;
-  // 玉を中央に置いた仮想的な盤の幅
+  // width of the virtual board with the ball in the center
   static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
-  // 玉を中央に置いた仮想的な盤の高さ
+  // height of a virtual board with balls in the center
   static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
-  // 特徴量の次元数
+  // number of feature dimensions
   static constexpr IndexType kDimensions =
       kNumPieceKinds * kBoardHeight * kBoardWidth;
-  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
   static constexpr IndexType kMaxActiveDimensions = PIECE_NUMBER_KING;
-  // 差分計算の代わりに全計算を行うタイミング
+  // Timing of full calculation instead of difference calculation
   static constexpr TriggerEvent kRefreshTrigger =
       (AssociatedKing == Side::kFriend) ?
       TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
 
-  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  // Get a list of indices with a value of 1 among the features
   static void AppendActiveIndices(const Position& pos, Color perspective,
                                   IndexList* active);
 
-  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
   static void AppendChangedIndices(const Position& pos, Color perspective,
                                    IndexList* removed, IndexList* added);
 
-  // 玉の位置とBonaPieceから特徴量のインデックスを求める
+  // Find the index of the feature quantity from the ball position and BonaPiece
   static IndexType MakeIndex(Square sq_k, BonaPiece p);
 
  private:
-  // 駒の情報を取得する
+  // Get the piece information
   static void GetPieces(const Position& pos, Color perspective,
                         BonaPiece** pieces, Square* sq_target_k);
 };
@@ -65,4 +65,4 @@ class HalfRelativeKP {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/features/index_list.h b/src/eval/nnue/features/index_list.h
index a5a71011..90317b4c 100644
--- a/src/eval/nnue/features/index_list.h
+++ b/src/eval/nnue/features/index_list.h
@@ -1,4 +1,4 @@
-﻿// 入力特徴量のインデックスリストの定義
+﻿// Definition of index list of input features
 
 #ifndef _NNUE_FEATURES_INDEX_LIST_H_
 #define _NNUE_FEATURES_INDEX_LIST_H_
@@ -14,7 +14,7 @@ namespace NNUE {
 
 namespace Features {
 
-// 特徴量のインデックスリストに使うクラステンプレート
+// Class template used for feature index list
 template <typename T, std::size_t MaxSize>
 class ValueList {
  public:
@@ -39,7 +39,7 @@ class ValueList {
   std::size_t size_ = 0;
 };
 
-// 特徴量のインデックスリストの型
+//Type of feature index list
 class IndexList
     : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
 };
@@ -52,4 +52,4 @@ class IndexList
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/features/k.cpp b/src/eval/nnue/features/k.cpp
index 03f66ff5..dc01eb92 100644
--- a/src/eval/nnue/features/k.cpp
+++ b/src/eval/nnue/features/k.cpp
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の入力特徴量Kの定義
+﻿//Definition of input feature quantity K of NNUE evaluation function
 
 #if defined(EVAL_NNUE)
 
@@ -11,10 +11,10 @@ namespace NNUE {
 
 namespace Features {
 
-// 特徴量のうち、値が1であるインデックスのリストを取得する
+// Get a list of indices with a value of 1 among the features
 void K::AppendActiveIndices(
     const Position& pos, Color perspective, IndexList* active) {
-  // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+  // do nothing if array size is small to avoid compiler warning
   if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
 
   const BonaPiece* pieces = (perspective == BLACK) ?
@@ -27,7 +27,7 @@ void K::AppendActiveIndices(
   }
 }
 
-// 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+// Get a list of indices whose values ​​have changed from the previous one in the feature quantity
 void K::AppendChangedIndices(
     const Position& pos, Color perspective,
     IndexList* removed, IndexList* added) {
diff --git a/src/eval/nnue/features/k.h b/src/eval/nnue/features/k.h
index 1a01c471..0930c160 100644
--- a/src/eval/nnue/features/k.h
+++ b/src/eval/nnue/features/k.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の入力特徴量Kの定義
+﻿//Definition of input feature quantity K of NNUE evaluation function
 
 #ifndef _NNUE_FEATURES_K_H_
 #define _NNUE_FEATURES_K_H_
@@ -14,25 +14,25 @@ namespace NNUE {
 
 namespace Features {
 
-// 特徴量K：玉の位置
+// Feature K: Ball position
 class K {
  public:
-  // 特徴量名
+  // feature quantity name
   static constexpr const char* kName = "K";
-  // 評価関数ファイルに埋め込むハッシュ値
+  // Hash value embedded in the evaluation function file
   static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
-  // 特徴量の次元数
+  // number of feature dimensions
   static constexpr IndexType kDimensions = SQUARE_NB * 2;
-  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
   static constexpr IndexType kMaxActiveDimensions = 2;
-  // 差分計算の代わりに全計算を行うタイミング
+  // Timing of full calculation instead of difference calculation
   static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  // Get a list of indices with a value of 1 among the features
   static void AppendActiveIndices(const Position& pos, Color perspective,
                                   IndexList* active);
 
-  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
   static void AppendChangedIndices(const Position& pos, Color perspective,
                                    IndexList* removed, IndexList* added);
 };
@@ -45,4 +45,4 @@ class K {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/features/p.cpp b/src/eval/nnue/features/p.cpp
index 56bca0a4..68527119 100644
--- a/src/eval/nnue/features/p.cpp
+++ b/src/eval/nnue/features/p.cpp
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の入力特徴量Pの定義
+﻿//Definition of input feature P of NNUE evaluation function
 
 #if defined(EVAL_NNUE)
 
@@ -11,10 +11,10 @@ namespace NNUE {
 
 namespace Features {
 
-// 特徴量のうち、値が1であるインデックスのリストを取得する
+// Get a list of indices with a value of 1 among the features
 void P::AppendActiveIndices(
     const Position& pos, Color perspective, IndexList* active) {
-  // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+  // do nothing if array size is small to avoid compiler warning
   if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
 
   const BonaPiece* pieces = (perspective == BLACK) ?
@@ -27,7 +27,7 @@ void P::AppendActiveIndices(
   }
 }
 
-// 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+// Get a list of indices whose values ​​have changed from the previous one in the feature quantity
 void P::AppendChangedIndices(
     const Position& pos, Color perspective,
     IndexList* removed, IndexList* added) {
diff --git a/src/eval/nnue/features/p.h b/src/eval/nnue/features/p.h
index 77ea882d..ded678a5 100644
--- a/src/eval/nnue/features/p.h
+++ b/src/eval/nnue/features/p.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の入力特徴量Pの定義
+﻿//Definition of input feature P of NNUE evaluation function
 
 #ifndef _NNUE_FEATURES_P_H_
 #define _NNUE_FEATURES_P_H_
@@ -14,25 +14,25 @@ namespace NNUE {
 
 namespace Features {
 
-// 特徴量P：玉以外の駒のBonaPiece
+// Feature P: BonaPiece of pieces other than balls
 class P {
  public:
-  // 特徴量名
+  // feature quantity name
   static constexpr const char* kName = "P";
-  // 評価関数ファイルに埋め込むハッシュ値
+  // Hash value embedded in the evaluation function file
   static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
-  // 特徴量の次元数
+  // number of feature dimensions
   static constexpr IndexType kDimensions = fe_end;
-  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
   static constexpr IndexType kMaxActiveDimensions = PIECE_NUMBER_KING;
-  // 差分計算の代わりに全計算を行うタイミング
+  // Timing of full calculation instead of difference calculation
   static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-  // 特徴量のうち、値が1であるインデックスのリストを取得する
+  // Get a list of indices with a value of 1 among the features
   static void AppendActiveIndices(const Position& pos, Color perspective,
                                   IndexList* active);
 
-  // 特徴量のうち、一手前から値が変化したインデックスのリストを取得する
+  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
   static void AppendChangedIndices(const Position& pos, Color perspective,
                                    IndexList* removed, IndexList* added);
 };
@@ -45,4 +45,4 @@ class P {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/layers/affine_transform.h b/src/eval/nnue/layers/affine_transform.h
index d8101ba4..99dae0fe 100644
--- a/src/eval/nnue/layers/affine_transform.h
+++ b/src/eval/nnue/layers/affine_transform.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の層AffineTransformの定義
+﻿// Definition of layer AffineTransform of NNUE evaluation function
 
 #ifndef _NNUE_LAYERS_AFFINE_TRANSFORM_H_
 #define _NNUE_LAYERS_AFFINE_TRANSFORM_H_
@@ -13,31 +13,31 @@ namespace NNUE {
 
 namespace Layers {
 
-// アフィン変換層
+// affine transformation layer
 template <typename PreviousLayer, IndexType OutputDimensions>
 class AffineTransform {
  public:
-  // 入出力の型
+  // Input/output type
   using InputType = typename PreviousLayer::OutputType;
   using OutputType = std::int32_t;
   static_assert(std::is_same<InputType, std::uint8_t>::value, "");
 
-  // 入出力の次元数
+  // number of input/output dimensions
   static constexpr IndexType kInputDimensions =
       PreviousLayer::kOutputDimensions;
   static constexpr IndexType kOutputDimensions = OutputDimensions;
   static constexpr IndexType kPaddedInputDimensions =
       CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);
 
-  // この層で使用する順伝播用バッファのサイズ
+  // Size of forward propagation buffer used in this layer
   static constexpr std::size_t kSelfBufferSize =
       CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
-  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  // Size of the forward propagation buffer used from the input layer to this layer
   static constexpr std::size_t kBufferSize =
       PreviousLayer::kBufferSize + kSelfBufferSize;
 
-  // 評価関数ファイルに埋め込むハッシュ値
+  // Hash value embedded in the evaluation function file
   static constexpr std::uint32_t GetHashValue() {
     std::uint32_t hash_value = 0xCC03DAE4u;
     hash_value += kOutputDimensions;
@@ -46,7 +46,7 @@ class AffineTransform {
     return hash_value;
   }
 
-  // 入力層からこの層までの構造を表す文字列
+  // A string that represents the structure from the input layer to this layer
   static std::string GetStructureString() {
     return "AffineTransform[" +
         std::to_string(kOutputDimensions) + "<-" +
@@ -54,7 +54,7 @@ class AffineTransform {
         PreviousLayer::GetStructureString() + ")";
   }
 
-  // パラメータを読み込む
+  // read parameters
   bool ReadParameters(std::istream& stream) {
     if (!previous_layer_.ReadParameters(stream)) return false;
     stream.read(reinterpret_cast<char*>(biases_),
@@ -65,7 +65,7 @@ class AffineTransform {
     return !stream.fail();
   }
 
-  // パラメータを書き込む
+  // write parameters
   bool WriteParameters(std::ostream& stream) const {
     if (!previous_layer_.WriteParameters(stream)) return false;
     stream.write(reinterpret_cast<const char*>(biases_),
@@ -76,7 +76,7 @@ class AffineTransform {
     return !stream.fail();
   }
 
-  // 順伝播
+  // forward propagation
   const OutputType* Propagate(
       const TransformedFeatureType* transformed_features, char* buffer) const {
     const auto input = previous_layer_.Propagate(
@@ -151,17 +151,17 @@ class AffineTransform {
   }
 
  private:
-  // パラメータの型
+  // parameter type
   using BiasType = OutputType;
   using WeightType = std::int8_t;
 
-  // 学習用クラスをfriendにする
+  // Make the learning class a friend
   friend class Trainer<AffineTransform>;
 
-  // この層の直前の層
+  // the layer immediately before this layer
   PreviousLayer previous_layer_;
 
-  // パラメータ
+  // parameter
   alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
   alignas(kCacheLineSize)
       WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
@@ -175,4 +175,4 @@ class AffineTransform {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/layers/clipped_relu.h b/src/eval/nnue/layers/clipped_relu.h
index 5877fc32..1b7e8fc1 100644
--- a/src/eval/nnue/layers/clipped_relu.h
+++ b/src/eval/nnue/layers/clipped_relu.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の層ClippedReLUの定義
+﻿// Definition of layer ClippedReLU of NNUE evaluation function
 
 #ifndef _NNUE_LAYERS_CLIPPED_RELU_H_
 #define _NNUE_LAYERS_CLIPPED_RELU_H_
@@ -17,49 +17,49 @@ namespace Layers {
 template <typename PreviousLayer>
 class ClippedReLU {
  public:
-  // 入出力の型
+  // Input/output type
   using InputType = typename PreviousLayer::OutputType;
   using OutputType = std::uint8_t;
   static_assert(std::is_same<InputType, std::int32_t>::value, "");
 
-  // 入出力の次元数
+  // number of input/output dimensions
   static constexpr IndexType kInputDimensions =
       PreviousLayer::kOutputDimensions;
   static constexpr IndexType kOutputDimensions = kInputDimensions;
 
-  // この層で使用する順伝播用バッファのサイズ
+  // Size of forward propagation buffer used in this layer
   static constexpr std::size_t kSelfBufferSize =
       CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
-  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  // Size of the forward propagation buffer used from the input layer to this layer
   static constexpr std::size_t kBufferSize =
       PreviousLayer::kBufferSize + kSelfBufferSize;
 
-  // 評価関数ファイルに埋め込むハッシュ値
+  // Hash value embedded in the evaluation function file
   static constexpr std::uint32_t GetHashValue() {
     std::uint32_t hash_value = 0x538D24C7u;
     hash_value += PreviousLayer::GetHashValue();
     return hash_value;
   }
 
-  // 入力層からこの層までの構造を表す文字列
+  // A string that represents the structure from the input layer to this layer
   static std::string GetStructureString() {
     return "ClippedReLU[" +
         std::to_string(kOutputDimensions) + "](" +
         PreviousLayer::GetStructureString() + ")";
   }
 
-  // パラメータを読み込む
+  // read parameters
   bool ReadParameters(std::istream& stream) {
     return previous_layer_.ReadParameters(stream);
   }
 
-  // パラメータを書き込む
+  // write parameters
   bool WriteParameters(std::ostream& stream) const {
     return previous_layer_.WriteParameters(stream);
   }
 
-  // 順伝播
+  // forward propagation
   const OutputType* Propagate(
       const TransformedFeatureType* transformed_features, char* buffer) const {
     const auto input = previous_layer_.Propagate(
@@ -150,10 +150,10 @@ class ClippedReLU {
   }
 
  private:
-  // 学習用クラスをfriendにする
+  // Make the learning class a friend
   friend class Trainer<ClippedReLU>;
 
-  // この層の直前の層
+  // the layer immediately before this layer
   PreviousLayer previous_layer_;
 };
 
@@ -165,4 +165,4 @@ class ClippedReLU {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/layers/input_slice.h b/src/eval/nnue/layers/input_slice.h
index c9c6a7c9..0497e769 100644
--- a/src/eval/nnue/layers/input_slice.h
+++ b/src/eval/nnue/layers/input_slice.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の層InputSliceの定義
+﻿// NNUE evaluation function layer InputSlice definition
 
 #ifndef _NNUE_LAYERS_INPUT_SLICE_H_
 #define _NNUE_LAYERS_INPUT_SLICE_H_
@@ -13,47 +13,47 @@ namespace NNUE {
 
 namespace Layers {
 
-// 入力層
+// input layer
 template <IndexType OutputDimensions, IndexType Offset = 0>
 class InputSlice {
  public:
-  // アライメントを維持する必要がある
+  // need to maintain alignment
   static_assert(Offset % kMaxSimdWidth == 0, "");
 
-  // 出力の型
+  // output type
   using OutputType = TransformedFeatureType;
 
-  // 出力の次元数
+  // output dimensionality
   static constexpr IndexType kOutputDimensions = OutputDimensions;
 
-  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  // Size of the forward propagation buffer used from the input layer to this layer
   static constexpr std::size_t kBufferSize = 0;
 
-  // 評価関数ファイルに埋め込むハッシュ値
+  // Hash value embedded in the evaluation function file
   static constexpr std::uint32_t GetHashValue() {
     std::uint32_t hash_value = 0xEC42E90Du;
     hash_value ^= kOutputDimensions ^ (Offset << 10);
     return hash_value;
   }
 
-  // 入力層からこの層までの構造を表す文字列
+  // A string that represents the structure from the input layer to this layer
   static std::string GetStructureString() {
     return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
         std::to_string(Offset) + ":" +
         std::to_string(Offset + kOutputDimensions) + ")]";
   }
 
-  // パラメータを読み込む
+  // read parameters
   bool ReadParameters(std::istream& /*stream*/) {
     return true;
   }
 
-  // パラメータを書き込む
+  // write parameters
   bool WriteParameters(std::ostream& /*stream*/) const {
     return true;
   }
 
-  // 順伝播
+  // forward propagation
   const OutputType* Propagate(
       const TransformedFeatureType* transformed_features,
       char* /*buffer*/) const {
@@ -71,4 +71,4 @@ class InputSlice {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/layers/sum.h b/src/eval/nnue/layers/sum.h
index 3fe000cf..c64852a1 100644
--- a/src/eval/nnue/layers/sum.h
+++ b/src/eval/nnue/layers/sum.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の層Sumの定義
+﻿// Definition of layer Sum of NNUE evaluation function
 
 #ifndef _NNUE_LAYERS_SUM_H_
 #define _NNUE_LAYERS_SUM_H_
@@ -13,7 +13,7 @@ namespace NNUE {
 
 namespace Layers {
 
-// 複数の層の出力の和を取る層
+// Layer that sums the output of multiple layers
 template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
 class Sum : public Sum<RemainingPreviousLayers...> {
  private:
@@ -21,25 +21,25 @@ class Sum : public Sum<RemainingPreviousLayers...> {
   using Tail = Sum<RemainingPreviousLayers...>;
 
  public:
-  // 入出力の型
+  // Input/output type
   using InputType = typename Head::OutputType;
   using OutputType = InputType;
   static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
 
-  // 入出力の次元数
+  // number of input/output dimensions
   static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
   static constexpr IndexType kOutputDimensions = kInputDimensions;
-  static_assert(kInputDimensions == Tail::kInputDimensions , "");
+  static_assert(kInputDimensions == Tail::kInputDimensions ,"");
 
-  // この層で使用する順伝播用バッファのサイズ
+  // Size of forward propagation buffer used in this layer
   static constexpr std::size_t kSelfBufferSize =
       CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
-  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  // Size of the forward propagation buffer used from the input layer to this layer
   static constexpr std::size_t kBufferSize =
       std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
 
-  // 評価関数ファイルに埋め込むハッシュ値
+  // Hash value embedded in the evaluation function file
   static constexpr std::uint32_t GetHashValue() {
     std::uint32_t hash_value = 0xBCE400B4u;
     hash_value ^= Head::GetHashValue() >> 1;
@@ -49,67 +49,67 @@ class Sum : public Sum<RemainingPreviousLayers...> {
     return hash_value;
   }
 
-  // 入力層からこの層までの構造を表す文字列
+  // A string that represents the structure from the input layer to this layer
   static std::string GetStructureString() {
     return "Sum[" +
         std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
   }
 
-  // パラメータを読み込む
+  // read parameters
   bool ReadParameters(std::istream& stream) {
     if (!Tail::ReadParameters(stream)) return false;
     return previous_layer_.ReadParameters(stream);
   }
 
-  // パラメータを書き込む
+  // write parameters
   bool WriteParameters(std::ostream& stream) const {
     if (!Tail::WriteParameters(stream)) return false;
     return previous_layer_.WriteParameters(stream);
   }
 
-  // 順伝播
+  // forward propagation
   const OutputType* Propagate(
       const TransformedFeatureType* transformed_features, char* buffer) const {
     Tail::Propagate(transformed_features, buffer);
     const auto head_output = previous_layer_.Propagate(
         transformed_features, buffer + kSelfBufferSize);
     const auto output = reinterpret_cast<OutputType*>(buffer);
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+    for (IndexType i = 0; i <kOutputDimensions; ++i) {
       output[i] += head_output[i];
     }
     return output;
   }
 
  protected:
-  // 和を取る対象となる層のリストを表す文字列
+  // A string that represents the list of layers to be summed
   static std::string GetSummandsString() {
     return Head::GetStructureString() + "," + Tail::GetSummandsString();
   }
 
-  // 学習用クラスをfriendにする
+  // Make the learning class a friend
   friend class Trainer<Sum>;
 
-  // この層の直前の層
+  // the layer immediately before this layer
   FirstPreviousLayer previous_layer_;
 };
 
-// 複数の層の出力の和を取る層（テンプレート引数が1つの場合）
+// Layer that sums the output of multiple layers (when there is one template argument)
 template <typename PreviousLayer>
 class Sum<PreviousLayer> {
  public:
-  // 入出力の型
+  // Input/output type
   using InputType = typename PreviousLayer::OutputType;
   using OutputType = InputType;
 
-  // 入出力の次元数
+  // number of input/output dimensions
   static constexpr IndexType kInputDimensions =
       PreviousLayer::kOutputDimensions;
   static constexpr IndexType kOutputDimensions = kInputDimensions;
 
-  // 入力層からこの層までで使用する順伝播用バッファのサイズ
+  // Size of the forward propagation buffer used from the input layer to this layer
   static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
 
-  // 評価関数ファイルに埋め込むハッシュ値
+  // Hash value embedded in the evaluation function file
   static constexpr std::uint32_t GetHashValue() {
     std::uint32_t hash_value = 0xBCE400B4u;
     hash_value ^= PreviousLayer::GetHashValue() >> 1;
@@ -117,38 +117,38 @@ class Sum<PreviousLayer> {
     return hash_value;
   }
 
-  // 入力層からこの層までの構造を表す文字列
+  // A string that represents the structure from the input layer to this layer
   static std::string GetStructureString() {
     return "Sum[" +
         std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
   }
 
-  // パラメータを読み込む
+  // read parameters
   bool ReadParameters(std::istream& stream) {
     return previous_layer_.ReadParameters(stream);
   }
 
-  // パラメータを書き込む
+  // write parameters
   bool WriteParameters(std::ostream& stream) const {
     return previous_layer_.WriteParameters(stream);
   }
 
-  // 順伝播
+  // forward propagation
   const OutputType* Propagate(
       const TransformedFeatureType* transformed_features, char* buffer) const {
     return previous_layer_.Propagate(transformed_features, buffer);
   }
 
  protected:
-  // 和を取る対象となる層のリストを表す文字列
+  // A string that represents the list of layers to be summed
   static std::string GetSummandsString() {
     return PreviousLayer::GetStructureString();
   }
 
-  // 学習用クラスをfriendにする
+  // Make the learning class a friend
   friend class Trainer<Sum>;
 
-  // この層の直前の層
+  // the layer immediately before this layer
   PreviousLayer previous_layer_;
 };
 
@@ -160,4 +160,4 @@ class Sum<PreviousLayer> {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/nnue_accumulator.h b/src/eval/nnue/nnue_accumulator.h
index 4241edb3..e480526b 100644
--- a/src/eval/nnue/nnue_accumulator.h
+++ b/src/eval/nnue/nnue_accumulator.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の差分計算用のクラス
+﻿// Class for difference calculation of NNUE evaluation function
 
 #ifndef _NNUE_ACCUMULATOR_H_
 #define _NNUE_ACCUMULATOR_H_
@@ -11,8 +11,8 @@ namespace Eval {
 
 namespace NNUE {
 
-// 入力特徴量をアフィン変換した結果を保持するクラス
-// 最終的な出力である評価値も一緒に持たせておく
+// Class that holds the result of affine transformation of input features
+// Keep the evaluation value that is the final output together
 struct alignas(32) Accumulator {
   std::int16_t
       accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
@@ -27,4 +27,4 @@ struct alignas(32) Accumulator {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/nnue_architecture.h b/src/eval/nnue/nnue_architecture.h
index 7479ac0a..aa4e8c7f 100644
--- a/src/eval/nnue/nnue_architecture.h
+++ b/src/eval/nnue/nnue_architecture.h
@@ -1,11 +1,11 @@
-﻿// NNUE評価関数で用いる入力特徴量とネットワーク構造
+﻿// Input features and network structure used in NNUE evaluation function
 
 #ifndef _NNUE_ARCHITECTURE_H_
 #define _NNUE_ARCHITECTURE_H_
 
 #if defined(EVAL_NNUE)
 
-// 入力特徴量とネットワーク構造が定義されたヘッダをincludeする
+// include a header that defines the input features and network structure
 //#include "architectures/k-p_256x2-32-32.h"
 //#include "architectures/k-p-cr_256x2-32-32.h"
 //#include "architectures/k-p-cr-ep_256x2-32-32.h"
@@ -20,7 +20,7 @@ static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
 static_assert(Network::kOutputDimensions == 1, "");
 static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
 
-// 差分計算の代わりに全計算を行うタイミングのリスト
+// List of timings to perform all calculations instead of difference calculation
 constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
 
 }  // namespace NNUE
@@ -29,4 +29,4 @@ constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/nnue_common.h b/src/eval/nnue/nnue_common.h
index 8ef8fee4..502fbc05 100644
--- a/src/eval/nnue/nnue_common.h
+++ b/src/eval/nnue/nnue_common.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数で用いる定数など
+﻿// Constants used in NNUE evaluation function
 
 #ifndef _NNUE_COMMON_H_
 #define _NNUE_COMMON_H_
@@ -15,17 +15,17 @@ namespace Eval {
 
 namespace NNUE {
 
-// 評価関数ファイルのバージョンを表す定数
+// A constant that represents the version of the evaluation function file
 constexpr std::uint32_t kVersion = 0x7AF32F16u;
 
-// 評価値の計算で利用する定数
+// Constant used in evaluation value calculation
 constexpr int FV_SCALE = 16;
 constexpr int kWeightScaleBits = 6;
 
-// キャッシュラインのサイズ（バイト単位）
+// Size of cache line (in bytes)
 constexpr std::size_t kCacheLineSize = 64;
 
-// SIMD幅（バイト単位）
+// SIMD width (in bytes)
 #if defined(USE_AVX2)
 constexpr std::size_t kSimdWidth = 32;
 #elif defined(USE_SSE2)
@@ -35,17 +35,17 @@ constexpr std::size_t kSimdWidth = 16;
 #endif
 constexpr std::size_t kMaxSimdWidth = 32;
 
-// 変換後の入力特徴量の型
+// Type of input feature after conversion
 using TransformedFeatureType = std::uint8_t;
 
-// インデックスの型
+// index type
 using IndexType = std::uint32_t;
 
-// 学習用クラステンプレートの前方宣言
+// Forward declaration of learning class template
 template <typename Layer>
 class Trainer;
 
-// n以上で最小のbaseの倍数を求める
+// find the smallest multiple of n and above
 template <typename IntType>
 constexpr IntType CeilToMultiple(IntType n, IntType base) {
   return (n + base - 1) / base * base;
diff --git a/src/eval/nnue/nnue_feature_transformer.h b/src/eval/nnue/nnue_feature_transformer.h
index 57d25310..039a0b98 100644
--- a/src/eval/nnue/nnue_feature_transformer.h
+++ b/src/eval/nnue/nnue_feature_transformer.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の入力特徴量の変換を行うクラス
+﻿// A class that converts the input features of the NNUE evaluation function
 
 #ifndef _NNUE_FEATURE_TRANSFORMER_H_
 #define _NNUE_FEATURE_TRANSFORMER_H_
@@ -15,37 +15,37 @@ namespace Eval {
 
 namespace NNUE {
 
-// 入力特徴量変換器
+// Input feature converter
 class FeatureTransformer {
  private:
-  // 片側分の出力の次元数
+  // number of output dimensions for one side
   static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
 
  public:
-  // 出力の型
+  // output type
   using OutputType = TransformedFeatureType;
 
-  // 入出力の次元数
+  // number of input/output dimensions
   static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
   static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
 
-  // 順伝播用バッファのサイズ
+  // size of forward propagation buffer
   static constexpr std::size_t kBufferSize =
       kOutputDimensions * sizeof(OutputType);
 
-  // 評価関数ファイルに埋め込むハッシュ値
+  // Hash value embedded in the evaluation function file
   static constexpr std::uint32_t GetHashValue() {
     return RawFeatures::kHashValue ^ kOutputDimensions;
   }
 
-  // 構造を表す文字列
+  // a string representing the structure
   static std::string GetStructureString() {
     return RawFeatures::GetName() + "[" +
         std::to_string(kInputDimensions) + "->" +
         std::to_string(kHalfDimensions) + "x2]";
   }
 
-  // パラメータを読み込む
+  // read parameters
   bool ReadParameters(std::istream& stream) {
     stream.read(reinterpret_cast<char*>(biases_),
                 kHalfDimensions * sizeof(BiasType));
@@ -54,7 +54,7 @@ class FeatureTransformer {
     return !stream.fail();
   }
 
-  // パラメータを書き込む
+  // write parameters
   bool WriteParameters(std::ostream& stream) const {
     stream.write(reinterpret_cast<const char*>(biases_),
                  kHalfDimensions * sizeof(BiasType));
@@ -63,7 +63,7 @@ class FeatureTransformer {
     return !stream.fail();
   }
 
-  // 可能なら差分計算を進める
+  // proceed with the difference calculation if possible
   bool UpdateAccumulatorIfPossible(const Position& pos) const {
     const auto now = pos.state();
     if (now->accumulator.computed_accumulation) {
@@ -77,7 +77,7 @@ class FeatureTransformer {
     return false;
   }
 
-  // 入力特徴量を変換する
+  // convert input features
   void Transform(const Position& pos, OutputType* output, bool refresh) const {
     if (refresh || !UpdateAccumulatorIfPossible(pos)) {
       RefreshAccumulator(pos);
@@ -174,7 +174,7 @@ class FeatureTransformer {
   }
 
  private:
-  // 差分計算を用いずに累積値を計算する
+  // Calculate cumulative value without using difference calculation
   void RefreshAccumulator(const Position& pos) const {
     auto& accumulator = pos.state()->accumulator;
     for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
@@ -232,7 +232,7 @@ class FeatureTransformer {
     accumulator.computed_score = false;
   }
 
-  // 差分計算を用いて累積値を計算する
+  // Calculate cumulative value using difference calculation
   void UpdateAccumulator(const Position& pos) const {
     const auto prev_accumulator = pos.state()->previous->accumulator;
     auto& accumulator = pos.state()->accumulator;
@@ -263,7 +263,7 @@ class FeatureTransformer {
             std::memset(accumulator.accumulation[perspective][i], 0,
                         kHalfDimensions * sizeof(BiasType));
           }
-        } else {  // 1から0に変化した特徴量に関する差分計算
+        } else {// Difference calculation for the feature amount changed from 1 to 0
           std::memcpy(accumulator.accumulation[perspective][i],
                       prev_accumulator.accumulation[perspective][i],
                       kHalfDimensions * sizeof(BiasType));
@@ -292,7 +292,7 @@ class FeatureTransformer {
 #endif
           }
         }
-        {  // 0から1に変化した特徴量に関する差分計算
+        {// Difference calculation for features that changed from 0 to 1
           for (const auto index : added_indices[perspective]) {
             const IndexType offset = kHalfDimensions * index;
 #if defined(USE_AVX2)
@@ -325,14 +325,14 @@ class FeatureTransformer {
     accumulator.computed_score = false;
   }
 
-  // パラメータの型
+  // parameter type
   using BiasType = std::int16_t;
   using WeightType = std::int16_t;
 
-  // 学習用クラスをfriendにする
+  // Make the learning class a friend
   friend class Trainer<FeatureTransformer>;
 
-  // パラメータ
+  // parameter
   alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
   alignas(kCacheLineSize)
       WeightType weights_[kHalfDimensions * kInputDimensions];
@@ -344,4 +344,4 @@ class FeatureTransformer {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/nnue_test_command.cpp b/src/eval/nnue/nnue_test_command.cpp
index 28e44273..46bc97de 100644
--- a/src/eval/nnue/nnue_test_command.cpp
+++ b/src/eval/nnue/nnue_test_command.cpp
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数に関するUSI拡張コマンド
+﻿// USI extended command for NNUE evaluation function
 
 #if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
 
@@ -19,15 +19,15 @@ namespace NNUE {
 
 namespace {
 
-// 主に差分計算に関するRawFeaturesのテスト
+// Testing RawFeatures mainly for difference calculation
 void TestFeatures(Position& pos) {
   const std::uint64_t num_games = 1000;
   StateInfo si;
   pos.set(StartFEN, false, &si, Threads.main());
-  const int MAX_PLY = 256; // 256手までテスト
+  const int MAX_PLY = 256; // test up to 256 hands
 
-  StateInfo state[MAX_PLY]; // StateInfoを最大手数分だけ
-  int ply; // 初期局面からの手数
+  StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
+  int ply; // Trouble from the initial phase
 
   PRNG prng(20171128);
 
@@ -96,13 +96,13 @@ void TestFeatures(Position& pos) {
   for (std::uint64_t i = 0; i < num_games; ++i) {
     auto index_sets = make_index_sets(pos);
     for (ply = 0; ply < MAX_PLY; ++ply) {
-      MoveList<LEGAL> mg(pos); // 全合法手の生成
+      MoveList<LEGAL> mg(pos); // Generate all legal hands
 
-      // 合法な指し手がなかった == 詰み
+      // There was no legal move == Clog
       if (mg.size() == 0)
         break;
 
-      // 生成された指し手のなかからランダムに選び、その指し手で局面を進める。
+      // Randomly choose from the generated moves and advance the phase with the moves.
       Move m = mg.begin()[prng.rand(mg.size())];
       pos.do_move(m, state[ply]);
 
@@ -113,7 +113,7 @@ void TestFeatures(Position& pos) {
 
     pos.set(StartFEN, false, &si, Threads.main());
 
-    // 100回に1回ごとに'.'を出力(進んでいることがわかるように)
+    // Output'.' every 100 times (so you can see that it's progressing)
     if ((i % 100) == 0)
       std::cout << "." << std::flush;
   }
@@ -141,7 +141,7 @@ void TestFeatures(Position& pos) {
             << ") features" << std::endl;
 }
 
-// 評価関数の構造を表す文字列を出力する
+// Output a string that represents the structure of the evaluation function
 void PrintInfo(std::istream& stream) {
   std::cout << "network architecture: " << GetArchitectureString() << std::endl;
 
@@ -178,7 +178,7 @@ void PrintInfo(std::istream& stream) {
 
 }  // namespace
 
-// NNUE評価関数に関するUSI拡張コマンド
+// USI extended command for NNUE evaluation function
 void TestCommand(Position& pos, std::istream& stream) {
   std::string sub_command;
   stream >> sub_command;
diff --git a/src/eval/nnue/nnue_test_command.h b/src/eval/nnue/nnue_test_command.h
index 10f57f6c..570ef01b 100644
--- a/src/eval/nnue/nnue_test_command.h
+++ b/src/eval/nnue/nnue_test_command.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数に関するUSI拡張コマンドのインターフェイス
+﻿// USI extended command interface for NNUE evaluation function
 
 #ifndef _NNUE_TEST_COMMAND_H_
 #define _NNUE_TEST_COMMAND_H_
@@ -9,7 +9,7 @@ namespace Eval {
 
 namespace NNUE {
 
-// NNUE評価関数に関するUSI拡張コマンド
+// USI extended command for NNUE evaluation function
 void TestCommand(Position& pos, std::istream& stream);
 
 }  // namespace NNUE
diff --git a/src/eval/nnue/trainer/features/factorizer.h b/src/eval/nnue/trainer/features/factorizer.h
index 3bc59260..dea95370 100644
--- a/src/eval/nnue/trainer/features/factorizer.h
+++ b/src/eval/nnue/trainer/features/factorizer.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の特徴量変換クラステンプレート
+﻿// NNUE evaluation function feature conversion class template
 
 #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
@@ -14,31 +14,31 @@ namespace NNUE {
 
 namespace Features {
 
-// 入力特徴量を学習用特徴量に変換するクラステンプレート
-// デフォルトでは学習用特徴量は元の入力特徴量と同じとし、必要に応じて特殊化する
+// Class template that converts input features into learning features
+// By default, the learning feature is the same as the original input feature, and specialized as necessary
 template <typename FeatureType>
 class Factorizer {
  public:
-  // 学習用特徴量の次元数を取得する
+  // Get the dimensionality of the learning feature
   static constexpr IndexType GetDimensions() {
     return FeatureType::kDimensions;
   }
 
-  // 学習用特徴量のインデックスと学習率のスケールを取得する
+  // Get index of learning feature and scale of learning rate
   static void AppendTrainingFeatures(
       IndexType base_index, std::vector<TrainingFeature>* training_features) {
-    assert(base_index < FeatureType::kDimensions);
+    assert(base_index <FeatureType::kDimensions);
     training_features->emplace_back(base_index);
   }
 };
 
-// 学習用特徴量の情報
+// Learning feature information
 struct FeatureProperties {
   bool active;
   IndexType dimensions;
 };
 
-// 元の入力特徴量を学習用特徴量に追加する
+// Add the original input features to the learning features
 template <typename FeatureType>
 IndexType AppendBaseFeature(
     FeatureProperties properties, IndexType base_index,
@@ -49,7 +49,7 @@ IndexType AppendBaseFeature(
   return properties.dimensions;
 }
 
-// 学習率のスケールが0でなければ他の種類の学習用特徴量を引き継ぐ
+// If the learning rate scale is not 0, inherit other types of learning features
 template <typename FeatureType>
 IndexType InheritFeaturesIfRequired(
     IndexType index_offset, FeatureProperties properties, IndexType base_index,
@@ -70,8 +70,8 @@ IndexType InheritFeaturesIfRequired(
   return properties.dimensions;
 }
 
-// 学習用特徴量を追加せず、必要に応じてインデックスの差分を返す
-// 対応する特徴量がない場合にInheritFeaturesIfRequired()の代わりに呼ぶ
+// Return the index difference as needed, without adding learning features
+// Call instead of InheritFeaturesIfRequired() if there are no corresponding features
 IndexType SkipFeatures(FeatureProperties properties) {
   if (!properties.active) {
     return 0;
@@ -79,7 +79,7 @@ IndexType SkipFeatures(FeatureProperties properties) {
   return properties.dimensions;
 }
 
-// 学習用特徴量の次元数を取得する
+// Get the dimensionality of the learning feature
 template <std::size_t N>
 constexpr IndexType GetActiveDimensions(
     const FeatureProperties (&properties)[N]) {
@@ -93,7 +93,7 @@ constexpr IndexType GetActiveDimensions(
   return dimensions;
 }
 
-// 配列の要素数を取得する
+// get the number of elements in the array
 template <typename T, std::size_t N>
 constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
   return N;
@@ -107,4 +107,4 @@ constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/trainer/features/factorizer_feature_set.h b/src/eval/nnue/trainer/features/factorizer_feature_set.h
index 111678e4..0afe7a48 100644
--- a/src/eval/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/eval/nnue/trainer/features/factorizer_feature_set.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の特徴量変換クラステンプレートのFeatureSet用特殊化
+﻿// Specialization for feature set of feature conversion class template of NNUE evaluation function
 
 #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
@@ -14,8 +14,8 @@ namespace NNUE {
 
 namespace Features {
 
-// 入力特徴量を学習用特徴量に変換するクラステンプレート
-// FeatureSet用特殊化
+// Class template that converts input features into learning features
+// Specialization for FeatureSet
 template <typename FirstFeatureType, typename... RemainingFeatureTypes>
 class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
  private:
@@ -23,16 +23,16 @@ class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
   using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
 
  public:
-  // 元の入力特徴量の次元数
+  // number of dimensions of original input features
   static constexpr IndexType kBaseDimensions =
       FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
 
-  // 学習用特徴量の次元数を取得する
+  // Get the dimensionality of the learning feature
   static constexpr IndexType GetDimensions() {
     return Head::GetDimensions() + Tail::GetDimensions();
   }
 
-  // 学習用特徴量のインデックスと学習率のスケールを取得する
+  // Get index of learning feature and scale of learning rate
   static void AppendTrainingFeatures(
       IndexType base_index, std::vector<TrainingFeature>* training_features,
       IndexType base_dimensions = kBaseDimensions) {
@@ -62,20 +62,20 @@ class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
   }
 };
 
-// 入力特徴量を学習用特徴量に変換するクラステンプレート
-// FeatureSetのテンプレート引数が1つの場合の特殊化
+// Class template that converts input features into learning features
+// Specialization when FeatureSet has one template argument
 template <typename FeatureType>
 class Factorizer<FeatureSet<FeatureType>> {
 public:
-  // 元の入力特徴量の次元数
+  // number of dimensions of original input features
   static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
 
-  // 学習用特徴量の次元数を取得する
+  // Get the dimensionality of the learning feature
   static constexpr IndexType GetDimensions() {
     return Factorizer<FeatureType>::GetDimensions();
   }
 
-  // 学習用特徴量のインデックスと学習率のスケールを取得する
+  // Get index of learning feature and scale of learning rate
   static void AppendTrainingFeatures(
       IndexType base_index, std::vector<TrainingFeature>* training_features,
       IndexType base_dimensions = kBaseDimensions) {
@@ -101,4 +101,4 @@ public:
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/trainer/features/factorizer_half_kp.h b/src/eval/nnue/trainer/features/factorizer_half_kp.h
index 28c11074..6ce5854a 100644
--- a/src/eval/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/eval/nnue/trainer/features/factorizer_half_kp.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の特徴量変換クラステンプレートのHalfKP用特殊化
+﻿// Specialization of NNUE evaluation function feature conversion class template for HalfKP
 
 #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
@@ -16,18 +16,18 @@ namespace NNUE {
 
 namespace Features {
 
-// 入力特徴量を学習用特徴量に変換するクラステンプレート
-// HalfKP用特殊化
+// Class template that converts input features into learning features
+// Specialization for HalfKP
 template <Side AssociatedKing>
 class Factorizer<HalfKP<AssociatedKing>> {
  private:
   using FeatureType = HalfKP<AssociatedKing>;
 
-  // 特徴量のうち、同時に値が1となるインデックスの数の最大値
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
   static constexpr IndexType kMaxActiveDimensions =
       FeatureType::kMaxActiveDimensions;
 
-  // 学習用特徴量の種類
+  // Type of learning feature
   enum TrainingFeatureType {
     kFeaturesHalfKP,
     kFeaturesHalfK,
@@ -36,7 +36,7 @@ class Factorizer<HalfKP<AssociatedKing>> {
     kNumTrainingFeatureTypes,
   };
 
-  // 学習用特徴量の情報
+  // Learning feature information
   static constexpr FeatureProperties kProperties[] = {
     // kFeaturesHalfKP
     {true, FeatureType::kDimensions},
@@ -50,12 +50,12 @@ class Factorizer<HalfKP<AssociatedKing>> {
   static_assert(GetArrayLength(kProperties) == kNumTrainingFeatureTypes, "");
 
  public:
-  // 学習用特徴量の次元数を取得する
+  // Get the dimensionality of the learning feature
   static constexpr IndexType GetDimensions() {
     return GetActiveDimensions(kProperties);
   }
 
-  // 学習用特徴量のインデックスと学習率のスケールを取得する
+  // Get index of learning feature and scale of learning rate
   static void AppendTrainingFeatures(
       IndexType base_index, std::vector<TrainingFeature>* training_features) {
     // kFeaturesHalfKP
@@ -100,4 +100,4 @@ constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/trainer/trainer.h b/src/eval/nnue/trainer/trainer.h
index 630f1a3d..f50ce092 100644
--- a/src/eval/nnue/trainer/trainer.h
+++ b/src/eval/nnue/trainer/trainer.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の学習用クラステンプレートの共通ヘッダ
+﻿// Common header of class template for learning NNUE evaluation function
 
 #ifndef _NNUE_TRAINER_H_
 #define _NNUE_TRAINER_H_
@@ -18,10 +18,10 @@ namespace Eval {
 
 namespace NNUE {
 
-// 評価値と勝率の関係式で用いるPonanza定数
+// Ponanza constant used in the relation between evaluation value and winning percentage
 constexpr double kPonanzaConstant = 600.0;
 
-// 学習用特徴量のインデックス1つを表すクラス
+// Class that represents one index of learning feature
 class TrainingFeature {
   using StorageType = std::uint32_t;
   static_assert(std::is_unsigned<StorageType>::value, "");
@@ -60,7 +60,7 @@ class TrainingFeature {
   StorageType index_and_count_;
 };
 
-// 学習データ1サンプルを表す構造体
+// Structure that represents one sample of training data
 struct Example {
   std::vector<TrainingFeature> training_features[2];
   Learner::PackedSfenValue psv;
@@ -68,9 +68,9 @@ struct Example {
   double weight;
 };
 
-// ハイパーパラメータの設定などに使用するメッセージ
+// Message used for setting hyperparameters
 struct Message {
-  Message(const std::string& name, const std::string& value = "") :
+  Message(const std::string& name, const std::string& value = ""):
       name(name), value(value), num_peekers(0), num_receivers(0) {}
   const std::string name;
   const std::string value;
@@ -78,7 +78,7 @@ struct Message {
   std::uint32_t num_receivers;
 };
 
-// メッセージを受理するかどうかを判定する
+// determine whether to accept the message
 bool ReceiveMessage(const std::string& name, Message* message) {
   const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
   if (message->name.substr(0, name.size() + 1) == name + "[") {
@@ -91,7 +91,7 @@ bool ReceiveMessage(const std::string& name, Message* message) {
   return false;
 }
 
-// 文字列を分割する
+// split the string
 std::vector<std::string> Split(const std::string& input, char delimiter) {
   std::istringstream stream(input);
   std::string field;
@@ -102,13 +102,13 @@ std::vector<std::string> Split(const std::string& input, char delimiter) {
   return fields;
 }
 
-// 浮動小数点数を整数に丸める
+// round a floating point number to an integer
 template <typename IntType>
 IntType Round(double value) {
   return static_cast<IntType>(std::floor(value + 0.5));
 }
 
-// アライメント付きmake_shared
+// make_shared with alignment
 template <typename T, typename... ArgumentTypes>
 std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
   const auto ptr = new(aligned_malloc(sizeof(T), alignof(T)))
@@ -122,4 +122,4 @@ std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
 
 #endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/trainer/trainer_affine_transform.h b/src/eval/nnue/trainer/trainer_affine_transform.h
index 34c4816b..f5b208a3 100644
--- a/src/eval/nnue/trainer/trainer_affine_transform.h
+++ b/src/eval/nnue/trainer/trainer_affine_transform.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の学習クラステンプレートのAffineTransform用特殊化
+﻿// Specialization of NNUE evaluation function learning class template for AffineTransform
 
 #ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
@@ -15,22 +15,22 @@ namespace Eval {
 
 namespace NNUE {
 
-// 学習：アフィン変換層
+// Learning: Affine transformation layer
 template <typename PreviousLayer, IndexType OutputDimensions>
 class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
  private:
-  // 学習対象の層の型
+  // Type of layer to learn
   using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
 
  public:
-  // ファクトリ関数
+  // factory function
   static std::shared_ptr<Trainer> Create(
       LayerType* target_layer, FeatureTransformer* feature_transformer) {
     return std::shared_ptr<Trainer>(
         new Trainer(target_layer, feature_transformer));
   }
 
-  // ハイパーパラメータなどのオプションを設定する
+  // Set options such as hyperparameters
   void SendMessage(Message* message) {
     previous_layer_trainer_->SendMessage(message);
     if (ReceiveMessage("momentum", message)) {
@@ -48,19 +48,19 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
     }
   }
 
-  // パラメータを乱数で初期化する
+  // Initialize the parameters with random numbers
   template <typename RNG>
   void Initialize(RNG& rng) {
     previous_layer_trainer_->Initialize(rng);
     if (kIsOutputLayer) {
-      // 出力層は0で初期化する
+      // Initialize output layer with 0
       std::fill(std::begin(biases_), std::end(biases_),
                 static_cast<LearnFloatType>(0.0));
       std::fill(std::begin(weights_), std::end(weights_),
                 static_cast<LearnFloatType>(0.0));
     } else {
-      // 入力の分布が各ユニット平均0.5、等分散であることを仮定し、
-      // 出力の分布が各ユニット平均0.5、入力と同じ等分散になるように初期化する
+      // Assuming that the input distribution is unit-mean 0.5, equal variance,
+      // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
       const double kSigma = 1.0 / std::sqrt(kInputDimensions);
       auto distribution = std::normal_distribution<double>(0.0, kSigma);
       for (IndexType i = 0; i < kOutputDimensions; ++i) {
@@ -76,7 +76,7 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
     QuantizeParameters();
   }
 
-  // 順伝播
+  // forward propagation
   const LearnFloatType* Propagate(const std::vector<Example>& batch) {
     if (output_.size() < kOutputDimensions * batch.size()) {
       output_.resize(kOutputDimensions * batch.size());
@@ -111,7 +111,7 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
     return output_.data();
   }
 
-  // 逆伝播
+  // backpropagation
   void Backpropagate(const LearnFloatType* gradients,
                      LearnFloatType learning_rate) {
     const LearnFloatType local_learning_rate =
@@ -185,7 +185,7 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
   }
 
  private:
-  // コンストラクタ
+  // constructor
   Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
       batch_size_(0),
       batch_input_(nullptr),
@@ -201,7 +201,7 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
     DequantizeParameters();
   }
 
-  // 重みの飽和とパラメータの整数化
+  // Weight saturation and parameterization
   void QuantizeParameters() {
     for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
       weights_[i] = std::max(-kMaxWeightMagnitude,
@@ -222,7 +222,7 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
     }
   }
 
-  // 整数化されたパラメータの読み込み
+  // read parameterized integer
   void DequantizeParameters() {
     for (IndexType i = 0; i < kOutputDimensions; ++i) {
       biases_[i] = static_cast<LearnFloatType>(
@@ -242,14 +242,14 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
               static_cast<LearnFloatType>(0.0));
   }
 
-  // 入出力の次元数
+  // number of input/output dimensions
   static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
   static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
 
-  // 出力の次元数が1なら出力層
+  // If the output dimensionality is 1, the output layer
   static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
 
-  // パラメータの整数化で用いる係数
+  // Coefficient used for parameterization
   static constexpr LearnFloatType kActivationScale =
       std::numeric_limits<std::int8_t>::max();
   static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
@@ -257,37 +257,37 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
       ((1 << kWeightScaleBits) * kActivationScale);
   static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
 
-  // パラメータの整数化でオーバーフローさせないために用いる重みの絶対値の上限
+  // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
   static constexpr LearnFloatType kMaxWeightMagnitude =
       std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
 
-  // ミニバッチのサンプル数
+  // number of samples in mini-batch
   IndexType batch_size_;
 
-  // ミニバッチの入力
+  // Input mini batch
   const LearnFloatType* batch_input_;
 
-  // 直前の層のTrainer
+  // Trainer of the previous layer
   const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
 
-  // 学習対象の層
+  // layer to learn
   LayerType* const target_layer_;
 
-  // パラメータ
+  // parameter
   LearnFloatType biases_[kOutputDimensions];
   LearnFloatType weights_[kOutputDimensions * kInputDimensions];
 
-  // パラメータの更新で用いるバッファ
+  // Buffer used for updating parameters
   LearnFloatType biases_diff_[kOutputDimensions];
   LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
 
-  // 順伝播用バッファ
+  // Forward propagation buffer
   std::vector<LearnFloatType> output_;
 
-  // 逆伝播用バッファ
+  // buffer for back propagation
   std::vector<LearnFloatType> gradients_;
 
-  // ハイパーパラメータ
+  // hyper parameter
   LearnFloatType momentum_;
   LearnFloatType learning_rate_scale_;
 };
diff --git a/src/eval/nnue/trainer/trainer_clipped_relu.h b/src/eval/nnue/trainer/trainer_clipped_relu.h
index bd894769..7fe1913d 100644
--- a/src/eval/nnue/trainer/trainer_clipped_relu.h
+++ b/src/eval/nnue/trainer/trainer_clipped_relu.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の学習クラステンプレートのClippedReLU用特殊化
+﻿// Specialization of NNUE evaluation function learning class template for ClippedReLU
 
 #ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
 #define _NNUE_TRAINER_CLIPPED_RELU_H_
@@ -13,22 +13,22 @@ namespace Eval {
 
 namespace NNUE {
 
-// 学習：アフィン変換層
+// Learning: Affine transformation layer
 template <typename PreviousLayer>
 class Trainer<Layers::ClippedReLU<PreviousLayer>> {
  private:
-  // 学習対象の層の型
+  // Type of layer to learn
   using LayerType = Layers::ClippedReLU<PreviousLayer>;
 
  public:
-  // ファクトリ関数
+  // factory function
   static std::shared_ptr<Trainer> Create(
       LayerType* target_layer, FeatureTransformer* feature_transformer) {
     return std::shared_ptr<Trainer>(
         new Trainer(target_layer, feature_transformer));
   }
 
-  // ハイパーパラメータなどのオプションを設定する
+  // Set options such as hyperparameters
   void SendMessage(Message* message) {
     previous_layer_trainer_->SendMessage(message);
     if (ReceiveMessage("check_health", message)) {
@@ -36,13 +36,13 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
     }
   }
 
-  // パラメータを乱数で初期化する
+  // Initialize the parameters with random numbers
   template <typename RNG>
   void Initialize(RNG& rng) {
     previous_layer_trainer_->Initialize(rng);
   }
 
-  // 順伝播
+  // forward propagation
   const LearnFloatType* Propagate(const std::vector<Example>& batch) {
     if (output_.size() < kOutputDimensions * batch.size()) {
       output_.resize(kOutputDimensions * batch.size());
@@ -62,7 +62,7 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
     return output_.data();
   }
 
-  // 逆伝播
+  // backpropagation
   void Backpropagate(const LearnFloatType* gradients,
                      LearnFloatType learning_rate) {
     for (IndexType b = 0; b < batch_size_; ++b) {
@@ -77,7 +77,7 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
   }
 
  private:
-  // コンストラクタ
+  // constructor
   Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
       batch_size_(0),
       previous_layer_trainer_(Trainer<PreviousLayer>::Create(
@@ -89,7 +89,7 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
               std::numeric_limits<LearnFloatType>::lowest());
   }
 
-  // 学習に問題が生じていないかチェックする
+  // Check if there are any problems with learning
   void CheckHealth() {
     const auto largest_min_activation = *std::max_element(
         std::begin(min_activations_), std::end(min_activations_));
@@ -105,30 +105,30 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
               std::numeric_limits<LearnFloatType>::lowest());
   }
 
-  // 入出力の次元数
+  // number of input/output dimensions
   static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
   static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
 
-  // LearnFloatTypeの定数
+  // LearnFloatType constant
   static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
   static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
 
-  // ミニバッチのサンプル数
+  // number of samples in mini-batch
   IndexType batch_size_;
 
-  // 直前の層のTrainer
+  // Trainer of the previous layer
   const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
 
-  // 学習対象の層
+  // layer to learn
   LayerType* const target_layer_;
 
-  // 順伝播用バッファ
+  // Forward propagation buffer
   std::vector<LearnFloatType> output_;
 
-  // 逆伝播用バッファ
+  // buffer for back propagation
   std::vector<LearnFloatType> gradients_;
 
-  // ヘルスチェック用統計値
+  // Health check statistics
   LearnFloatType min_activations_[kOutputDimensions];
   LearnFloatType max_activations_[kOutputDimensions];
 };
@@ -139,4 +139,4 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
 
 #endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/trainer/trainer_feature_transformer.h b/src/eval/nnue/trainer/trainer_feature_transformer.h
index 742da440..eb14d98b 100644
--- a/src/eval/nnue/trainer/trainer_feature_transformer.h
+++ b/src/eval/nnue/trainer/trainer_feature_transformer.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の学習クラステンプレートのFeatureTransformer用特殊化
+﻿// Specialization for feature transformer of learning class template of NNUE evaluation function
 
 #ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 #define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
@@ -24,11 +24,11 @@ namespace Eval {
 
 namespace NNUE {
 
-// 学習：入力特徴量変換器
+// Learning: Input feature converter
 template <>
 class Trainer<FeatureTransformer> {
  private:
-  // 学習対象の層の型
+  // Type of layer to learn
   using LayerType = FeatureTransformer;
 
  public:
@@ -37,12 +37,12 @@ class Trainer<FeatureTransformer> {
   template <typename T, typename... ArgumentTypes>
   friend std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments);
 
-  // ファクトリ関数
+  // factory function
   static std::shared_ptr<Trainer> Create(LayerType* target_layer) {
     return MakeAlignedSharedPtr<Trainer>(target_layer);
   }
 
-  // ハイパーパラメータなどのオプションを設定する
+  // Set options such as hyperparameters
   void SendMessage(Message* message) {
     if (ReceiveMessage("momentum", message)) {
       momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
@@ -65,7 +65,7 @@ class Trainer<FeatureTransformer> {
     }
   }
 
-  // パラメータを乱数で初期化する
+  // Initialize the parameters with random numbers
   template <typename RNG>
   void Initialize(RNG& rng) {
     std::fill(std::begin(weights_), std::end(weights_), +kZero);
@@ -81,7 +81,7 @@ class Trainer<FeatureTransformer> {
     QuantizeParameters();
   }
 
-  // 順伝播
+  // forward propagation
   const LearnFloatType* Propagate(const std::vector<Example>& batch) {
     if (output_.size() < kOutputDimensions * batch.size()) {
       output_.resize(kOutputDimensions * batch.size());
@@ -131,7 +131,7 @@ class Trainer<FeatureTransformer> {
     return output_.data();
   }
 
-  // 逆伝播
+  // backpropagation
   void Backpropagate(const LearnFloatType* gradients,
                      LearnFloatType learning_rate) {
     const LearnFloatType local_learning_rate =
@@ -144,8 +144,8 @@ class Trainer<FeatureTransformer> {
             ((output_[index] > kZero) * (output_[index] < kOne));
       }
     }
-    // 重み行列は入力に出現した特徴量に対応する列のみを更新するため、
-    // momentumを使用せず、学習率を補正してスケールを合わせる
+    // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
+    // Correct the learning rate and adjust the scale without using momentum
     const LearnFloatType effective_learning_rate =
         static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
 #if defined(USE_BLAS)
@@ -227,7 +227,7 @@ class Trainer<FeatureTransformer> {
   }
 
  private:
-  // コンストラクタ
+  // constructor
   Trainer(LayerType* target_layer) :
       batch_(nullptr),
       target_layer_(target_layer),
@@ -245,7 +245,7 @@ class Trainer<FeatureTransformer> {
     DequantizeParameters();
   }
 
-  // 重みの飽和とパラメータの整数化
+  // Weight saturation and parameterization
   void QuantizeParameters() {
     for (IndexType i = 0; i < kHalfDimensions; ++i) {
       target_layer_->biases_[i] =
@@ -268,7 +268,7 @@ class Trainer<FeatureTransformer> {
     }
   }
 
-  // 整数化されたパラメータの読み込み
+  // read parameterized integer
   void DequantizeParameters() {
     for (IndexType i = 0; i < kHalfDimensions; ++i) {
       biases_[i] = static_cast<LearnFloatType>(
@@ -282,7 +282,7 @@ class Trainer<FeatureTransformer> {
     std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
   }
 
-  // 学習データに出現していない特徴量に対応する重みを0にする
+  // Set the weight corresponding to the feature that does not appear in the learning data to 0
   void ClearUnobservedFeatureWeights() {
     for (IndexType i = 0; i < kInputDimensions; ++i) {
       if (!observed_features.test(i)) {
@@ -293,7 +293,7 @@ class Trainer<FeatureTransformer> {
     QuantizeParameters();
   }
 
-  // 学習に問題が生じていないかチェックする
+  // Check if there are any problems with learning
   void CheckHealth() {
     std::cout << "INFO: observed " << observed_features.count()
               << " (out of " << kInputDimensions << ") features" << std::endl;
@@ -320,48 +320,48 @@ class Trainer<FeatureTransformer> {
               std::numeric_limits<LearnFloatType>::lowest());
   }
 
-  // 入出力の次元数
+  // number of input/output dimensions
   static constexpr IndexType kInputDimensions =
       Features::Factorizer<RawFeatures>::GetDimensions();
   static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
   static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
 
-  // パラメータの整数化で用いる係数
+  // Coefficient used for parameterization
   static constexpr LearnFloatType kActivationScale =
       std::numeric_limits<std::int8_t>::max();
   static constexpr LearnFloatType kBiasScale = kActivationScale;
   static constexpr LearnFloatType kWeightScale = kActivationScale;
 
-  // LearnFloatTypeの定数
+  // LearnFloatType constant
   static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
   static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
 
-  // ミニバッチ
+  // mini batch
   const std::vector<Example>* batch_;
 
-  // 学習対象の層
+  // layer to learn
   LayerType* const target_layer_;
 
-  // パラメータ
+  // parameter
   alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
   alignas(kCacheLineSize)
       LearnFloatType weights_[kHalfDimensions * kInputDimensions];
 
-  // パラメータの更新で用いるバッファ
+  // Buffer used for updating parameters
   LearnFloatType biases_diff_[kHalfDimensions];
   std::vector<LearnFloatType> gradients_;
 
-  // 順伝播用バッファ
+  // Forward propagation buffer
   std::vector<LearnFloatType> output_;
 
-  // 学習データに出現した特徴量
+  // Features that appeared in the training data
   std::bitset<kInputDimensions> observed_features;
 
-  // ハイパーパラメータ
+  // hyper parameter
   LearnFloatType momentum_;
   LearnFloatType learning_rate_scale_;
 
-  // ヘルスチェック用統計値
+  // Health check statistics
   LearnFloatType min_pre_activation_;
   LearnFloatType max_pre_activation_;
   LearnFloatType min_activations_[kHalfDimensions];
@@ -374,4 +374,4 @@ class Trainer<FeatureTransformer> {
 
 #endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/eval/nnue/trainer/trainer_input_slice.h b/src/eval/nnue/trainer/trainer_input_slice.h
index 0660e987..f5b263d3 100644
--- a/src/eval/nnue/trainer/trainer_input_slice.h
+++ b/src/eval/nnue/trainer/trainer_input_slice.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の学習クラステンプレートのInputSlice用特殊化
+﻿// Specialization of NNUE evaluation function learning class template for InputSlice
 
 #ifndef _NNUE_TRAINER_INPUT_SLICE_H_
 #define _NNUE_TRAINER_INPUT_SLICE_H_
@@ -13,10 +13,10 @@ namespace Eval {
 
 namespace NNUE {
 
-// 学習：入力層
+// Learning: Input layer
 class SharedInputTrainer {
  public:
-  // ファクトリ関数
+  // factory function
   static std::shared_ptr<SharedInputTrainer> Create(
       FeatureTransformer* feature_transformer) {
     static std::shared_ptr<SharedInputTrainer> instance;
@@ -27,7 +27,7 @@ class SharedInputTrainer {
     return instance;
   }
 
-  // ハイパーパラメータなどのオプションを設定する
+  // Set options such as hyperparameters
   void SendMessage(Message* message) {
     if (num_calls_ == 0) {
       current_operation_ = Operation::kSendMessage;
@@ -40,7 +40,7 @@ class SharedInputTrainer {
     }
   }
 
-  // パラメータを乱数で初期化する
+  // Initialize the parameters with random numbers
   template <typename RNG>
   void Initialize(RNG& rng) {
     if (num_calls_ == 0) {
@@ -54,7 +54,7 @@ class SharedInputTrainer {
     }
   }
 
-  // 順伝播
+  // forward propagation
   const LearnFloatType* Propagate(const std::vector<Example>& batch) {
     if (gradients_.size() < kInputDimensions * batch.size()) {
       gradients_.resize(kInputDimensions * batch.size());
@@ -72,7 +72,7 @@ class SharedInputTrainer {
     return output_;
   }
 
-  // 逆伝播
+  // backpropagation
   void Backpropagate(const LearnFloatType* gradients,
                      LearnFloatType learning_rate) {
     if (num_referrers_ == 1) {
@@ -104,7 +104,7 @@ class SharedInputTrainer {
   }
 
  private:
-  // コンストラクタ
+  // constructor
   SharedInputTrainer(FeatureTransformer* feature_transformer) :
       batch_size_(0),
       num_referrers_(0),
@@ -115,11 +115,11 @@ class SharedInputTrainer {
       output_(nullptr) {
   }
 
-  // 入出力の次元数
+  // number of input/output dimensions
   static constexpr IndexType kInputDimensions =
       FeatureTransformer::kOutputDimensions;
 
-  // 処理の種類
+  // type of processing
   enum class Operation {
     kNone,
     kSendMessage,
@@ -128,55 +128,55 @@ class SharedInputTrainer {
     kBackPropagate,
   };
 
-  // ミニバッチのサンプル数
+  // number of samples in mini-batch
   IndexType batch_size_;
 
-  // この層を入力として共有する層の数
+  // number of layers sharing this layer as input
   std::uint32_t num_referrers_;
 
-  // 現在の処理が呼び出された回数
+  // Number of times the current process has been called
   std::uint32_t num_calls_;
 
-  // 現在の処理の種類
+  // current processing type
   Operation current_operation_;
 
-  // 入力特徴量変換器のTrainer
+  // Trainer of input feature converter
   const std::shared_ptr<Trainer<FeatureTransformer>>
       feature_transformer_trainer_;
 
-  // 順伝播用に共有する出力のポインタ
+  // pointer to output shared for forward propagation
   const LearnFloatType* output_;
 
-  // 逆伝播用バッファ
+  // buffer for back propagation
   std::vector<LearnFloatType> gradients_;
 };
 
-// 学習：入力層
+// Learning: Input layer
 template <IndexType OutputDimensions, IndexType Offset>
 class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
  private:
-  // 学習対象の層の型
+  // Type of layer to learn
   using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
 
  public:
-  // ファクトリ関数
+  // factory function
   static std::shared_ptr<Trainer> Create(
       LayerType* /*target_layer*/, FeatureTransformer* feature_transformer) {
     return std::shared_ptr<Trainer>(new Trainer(feature_transformer));
   }
 
-  // ハイパーパラメータなどのオプションを設定する
+  // Set options such as hyperparameters
   void SendMessage(Message* message) {
     shared_input_trainer_->SendMessage(message);
   }
 
-  // パラメータを乱数で初期化する
+  // Initialize the parameters with random numbers
   template <typename RNG>
   void Initialize(RNG& rng) {
     shared_input_trainer_->Initialize(rng);
   }
 
-  // 順伝播
+  // forward propagation
   const LearnFloatType* Propagate(const std::vector<Example>& batch) {
     if (output_.size() < kOutputDimensions * batch.size()) {
       output_.resize(kOutputDimensions * batch.size());
@@ -199,7 +199,7 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
     return output_.data();
   }
 
-  // 逆伝播
+  // backpropagation
   void Backpropagate(const LearnFloatType* gradients,
                      LearnFloatType learning_rate) {
     for (IndexType b = 0; b < batch_size_; ++b) {
@@ -217,28 +217,28 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
   }
 
  private:
-  // コンストラクタ
-  Trainer(FeatureTransformer* feature_transformer) :
+  // constructor
+  Trainer(FeatureTransformer* feature_transformer):
       batch_size_(0),
       shared_input_trainer_(SharedInputTrainer::Create(feature_transformer)) {
   }
 
-  // 入出力の次元数
+  // number of input/output dimensions
   static constexpr IndexType kInputDimensions =
       FeatureTransformer::kOutputDimensions;
   static constexpr IndexType kOutputDimensions = OutputDimensions;
   static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
 
-  // ミニバッチのサンプル数
+  // number of samples in mini-batch
   IndexType batch_size_;
 
-  // 共有入力層のTrainer
+  // Trainer of shared input layer
   const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
 
-  // 順伝播用バッファ
+  // Forward propagation buffer
   std::vector<LearnFloatType> output_;
 
-  // 逆伝播用バッファ
+  // buffer for back propagation
   std::vector<LearnFloatType> gradients_;
 };
 
diff --git a/src/eval/nnue/trainer/trainer_sum.h b/src/eval/nnue/trainer/trainer_sum.h
index 76f6073f..bae3edd5 100644
--- a/src/eval/nnue/trainer/trainer_sum.h
+++ b/src/eval/nnue/trainer/trainer_sum.h
@@ -1,4 +1,4 @@
-﻿// NNUE評価関数の学習クラステンプレートのSum用特殊化
+﻿// Specialization of NNUE evaluation function learning class template for Sum
 
 #ifndef _NNUE_TRAINER_SUM_H_
 #define _NNUE_TRAINER_SUM_H_
@@ -13,40 +13,40 @@ namespace Eval {
 
 namespace NNUE {
 
-// 学習：複数の層の出力の和を取る層
+// Learning: A layer that sums the outputs of multiple layers
 template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
 class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
       Trainer<Layers::Sum<RemainingPreviousLayers...>> {
  private:
-  // 学習対象の層の型
+  // Type of layer to learn
   using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
   using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
 
  public:
-  // ファクトリ関数
+  // factory function
   static std::shared_ptr<Trainer> Create(
       LayerType* target_layer, FeatureTransformer* feature_transformer) {
     return std::shared_ptr<Trainer>(
         new Trainer(target_layer, feature_transformer));
   }
 
-  // ハイパーパラメータなどのオプションを設定する
+  // Set options such as hyperparameters
   void SendMessage(Message* message) {
-    // 他のメンバ関数の結果は処理の順番に依存しないため、
-    // 実装をシンプルにすることを目的としてTailを先に処理するが、
-    // SendMessageは添字の対応を分かりやすくするためにHeadを先に処理する
+    // The results of other member functions do not depend on the processing order, so
+    // Tail is processed first for the purpose of simplifying the implementation, but
+    // SendMessage processes Head first to make it easier to understand subscript correspondence
     previous_layer_trainer_->SendMessage(message);
     Tail::SendMessage(message);
   }
 
-  // パラメータを乱数で初期化する
+  // Initialize the parameters with random numbers
   template <typename RNG>
   void Initialize(RNG& rng) {
     Tail::Initialize(rng);
     previous_layer_trainer_->Initialize(rng);
   }
 
-  // 順伝播
+  // forward propagation
   /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
     batch_size_ = static_cast<IndexType>(batch.size());
     auto output = Tail::Propagate(batch);
@@ -65,7 +65,7 @@ class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
     return output;
   }
 
-  // 逆伝播
+  // backpropagation
   void Backpropagate(const LearnFloatType* gradients,
                      LearnFloatType learning_rate) {
     Tail::Backpropagate(gradients, learning_rate);
@@ -73,8 +73,8 @@ class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
   }
 
  private:
-  // コンストラクタ
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+  // constructor
+  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer):
       Tail(target_layer, feature_transformer),
       batch_size_(0),
       previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
@@ -82,51 +82,51 @@ class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
       target_layer_(target_layer) {
   }
 
-  // 入出力の次元数
+  // number of input/output dimensions
   static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
 
-  // サブクラスをfriendにする
+  // make subclass friend
   template <typename SumLayer>
   friend class Trainer;
 
-  // ミニバッチのサンプル数
+  // number of samples in mini-batch
   IndexType batch_size_;
 
-  // 直前の層のTrainer
+  // Trainer of the previous layer
   const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
 
-  // 学習対象の層
+  // layer to learn
   LayerType* const target_layer_;
 };
 
 
-// 学習：複数の層の出力の和を取る層（テンプレート引数が1つの場合）
+// Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
 template <typename PreviousLayer>
 class Trainer<Layers::Sum<PreviousLayer>> {
  private:
-  // 学習対象の層の型
+  // Type of layer to learn
   using LayerType = Layers::Sum<PreviousLayer>;
 
  public:
-  // ファクトリ関数
+  // factory function
   static std::shared_ptr<Trainer> Create(
       LayerType* target_layer, FeatureTransformer* feature_transformer) {
     return std::shared_ptr<Trainer>(
         new Trainer(target_layer, feature_transformer));
   }
 
-  // ハイパーパラメータなどのオプションを設定する
+  // Set options such as hyperparameters
   void SendMessage(Message* message) {
     previous_layer_trainer_->SendMessage(message);
   }
 
-  // パラメータを乱数で初期化する
+  // Initialize the parameters with random numbers
   template <typename RNG>
   void Initialize(RNG& rng) {
     previous_layer_trainer_->Initialize(rng);
   }
 
-  // 順伝播
+  // forward propagation
   /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
     if (output_.size() < kOutputDimensions * batch.size()) {
       output_.resize(kOutputDimensions * batch.size());
@@ -146,14 +146,14 @@ class Trainer<Layers::Sum<PreviousLayer>> {
     return output_.data();
   }
 
-  // 逆伝播
+  // backpropagation
   void Backpropagate(const LearnFloatType* gradients,
                      LearnFloatType learning_rate) {
     previous_layer_trainer_->Backpropagate(gradients, learning_rate);
   }
 
  private:
-  // コンストラクタ
+  // constructor
   Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
       batch_size_(0),
       previous_layer_trainer_(Trainer<PreviousLayer>::Create(
@@ -161,23 +161,23 @@ class Trainer<Layers::Sum<PreviousLayer>> {
       target_layer_(target_layer) {
   }
 
-  // 入出力の次元数
+  // number of input/output dimensions
   static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
 
-  // サブクラスをfriendにする
+  // make subclass friend
   template <typename SumLayer>
   friend class Trainer;
 
-  // ミニバッチのサンプル数
+  // number of samples in mini-batch
   IndexType batch_size_;
 
-  // 直前の層のTrainer
+  // Trainer of the previous layer
   const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
 
-  // 学習対象の層
+  // layer to learn
   LayerType* const target_layer_;
 
-  // 順伝播用バッファ
+  // Forward propagation buffer
   std::vector<LearnFloatType> output_;
 };
 
@@ -187,4 +187,4 @@ class Trainer<Layers::Sum<PreviousLayer>> {
 
 #endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
index d56e808b..b3404542 100644
--- a/src/extra/sfen_packer.cpp
+++ b/src/extra/sfen_packer.cpp
@@ -5,33 +5,33 @@
 
 #include <sstream>
 #include <fstream>
-#include <cstring>	// std::memset()
+#include <cstring> // std::memset()
 
 using namespace std;
 
 // -----------------------------------
-//        局面の圧縮・解凍
+// stage compression/decompression
 // -----------------------------------
 
-// ビットストリームを扱うクラス
-// 局面の符号化を行なうときに、これがあると便利
+// Class that handles bitstream
+// useful when doing aspect encoding
 struct BitStream
 {
-  // データを格納するメモリを事前にセットする。
-  // そのメモリは0クリアされているものとする。
+  // Set the memory to store the data in advance.
+  // Assume that memory is cleared to 0.
   void  set_data(uint8_t* data_) { data = data_; reset(); }
 
-  // set_data()で渡されたポインタの取得。
+  // Get the pointer passed in set_data().
   uint8_t* get_data() const { return data; }
 
-  // カーソルの取得。
+  // Get the cursor.
   int get_cursor() const { return bit_cursor; }
 
-  // カーソルのリセット
+  // reset the cursor
   void reset() { bit_cursor = 0; }
 
-  // ストリームに1bit書き出す。
-  // bは非0なら1を書き出す。0なら0を書き出す。
+  // Write 1bit to the stream.
+  // If b is non-zero, write out 1. If 0, write 0.
   void write_one_bit(int b)
   {
     if (b)
@@ -40,7 +40,7 @@ struct BitStream
     ++bit_cursor;
   }
 
-  // ストリームから1ビット取り出す。
+  // Get 1 bit from the stream.
   int read_one_bit()
   {
     int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
@@ -49,16 +49,16 @@ struct BitStream
     return b;
   }
 
-  // nビットのデータを書き出す
-  // データはdの下位から順に書き出されるものとする。
+  // write n bits of data
+  // Data shall be written out from the lower order of d.
   void write_n_bit(int d, int n)
   {
-    for (int i = 0; i < n; ++i)
+    for (int i = 0; i <n; ++i)
       write_one_bit(d & (1 << i));
   }
 
-  // nビットのデータを読み込む
-  // write_n_bit()の逆変換。
+  // read n bits of data
+  // Reverse conversion of write_n_bit().
   int read_n_bit(int n)
   {
     int result = 0;
@@ -69,46 +69,46 @@ struct BitStream
   }
 
 private:
-  // 次に読み書きすべきbit位置。
+  // Next bit position to read/write.
   int bit_cursor;
 
-  // データの実体
+  // data entity
   uint8_t* data;
 };
 
 
-//  ハフマン符号化
-//   ※　 なのはminiの符号化から、変換が楽になるように単純化。
+// Huffman coding
+// * is simplified from mini encoding to make conversion easier.
 //
-//   盤上の1升(NO_PIECE以外) = 2～6bit ( + 成りフラグ1bit+ 先後1bit )
-//   手駒の1枚               = 1～5bit ( + 成りフラグ1bit+ 先後1bit )
+// 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
+// 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
 //
-//    空     xxxxx0 + 0    (none)
-//    歩     xxxx01 + 2    xxxx0 + 2
-//    香     xx0011 + 2    xx001 + 2
-//    桂     xx1011 + 2    xx101 + 2
-//    銀     xx0111 + 2    xx011 + 2
-//    金     x01111 + 1    x0111 + 1 // 金は成りフラグはない。
-//    角     011111 + 2    01111 + 2
-//    飛     111111 + 2    11111 + 2
+// empty xxxxx0 + 0 (none)
+// step xxxx01 + 2 xxxx0 + 2
+// incense xx0011 + 2 xx001 + 2
+// Katsura xx1011 + 2 xx101 + 2
+// silver xx0111 + 2 xx011 + 2
+// Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
+// corner 011111 + 2 01111 + 2
+// Fly 111111 + 2 11111 + 2
 //
-// すべての駒が盤上にあるとして、
-//     空 81 - 40駒 = 41升 = 41bit
-//     歩      4bit*18駒   = 72bit
-//     香      6bit* 4駒   = 24bit
-//     桂      6bit* 4駒   = 24bit
-//     銀      6bit* 4駒   = 24bit            
-//     金      6bit* 4駒   = 24bit
-//     角      8bit* 2駒   = 16bit
-//     飛      8bit* 2駒   = 16bit
-//                          -------
-//                          241bit + 1bit(手番) + 7bit×2(王の位置先後) = 256bit
+// Assuming all pieces are on the board,
+// Sky 81-40 pieces = 41 boxes = 41bit
+// Walk 4bit*18 pieces = 72bit
+// Incense 6bit*4 pieces = 24bit
+// Katsura 6bit*4 pieces = 24bit
+// Silver 6bit*4 pieces = 24bit
+// Gold 6bit* 4 pieces = 24bit
+// corner 8bit* 2 pieces = 16bit
+// Fly 8bit* 2 pieces = 16bit
+// -------
+// 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
 //
-// 盤上の駒が手駒に移動すると盤上の駒が空になるので盤上のその升は1bitで表現でき、
-// 手駒は、盤上の駒より1bit少なく表現できるので結局、全体のbit数に変化はない。
-// ゆえに、この表現において、どんな局面でもこのbit数で表現できる。
-// 手駒に成りフラグは不要だが、これも含めておくと盤上の駒のbit数-1になるので
-// 全体のbit数が固定化できるのでこれも含めておくことにする。
+// When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
+// Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
+// Therefore, in this expression, any aspect can be expressed by this bit number.
+// It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
+// Since the total number of bits can be fixed, we will include this as well.
 
 // Huffman Encoding
 //
@@ -120,8 +120,8 @@ private:
 
 struct HuffmanedPiece
 {
-  int code; // どうコード化されるか
-  int bits; // 何bit専有するのか
+  int code; // how it will be coded
+  int bits; // How many bits do you have
 };
 
 HuffmanedPiece huffman_table[] =
@@ -134,11 +134,11 @@ HuffmanedPiece huffman_table[] =
   {0b1001,4}, // QUEEN
 };
 
-// sfenを圧縮/解凍するためのクラス
-// sfenはハフマン符号化をすることで256bit(32bytes)にpackできる。
-// このことはなのはminiにより証明された。上のハフマン符号化である。
+// Class for compressing/decompressing sfen
+// sfen can be packed to 256bit (32bytes) by Huffman coding.
+// This is proven by mini. The above is Huffman coding.
 //
-// 内部フォーマット = 手番1bit+王の位置7bit*2 + 盤上の駒(ハフマン符号化) + 手駒(ハフマン符号化)
+// Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
 // Side to move (White = 0, Black = 1) (1bit)
 // White King Position (6 bits)
 // Black King Position (6 bits)
@@ -152,21 +152,21 @@ HuffmanedPiece huffman_table[] =
 //
 struct SfenPacker
 {
-  // sfenをpackしてdata[32]に格納する。
+  // Pack sfen and store in data[32].
   void pack(const Position& pos)
   {
-//    cout << pos;
+// cout << pos;
 
     memset(data, 0, 32 /* 256bit */);
     stream.set_data(data);
 
-    // 手番
+    // turn
     // Side to move.
     stream.write_one_bit((int)(pos.side_to_move()));
 
-    // 先手玉、後手玉の位置、それぞれ7bit
+    // 7-bit positions for leading and trailing balls
     // White king and black king, 6 bits for each.
-    for(auto c : Colors)
+    for(auto c: Colors)
       stream.write_n_bit(pos.king_square(c), 6);
 
     // Write the pieces on the board other than the kings.
@@ -197,24 +197,24 @@ struct SfenPacker
 
     stream.write_n_bit(pos.state()->rule50, 6);
 
-    stream.write_n_bit(1 + (pos.game_ply() - (pos.side_to_move() == BLACK)) / 2, 8);
+    stream.write_n_bit(1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2, 8);
 
     assert(stream.get_cursor() <= 256);
   }
 
-  // pack()でpackされたsfen(256bit = 32bytes)
-  // もしくはunpack()でdecodeするsfen
+  // sfen packed by pack() (256bit = 32bytes)
+  // Or sfen to decode with unpack()
   uint8_t *data; // uint8_t[32];
 
 //private:
-  // Position::set_from_packed_sfen(uint8_t data[32])でこれらの関数を使いたいので筋は悪いがpublicにしておく。
+  // Position::set_from_packed_sfen(uint8_t data[32]) I want to use these functions, so the line is bad, but I want to keep it public.
 
   BitStream stream;
 
-  // 盤面の駒をstreamに出力する。
+  // Output the board pieces to stream.
   void write_board_piece_to_stream(Piece pc)
   {
-    // 駒種
+    // piece type
     PieceType pr = type_of(pc);
     auto c = huffman_table[pr];
     stream.write_n_bit(c.code, c.bits);
@@ -222,11 +222,11 @@ struct SfenPacker
     if (pc == NO_PIECE)
       return;
 
-    // 先後フラグ
+    // first and second flag
     stream.write_one_bit(color_of(pc));
   }
 
-  // 盤面の駒を1枚streamから読み込む
+  // Read one board piece from stream
   Piece read_board_piece_from_stream()
   {
     PieceType pr = NO_PIECE_TYPE;
@@ -238,7 +238,7 @@ struct SfenPacker
 
       assert(bits <= 6);
 
-      for (pr = NO_PIECE_TYPE; pr < KING; ++pr)
+      for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
         if (huffman_table[pr].code == code
           && huffman_table[pr].bits == bits)
           goto Found;
@@ -247,7 +247,7 @@ struct SfenPacker
     if (pr == NO_PIECE_TYPE)
       return NO_PIECE;
 
-    // 先後フラグ
+    // first and second flag
     Color c = (Color)stream.read_one_bit();
     
     return make_piece(c, pr);
@@ -256,12 +256,12 @@ struct SfenPacker
 
 
 // -----------------------------------
-//        Positionクラスに追加
+// Add to Position class
 // -----------------------------------
 
-// 高速化のために直接unpackする関数を追加。かなりしんどい。
-// packer::unpack()とPosition::set()とを合体させて書く。
-// 渡された局面に問題があって、エラーのときは非0を返す。
+// Add a function that directly unpacks for speed. It's pretty tough.
+// Write it by combining packer::unpack() and Position::set().
+// If there is a problem with the passed phase and there is an error, non-zero is returned.
 int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thread* th, bool mirror)
 {
 	SfenPacker packer;
@@ -276,17 +276,17 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 	// Active color
 	sideToMove = (Color)stream.read_one_bit();
 
-	// evalListのclear。上でmemsetでゼロクリアしたときにクリアされているが…。
+	// clear evalList. It is cleared when memset is cleared to zero above...
 	evalList.clear();
 
-	// PieceListを更新する上で、どの駒がどこにあるかを設定しなければならないが、
-	// それぞれの駒をどこまで使ったかのカウンター
+	// In updating the PieceList, we have to set which piece is where,
+	// A counter of how much each piece has been used
   PieceNumber next_piece_number = PIECE_NUMBER_ZERO;
 
   pieceList[W_KING][0] = SQUARE_NB;
   pieceList[B_KING][0] = SQUARE_NB;
 
-	// まず玉の位置
+	// First the position of the ball
 	if (mirror)
 	{
 		for (auto c : Colors)
@@ -308,7 +308,7 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
         sq = Mir(sq);
       }
 
-      // すでに玉がいるようだ
+      // it seems there are already balls
       Piece pc;
       if (type_of(board[sq]) != KING)
       {
@@ -318,26 +318,26 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
       else
       {
         pc = board[sq];
-        board[sq] = NO_PIECE; // いっかい取り除いておかないとput_piece()でASSERTに引っかかる。
+        board[sq] = NO_PIECE; // put_piece() will catch ASSERT unless you remove it all.
       }
 
-      // 駒がない場合もあるのでその場合はスキップする。
+      // There may be no pieces, so skip in that case.
       if (pc == NO_PIECE)
         continue;
 
       put_piece(Piece(pc), sq);
 
-      // evalListの更新
+      // update evalList
       PieceNumber piece_no =
-        (pc == B_KING) ? PIECE_NUMBER_BKING : // 先手玉
-        (pc == W_KING) ? PIECE_NUMBER_WKING : // 後手玉
-        next_piece_number++; // それ以外
+        (pc == B_KING) ?PIECE_NUMBER_BKING :// Move ball
+        (pc == W_KING) ?PIECE_NUMBER_WKING :// Backing ball
+        next_piece_number++; // otherwise
 
-      evalList.put_piece(piece_no, sq, pc); // sqの升にpcの駒を配置する
+      evalList.put_piece(piece_no, sq, pc); // Place the pc piece in the sq box
 
       //cout << sq << ' ' << board[sq] << ' ' << stream.get_cursor() << endl;
 
-      if (stream.get_cursor() > 256)
+      if (stream.get_cursor()> 256)
         return 1;
       //assert(stream.get_cursor() <= 256);
 
@@ -397,7 +397,7 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 
   chess960 = false;
   thisThread = th;
-	set_state(st);
+set_state(st);
 
   //std::cout << *this << std::endl;
 
@@ -409,11 +409,11 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 	return 0;
 }
 
-// 盤面と手駒、手番を与えて、そのsfenを返す。
+// Give the board, hand piece, and turn, and return the sfen.
 //std::string Position::sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly_)
 //{
-//  // 内部的な構造体にコピーして、sfen()を呼べば、変換過程がそこにしか依存していないならば
-//  // これで正常に変換されるのでは…。
+// // Copy it to an internal structure and call sfen() if the conversion process depends only on it
+// // Maybe it will be converted normally...
 //  Position pos;
 //
 //  memcpy(pos.board, board, sizeof(Piece) * 81);
@@ -423,11 +423,11 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 //
 //  return pos.sfen();
 //
-//  // ↑の実装、美しいが、いかんせん遅い。
-//  // 棋譜を大量に読み込ませて学習させるときにここがボトルネックになるので直接unpackする関数を書く。
+// // Implementation of ↑ is beautiful, but slow.
+// // This is a bottleneck when learning a large amount of game records, so write a function to unpack directly.
 //}
 
-// packされたsfenを得る。引数に指定したバッファに返す。
+// Get the packed sfen. Returns to the buffer specified in the argument.
 void Position::sfen_pack(PackedSfen& sfen)
 {
   SfenPacker sp;
@@ -435,14 +435,13 @@ void Position::sfen_pack(PackedSfen& sfen)
   sp.pack(*this);
 }
 
-//// packされたsfenを解凍する。sfen文字列が返る。
+//// Unpack the packed sfen. Returns an sfen string.
 //std::string Position::sfen_unpack(const PackedSfen& sfen)
 //{
-//  SfenPacker sp;
-//  sp.data = (uint8_t*)&sfen;
-//  return sp.unpack();
+// SfenPacker sp;
+// sp.data = (uint8_t*)&sfen;
+// return sp.unpack();
 //}
 
 
 #endif // USE_SFEN_PACKER
-
diff --git a/src/learn/half_float.h b/src/learn/half_float.h
index 31dc5a29..d5c2f83c 100644
--- a/src/learn/half_float.h
+++ b/src/learn/half_float.h
@@ -4,8 +4,8 @@
 // Half Float Library by yaneurao
 // (16-bit float)
 
-// 16bit型による浮動小数点演算
-// コンパイラの生成するfloat型のコードがIEEE 754の形式であると仮定して、それを利用する。
+// Floating point operation by 16bit type
+// Assume that the float type code generated by the compiler is in IEEE 754 format and use it.
 
 #include "../types.h"
 
@@ -99,7 +99,7 @@ namespace HalfFloat
 			return c.f;
 		}
 
-		// unit testになってないが、一応計算が出来ることは確かめた。コードはあとでなおす(かも)。
+		// It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
 		static void unit_test()
 		{
 			float16 a, b, c, d;
@@ -130,4 +130,4 @@ namespace HalfFloat
 
 }
 
-#endif // __HALF_FLOAT_H__
+#endif // __HALF_FLOAT_H__
\ No newline at end of file
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 246e5cc9..ab53e046 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -6,173 +6,173 @@
 #include <vector>
 
 // =====================
-//  学習時の設定
+// Settings for learning
 // =====================
 
-// 以下のいずれかを選択すれば、そのあとの細々したものは自動的に選択される。
-// いずれも選択しない場合は、そのあとの細々したものをひとつひとつ設定する必要がある。
+// If you select one of the following, the details after that will be automatically selected.
+// If you don't select any of them, you need to set the subsequent details one by one.
 
-// elmo方式での学習設定。これをデフォルト設定とする。
-// 標準の雑巾絞りにするためにはlearnコマンドで "lambda 1"を指定してやれば良い。
+// Learning setting by elmo method. This is the default setting.
+// To make a standard squeeze diaphragm, specify "lambda 1" with the learn command.
 #define LEARN_ELMO_METHOD
 
 
 // ----------------------
-//        更新式
+// update formula
 // ----------------------
 
-// AdaGrad。これが安定しているのでお勧め。
+// Ada Grad. Recommended because it is stable.
 // #define ADA_GRAD_UPDATE
 
-// 勾配の符号だけ見るSGD。省メモリで済むが精度は…。
+// SGD looking only at the sign of the gradient. It requires less memory, but the accuracy is...
 // #define SGD_UPDATE
 
 // ----------------------
-//    学習時の設定
+// Settings for learning
 // ----------------------
 
-// mini-batchサイズ。
-// この数だけの局面をまとめて勾配を計算する。
-// 小さくするとupdate_weights()の回数が増えるので収束が速くなる。勾配が不正確になる。
-// 大きくするとupdate_weights()の回数が減るので収束が遅くなる。勾配は正確に出るようになる。
-// 多くの場合において、この値を変更する必要はないと思う。
+// mini-batch size.
+// Calculate the gradient by combining this number of phases.
+// If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
+// If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
+// I don't think you need to change this value in most cases.
 
 #define LEARN_MINI_BATCH_SIZE (1000 * 1000 * 1)
 
-// ファイルから1回に読み込む局面数。これだけ読み込んだあとshuffleする。
-// ある程度大きいほうが良いが、この数×40byte×3倍ぐらいのメモリを消費する。10M局面なら400MB*3程度消費する。
-// THREAD_BUFFER_SIZE(=10000)の倍数にすること。
+// The number of phases to read from the file at one time. After reading this much, shuffle.
+// It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
+// Must be a multiple of THREAD_BUFFER_SIZE(=10000).
 
 #define LEARN_SFEN_READ_SIZE (1000 * 1000 * 10)
 
-// 学習時の評価関数の保存間隔。この局面数だけ学習させるごとに保存。
-// 当然ながら、保存間隔を長くしたほうが学習時間は短くなる。
-// フォルダ名は 0/ , 1/ , 2/ ...のように保存ごとにインクリメントされていく。
-// デフォルトでは10億局面に1回。
+// Saving interval of evaluation function at learning. Save each time you learn this number of phases.
+// Needless to say, the longer the saving interval, the shorter the learning time.
+// Folder name is incremented for each save like 0/, 1/, 2/...
+// By default, once every 1 billion phases.
 #define LEARN_EVAL_SAVE_INTERVAL (1000000000ULL)
 
 
 // ----------------------
-//    目的関数の選択
+// Select the objective function
 // ----------------------
 
-// 目的関数が勝率の差の二乗和
-// 詳しい説明は、learner.cppを見ること。
+// The objective function is the sum of squares of the difference in winning percentage
+// See learner.cpp for more information.
 
 //#define LOSS_FUNCTION_IS_WINNING_PERCENTAGE
 
-// 目的関数が交差エントロピー
-// 詳しい説明は、learner.cppを見ること。
-// いわゆる、普通の「雑巾絞り」
+// Objective function is cross entropy
+// See learner.cpp for more information.
+// So-called ordinary "rag cloth squeezer"
 //#define LOSS_FUNCTION_IS_CROSS_ENTOROPY
 
-// 目的関数が交差エントロピーだが、勝率の関数を通さない版
+// A version in which the objective function is cross entropy, but the win rate function is not passed
 // #define LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE
 
-// elmo(WCSC27)の方式
+// elmo (WCSC27) method
 // #define LOSS_FUNCTION_IS_ELMO_METHOD
 
-// ※　他、色々追加するかも。
+// ※ Other things may be added.
 
 
 // ----------------------
-// 学習に関するデバッグ設定
+// debug settings for learning
 // ----------------------
 
-// 学習時のrmseの出力をこの回数に1回に減らす。
-// rmseの計算は1スレッドで行なうためそこそこ時間をとられるので出力を減らすと効果がある。
+// Reduce the output of rmse during learning to 1 for this number of times.
+// rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
 #define LEARN_RMSE_OUTPUT_INTERVAL 1
 
 
 // ----------------------
-// ゼロベクトルからの学習
+// learning from zero vector
 // ----------------------
 
-// 評価関数パラメーターをゼロベクトルから学習を開始する。
-// ゼロ初期化して棋譜生成してゼロベクトルから学習させて、
-// 棋譜生成→学習を繰り返すとプロの棋譜に依らないパラメーターが得られる。(かも)
-// (すごく時間かかる)
+// Start learning the evaluation function parameters from the zero vector.
+// Initialize to zero, generate a game, learn from zero vector,
+// Game generation → If you repeat learning, you will get parameters that do not depend on the professional game. (maybe)
+// (very time consuming)
 
 //#define RESET_TO_ZERO_VECTOR
 
 
 // ----------------------
-//  学習のときの浮動小数
+// Floating point for learning
 // ----------------------
 
-// これをdoubleにしたほうが計算精度は上がるが、重み配列絡みのメモリが倍必要になる。
-// 現状、ここをfloatにした場合、評価関数ファイルに対して、重み配列はその4.5倍のサイズ。(KPPTで4.5GB程度)
-// double型にしても収束の仕方にほとんど差異がなかったのでfloatに固定する。
+// If this is set to double, the calculation accuracy will be higher, but the weight array entangled memory will be doubled.
+// Currently, if this is float, the weight array is 4.5 times the size of the evaluation function file. (About 4.5GB with KPPT)
+// Even if it is a double type, there is almost no difference in the way of convergence, so fix it to float.
 
-// floatを使う場合
+// when using float
 typedef float LearnFloatType;
 
-// doubleを使う場合
+// when using double
 //typedef double LearnFloatType;
 
-// float16を使う場合
+// when using float16
 //#include "half_float.h"
 //typedef HalfFloat::float16 LearnFloatType;
 
 // ----------------------
-//  省メモリ化
+// save memory
 // ----------------------
 
-// Weight配列(のうちのKPP)に三角配列を用いて省メモリ化する。
-// これを用いると、学習用の重み配列は評価関数ファイルの3倍程度で済むようになる。
+// Use a triangular array for the Weight array (of which is KPP) to save memory.
+// If this is used, the weight array for learning will be about 3 times as large as the evaluation function file.
 
 #define USE_TRIANGLE_WEIGHT_ARRAY
 
 // ----------------------
-//  次元下げ
+// dimension down
 // ----------------------
 
-// ミラー(左右対称性)、インバース(先後対称性)に関して次元下げを行なう。
-// デフォルトではすべてオン。
+// Dimension reduction for mirrors (left/right symmetry) and inverse (forward/backward symmetry).
+// All on by default.
 
-// KKに対してミラー、インバースを利用した次元下げを行なう。(効果のほどは不明)
-// USE_KK_INVERSE_WRITEをオンにするときはUSE_KK_MIRROR_WRITEもオンでなければならない。
+// Dimension reduction using mirror and inverse for KK. (Unclear effect)
+// USE_KK_MIRROR_WRITE must be on when USE_KK_INVERSE_WRITE is on.
 #define USE_KK_MIRROR_WRITE
 #define USE_KK_INVERSE_WRITE
 
-// KKPに対してミラー、インバースを利用した次元下げを行なう。(インバースのほうは効果のほどは不明)
-// USE_KKP_INVERSE_WRITEをオンにするときは、USE_KKP_MIRROR_WRITEもオンになっていなければならない。
+// Dimension reduction using Mirror and Inverse for KKP. (Inverse is not so effective)
+// When USE_KKP_INVERSE_WRITE is turned on, USE_KKP_MIRROR_WRITE must also be turned on.
 #define USE_KKP_MIRROR_WRITE
 #define USE_KKP_INVERSE_WRITE
 
-// KPPに対してミラーを利用した次元下げを行なう。(これをオフにすると教師局面が倍ぐらい必要になる)
-// KPPにはインバースはない。(先手側のKしかないので)
+// Perform dimension reduction using a mirror for KPP. (Turning this off requires double the teacher position)
+// KPP has no inverse. (Because there is only K on the front side)
 #define USE_KPP_MIRROR_WRITE
 
-// KPPPに対してミラーを利用した次元下げを行なう。(これをオフにすると教師局面が倍ぐらい必要になる)
-// KPPPにもインバースはない。(先手側のKしかないので)
+// Perform a dimension reduction using a mirror for KPPP. (Turning this off requires double the teacher position)
+// KPPP has no inverse. (Because there is only K on the front side)
 #define USE_KPPP_MIRROR_WRITE
 
-// KKPP成分に対して学習時にKPPによる次元下げを行なう。
-// 学習、めっちゃ遅くなる。
-// 未デバッグなので使わないこと。
+// Reduce the dimension by KPP for learning the KKPP component.
+// Learning is very slow.
+// Do not use as it is not debugged.
 //#define USE_KKPP_LOWER_DIM
 
 
 // ======================
-//  教師局面生成時の設定
+// Settings for creating teacher phases
 // ======================
 
 // ----------------------
-//  引き分けを書き出す
+// write out the draw
 // ----------------------
 
-// 引き分けに至ったとき、それを教師局面として書き出す
-// これをするほうが良いかどうかは微妙。
+// When you reach a draw, write it out as a teacher position
+// It's subtle whether it's better to do this.
 // #define LEARN_GENSFEN_USE_DRAW_RESULT
 
 
 // ======================
-//       configure
+// configure
 // ======================
 
 // ----------------------
-//  elmo(WCSC27)の方法での学習
+// Learning with the method of elmo (WCSC27)
 // ----------------------
 
 #if defined( LEARN_ELMO_METHOD )
@@ -182,49 +182,49 @@ typedef float LearnFloatType;
 
 
 // ----------------------
-// Learnerで用いるstructの定義
+// Definition of struct used in Learner
 // ----------------------
 #include "../position.h"
 
 namespace Learner
 {
-	// PackedSfenと評価値が一体化した構造体
-	// オプションごとに書き出す内容が異なると教師棋譜を再利用するときに困るので
-	// とりあえず、以下のメンバーはオプションによらずすべて書き出しておく。
+	//Structure in which PackedSfen and evaluation value are integrated
+	// If you write different contents for each option, it will be a problem when reusing the teacher game
+	// For the time being, write all the following members regardless of the options.
 	struct PackedSfenValue
 	{
-		// 局面
+		// phase
 		PackedSfen sfen;
 
-		// Learner::search()から返ってきた評価値
+		// Evaluation value returned from Learner::search()
 		int16_t score;
 
-		// PVの初手
-		// 教師との指し手一致率を求めるときなどに用いる
+		// PV first move
+		// Used when finding the match rate with the teacher
 		uint16_t move;
 
-		// 初期局面からの局面の手数。
+		// Trouble of the phase from the initial phase.
 		uint16_t gamePly;
 
-		// この局面の手番側が、ゲームを最終的に勝っているなら1。負けているなら-1。
-		// 引き分けに至った場合は、0。
-		// 引き分けは、教師局面生成コマンドgensfenにおいて、
-		// LEARN_GENSFEN_DRAW_RESULTが有効なときにだけ書き出す。
+		// 1 if the player on this side ultimately wins the game. -1 if you are losing.
+		// 0 if a draw is reached.
+		// The draw is in the teacher position generation command gensfen,
+		// Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
 		int8_t game_result;
 
-		// 教師局面を書き出したファイルを他の人とやりとりするときに
-		// この構造体サイズが不定だと困るため、paddingしてどの環境でも必ず40bytesになるようにしておく。
+		// When exchanging the file that wrote the teacher aspect with other people
+		//Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
 		uint8_t padding;
 
 		// 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
 	};
 
-	// 読み筋とそのときの評価値を返す型
-	// Learner::search() , Learner::qsearch()で用いる。
+	// Type that returns the reading line and the evaluation value at that time
+	// Used in Learner::search(), Learner::qsearch().
 	typedef std::pair<Value, std::vector<Move> > ValueAndPV;
 
-	// いまのところ、やねうら王2018 Otafukuしか、このスタブを持っていないが
-	// EVAL_LEARNをdefineするなら、このスタブが必須。
+	// So far, only Yaneura King 2018 Otafuku has this stub
+	// This stub is required if EVAL_LEARN is defined.
 	extern Learner::ValueAndPV  search(Position& pos, int depth , size_t multiPV = 1 , uint64_t NodesLimit = 0);
 	extern Learner::ValueAndPV qsearch(Position& pos);
 
@@ -234,4 +234,4 @@ namespace Learner
 
 #endif
 
-#endif // ifndef _LEARN_H_
+#endif // ifndef _LEARN_H_
\ No newline at end of file
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index f105296f..fb0d9959 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1,16 +1,16 @@
-﻿// 学習関係のルーチン
+﻿// learning routines
 //
-// 1) 棋譜の自動生成
-//   → "gensfen"コマンド
-// 2) 生成した棋譜からの評価関数パラメーターの学習
-//   → "learn"コマンド
-//   → 教師局面のshuffleもこのコマンドの拡張として行なう。
-//   例) "learn shuffle"
-// 3) 定跡の自動生成
-//   → "makebook think"コマンド
-//   → extra/book/book.cppで実装
-// 4) 局後自動検討モード
-//   →　GUIが補佐すべき問題なのでエンジンでは関与しないことにする。
+// 1) Automatic generation of game records
+// → "gensfen" command
+// 2) Learning evaluation function parameters from the generated game record
+// → "learn" command
+// → Shuffle in the teacher phase is also an extension of this command.
+// Example) "learn shuffle"
+// 3) Automatic generation of fixed traces
+// → "makebook think" command
+// → implemented in extra/book/book.cpp
+// 4) Post-station automatic review mode
+// → I will not be involved in the engine because it is a problem that the GUI should assist.
 // etc..
 
 #if defined(EVAL_LEARN)
@@ -23,15 +23,15 @@
 #include "multi_think.h"
 #include "../uci.h"
 
-// 学習用のevaluate絡みのheader
+// evaluate header for learning
 #include "../eval/evaluate_common.h"
 
 // ----------------------
-// 設定内容に基づく定数文字列
+// constant string based on the settings
 // ----------------------
 
-// 更新式に応じた文字列。(デバッグ用に出力する。)
-// 色々更新式を実装したがAdaGradが速度面、メモリ面においてベストという結論になった。
+// Character string according to update formula. (Output for debugging.)
+// Implemented various update expressions, but concluded that AdaGrad is the best in terms of speed and memory.
 #if defined(ADA_GRAD_UPDATE)
 #define LEARN_UPDATE "AdaGrad"
 #elif defined(SGD_UPDATE)
@@ -49,7 +49,7 @@
 #endif
 
 // -----------------------------------
-//    以下、実装部。
+// Below, the implementation section.
 // -----------------------------------
 
 #include <sstream>
@@ -65,9 +65,9 @@
 #endif
 
 #if defined(_MSC_VER)
-// C++のfilesystemは、C++17以降か、MSVCでないと使えないようだ。
-// windows.hを使うようにしたが、msys2のg++だとうまくフォルダ内のファイルが取得できない。
-// 仕方ないのでdirent.hを用いる。
+// The C++ filesystem cannot be used unless it is C++17 or later or MSVC.
+// I tried to use windows.h, but with g++ of msys2 I can not get the files in the folder well.
+// Use dirent.h because there is no help for it.
 #include <filesystem>
 #elif defined(__GNUC__)
 #include <dirent.h>
@@ -87,17 +87,17 @@
 
 using namespace std;
 
-//// これは探索部で定義されているものとする。
+//// This is defined in the search section.
 //extern Book::BookMoveSelector book;
 
-// atomic<T>に対する足し算、引き算の定義
-// Apery/learner.hppにあるatomicAdd()に合わせてある。
+// Addition and subtraction definition for atomic<T>
+// Aligned with atomicAdd() in Apery/learner.hpp.
 template <typename T>
 T operator += (std::atomic<T>& x, const T rhs)
 {
 	T old = x.load(std::memory_order_consume);
-	// このタイミングで他スレッドから値が書き換えられることは許容する。
-	// 値が破壊されなければ良しという考え。
+	// It is allowed that the value is rewritten from other thread at this timing.
+	// The idea that the value is not destroyed is good.
 	T desired = old + rhs;
 	while (!x.compare_exchange_weak(old, desired, std::memory_order_release, std::memory_order_consume))
 		desired = old + rhs;
@@ -109,7 +109,7 @@ T operator -= (std::atomic<T>& x, const T rhs) { return x += -rhs; }
 namespace Learner
 {
 
-// 局面の配列 : PSVector は packed sfen vector の略。
+// Phase array: PSVector stands for packed sfen vector.
 typedef std::vector<PackedSfenValue> PSVector;
 
 bool use_draw_in_training_data_generation = false;
@@ -118,20 +118,20 @@ bool use_draw_in_validation = false;
 bool use_hash_in_training = true;
 
 // -----------------------------------
-//    局面のファイルへの書き出し
+// write phase file
 // -----------------------------------
 
-// Sfenを書き出して行くためのヘルパクラス
+// Helper class for exporting Sfen
 struct SfenWriter
 {
-	// 書き出すファイル名と生成するスレッドの数
+		// File name to write and number of threads to create
 	SfenWriter(string filename, int thread_num)
 	{
 		sfen_buffers_pool.reserve((size_t)thread_num * 10);
 		sfen_buffers.resize(thread_num);
 
-		// 追加学習するとき、評価関数の学習後も生成される教師の質はあまり変わらず、教師局面数を稼ぎたいので
-		// 古い教師も使うのが好ましいのでこういう仕様にしてある。
+		// When performing additional learning, the quality of the teacher generated after learning the evaluation function does not change much and I want to earn more teacher positions.
+		// Since it is preferable that old teachers also use it, it has such a specification.
 		fs.open(filename, ios::out | ios::binary | ios::app);
 		filename_ = filename;
 
@@ -144,76 +144,76 @@ struct SfenWriter
 		file_worker_thread.join();
 		fs.close();
 
-		// file_worker_threadがすべて書き出したあとなのでbufferはすべて空のはずなのだが..
+		// all buffers should be empty since file_worker_thread has written all..
 		for (auto p : sfen_buffers) { assert(p == nullptr); }
 		assert(sfen_buffers_pool.empty());
 	}
 
-	// 各スレッドについて、この局面数ごとにファイルにflushする。
+	// For each thread, flush the file by this number of phases.
 	const size_t SFEN_WRITE_SIZE = 5000;
 
-	// 局面と評価値をペアにして1つ書き出す(packされたsfen形式で)
+	// write one by pairing the phase and evaluation value (in packed sfen format)
 	void write(size_t thread_id, const PackedSfenValue& psv)
 	{
-		// スレッドごとにbufferを持っていて、そこに追加する。
-		// bufferが溢れたら、ファイルに書き出す。
+		// We have a buffer for each thread and add it there.
+		// If the buffer overflows, write it to a file.
 
-		// このバッファはスレッドごとに用意されている。
+		// This buffer is prepared for each thread.
 		auto& buf = sfen_buffers[thread_id];
 
-		// 初回とスレッドバッファを書き出した直後はbufがないので確保する。
+		// Secure since there is no buf at the first time and immediately after writing the thread buffer.
 		if (!buf)
 		{
 			buf = new PSVector();
 			buf->reserve(SFEN_WRITE_SIZE);
 		}
 
-		// スレッドごとに用意されており、一つのスレッドが同時にこのwrite()関数を呼び出さないので
-		// この時点では排他する必要はない。
+		// It is prepared for each thread, so one thread does not call this write() function at the same time.
+		// There is no need to exclude at this point.
 		buf->push_back(psv);
 
 		if (buf->size() >= SFEN_WRITE_SIZE)
 		{
-			// sfen_buffers_poolに積んでおけばあとはworkerがよきに計らってくれる。
+			// If you load it in sfen_buffers_pool, the worker will do the rest.
 
-			// sfen_buffers_poolの内容を変更するときはmutexのlockが必要。
+			// Mutex lock is required when changing the contents of sfen_buffers_pool.
 			std::unique_lock<std::mutex> lk(mutex);
 			sfen_buffers_pool.push_back(buf);
 
 			buf = nullptr;
-			// buf == nullptrにしておけば次回にこの関数が呼び出されたときにバッファは確保される。
+			// If you set buf == nullptr, the buffer will be allocated the next time this function is called.
 		}
 	}
 
-	// 自分のスレッド用のバッファに残っている分をファイルに書き出すためのバッファに移動させる。
+	// Move what remains in the buffer for your thread to a buffer for writing to a file.
 	void finalize(size_t thread_id)
 	{
 		std::unique_lock<std::mutex> lk(mutex);
 
 		auto& buf = sfen_buffers[thread_id];
 
-		// buf==nullptrであるケースもあるのでそのチェックが必要。
+		// There is a case that buf==nullptr, so that check is necessary.
 		if (buf && buf->size() != 0)
 			sfen_buffers_pool.push_back(buf);
 
 		buf = nullptr;
 	}
 
-	// write_workerスレッドを開始する。
+	// Start the write_worker thread.
 	void start_file_write_worker()
 	{
 		file_worker_thread = std::thread([&] { this->file_write_worker(); });
 	}
 
-	// ファイルに書き出すの専用スレッド
+	// Dedicated thread to write to file
 	void file_write_worker()
 	{
 		auto output_status = [&]()
 		{
-			// 現在時刻も出力
+			// also output the current time
 			sync_cout << endl << sfen_write_count << " sfens , at " << now_string() << sync_endl;
 
-			// flush()はこのタイミングで十分。
+			// This is enough for flush().
 			fs.flush();
 		};
 
@@ -223,12 +223,12 @@ struct SfenWriter
 			{
 				std::unique_lock<std::mutex> lk(mutex);
 
-				// まるごとコピー
+				// copy the whole
 				buffers = sfen_buffers_pool;
 				sfen_buffers_pool.clear();
 			}
 
-			// 何も取得しなかったならsleep()
+			// sleep() if you didn't get anything
 			if (!buffers.size())
 				sleep(100);
 			else
@@ -240,83 +240,83 @@ struct SfenWriter
 					sfen_write_count += ptr->size();
 
 #if 1
-					// 処理した件数をここに加算していき、save_everyを超えたら、ファイル名を変更し、このカウンターをリセットする。
+					// Add the processed number here, and if it exceeds save_every, change the file name and reset this counter.
 					save_every_counter += ptr->size();
 					if (save_every_counter >= save_every)
 					{
 						save_every_counter = 0;
-						// ファイル名を変更。
+						// Change the file name.
 
 						fs.close();
 
-						// ファイルにつける連番
+						// Sequential number attached to the file
 						int n = (int)(sfen_write_count / save_every);
-						// ファイル名を変更して再度openする。上書き考慮してios::appをつけておく。(運用によっては、ないほうがいいかも..)
+						// Rename the file and open it again. Add ios::app in consideration of overwriting. (Depending on the operation, it may not be necessary.)
 						string filename = filename_ + "_" + std::to_string(n);
 						fs.open(filename, ios::out | ios::binary | ios::app);
 						cout << endl << "output sfen file = " << filename << endl;
 					}
 #endif
 
-					// 棋譜を書き出すごとに'.'を出力。
+					// Output'.' every time when writing a game record.
 					std::cout << ".";
 
-					// 40回ごとに処理した局面数を出力
-					// 最後、各スレッドの教師局面の余りを書き出すので中途半端な数が表示されるが、まあいいか…。
-					// スレッドを論理コアの最大数まで酷使するとコンソールが詰まるのでもう少し間隔甘くてもいいと思う。
+					// Output the number of phases processed every 40 times
+					// Finally, the remainder of the teacher phase of each thread is written out, so halfway numbers are displayed, but is it okay?
+					// If you overuse the threads to the maximum number of logical cores, the console will be clogged, so it may be a little more loose.
 					if ((++time_stamp_count % 40) == 0)
 						output_status();
 
-					// このメモリは不要なのでこのタイミングで開放しておく。
+					// Since this memory is unnecessary, release it at this timing.
 					delete ptr;
 				}
 			}
 		}
 
-		// 終了前にもう一度、タイムスタンプを出力。
+		// Output the time stamp again before the end.
 		output_status();
 	}
 
-	// この単位でファイル名を変更する。
+	// Change the file name in this unit.
 	uint64_t save_every = UINT64_MAX;
 
 private:
 
 	fstream fs;
 
-	// コンストラクタで渡されたファイル名
+	// File name passed in the constructor
 	std::string filename_;
 
-	// 処理した件数をここに加算していき、save_everyを超えたら、ファイル名を変更し、このカウンターをリセットする。
+	// Add the processed number here, and if it exceeds save_every, change the file name and reset this counter.
 	uint64_t save_every_counter = 0;
 
-	// ファイルに書き込む用のthread
+	// thread to write to the file
 	std::thread file_worker_thread;
-	// すべてのスレッドが終了したかのフラグ
+	// Flag that all threads have finished
 	atomic<bool> finished;
 
-	// タイムスタンプの出力用のカウンター
+	// Counter for time stamp output
 	uint64_t time_stamp_count = 0;
 
-	// ファイルに書き出す前のバッファ
-	// sfen_buffersは各スレッドに対するバッファ
-	// sfen_buffers_poolは書き出しのためのバッファ。
-	// 前者のバッファに局面をSFEN_WRITE_SIZEだけ積んだら、後者に積み替える。
+	// buffer before writing to file
+	// sfen_buffers is the buffer for each thread
+	// sfen_buffers_pool is a buffer for writing.
+	// After loading the phase in the former buffer by SFEN_WRITE_SIZE, transfer it to the latter.
 	std::vector<PSVector*> sfen_buffers;
 	std::vector<PSVector*> sfen_buffers_pool;
 
-	// sfen_buffers_poolにアクセスするときに必要なmutex
+	// Mutex required to access sfen_buffers_pool
 	std::mutex mutex;
 
-	// 書きだした局面の数
+	// number of written phases
 	uint64_t sfen_write_count = 0;
 };
 
 // -----------------------------------
-//  棋譜を生成するworker(スレッドごと)
+// worker that creates the game record (for each thread)
 // -----------------------------------
 
-// 複数スレッドでsfenを生成するためのクラス
+// Class to generate sfen with multiple threads
 struct MultiThinkGenSfen : public MultiThink
 {
 	MultiThinkGenSfen(int search_depth_, int search_depth2_, SfenWriter& sw_)
@@ -324,49 +324,49 @@ struct MultiThinkGenSfen : public MultiThink
 	{
 		hash.resize(GENSFEN_HASH_SIZE);
 
-		// PCを並列化してgensfenするときに同じ乱数seedを引いていないか確認用の出力。
+		// Output for confirmation if the same random seed is not drawn when parallelizing and gensfening the PC.
 		std::cout << prng << std::endl;
 	}
 
 	virtual void thread_worker(size_t thread_id);
 	void start_file_write_worker() { sw.start_file_write_worker(); }
 
-	//  search_depth = 通常探索の探索深さ
+	// search_depth = search depth for normal search
 	int search_depth;
 	int search_depth2;
 
-	// 生成する局面の評価値の上限
+	// Upper limit of evaluation value of generated situation
 	int eval_limit;
 
-	// ランダムムーブを行なう最小ply
+	// minimum ply with random move
 	int random_move_minply;
-	// ランダムムーブを行なう最大ply
+	// maximum ply with random move
 	int random_move_maxply;
-	// 1局のなかでランダムムーブを行なう回数
+	// Number of random moves in one station
 	int random_move_count;
-	// Aperyのようにランダムムーブのときに1/Nの確率で玉を動かす。
-	// また玉を動かしたときは1/Nの確率で相手番で1回ランダムムーブする。
-	// AperyはN=2。ここ0を指定するとこの機能を無効化する。
+	// Move balls with a probability of 1/N when randomly moving like Apery.
+	// When you move the ball again, there is a 1/N chance that it will randomly move once in the opponent's number.
+	// Apery has N=2. Specifying 0 here disables this function.
 	int random_move_like_apery;
 
-	// ランダムムーブの代わりにmulti pvを使うとき用。
-	// random_multi_pvは、MultiPVのときの候補手の数。
-	// 候補手の指し手を採択するとき、1位の指し手の評価値とN位の指し手の評価値との差が
-	// random_multi_pv_diffの範囲でなければならない。
-	// random_multi_pv_depthはMultiPVのときの探索深さ。
+	// For when using multi pv instead of random move.
+	// random_multi_pv is the number of candidates for MultiPV.
+	// When adopting the move of the candidate move, the difference between the evaluation value of the move of the 1st place and the evaluation value of the move of the Nth place is
+	// Must be in the range random_multi_pv_diff.
+	// random_multi_pv_depth is the search depth for MultiPV.
 	int random_multi_pv;
 	int random_multi_pv_diff;
 	int random_multi_pv_depth;
 
-	// 書き出す局面のply(初期局面からの手数)の最小、最大。
+	// The minimum and maximum ply (number of steps from the initial phase) of the phase to write out.
 	int write_minply;
 	int write_maxply;
 
-	// sfenの書き出し器
+	// sfen exporter
 	SfenWriter& sw;
 
-	// 同一局面の書き出しを制限するためのhash
-	// hash_indexを求めるためのmaskに使うので、2**Nでなければならない。
+	// hash to limit the export of the same phase
+	// It must be 2**N because it will be used as the mask to calculate hash_index.
 	static const uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
 
 	vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
@@ -375,25 +375,25 @@ struct MultiThinkGenSfen : public MultiThink
 //  thread_id    = 0..Threads.size()-1
 void MultiThinkGenSfen::thread_worker(size_t thread_id)
 {
-	// とりあえず、書き出す手数の最大のところで引き分け扱いになるものとする。
+	// For the time being, it will be treated as a draw at the maximum number of steps to write.
 	const int MAX_PLY2 = write_maxply;
 
-	// StateInfoを最大手数分 + SearchのPVでleafにまで進めるbuffer
+	//Maximum StateInfo + Search PV to advance to leaf buffer
 	std::vector<StateInfo,AlignedAllocator<StateInfo>> states(MAX_PLY2 + MAX_PLY /* == search_depth + α */);
 	StateInfo si;
 
-	// 今回の指し手。この指し手で局面を進める。
+	// This move. Use this move to advance the stage.
 	Move m = MOVE_NONE;
 
-	// 終了フラグ
+	// end flag
 	bool quit = false;
 
-	// 規定回数回になるまで繰り返し
+	// repeat until the specified number of times
 	while (!quit)
 	{
-		// Positionに対して従属スレッドの設定が必要。
-		// 並列化するときは、Threads (これが実体が vector<Thread*>なので、
-		// Threads[0]...Threads[thread_num-1]までに対して同じようにすれば良い。
+		// It is necessary to set a dependent thread for Position.
+		// When parallelizing, Threads (since this is a vector<Thread*>,
+		// Do the same for up to Threads[0]...Threads[thread_num-1].
 		auto th = Threads[thread_id];
 
 		auto& pos = th->rootPos;
@@ -409,42 +409,42 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
     //  assert(actual == StartFEN);
     //}
 
-		// 探索部で定義されているBookMoveSelectorのメンバを参照する。
+		// Refer to the members of BookMoveSelector defined in the search section.
 		//auto& book = ::book;
 
-		// 1局分の局面を保存しておき、終局のときに勝敗を含めて書き出す。
-		// 書き出す関数は、この下にあるflush_psv()である。
+		// Save the situation for one station, and write it out including the winning and losing at the end.
+		// The function to write is flush_psv() below this.
 		PSVector a_psv;
 		a_psv.reserve(MAX_PLY2 + MAX_PLY);
 
-		// a_psvに積まれている局面をファイルに書き出す。
-		// lastTurnIsWin : a_psvに積まれている最終局面の次の局面での勝敗
-		// 勝ちのときは1。負けのときは-1。引き分けのときは0を渡す。
-		// 返し値 : もう規定局面数に達したので終了する場合にtrue。
+		// Write out the phases loaded in a_psv to a file.
+		// lastTurnIsWin: win/loss in the next phase after the final phase in a_psv
+		// 1 when winning. -1 when losing. Pass 0 for a draw.
+		// Return value: true if the specified number of phases has already been reached and the process ends.
 		auto flush_psv = [&](int8_t lastTurnIsWin)
 		{
 			int8_t isWin = lastTurnIsWin;
 
-			// 終局の局面(の一つ前)から初手に向けて、各局面に関して、対局の勝敗の情報を付与しておく。
-			// a_psvに保存されている局面は(手番的に)連続しているものとする。
+			// From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
+			// The phases stored in a_psv are assumed to be continuous (in order).
 			for (auto it = a_psv.rbegin(); it != a_psv.rend(); ++it)
 			{
-				// isWin == 0(引き分け)なら -1を掛けても 0(引き分け)のまま
+				// If isWin == 0 (draw), multiply by -1 and it will remain 0 (draw)
 				isWin = - isWin;
 				it->game_result = isWin;
 
-				// 局面を書き出そうと思ったら規定回数に達していた。
-				// get_next_loop_count()内でカウンターを加算するので
-				// 局面を出力したときにこれを呼び出さないとカウンターが狂う。
+				// When I tried to write out the phase, it reached the specified number of times.
+				// Because the counter is added in get_next_loop_count()
+				// If you don't call this when the phase is output, the counter goes crazy.
 				auto loop_count = get_next_loop_count();
 				if (loop_count == UINT64_MAX)
 				{
-					// 終了フラグを立てておく。
+					// Set the end flag.
 					quit = true;
 					return;
 				}
 
-				// 局面を一つ書き出す。
+				// Write out one aspect.
 				sw.write(thread_id, *it);
 
 #if 0
@@ -454,30 +454,30 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 			}
 		};
 
-		// ply手目でランダムムーブをするかどうかのフラグ
+		// ply flag for whether or not to randomly move by eyes
 		vector<bool> random_move_flag;
 		{
-			// ランダムムーブを入れるならrandom_move_maxply手目までに絶対にrandom_move_count回入れる。
-			// そこそこばらけて欲しい。
-			// どれくらいがベストなのかはよくわからない。色々条件を変えて実験中。
-			
-			// a[0] = 0 , a[1] = 1, ... みたいな配列を作って、これを
-			// Fisher-Yates shuffleして先頭のN個を取り出せば良い。
-			// 実際には、N個欲しいだけなので先頭N個分だけFisher-Yatesでshuffleすれば良い。
+			// If you want to add a random move, random_move_maxply be sure to enter random_move_count times before the first move.
+			// I want you to disperse so much.
+			// I'm not sure how best it is. Experimenting under various conditions.
+
+			// Make an array like a[0] = 0 ,a[1] = 1, ...
+			// Fisher-Yates shuffle and take out the first N items.
+			// Actually, I only want N pieces, so I only need to shuffle the first N pieces with Fisher-Yates.
 
 			vector<int> a;
 			a.reserve((size_t)random_move_maxply);
 
-			// random_move_minply , random_move_maxplyは1 originで指定されるが、
-			// ここでは0 originで扱っているので注意。
+			// random_move_minply ,random_move_maxply is specified by 1 origin,
+			// Note that we are handling 0 origin here.
 			for (int i = std::max(random_move_minply - 1 , 0) ; i < random_move_maxply; ++i)
 				a.push_back(i);
 
-			// Apery方式のランダムムーブの場合、insert()がrandom_move_count回呼び出される可能性があるので
-			// それを考慮したサイズだけ確保しておく。
+			// In case of Apery random move, insert() may be called random_move_count times.
+			// Reserve only the size considering it.
 			random_move_flag.resize((size_t)random_move_maxply + random_move_count);
 
-			// a[]のsize()を超える回数のランダムムーブは適用できないので制限する。
+			// A random move that exceeds the size() of a[] cannot be applied, so limit it.
 			for (int i = 0 ; i < std::min(random_move_count, (int)a.size()) ; ++i)
 			{
 				swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
@@ -485,26 +485,26 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 			}
 		}
 
-		// random moveを行なった回数をカウントしておくカウンター
-		// random_move_minply == -1のときに、連続してランダムムーブを行なうので、このときに用いる。
+		// A counter that keeps track of the number of random moves
+		// When random_move_minply == -1, random moves are performed continuously, so use it at this time.
 		int random_move_c = 0;
 
-		// ply : 初期局面からの手数
+		// ply: steps from the initial stage
 		for (int ply = 0; ; ++ply)
 		{
 			//cout << pos << endl;
 
-			// 今回の探索depth
-			// gotoで飛ぶので先に宣言しておく。
+			// Current search depth
+			// Goto will fly, so declare it first.
 			int depth = search_depth + (int)prng.rand(search_depth2 - search_depth + 1);
 
-			// 長手数に達したのか
+			// has it reached the length
 			if (ply >= MAX_PLY2)
 			{
 				if (use_draw_in_training_data_generation) {
-					// 勝敗 = 引き分けとして書き出す。
-					// こうしたほうが自分が入玉したときに、相手の入玉を許しにくい(かも)
-					flush_psv(0);
+				// Write out as win/loss = draw.
+				// This way it is harder to allow the opponent to enter the ball when I enter (may)
+				flush_psv(0);
 				}
 				break;
 			}
@@ -514,13 +514,13 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 			  // Write if draw.
 			  flush_psv(0);
 		  }
-          break;
+        break;
       }
 
-			// 全駒されて詰んでいたりしないか？
-			if (MoveList<LEGAL>(pos).size() == 0) // Can be mate or stalemate
+			// Isn't all pieces stuck and stuck?
+			if (MoveList<LEGAL>(pos).size() == 0)
 			{
-        // (この局面の一つ前の局面までは書き出す)
+        // (write up to the previous phase of this phase)
         // Write the positions other than this position if checkmated.
                 if (pos.checkers()) // Mate
                     flush_psv(-1);
@@ -530,63 +530,63 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 				break;
 			}
 
-			//// 定跡
+			//// constant track
 			//if ((m = book.probe(pos)) != MOVE_NONE)
 			//{
-			//	// 定跡にhitした。
-			//	// その指し手はmに格納された。
+			//  // Hit the constant track.
+			//  // The move was stored in m.
 
-			//	// 定跡の局面は学習には用いない。
-			//	a_psv.clear();
+			//  // Do not use the fixed phase for learning.
+			//  a_psv.clear();
 
-			//	if (random_move_minply != -1)
-			//		// 定跡の局面であっても、一定確率でランダムムーブは行なう。
-			//		goto RANDOM_MOVE;
-			//	else
-			//		// random_move_minplyとして-1が指定されているときは定跡を抜けるところまでは定跡に従って指す。
-			//		// 巨大定跡を用いて、ConsiderBookMoveCount trueとして定跡を抜けた局面を無数に用意しておき、
-			//		// そこから5回ランダムムーブを行なう、などの用途に用いる。
-			//		goto DO_MOVE;
+			//  if (random_move_minply != -1)
+			// 		// Random move is performed with a certain probability even in the constant phase.
+			// 		goto RANDOM_MOVE;
+			//  else
+			// 		// When -1 is specified as random_move_minply, it points according to the standard until it goes out of the standard.
+			// 		// Prepare an innumerable number of situations that have left the constant as ConsiderationBookMoveCount true using a huge constant
+			// 		// Used for purposes such as performing a random move 5 times from there.
+			// 		goto DO_MOVE;
 			//}
 
 			{
-				// search_depth～search_depth2 手読みの評価値とPV(最善応手列)
-				// 探索窓を狭めておいても問題ないはず。
+				// search_depth～search_depth2 Evaluation value of hand reading and PV (best responder row)
+				// There should be no problem if you narrow the search window.
 
 				auto pv_value1 = search(pos, depth);
 
 				auto value1 = pv_value1.first;
 				auto& pv1 = pv_value1.second;
 
-				// 評価値の絶対値がこの値以上の局面については
-				// その局面を学習に使うのはあまり意味がないのでこの試合を終了する。
-				// これをもって勝敗がついたという扱いをする。
+				// For situations where the absolute evaluation value is greater than or equal to this value
+				// It doesn't make much sense to use that aspect for learning, so this game ends.
+				// Treat this as having won or lost.
 
-				// 1手詰め、宣言勝ちならば、ここでmate_in(2)が返るのでeval_limitの上限値と同じ値になり、
-				// このif式は必ず真になる。resignについても同様。
+				// If you win one move, declarative win, mate_in(2) will be returned here, so it will be the same value as the upper limit of eval_limit,
+				// This if expression is always true. The same applies to resign.
 
 				if (abs(value1) >= eval_limit)
 				{
-//					sync_cout << pos << "eval limit = " << eval_limit << " over , move = " << pv1[0] << sync_endl;
+					// sync_cout << pos << "eval limit = "<< eval_limit << "over ,move = "<< pv1[0] << sync_endl;
 
-					// この局面でvalue1 >= eval_limitならば、(この局面の手番側の)勝ちである。
+					// If value1 >= eval_limit in this aspect, you win (the turn side of this aspect).
 					flush_psv((value1 >= eval_limit) ? 1 : -1);
 					break;
 				}
 
-				// おかしな指し手の検証
+				// Verification of a strange move
 				if (pv1.size() > 0
 					&& (pv1[0] == MOVE_NONE || pv1[0] == MOVE_NULL)
 					)
 				{
-					// MOVE_WINは、この手前で宣言勝ちの局面であるかチェックしているので
-					// ここで宣言勝ちの指し手が返ってくることはないはず。
-					// また、MOVE_RESIGNのときvalue1は1手詰めのスコアであり、eval_limitの最小値(-31998)のはずなのだが…。
+					// MOVE_WIN is checking if it is the declaration victory stage before this
+					// The declarative winning move should never come back here.
+					// Also, when MOVE_RESIGN, value1 is a one-stop score, which should be the minimum value of eval_limit (-31998)...
 					cout << "Error! : " << pos.fen() << m << value1 << endl;
 					break;
 				}
 
-				// 各千日手に応じた処理。
+				// Processing according to each thousand-day hand.
 
         if (pos.is_draw(0)) {
 			if (use_draw_in_training_data_generation) {
@@ -596,7 +596,7 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
           break;
         }
 
-				// PVの指し手でleaf nodeまで進めて、そのleaf nodeでevaluate()を呼び出した値を用いる。
+				// Use PV's move to the leaf node and use the value that evaluated() is called on that leaf node.
 				auto evaluate_leaf = [&](Position& pos , vector<Move>& pv)
 				{
 					auto rootColor = pos.side_to_move();
@@ -604,14 +604,14 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 					int ply2 = ply;
 					for (auto m : pv)
 					{
-						// デバッグ用の検証として、途中に非合法手が存在しないことを確認する。
-						// NULL_MOVEはこないものとする。
+						// As a verification for debugging, make sure there are no illegal players in the middle.
+						// NULL_MOVE does not come.
 
-						// 十分にテストしたのでコメントアウトで良い。
+						// I tested it out enough so I can comment it out.
 #if 1
-						// 非合法手はやってこないはずなのだが。
-						// 宣言勝ちとmated()でないことは上でテストしているので
-						// 読み筋としてMOVE_WINとMOVE_RESIGNが来ないことは保証されている。(はずだが…)
+						// I shouldn't be an illegal player.
+						// declarative win and not mated() are tested above so
+						// It is guaranteed that MOVE_WIN and MOVE_RESIGN do not come as a reader. (Should...)
 						if (!pos.pseudo_legal(m) || !pos.legal(m))
 						{
 							cout << "Error! : " << pos.fen() << m << endl;
@@ -619,25 +619,25 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 #endif
 						pos.do_move(m, states[ply2++]);
 						
-						// 毎ノードevaluate()を呼び出さないと、evaluate()の差分計算が出来ないので注意！
-						// depthが8以上だとこの差分計算はしないほうが速いと思われる。
+						//Because the difference calculation of evaluate() cannot be performed unless each node evaluate() is called!
+						// If the depth is 8 or more, it seems faster not to calculate this difference.
 #if defined(EVAL_NNUE)
             if (depth < 8)
               Eval::evaluate_with_no_return(pos);
 #endif  // defined(EVAL_NNUE)
 					}
 
-					// leafに到達
-					//      cout << pos;
+					// reach leaf
+					// cout << pos;
 
 					auto v = Eval::evaluate(pos);
-					// evaluate()は手番側の評価値を返すので、
-					// root_colorと違う手番なら、vを反転させて返さないといけない。
+					// evaluate() returns the evaluation value on the turn side, so
+					// If it's a turn different from root_color, you must invert v and return it.
 					if (rootColor != pos.side_to_move())
 						v = -v;
 
-					// 巻き戻す。
-					// C++x14にもなって、いまだreverseで回すforeachすらないのか…。
+					// Rewind.
+					// Is it C++x14, and isn't there even foreach to turn in reverse?
 					//  for (auto it : boost::adaptors::reverse(pv))
 
 					for (auto it = pv.rbegin(); it != pv.rend(); ++it)
@@ -655,52 +655,52 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 				// gensfen depth 6 eval_limit 3000
 				// Total 53879 Hits 43713 hit rate (%) 81.132
 
-				// 置換表の指し手で枝刈りされるなどの問題。
-				// これ、教師としては少し気持ち悪いが…。
+				// Problems such as pruning with moves in the substitution table.
+				// This is a little uncomfortable as a teacher...
 #endif
 
-				// depth 0の場合、pvが得られていないのでdepth 2で探索しなおす。
+				//If depth 0, pv is not obtained, so search again at depth 2.
 				if (search_depth <= 0)
 				{
 					pv_value1 = search(pos, 2);
 					pv1 = pv_value1.second;
 				}
 
-				// 初期局面周辺はは類似局面ばかりなので
-				// 学習に用いると過学習になりかねないから書き出さない。
-				// →　比較実験すべき
+				// The surroundings of the initial stage are all similar
+				// Do not write it out because it can lead to overlearning when used for learning.
+				// → comparative experiment should be done
 				if (ply < write_minply - 1)
 				{
 					a_psv.clear();
 					goto SKIP_SAVE;
 				}
 
-				// 同一局面を書き出したところか？
-				// これ、複数のPCで並列して生成していると同じ局面が含まれることがあるので
-				// 読み込みのときにも同様の処理をしたほうが良い。
+				// Did you just write the same phase?
+				// This may include the same aspect as it is generated in parallel on multiple PCs, so
+				// It is better to do the same process when reading.
 				{
 					auto key = pos.key();
 					auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
 					auto key2 = hash[hash_index];
 					if (key == key2)
 					{
-						// スキップするときはこれ以前に関する
-						// 勝敗の情報がおかしくなるので保存している局面をクリアする。
-						// どのみち、hashが合致した時点でそこ以前の局面も合致している可能性が高いから
-						// 書き出す価値がない。
+						// when skipping regarding earlier
+						// Clear the saved situation because the win/loss information will be incorrect.
+						// anyway, when the hash matches, it's likely that the previous phases also match
+						// Not worth writing out.
 						a_psv.clear();
 						goto SKIP_SAVE;
 					}
-					hash[hash_index] = key; // 今回のkeyに入れ替えておく。
+					hash[hash_index] = key; // Replace with the current key.
 				}
 
-				// 局面の一時保存。
+				// Temporary saving of the situation.
 				{
 					a_psv.emplace_back(PackedSfenValue());
 					auto &psv = a_psv.back();
-					
-					// packを要求されているならpackされたsfenとそのときの評価値を書き出す。
-					// 最終的な書き出しは、勝敗がついてから。
+
+					// If pack is requested, write the packed sfen and the evaluation value at that time.
+					// The final writing is after winning or losing.
 					pos.sfen_pack(psv.sfen);
 
           //{
@@ -710,12 +710,12 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
           //  assert(before_fen == after_fen);
           //}
 
-					// PV lineのleaf nodeでのroot colorから見たevaluate()の値を取得。
-					// search()の返し値をそのまま使うのとこうするのとの善悪は良くわからない。
+					// Get the value of evaluate() as seen from the root color on the leaf node of the PV line.
+					//I don't know the goodness and badness of using the return value of search() as it is.
 					psv.score = evaluate_leaf(pos, pv1);
 					psv.gamePly = ply;
 
-					// PVの初手を取り出す。これはdepth 0でない限りは存在するはず。
+					// Take out the first PV hand. This should be present unless depth 0.
 					assert(pv_value1.second.size() >= 1);
 					Move pv_move1 = pv_value1.second[0];
 					psv.move = pv_move1;
@@ -723,44 +723,44 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 
 			SKIP_SAVE:;
 
-				// 何故かPVが得られなかった(置換表などにhitして詰んでいた？)ので次の対局に行く。
-				// かなりのレアケースなので無視して良いと思う。
+				// For some reason, I could not get PV (hit the substitution table etc. and got stuck?) so go to the next game.
+				// It's a rare case, so you can ignore it.
 				if (pv1.size() == 0)
 					break;
-				
-				// search_depth手読みの指し手で局面を進める。
+
+				// search_depth Advance the phase by hand reading.
 				m = pv1[0];
 			}
 
 		RANDOM_MOVE:;
 
-			// 合法手のなかからランダムに1手選ぶフェーズ
+			// Phase to randomly choose one from legal hands
 			if (
-				// 1. random_move_minplyからrandom_move_maxplyの間でrandom_move_count回のランダムムーブを行なうモード
-				(random_move_minply != -1 && ply < (int)random_move_flag.size() && random_move_flag[ply]) ||
-				// 2. 定跡を抜けたあとにまとめてrandom_move_count回のランダムムーブを行なうモード
-				(random_move_minply == -1 && random_move_c < random_move_count))
+				// 1. Random move of random_move_count times from random_move_minply to random_move_maxply
+				(random_move_minply != -1 && ply <(int)random_move_flag.size() && random_move_flag[ply]) ||
+				// 2. A mode to perform random move of random_move_count times after leaving the track
+				(random_move_minply == -1 && random_move_c <random_move_count))
 			{
 				++random_move_c;
 
-				// mateではないので合法手が1手はあるはず…。
+				// It's not a mate, so there should be one legal hand...
 				if (random_multi_pv == 0)
 				{
-					// 普通のランダムムーブ
+					// normal random move
 
 					MoveList<LEGAL> list(pos);
 
-					// ここをApery方式にするのとの善悪はよくわからない。
+					// I don't really know the goodness and badness of making this the Apery method.
 					if (random_move_like_apery == 0
 						|| prng.rand(random_move_like_apery) != 0
 					)
 					{
-						// 普通に合法手から1手選択
+						// Normally one move from legal move
 						m = list.at((size_t)prng.rand((uint64_t)list.size()));
 					}
 					else {
-						// 玉が動かせるなら玉を動かす
-						Move moves[8]; // 8近傍
+						// if you can move the ball, move the ball
+						Move moves[8]; // Near 8
 						Move* p = &moves[0];
 						for (auto& m : list)
 							if (type_of(pos.moved_piece(m)) == KING)
@@ -768,36 +768,36 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 						size_t n = p - &moves[0];
 						if (n != 0)
 						{
-							// 玉を動かす指し手
+							// move to move the ball
 							m = moves[prng.rand(n)];
 
-							// Apery方式ではこのとき1/2の確率で相手もランダムムーブ
+							// In Apery method, at this time there is a 1/2 chance that the opponent will also move randomly
 							if (prng.rand(2) == 0)
 							{
-								// random_move_flag[ply]の次のところに"1"を追加するのがシンプルなhackか。
+								// Is it a simple hack to add a "1" next to random_move_flag[ply]?
 								random_move_flag.insert(random_move_flag.begin() + ply + 1, 1, true);
 							}
 						}
 						else
-							// 普通に合法手から1手選択
+							// Normally one move from legal move
 							m = list.at((size_t)prng.rand((uint64_t)list.size()));
 					}
 
-					// 玉の2手指しのコードを入れていたが、合法手から1手選べばそれに相当するはずで
-					// コードが複雑化するだけだから不要だと判断した。
+					// I put in the code of two handed balls, but if you choose one from legal hands, it should be equivalent to that
+					// I decided it's unnecessary because it just makes the code more complicated.
 				}
 				else {
-					// ロジックが複雑になるので、すまんがここで再度MultiPVで探索する。
+					// Since the logic becomes complicated, I'm sorry, I will search again with MultiPV here.
 					Learner::search(pos, random_multi_pv_depth, random_multi_pv);
-					// rootMovesの上位N手のなかから一つ選択
+					// Select one from the top N hands of root Moves
 
 					auto& rm = pos.this_thread()->rootMoves;
 
 					uint64_t s = min((uint64_t)rm.size(), (uint64_t)random_multi_pv);
 					for (uint64_t i = 1; i < s; ++i)
 					{
-						// rm[0]の評価値との差がrandom_multi_pv_diffの範囲でなければならない。
-						// rm[x].scoreは、降順に並んでいると仮定できる。 
+						// The difference from the evaluation value of rm[0] must be within the range of random_multi_pv_diff.
+						// It can be assumed that rm[x].score is arranged in descending order.
 						if (rm[0].score > rm[i].score + random_multi_pv_diff)
 						{
 							s = i;
@@ -807,81 +807,81 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 
 					m = rm[prng.rand(s)].pv[0];
 
-					// まだ1局面も書き出していないのに終局してたので書き出し処理は端折って次の対局に。
+					// I haven't written one phase yet, but it ended, so the writing process ends and the next game starts.
 					if (!is_ok(m))
 						break;
 				}
 
-				// ゲームの勝敗から指し手を評価しようとするとき、
-				// 今回のrandom moveがあるので、ここ以前には及ばないようにする。
-				a_psv.clear(); // 保存していた局面のクリア
+				// When trying to evaluate the move from the outcome of the game,
+				// There is a random move this time, so try not to fall below this.
+				a_psv.clear(); // clear saved aspect
 			}
 
 		DO_MOVE:;
 			pos.do_move(m, states[ply]);
 
-			// 差分計算を行なうために毎node evaluate()を呼び出しておく。
+			// Call node evaluate() for each difference calculation.
 			Eval::evaluate_with_no_return(pos);
 
 		} // for (int ply = 0; ; ++ply)
-	
+
 	} // while(!quit)
-	
+
 	sw.finalize(thread_id);
 }
 
 // -----------------------------------
-//    棋譜を生成するコマンド(master thread)
+// Command to generate a game record (master thread)
 // -----------------------------------
 
-// 棋譜を生成するコマンド
+// Command to generate a game record
 void gen_sfen(Position&, istringstream& is)
 {
-	// スレッド数(これは、USIのsetoptionで与えられる)
+	// number of threads (given by USI setoption)
 	uint32_t thread_num = (uint32_t)Options["Threads"];
 
-	// 生成棋譜の個数 default = 80億局面(Ponanza仕様)
+	// Number of generated game records default = 8 billion phases (Ponanza specification)
 	uint64_t loop_max = 8000000000UL;
 
-	// 評価値がこの値になったら生成を打ち切る。
+	// Stop the generation when the evaluation value reaches this value.
 	int eval_limit = 3000;
 
-	// 探索深さ
+	// search depth
 	int search_depth = 3;
 	int search_depth2 = INT_MIN;
 
-	// ランダムムーブを行なう最小plyと最大plyと回数
+	// minimum ply, maximum ply and number of random moves
 	int random_move_minply = 1;
 	int random_move_maxply = 24;
 	int random_move_count = 5;
-	// ランダムムーブをAperyのように玉を主に動かす機能
-	// これを例えば3にすると1/3の確率で玉を動かす。
+	// A function to move the random move mainly like Apery
+	// If this is set to 3, the ball will move with a probability of 1/3.
 	int random_move_like_apery = 0;
-	// ランダムムーブの代わりにmultipvで探索してそのなかからランダムに選ぶときはrandom_multi_pv = 1以上の数にする。
+	// If you search with multipv instead of random move and choose from among them randomly, set random_multi_pv = 1 or more.
 	int random_multi_pv = 0;
 	int random_multi_pv_diff = 32000;
 	int random_multi_pv_depth = INT_MIN;
 
-	// 書き出す局面のply(初期局面からの手数)の最小、最大。
+	// The minimum and maximum ply (number of steps from the initial phase) of the phase to write out.
 	int write_minply = 16;
 	int write_maxply = 400;
 
-	// 書き出すファイル名
+	// File name to write
 	string output_file_name = "generated_kifu.bin";
 
 	string token;
 
-	// eval hashにhitすると初期局面付近の評価値として、hash衝突して大きな値を書き込まれてしまうと
-	// eval_limitが小さく設定されているときに初期局面で毎回eval_limitを超えてしまい局面の生成が進まなくなる。
-	// そのため、eval hashは無効化する必要がある。
-	// あとeval hashのhash衝突したときに、変な値の評価値が使われ、それを教師に使うのが気分が悪いというのもある。
+	// When hit to eval hash, as a evaluation value near the initial stage, if a hash collision occurs and a large value is written
+	// When eval_limit is set small, eval_limit will be exceeded every time in the initial phase, and phase generation will not proceed.
+	// Therefore, eval hash needs to be disabled.
+	// After that, when the hash of the eval hash collides, the evaluation value of a strange value is used, and it may be unpleasant to use it for the teacher.
 	bool use_eval_hash = false;
 
-	// この単位でファイルに保存する。
-	// ファイル名は file_1.bin , file_2.binのように連番がつく。
+	// Save to file in this unit.
+	// File names are serialized like file_1.bin, file_2.bin.
 	uint64_t save_every = UINT64_MAX;
 
-	// ファイル名の末尾にランダムな数値を付与する。
+	// Add a random number to the end of the file name.
 	bool random_file_name = false;
 
 	while (true)
@@ -902,7 +902,7 @@ void gen_sfen(Position&, istringstream& is)
 		else if (token == "eval_limit")
 		{
 			is >> eval_limit;
-			// 最大値を1手詰みのスコアに制限する。(そうしないとループを終了しない可能性があるので)
+			// Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
 			eval_limit = std::min(eval_limit, (int)mate_in(2));
 		}
 		else if (token == "random_move_minply")
@@ -934,12 +934,12 @@ void gen_sfen(Position&, istringstream& is)
 	}
 
 #if defined(USE_GLOBAL_OPTIONS)
-	// あとで復元するために保存しておく。
+	// Save it for later restore.
 	auto oldGlobalOptions = GlobalOptions;
 	GlobalOptions.use_eval_hash = use_eval_hash;
 #endif
 
-	// search depth2が設定されていないなら、search depthと同じにしておく。
+	// If search depth2 is not set, leave it the same as search depth.
 	if (search_depth2 == INT_MIN)
 		search_depth2 = search_depth;
 	if (random_multi_pv_depth == INT_MIN)
@@ -947,10 +947,10 @@ void gen_sfen(Position&, istringstream& is)
 
 	if (random_file_name)
 	{
-		// output_file_nameにこの時点でランダムな数値を付与してしまう。
+		// Give a random number to output_file_name at this point.
     std::random_device seed_gen;
     PRNG r(seed_gen());
-		// 念のため乱数振り直しておく。
+		// Just in case, reassign the random numbers.
 		for(int i=0;i<10;++i)
 			r.rand(1);
 		auto to_hex = [](uint64_t u){
@@ -958,7 +958,7 @@ void gen_sfen(Position&, istringstream& is)
 			ss << std::hex << u;
 			return ss.str();
 		};
-		// 64bitの数値で偶然かぶると嫌なので念のため64bitの数値２つくっつけておく。
+		// I don't want to wear 64bit numbers by accident, so I'm going to make a 64bit number 2 just in case.
 		output_file_name = output_file_name + "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
 	}
 
@@ -982,7 +982,7 @@ void gen_sfen(Position&, istringstream& is)
 		<< "  save_every             = " << save_every << endl
 		<< "  random_file_name       = " << random_file_name << endl;
 
-	// Options["Threads"]の数だけスレッドを作って実行。
+	// Create and execute threads as many as Options["Threads"].
 	{
 		SfenWriter sw(output_file_name, thread_num);
 		sw.save_every = save_every;
@@ -1002,30 +1002,30 @@ void gen_sfen(Position&, istringstream& is)
 		multi_think.start_file_write_worker();
 		multi_think.go_think();
 
-		// SfenWriterのデストラクタでjoinするので、joinが終わってから終了したというメッセージを
-		// 表示させるべきなのでここをブロックで囲む。
+		// Since we are joining with the destructor of SfenWriter, please give a message that it has finished after the join
+		// Enclose this in a block because it should be displayed.
 	}
 
 	std::cout << "gensfen finished." << endl;
 
 #if defined(USE_GLOBAL_OPTIONS)
-	// GlobalOptionsの復元。
+	// Restore Global Options.
 	GlobalOptions = oldGlobalOptions;
 #endif
 
 }
 
 // -----------------------------------
-// 生成した棋譜から学習させるコマンド(learn)
+// command to learn from the generated game (learn)
 // -----------------------------------
 
-// 普通のシグモイド関数
+// ordinary sigmoid function
 double sigmoid(double x)
 {
 	return 1.0 / (1.0 + std::exp(-x));
 }
 
-// 評価値を勝率[0,1]に変換する関数
+// A function that converts the evaluation value to the winning rate [0,1]
 double winning_percentage(double value)
 {
 	// In Maxima,
@@ -1047,43 +1047,41 @@ double delta_winning_percentage(double value)
 		0.5756462732485115 * pow(PawnValue, -1) * pow(10.0, -0.25 * pow(PawnValue, -1) * value) *
 		pow(pow(10.0, -0.25 * pow(PawnValue, -1) * value) + 1.0, -2);
 }
-
-// 普通のシグモイド関数の導関数。
 double dsigmoid(double x)
 {
-	// シグモイド関数
-	//    f(x) = 1/(1+exp(-x))
-	// に対して1階微分は、
-	//    f'(x) = df/dx = f(x)・{ 1 - f(x) }
-	// となる。
+	// Sigmoid function
+	// f(x) = 1/(1+exp(-x))
+	// the first derivative is
+	// f'(x) = df/dx = f(x)・{ 1-f(x)}
+	// becomes
 
 	return sigmoid(x) * (1.0 - sigmoid(x));
 }
 
-// 目的関数が勝率の差の二乗和のとき
+// When the objective function is the sum of squares of the difference in winning percentage
 #if defined (LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
-// 勾配を計算する関数
+// function to calculate the gradient
 double calc_grad(Value deep, Value shallow, PackedSfenValue& psv)
 {
-	// 勝率の差の2乗が目的関数それを最小化する。
-	// 目的関数 J = 1/2m Σ ( win_rate(shallow) - win_rate(deep) ) ^2
-	// ただし、σはシグモイド関数で、評価値を勝率の差に変換するもの。
-	// mはサンプルの件数。shallowは浅い探索(qsearch())のときの評価値。deepは深い探索のときの評価値。
-	// また、Wを特徴ベクトル(評価関数のパラメーター)、Xi,Yiを教師とすると
-	// shallow = W*Xi   // *はアダマール積で、Wの転置・X の意味
+	// The square of the win rate difference minimizes it in the objective function.
+	// Objective function J = 1/2m Σ (win_rate(shallow)-win_rate(deep) )^2
+	// However, σ is a sigmoid function that converts the evaluation value into the difference in the winning percentage.
+	// m is the number of samples. shallow is the evaluation value for a shallow search (qsearch()). deep is the evaluation value for deep search.
+	// If W is the feature vector (parameter of the evaluation function) and Xi and Yi are teachers
+	// shallow = W*Xi // * is the Hadamard product, transposing W and meaning X
 	// f(Xi) = win_rate(W*Xi)
-	// σ(i番目のdeep) = Yi とおくと、
-	// J = m/2 Σ ( f(Xi) - Yi )^2
-	// とよくある式になる。
-	// Wはベクトルで、j番目の要素をWjと書くとすると、連鎖律から
-	// ∂J/∂Wj =            ∂J/∂f     ・  ∂f/∂W   ・ ∂W/∂Wj
-	//          =  1/m Σ ( f(Xi) - y )  ・  f'(Xi)    ・    1
+	// If σ(i th deep) = Yi,
+	// J = m/2 Σ (f(Xi)-Yi )^2
+	// becomes a common expression.
+	// W is a vector, and if we write the jth element as Wj, from the chain rule
+	// ∂J/∂Wj = ∂J/∂f ・∂f/∂W ・∂W/∂Wj
+	// = 1/m Σ (f(Xi)-y) ・f'(Xi) ・ 1
 
-	// 1/mはあとで掛けるとして、勾配の値としてはΣの中身を配列に保持しておけば良い。
+	// 1/m will be multiplied later, but the contents of Σ can be retained in the array as the value of the gradient.
 	// f'(Xi) = win_rate'(shallow) = sigmoid'(shallow/600) = dsigmoid(shallow / 600) / 600
-	// この末尾の /600 は学習率で調整するから書かなくていいか..
-	// また1/mという係数も、Adam , AdaGradのような勾配の自動調整機能を持つ更新式を用いるなら不要。
-	// ゆえにメモリ上に保存しておく必要はない。
+	// This /600 at the end is adjusted by the learning rate, so do not write it..
+	// Also, the coefficient of 1/m is unnecessary if you use the update formula that has the automatic gradient adjustment function like Adam and AdaGrad.
+	// Therefore, it is not necessary to save it in memory.
 
 	double p = winning_percentage(deep);
 	double q = winning_percentage(shallow);
@@ -1094,22 +1092,22 @@ double calc_grad(Value deep, Value shallow, PackedSfenValue& psv)
 #if defined (LOSS_FUNCTION_IS_CROSS_ENTOROPY)
 double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
 {
-	// 交差エントロピーを用いた目的関数
+	// Objective function with cross entropy
 
-	// 交差エントロピーの概念と性質については、
+	// For the concept and nature of cross entropy,
 	// http://nnadl-ja.github.io/nnadl_site_ja/chap3.html#the_cross-entropy_cost_function
 	// http://postd.cc/visual-information-theory-3/
-	// などを参考に。
+	// Refer to etc.
 
-	// 目的関数の設計)
-	// pの分布をqの分布に近づけたい → pとqの確率分布間の交差エントロピーの最小化問題と考える。
-	// J = H(p,q) = - Σ p(x) log(q(x)) = -p log q - (1-p) log(1-q)
-	//                 x
+	// Objective function design)
+	// We want to make the distribution of p closer to the distribution of q → Think of it as the problem of minimizing the cross entropy between the probability distributions of p and q.
+	// J = H(p,q) =-Σ p(x) log(q(x)) = -p log q-(1-p) log(1-q)
+	// x
 
-	// pは定数、qはWiの関数(q = σ(W・Xi) )としてWiに対する偏微分を求める。
-	// ∂J/∂Wi = -p・q'/q - (1-p)(1-q)'/(1-q)
-	//          = ...
-	//          = q - p.
+	// p is a constant and q is a Wi function (q = σ(W・Xi) ).
+	// ∂J/∂Wi = -p・q'/q-(1-p)(1-q)'/(1-q)
+	// = ...
+	// = q-p.
 
 	double p = winning_percentage(deep);
 	double q = winning_percentage(shallow);
@@ -1121,41 +1119,41 @@ double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
 #if defined ( LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE )
 double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
 {
-	// 勝率の関数を通さない版
-	// これ、EVAL_LIMITを低くしておかないと、終盤の形に対して評価値を一致させようとして
-	// evalがevalの範囲を超えかねない。
+	// Version that does not pass the winning percentage function
+	// This, unless EVAL_LIMIT is set low, trying to match the evaluation value with the shape of the end stage
+	// eval may exceed the range of eval.
 	return shallow - deep;
 }
 #endif
 
 #if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
 
-// elmo(WCSC27)で使われている定数。要調整。
-// elmoのほうは式を内分していないので値が違う。
-// learnコマンドでこの値を設定できる。
-// 0.33は、elmo(WCSC27)で使われていた定数(0.5)相当
+// A constant used in elmo (WCSC27). Adjustment required.
+// Since elmo does not internally divide the expression, the value is different.
+// You can set this value with the learn command.
+// 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
 double ELMO_LAMBDA = 0.33;
 double ELMO_LAMBDA2 = 0.33;
 double ELMO_LAMBDA_LIMIT = 32000;
 
 double calc_grad(Value deep, Value shallow , const PackedSfenValue& psv)
 {
-	// elmo(WCSC27)方式
-	// 実際のゲームの勝敗で補正する。
+	// elmo (WCSC27) method
+	// Correct with the actual game wins and losses.
 
 	const double q = winning_percentage(shallow);
 	const double p = winning_percentage(deep);
 	const double dq = delta_winning_percentage(shallow);
 
-	// 期待勝率を勝っていれば1、負けていれば 0、引き分けなら0.5として補正項として用いる。
-	// game_result = 1,0,-1なので1足して2で割る。
+	// Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
+	// game_result = 1,0,-1 so add 1 and divide by 2.
 	const double t = double(psv.game_result + 1) / 2;
 
-	// 深い探索での評価値がELMO_LAMBDA_LIMITを超えているならELMO_LAMBDAではなくELMO_LAMBDA2を適用する。
+	// If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
 	const double lambda = (abs(deep) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
 
-	// 実際の勝率を補正項として使っている。
-	// これがelmo(WCSC27)のアイデアで、現代のオーパーツ。
+	// Use the actual win rate as a correction term.
+	// This is the idea of ​​elmo (WCSC27), modern O-parts.
 	const double pp = (q - p) * dq / q / (1.0 - q);
 	const double tt = (q - t) * dq / q / (1.0 - q);
 	const double grad = lambda * pp + (1.0 - lambda) * tt;
@@ -1163,8 +1161,8 @@ double calc_grad(Value deep, Value shallow , const PackedSfenValue& psv)
 	return grad;
 }
 
-// 学習時の交差エントロピーの計算
-// elmo式の勝敗項と勝率項との個別の交差エントロピーが引数であるcross_entropy_evalとcross_entropy_winに返る。
+// Calculate cross entropy during learning
+// The individual cross entropy of the win/loss term and win rate term of the elmo expression is returned to the arguments cross_entropy_eval and cross_entropy_win.
 void calc_cross_entropy(Value deep, Value shallow, const PackedSfenValue& psv,
 	double& cross_entropy_eval, double& cross_entropy_win, double& cross_entropy,
 	double& entropy_eval, double& entropy_win, double& entropy)
@@ -1175,7 +1173,7 @@ void calc_cross_entropy(Value deep, Value shallow, const PackedSfenValue& psv,
 
 	constexpr double epsilon = 0.000001;
 
-	// 深い探索での評価値がELMO_LAMBDA_LIMITを超えているならELMO_LAMBDAではなくELMO_LAMBDA2を適用する。
+	// If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
 	const double lambda = (abs(deep) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
 
 	const double m = (1.0 - lambda) * t + lambda * p;
@@ -1198,13 +1196,14 @@ void calc_cross_entropy(Value deep, Value shallow, const PackedSfenValue& psv,
 #endif
 
 
-// 目的関数として他のバリエーションも色々用意するかも..
+// Other variations may be prepared as the objective function..
+
 
 double calc_grad(Value shallow, const PackedSfenValue& psv) {
 	return calc_grad((Value)psv.score, shallow, psv);
 }
 
-// Sfenの読み込み機
+// Sfen reader
 struct SfenReader
 {
 	SfenReader(int thread_num) : prng((std::random_device())())
@@ -1233,12 +1232,12 @@ struct SfenReader
 			delete p;
 	}
 
-	// mseなどの計算用に用いる局面数
-	// mini-batch size = 1Mが標準的なので、その0.2%程度なら時間的には無視できるはず。
-	// 指し手一致率の計算でdepth = 1でsearch()をするので、単純比較はできないが…。
+	// number of phases used for calculation such as mse
+	// mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
+	//Since search() is performed with depth = 1 in calculation of move match rate, simple comparison is not possible...
 	const uint64_t sfen_for_mse_size = 2000;
 
-	// mseなどの計算用に局面を読み込んでおく。
+	// Load the phase for calculation such as mse.
 	void read_for_mse()
 	{
 		auto th = Threads.main();
@@ -1253,7 +1252,7 @@ struct SfenReader
 			}
 			sfen_for_mse.push_back(ps);
 
-			// hash keyを求める。
+			// Get the hash key.
 			StateInfo si;
 			pos.set_from_packed_sfen(ps.sfen,&si,th);
 			sfen_for_mse_hash.insert(pos.key());
@@ -1280,35 +1279,36 @@ struct SfenReader
 		}
 	}
 
-	// 各スレッドがバッファリングしている局面数 0.1M局面。40HTで4M局面
+	// Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
 	const size_t THREAD_BUFFER_SIZE = 10 * 1000;
 
-	// ファイル読み込み用のバッファ(これ大きくしたほうが局面がshuffleが大きくなるので局面がバラけていいと思うが
-	// あまり大きいとメモリ消費量も上がる。
-	// SFEN_READ_SIZEはTHREAD_BUFFER_SIZEの倍数であるものとする。
+	// Buffer for reading files (If this is made larger, the shuffle becomes larger and the phases may vary.
+	// If it is too large, the memory consumption will increase.
+	// SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
 	const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
 
-	// [ASYNC] スレッドが局面を一つ返す。なければfalseが返る。
+	// [ASYNC] Thread returns one aspect. Otherwise returns false.
 	bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
 	{
-		// スレッドバッファに局面が残っているなら、それを1つ取り出して返す。
+		// If there are any positions left in the thread buffer, retrieve one and return it.
 		auto& thread_ps = packed_sfens[thread_id];
 
-		// バッファに残りがなかったらread bufferから充填するが、それすらなかったらもう終了。
-		if ((thread_ps == nullptr || thread_ps->size() == 0) // バッファが空なら充填する。
+		// Fill the read buffer if there is no remaining buffer, but if it doesn't even exist, finish.
+		if ((thread_ps == nullptr || thread_ps->size() == 0) // If the buffer is empty, fill it.
 			&& !read_to_thread_buffer_impl(thread_id))
 			return false;
 
-		// read_to_thread_buffer_impl()がtrueを返したというこは、
-		// スレッドバッファへの局面の充填が無事完了したということなので
-		// thread_ps->rbegin()は健在。
+		// read_to_thread_buffer_impl() returned true,
+		// Since the filling of the thread buffer with the phase has been completed successfully
+		// thread_ps->rbegin() is alive.
 
 		ps = *(thread_ps->rbegin());
 		thread_ps->pop_back();
-		
-		// バッファを使いきったのであれば自らdeleteを呼び出してこのバッファを開放する。
+
+		// If you've run out of buffers, call delete yourself to free this buffer.
 		if (thread_ps->size() == 0)
 		{
+
 			delete thread_ps;
 			thread_ps = nullptr;
 		}
@@ -1316,17 +1316,17 @@ struct SfenReader
 		return true;
 	}
 
-	// [ASYNC] スレッドバッファに局面をある程度読み込む。
+	// [ASYNC] Read some aspects into thread buffer.
 	bool read_to_thread_buffer_impl(size_t thread_id)
 	{
 		while (true)
 		{
 			{
 				std::unique_lock<std::mutex> lk(mutex);
-				// ファイルバッファから充填できたなら、それで良し。
+				// If you can fill from the file buffer, that's fine.
 				if (packed_sfens_pool.size() != 0)
 				{
-					// 充填可能なようなので充填して終了。
+					// It seems that filling is possible, so fill and finish.
 
 					packed_sfens[thread_id] = packed_sfens_pool.front();
 					packed_sfens_pool.pop_front();
@@ -1337,24 +1337,24 @@ struct SfenReader
 				}
 			}
 
-			// もうすでに読み込むファイルは無くなっている。もうダメぽ。
+			// The file to read is already gone. No more use.
 			if (end_of_files)
 				return false;
 
-			// file workerがpacked_sfens_poolに充填してくれるのを待っている。
-			// mutexはlockしていないのでいずれ充填してくれるはずだ。
+			// Waiting for file worker to fill packed_sfens_pool.
+			// The mutex isn't locked, so it should fill up soon.
 			sleep(1);
 		}
 
 	}
-	
-	// 局面ファイルをバックグラウンドで読み込むスレッドを起動する。
+
+	// Start a thread that loads the phase file in the background.
 	void start_file_read_worker()
 	{
 		file_worker_thread = std::thread([&] { this->file_read_worker(); });
 	}
 
-	// ファイルの読み込み専用スレッド用
+	// for file read-only threads
 	void file_read_worker()
 	{
 		auto open_next_file = [&]()
@@ -1362,11 +1362,11 @@ struct SfenReader
 			if (fs.is_open())
 				fs.close();
 
-			// もう無い
+			// no more
 			if (filenames.size() == 0)
 				return false;
 
-			// 次のファイル名ひとつ取得。
+			// Get the next file name.
 			string filename = *filenames.rbegin();
 			filenames.pop_back();
 
@@ -1379,8 +1379,8 @@ struct SfenReader
 
 		while (true)
 		{
-			// バッファが減ってくるのを待つ。
-			// このsize()の読み取りはread onlyなのでlockしなくていいだろう。
+			// Wait for the buffer to run out.
+			// This size() is read only, so you don't need to lock it.
 			while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
 				sleep(100);
 			if (stop_flag)
@@ -1389,7 +1389,7 @@ struct SfenReader
 			PSVector sfens;
 			sfens.reserve(SFEN_READ_SIZE);
 
-			// ファイルバッファにファイルから読み込む。
+			// Read from the file into the file buffer.
 			while (sfens.size() < SFEN_READ_SIZE)
 			{
 				PackedSfenValue p;
@@ -1398,10 +1398,10 @@ struct SfenReader
 					sfens.push_back(p);
 				} else
 				{
-					// 読み込み失敗
+					// read failure
 					if (!open_next_file())
 					{
-						// 次のファイルもなかった。あぼーん。
+						// There was no next file. Abon.
 						cout << "..end of files." << endl;
 						end_of_files = true;
 						return;
@@ -1409,7 +1409,7 @@ struct SfenReader
 				}
 			}
 
-			// この読み込んだ局面データをshuffleする。
+			// Shuffle the read phase data.
 			// random shuffle by Fisher-Yates algorithm
 
 			if (!no_shuffle)
@@ -1419,8 +1419,8 @@ struct SfenReader
 					swap(sfens[i], sfens[(size_t)(prng.rand((uint64_t)size - i) + i)]);
 			}
 
-			// これをTHREAD_BUFFER_SIZEごとの細切れにする。それがsize個あるはず。
-			// SFEN_READ_SIZEはTHREAD_BUFFER_SIZEの倍数であるものとする。
+			// Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
+			// SFEN_READ_SIZE shall be a multiple of THREAD_BUFFER_SIZE.
 			assert((SFEN_READ_SIZE % THREAD_BUFFER_SIZE)==0);
 
 			auto size = size_t(SFEN_READ_SIZE / THREAD_BUFFER_SIZE);
@@ -1429,7 +1429,7 @@ struct SfenReader
 
 			for (size_t i = 0; i < size; ++i)
 			{
-				// このポインターのdeleteは、受け側で行なう。
+				// Delete this pointer on the receiving side.
 				PSVector* ptr = new PSVector();
 				ptr->resize(THREAD_BUFFER_SIZE);
 				memcpy(&((*ptr)[0]), &sfens[i * THREAD_BUFFER_SIZE], sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
@@ -1437,12 +1437,12 @@ struct SfenReader
 				ptrs.push_back(ptr);
 			}
 
-			// sfensの用意が出来たので、折を見てコピー
+			// Since sfens is ready, look at the occasion and copy
 			{
 				std::unique_lock<std::mutex> lk(mutex);
 
-				// ポインタをコピーするだけなのでこの時間は無視できるはず…。
-				// packed_sfens_poolの内容を変更するのでmutexのlockが必要。
+				// You can ignore this time because you just copy the pointer...
+				// The mutex lock is required because the contents of packed_sfens_pool are changed.
 
 				for (size_t i = 0; i < size; ++i)
 					packed_sfens_pool.push_back(ptrs[i]);
@@ -1450,76 +1450,76 @@ struct SfenReader
 		}
 	}
 
-	// sfenファイル群
+	// sfen files
 	vector<string> filenames;
 
-	// 読み込んだ局面数(ファイルからメモリ上のバッファへ)
+	// number of phases read (file to memory buffer)
 	atomic<uint64_t> total_read;
 
-	// 処理した局面数
+	// number of processed phases
 	atomic<uint64_t> total_done;
 
-	// 前回までに処理した件数
+	// number of cases processed so far
 	uint64_t last_done;
 
-	// total_readがこの値を超えたらupdate_weights()してmseの計算をする。
+	// If total_read exceeds this value, update_weights() and calculate mse.
 	uint64_t next_update_weights;
 
 	uint64_t save_count;
 
-	// 局面読み込み時のシャッフルを行わない。
+	// Do not shuffle when reading the phase.
 	bool no_shuffle;
 
 	bool stop_flag;
 
-	// rmseの計算用の局面であるかどうかを判定する。
-	// (rmseの計算用の局面は学習のために使うべきではない。)
+	// Determine if it is a phase for calculating rmse.
+	// (The computational aspects of rmse should not be used for learning.)
 	bool is_for_rmse(Key key) const
 	{
-		return sfen_for_mse_hash.count(key) != 0;
+			return sfen_for_mse_hash.count(key) != 0;
 	}
 
-	// 同一局面の読み出しを制限するためのhash
-	// 6400万局面って多すぎるか？そうでもないか..
-	// hash_indexを求めるためのmaskに使うので、2**Nでなければならない。
+	// hash to limit the reading of the same situation
+	// Is there too many 64 million phases? Or Not really..
+	// It must be 2**N because it will be used as the mask to calculate hash_index.
 	static const uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
 	vector<Key> hash; // 64MB*8 = 512MB
 
-	// mse計算用のtest局面
+	// test phase for mse calculation
 	PSVector sfen_for_mse;
 
 protected:
 
-	// fileをバックグラウンドで読み込みしているworker thread
+	// worker thread reading file in background
 	std::thread file_worker_thread;
 
-	// 局面の読み込み時にshuffleするための乱数
+	// Random number to shuffle when reading the phase
 	PRNG prng;
 
-	// ファイル群を読み込んでいき、最後まで到達したか。
+	// Did you read the files and reached the end?
 	atomic<bool> end_of_files;
 
 
-	// sfenファイルのハンドル
+	// handle of sfen file
 	std::fstream fs;
 
-	// 各スレッド用のsfen
-	// (使いきったときにスレッドが自らdeleteを呼び出して開放すべし。)
+	// sfen for each thread
+	// (When the thread is used up, the thread should call delete to release it.)
 	std::vector<PSVector*> packed_sfens;
 
-	// packed_sfens_poolにアクセスするときのmutex
+	// Mutex when accessing packed_sfens_pool
 	std::mutex mutex;
 
-	// sfenのpool。fileから読み込むworker threadはここに補充する。
-	// 各worker threadはここから自分のpacked_sfens[thread_id]に充填する。
-	// ※　mutexをlockしてアクセスすること。
+	// pool of sfen. The worker thread read from the file is added here.
+	// Each worker thread fills its own packed_sfens[thread_id] from here.
+	// * Lock and access the mutex.
 	std::list<PSVector*> packed_sfens_pool;
 
-	// mse計算用の局面を学習に用いないためにhash keyを保持しておく。
+	// Hold the hash key so that the mse calculation phase is not used for learning.
 	std::unordered_set<Key> sfen_for_mse_hash;
 };
 
-// 複数スレッドでsfenを生成するためのクラス
+// Class to generate sfen with multiple threads
 struct LearnerThink: public MultiThink
 {
 	LearnerThink(SfenReader& sr_):sr(sr_),stop_flag(false), save_only_once(false)
@@ -1544,43 +1544,43 @@ struct LearnerThink: public MultiThink
 
 	virtual void thread_worker(size_t thread_id);
 
-	// 局面ファイルをバックグラウンドで読み込むスレッドを起動する。
+	// Start a thread that loads the phase file in the background.
 	void start_file_read_worker() { sr.start_file_read_worker(); }
 
-	// 評価関数パラメーターをファイルに保存
+	// save merit function parameters to a file
 	bool save(bool is_final=false);
 
-	// sfenの読み出し器
+	// sfen reader
 	SfenReader& sr;
 
-	// 学習の反復回数のカウンター
+	// Learning iteration counter
 	uint64_t epoch = 0;
 
-	// ミニバッチサイズのサイズ。必ずこのclassを使う側で設定すること。
+	// Mini batch size size. Be sure to set it on the side that uses this class.
 	uint64_t mini_batch_size = 1000*1000;
 
 	bool stop_flag;
 
-	// 割引率
+	// Discount rate
 	double discount_rate;
 
-	// 序盤を学習対象から外すオプション
+	// Option to exclude early stage from learning
 	int reduction_gameply;
 
-	// kk/kkp/kpp/kpppを学習させないオプション
+	// Option not to learn kk/kkp/kpp/kppp
 	std::array<bool,4> freeze;
 
-	// 教師局面の深い探索の評価値の絶対値がこの値を超えていたらその教師局面を捨てる。
+	// If the absolute value of the evaluation value of the deep search of the teacher phase exceeds this value, discard the teacher phase.
 	int eval_limit;
 
-	// 評価関数の保存するときに都度フォルダを掘るかのフラグ。
-	// trueだとフォルダを掘らない。
+	// Flag whether to dig a folder each time the evaluation function is saved.
+	// If true, do not dig the folder.
 	bool save_only_once;
 
-	// --- lossの計算
+	// --- loss calculation
 
-#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
-	// 学習用データのロスの計算用
+#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
+	// For calculation of learning data loss
 	atomic<double> learn_sum_cross_entropy_eval;
 	atomic<double> learn_sum_cross_entropy_win;
 	atomic<double> learn_sum_cross_entropy;
@@ -1604,20 +1604,21 @@ struct LearnerThink: public MultiThink
 	uint64_t loss_output_interval;
 	uint64_t mirror_percentage;
 
-	// ロスの計算。
-	// done : 今回対象とした局面数
+	// Loss calculation.
+	// done: Number of phases targeted this time
 	void calc_loss(size_t thread_id , uint64_t done);
 
-	// ↑のlossの計算をタスクとして定義してやり、それを実行する
+	// Define the loss calculation in ↑ as a task and execute it
 	TaskDispatcher task_dispatcher;
 };
 
 void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
 {
-	// 置換表にhitされてもかなわんので、このタイミングで置換表の世代を新しくする。
-	// 置換表を無効にしているなら関係ないのだが。
+	// There is no point in hitting the replacement table, so at this timing the generation of the replacement table is updated.
+	// It doesn't matter if you have disabled the substitution table.
 	TT.new_search();
 
+
 #if defined(EVAL_NNUE)
 	std::cout << "PROGRESS: " << now_string() << ", ";
 	std::cout << sr.total_done << " sfens";
@@ -1631,8 +1632,8 @@ void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
 	double sum_error3 = 0;
 #endif
 
-#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
-	// 検証用データのロスの計算用
+#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
+	// For calculation of verification data loss
 	atomic<double> test_sum_cross_entropy_eval,test_sum_cross_entropy_win,test_sum_cross_entropy;
 	atomic<double> test_sum_entropy_eval,test_sum_entropy_win,test_sum_entropy;
 	test_sum_cross_entropy_eval = 0;
@@ -1642,16 +1643,16 @@ void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
 	test_sum_entropy_win = 0;
 	test_sum_entropy = 0;
 
-	// 学習時のnorm
+	// norm for learning
 	atomic<double> sum_norm;
 	sum_norm = 0;
 #endif
 
-	// 深い探索のpvの初手と、search(1)のpvの初手の指し手が一致した回数。
+	// The number of times the pv first move of deep search matches the pv first move of search(1).
 	atomic<int> move_accord_count;
 	move_accord_count = 0;
 
-	// 平手の初期局面のeval()の値を表示させて、揺れを見る。
+	// Display the value of eval() in the initial stage of Hirate and see the shaking.
 	auto th = Threads[thread_id];
 	auto& pos = th->rootPos;
 	StateInfo si;
@@ -1660,36 +1661,36 @@ void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
 
 	//Eval::print_eval_stat(pos);
 
-	// ここ、並列化したほうが良いのだがslaveの前の探索が終わってなかったりしてちょっと面倒。
-	// taskを呼び出すための仕組みを作ったのでそれを用いる。
+	// It's better to parallelize here, but it's a bit troublesome because the search before slave has not finished.
+	// I created a mechanism to call task, so I will use it.
 
-	// こなすべきtaskの数。
+	// The number of tasks to do.
 	atomic<int> task_count;
 	task_count = (int)sr.sfen_for_mse.size();
 	task_dispatcher.task_reserve(task_count);
 
-	// 局面の探索をするtaskを生成して各スレッドに振ってやる。
+	// Create a task to search for the situation and give it to each thread.
 	for (const auto& ps : sr.sfen_for_mse)
 	{
-		// TaskDispatcherを用いて各スレッドに作業を振る。
-		// そのためのタスクの定義。
-		// ↑で使っているposをcaptureされるとたまらんのでcaptureしたい変数は一つずつ指定しておく。
+		// Assign work to each thread using TaskDispatcher.
+		// A task definition for that.
+		// It is not possible to capture pos used in ↑, so specify the variables you want to capture one by one.
 		auto task = [&ps,&test_sum_cross_entropy_eval,&test_sum_cross_entropy_win,&test_sum_cross_entropy,&test_sum_entropy_eval,&test_sum_entropy_win,&test_sum_entropy, &sum_norm,&task_count ,&move_accord_count](size_t thread_id)
 		{
-			// これ、C++ではループごとに新たなpsのインスタンスをちゃんとcaptureするのだろうか.. →　するようだ。
+			// Does C++ properly capture a new ps instance for each loop?.
 			auto th = Threads[thread_id];
 			auto& pos = th->rootPos;
 			StateInfo si;
 			if (pos.set_from_packed_sfen(ps.sfen ,&si, th) != 0)
 			{
-				// 運悪くrmse計算用のsfenとして、不正なsfenを引いてしまっていた。
+				// Unfortunately, as an sfen for rmse calculation, an invalid sfen was drawn.
 				cout << "Error! : illegal packed sfen " << pos.fen() << endl;
 			}
 
-			// 浅い探索の評価値
-			// evaluate()の値を用いても良いのだが、ロスを計算するときにlearn_cross_entropyと
-			// 値が比較しにくくて困るのでqsearch()を用いる。
-			// EvalHashは事前に無効化してある。(そうしないと毎回同じ値が返ってしまう)
+			// Evaluation value for shallow search
+			// The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
+			// Use qsearch() because it is difficult to compare the values.
+			// EvalHash has been disabled in advance. (If not, the same value will be returned every time)
 			auto r = qsearch(pos);
 
 			auto shallow_value = r.first;
@@ -1707,34 +1708,34 @@ void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
 					pos.undo_move(*it);
 			}
 
-			// 深い探索の評価値
+			// Evaluation value of deep search
 			auto deep_value = (Value)ps.score;
 
-			// 注) このコードは、learnコマンドでeval_limitを指定しているときのことを考慮してない。
+			// Note) This code does not consider when eval_limit is specified in the learn command.
 
-			// --- 誤差の計算
+			// --- error calculation
 
 #if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
 			auto grad = calc_grad(deep_value, shallow_value, ps);
 
-			// rmse的なもの
+			// something like rmse
 			sum_error += grad*grad;
-			// 勾配の絶対値を足したもの
+			// Add the absolute value of the gradient
 			sum_error2 += abs(grad);
-			// 評価値の差の絶対値を足したもの
+			// Add the absolute value of the difference between the evaluation values
 			sum_error3 += abs(shallow_value - deep_value);
 #endif
 
-			// --- 交差エントロピーの計算
+			// --- calculation of cross entropy
 
-			// とりあえずelmo methodの時だけ勝率項と勝敗項に関して
-			// 交差エントロピーを計算して表示させる。
+			// For the time being, regarding the win rate and loss terms only in the elmo method
+			// Calculate and display the cross entropy.
 
-#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
+#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
 			double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
 			double test_entropy_eval, test_entropy_win, test_entropy;
 			calc_cross_entropy(deep_value, shallow_value, ps, test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy, test_entropy_eval, test_entropy_win, test_entropy);
-			// 交差エントロピーの合計は定義的にabs()をとる必要がない。
+			// The total cross entropy need not be abs() by definition.
 			test_sum_cross_entropy_eval += test_cross_entropy_eval;
 			test_sum_cross_entropy_win += test_cross_entropy_win;
 			test_sum_cross_entropy += test_cross_entropy;
@@ -1744,31 +1745,31 @@ void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
 			sum_norm += (double)abs(shallow_value);
 #endif
 
-			// 教師の指し手と浅い探索のスコアが一致するかの判定
+			// Determine if the teacher's move and the score of the shallow search match
 			{
 				auto r = search(pos,1);
 				if ((uint16_t)r.second[0] == ps.move)
 					move_accord_count.fetch_add(1, std::memory_order_relaxed);
 			}
 
-			// こなしたのでタスク一つ減る
+			// Reduced one task because I did it
 			--task_count;
 		};
 
-		// 定義したタスクをslaveに投げる。
+		// Throw the defined task to slave.
 		task_dispatcher.push_task_async(task);
 	}
 
-	// 自分自身もslaveとして参加する
+	// join yourself as a slave
 	task_dispatcher.on_idle(thread_id);
 
-	// すべてのtaskの完了を待つ
+	// wait for all tasks to complete
 	while (task_count)
 		sleep(1);
 
 #if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-	// rmse = root mean square error : 平均二乗誤差
-	// mae  = mean absolute error    : 平均絶対誤差
+	// rmse = root mean square error: mean square error
+	// mae = mean absolute error: mean absolute error
 	auto dsig_rmse = std::sqrt(sum_error / (sfen_for_mse.size() + epsilon));
 	auto dsig_mae = sum_error2 / (sfen_for_mse.size() + epsilon);
 	auto eval_mae = sum_error3 / (sfen_for_mse.size() + epsilon);
@@ -1782,8 +1783,8 @@ void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
 	latest_loss_count += sr.sfen_for_mse.size();
 #endif
 
-	// learn_cross_entropyは、機械学習の世界ではtrain cross entropyと呼ぶべきかも知れないが、
-	// 頭文字を略するときに、lceと書いて、test cross entropy(tce)と区別出来たほうが嬉しいのでこうしてある。
+// learn_cross_entropy may be called train cross entropy in the world of machine learning,
+// When omitting the acronym, it is nice to be able to distinguish it from test cross entropy(tce) by writing it as lce.
 
 	if (sr.sfen_for_mse.size() && done)
 	{
@@ -1812,7 +1813,7 @@ void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
 		cout << "Error! : sr.sfen_for_mse.size() = " << sr.sfen_for_mse.size() << " ,  done = " << done << endl;
 	}
 
-	// 次回のために0クリアしておく。
+	// Clear 0 for next time.
 	learn_sum_cross_entropy_eval = 0.0;
 	learn_sum_cross_entropy_win = 0.0;
 	learn_sum_cross_entropy = 0.0;
@@ -1836,11 +1837,11 @@ void LearnerThink::thread_worker(size_t thread_id)
 
 	while (true)
 	{
-		// mseの表示(これはthread 0のみときどき行う)
-		// ファイルから読み込んだ直後とかでいいような…。
+	// display mse (this is sometimes done only for thread 0)
+	// Immediately after being read from the file...
 
 #if defined(EVAL_NNUE)
-		// 更新中に評価関数を使わないようにロックする。
+		// Lock the evaluation function so that it is not used during updating.
 		shared_lock<shared_timed_mutex> read_lock(nn_mutex, defer_lock);
 		if (sr.next_update_weights <= sr.total_done ||
 		    (thread_id != 0 && !read_lock.try_lock()))
@@ -1850,20 +1851,20 @@ void LearnerThink::thread_worker(size_t thread_id)
 		{
 			if (thread_id != 0)
 			{
-				// thread_id == 0以外は、待機。
+				// Wait except thread_id == 0.
 
 				if (stop_flag)
 					break;
 
-				// rmseの計算などを並列化したいのでtask()が積まれていればそれを処理する。
+				// I want to parallelize rmse calculation etc., so if task() is loaded, process it.
 				task_dispatcher.on_idle(thread_id);
 				continue;
 			}
 			else
 			{
-				// thread_id == 0だけが以下の更新処理を行なう。
+				// Only thread_id == 0 performs the following update process.
 
-				// 初回はweight配列の更新は行わない。
+				// The weight array is not updated for the first time.
 				if (sr.next_update_weights == 0)
 				{
 					sr.next_update_weights += mini_batch_size;
@@ -1871,33 +1872,33 @@ void LearnerThink::thread_worker(size_t thread_id)
 				}
 
 #if !defined(EVAL_NNUE)
-				// 現在時刻を出力。毎回出力する。
+				// Output the current time. Output every time.
 				std::cout << sr.total_done << " sfens , at " << now_string() << std::endl;
 
-				// このタイミングで勾配をweight配列に反映。勾配の計算も1M局面ごとでmini-batch的にはちょうどいいのでは。
+				// Reflect the gradient in the weight array at this timing. The calculation of the gradient is just right for each 1M phase in terms of mini-batch.
 				Eval::update_weights(epoch , freeze);
 
-				// デバッグ用にepochと現在のetaを表示してやる。
+				// Display epoch and current eta for debugging.
 				std::cout << "epoch = " << epoch << " , eta = " << Eval::get_eta() << std::endl;
 #else
 				{
-					// パラメータの更新
+					// update parameters
 
-					// 更新中に評価関数を使わないようにロックする。
+					// Lock the evaluation function so that it is not used during updating.
 					lock_guard<shared_timed_mutex> write_lock(nn_mutex);
 					Eval::NNUE::UpdateParameters(epoch);
 				}
 #endif
 				++epoch;
 
-				// 10億局面ごとに1回保存、ぐらいの感じで。
+				// Save once every 1 billion phases.
 
-				// ただし、update_weights(),calc_rmse()している間の時間経過は無視するものとする。
+				// However, the elapsed time during update_weights() and calc_rmse() is ignored.
 				if (++sr.save_count * mini_batch_size >= eval_save_interval)
 				{
 					sr.save_count = 0;
 
-					// この間、gradientの計算が進むと値が大きくなりすぎて困る気がするので他のスレッドを停止させる。
+					// During this time, as the gradient calculation proceeds, the value becomes too large and I feel annoyed, so stop other threads.
 					const bool converged = save();
 					if (converged)
 					{
@@ -1907,33 +1908,33 @@ void LearnerThink::thread_worker(size_t thread_id)
 					}
 				}
 
-				// rmseを計算する。1万局面のサンプルに対して行う。
-				// 40コアでやると100万局面ごとにupdate_weightsするとして、特定のスレッドが
-				// つきっきりになってしまうのあまりよくないような気も…。
+				// Calculate rmse. This is done for samples of 10,000 phases.
+				// If you do with 40 cores, update_weights every 1 million phases
+				// I don't think it's so good to be tiring.
 				static uint64_t loss_output_count = 0;
 				if (++loss_output_count * mini_batch_size >= loss_output_interval)
 				{
 					loss_output_count = 0;
 
-					// 今回処理した件数
+					// Number of cases processed this time
 					uint64_t done = sr.total_done - sr.last_done;
 
-					// lossの計算
+					// loss calculation
 					calc_loss(thread_id , done);
 
 #if defined(EVAL_NNUE)
 					Eval::NNUE::CheckHealth();
 #endif
 
-					// どこまで集計したかを記録しておく。
+					// Make a note of how far you have totaled.
 					sr.last_done = sr.total_done;
 				}
 
-				// 次回、この一連の処理は、次回、mini_batch_sizeだけ処理したときに再度やって欲しい。
+				// Next time, I want you to do this series of processing again when you process only mini_batch_size.
 				sr.next_update_weights += mini_batch_size;
 
-				// main thread以外は、このsr.next_update_weightsの更新を待っていたので
-				// この値が更新されると再度動き始める。				
+				// Since I was waiting for the update of this sr.next_update_weights except the main thread,
+				// Once this value is updated, it will start moving again.
 			}
 		}
 
@@ -1941,17 +1942,17 @@ void LearnerThink::thread_worker(size_t thread_id)
 	RetryRead:;
 		if (!sr.read_to_thread_buffer(thread_id, ps))
 		{
-			// 自分のスレッド用の局面poolを使い尽くした。
-			// 局面がもうほとんど残っていないということだから、
-			// 他のスレッドもすべて終了させる。
+			// ran out of thread pool for my thread.
+			// Because there are almost no phases left,
+			// Terminate all other threads.
 
 			stop_flag = true;
 			break;
 		}
 
-		// 評価値が学習対象の値を超えている。
-		// この局面情報を無視する。
-		if (eval_limit < abs(ps.score))
+		// The evaluation value exceeds the learning target value.
+		// Ignore this aspect information.
+		if (eval_limit <abs(ps.score))
 			goto RetryRead;
 
 
@@ -1959,7 +1960,7 @@ void LearnerThink::thread_worker(size_t thread_id)
 			goto RetryRead;
 
 
-		// 序盤局面に関する読み飛ばし
+		// Skip over the opening phase
 		if (ps.gamePly < prng.rand(reduction_gameply))
 			goto RetryRead;
 
@@ -1967,70 +1968,70 @@ void LearnerThink::thread_worker(size_t thread_id)
 		auto sfen = pos.sfen_unpack(ps.data);
 		pos.set(sfen);
 #endif
-		// ↑sfenを経由すると遅いので専用の関数を作った。
+		// ↑ Since it is slow when passing through sfen, I made a dedicated function.
 		StateInfo si;
 		const bool mirror = prng.rand(100) < mirror_percentage;
 		if (pos.set_from_packed_sfen(ps.sfen,&si,th,mirror) != 0)
 		{
-			// 変なsfenを掴かまされた。デバッグすべき！
-			// 不正なsfenなのでpos.sfen()で表示できるとは限らないが、しないよりマシ。
+			// I got a strange sfen. Should be debugged!
+			// Since it is an illegal sfen, it may not be displayed with pos.sfen(), but it is better than not.
 			cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
 			goto RetryRead;
 		}
 #if !defined(EVAL_NNUE)
 		{
 			auto key = pos.key();
-			// rmseの計算用に使っている局面なら除外する。
+			// Exclude the phase used for rmse calculation.
 			if (sr.is_for_rmse(key) && use_hash_in_training)
 				goto RetryRead;
 
-			// 直近で用いた局面も除外する。
+			// Exclude the most recently used aspect.
 			auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
 			auto key2 = sr.hash[hash_index];
 			if (key == key2 && use_hash_in_training)
 				goto RetryRead;
-			sr.hash[hash_index] = key; // 今回のkeyに入れ替えておく。
+			sr.hash[hash_index] = key; // Replace with the current key.
 		}
 #endif
 
-		// 全駒されて詰んでいる可能性がある。
-		// また宣言勝ちの局面はPVの指し手でleafに行けないので学習から除外しておく。
-		// (そのような教師局面自体を書き出すべきではないのだが古い生成ルーチンで書き出しているかも知れないので)
-    // Skip the position if there are no legal moves (=checkmated or stalemate).
+		// There is a possibility that all the pieces are blocked and stuck.
+		// Also, the declaration win phase is excluded from learning because you cannot go to leaf with PV moves.
+		// (shouldn't write out such teacher aspect itself, but may have written it out with an old generation routine)
+	// Skip the position if there are no legal moves (=checkmated or stalemate).
 		if (MoveList<LEGAL>(pos).size() == 0)
 			goto RetryRead;
 
-		// 読み込めたので試しに表示してみる。
+		// I can read it, so try displaying it.
 		//		cout << pos << value << endl;
 
-		// 浅い探索(qsearch)の評価値
+		// Evaluation value of shallow search (qsearch)
 		auto r = qsearch(pos);
 		auto pv = r.second;
 
-		// 深い探索の評価値
+		// Evaluation value of deep search
 		auto deep_value = (Value)ps.score;
 
-		// mini batchのほうが勾配が出ていいような気がする。
-		// このままleaf nodeに行って、勾配配列にだけ足しておき、あとでrmseの集計のときにAdaGradしてみる。
+		// I feel that the mini batch has a better gradient.
+		// Go to the leaf node as it is, add only to the gradient array, and later try AdaGrad at the time of rmse aggregation.
 
 		auto rootColor = pos.side_to_move();
 
-		// PVの初手が異なる場合は学習に用いないほうが良いのでは…。
-		// 全然違うところを探索した結果だとそれがノイズに成りかねない。
-		// 評価値の差が大きすぎるところも学習対象としないほうがいいかも…。
+		// If the initial PV is different, it is better not to use it for learning.
+		// If it is the result of searching a completely different place, it may become noise.
+		// It may be better not to study where the difference in evaluation values ​​is too large.
 
 #if 0
-		// これやると13%程度の局面が学習対象から外れてしまう。善悪は微妙。
+		// If you do this, about 13% of the phases will be excluded from the learning target. Good and bad are subtle.
 		if (pv.size() >= 1 && (uint16_t)pv[0] != ps.move)
 		{
-//			dbg_hit_on(false);
+			// dbg_hit_on(false);
 			continue;
 		}
 #endif
 
 #if 0
-		// 評価値の差が大きすぎるところも学習対象としないほうがいいかも…。
-		// →　勝率の関数を通すのでまあいいか…。30%ぐらいの局面が学習対象から外れてしまうしな…。
+		// It may be better not to study where the difference in evaluation values ​​is too large.
+		// → It's okay because it passes the win rate function... About 30% of the phases are out of the scope of learning...
 		if (abs((int16_t)r.first - ps.score) >= Eval::PawnValue * 4)
 		{
 //			dbg_hit_on(false);
@@ -2041,18 +2042,18 @@ void LearnerThink::thread_worker(size_t thread_id)
 
 		int ply = 0;
 
-		// 現在の局面に対して勾配を加算するヘルパー関数。
+		// A helper function that adds the gradient to the current phase.
 		auto pos_add_grad = [&]() {
-			// shallow_valueとして、leafでのevaluateの値を用いる。
-			// qsearch()の戻り値をshallow_valueとして用いると、
-			// PVが途中で途切れている場合、勾配を計算するのにevaluate()を呼び出した局面と、
-			// その勾配を与える局面とが異なることになるので、これはあまり好ましい性質ではないと思う。
-			// 置換表をオフにはしているのだが、1手詰みなどはpv配列を更新していないので…。
+			// Use the value of evaluate in leaf as shallow_value.
+			// Using the return value of qsearch() as shallow_value,
+			// If PV is interrupted in the middle, the phase where evaluate() is called to calculate the gradient, and
+			// I don't think this is a very desirable property, as the aspect that gives that gradient will be different.
+			// I have turned off the substitution table, but since the pv array has not been updated due to one stumbling block etc...
 
 			Value shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
 
-#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
-			// 学習データに対するロスの計算
+#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
+			// Calculate loss for training data
 			double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
 			double learn_entropy_eval, learn_entropy_win, learn_entropy;
 			calc_cross_entropy(deep_value, shallow_value, ps, learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy, learn_entropy_eval, learn_entropy_win, learn_entropy);
@@ -2065,17 +2066,17 @@ void LearnerThink::thread_worker(size_t thread_id)
 #endif
 
 #if !defined(EVAL_NNUE)
-			// 勾配
+			// Slope
 			double dj_dw = calc_grad(deep_value, shallow_value, ps);
 
-			// 現在、leaf nodeで出現している特徴ベクトルに対する勾配(∂J/∂Wj)として、jd_dwを加算する。
+			// Add jd_dw as the gradient (∂J/∂Wj) for the feature vector currently appearing in the leaf node.
 
-			// PV終端でなければ割引率みたいなものを適用。
+			// If it is not PV termination, apply a discount rate.
 			if (discount_rate != 0 && ply != (int)pv.size())
 				dj_dw *= discount_rate;
 
-			// leafに到達したのでこの局面に出現している特徴に勾配を加算しておく。
-			// 勾配に基づくupdateはのちほど行なう。
+			// Since we have reached leaf, add the gradient to the features that appear in this phase.
+			// Update based on gradient later.
 			Eval::add_grad(pos, rootColor, dj_dw, freeze);
 #else
 			const double example_weight =
@@ -2083,15 +2084,15 @@ void LearnerThink::thread_worker(size_t thread_id)
 			Eval::NNUE::AddExample(pos, rootColor, ps, example_weight);
 #endif
 
-			// 処理が終了したので処理した件数のカウンターをインクリメント
+			// Since the processing is completed, the counter of the processed number is incremented
 			sr.total_done++;
 		};
 
-		StateInfo state[MAX_PLY]; // qsearchのPVがそんなに長くなることはありえない。
+		StateInfo state[MAX_PLY]; // PV of qsearch cannot be so long.
 		bool illegal_move = false;
 		for (auto m : pv)
 		{
-			// 非合法手はやってこないはずなのだが。
+			// I shouldn't be an illegal player.
 			// An illegal move sometimes comes here...
 			if (!pos.pseudo_legal(m) || !pos.legal(m))
 			{
@@ -2101,14 +2102,14 @@ void LearnerThink::thread_worker(size_t thread_id)
 				break;
 			}
 
-			// 各PV上のnodeでも勾配を加算する場合の処理。
-			// discount_rateが0のときはこの処理は行わない。
+			// Processing when adding the gradient to the node on each PV.
+			//If discount_rate is 0, this process is not performed.
 			if (discount_rate != 0)
 				pos_add_grad();
 
 			pos.do_move(m, state[ply++]);
-			
-			// leafでのevaluateの値を用いるので差分更新していく。
+
+			// Since the value of evaluate in leaf is used, the difference is updated.
 			Eval::evaluate_with_no_return(pos);
 		}
 
@@ -2117,15 +2118,15 @@ void LearnerThink::thread_worker(size_t thread_id)
 			continue;
 		}
 
-		// PVの終端局面に達したので、ここで勾配を加算する。
+		// Since we have reached the end phase of PV, add the slope here.
 		pos_add_grad();
 
-		// 局面を巻き戻す
+		// rewind the phase
 		for (auto it = pv.rbegin(); it != pv.rend(); ++it)
 			pos.undo_move(*it);
 
 #if 0
-		// rootの局面にも勾配を加算する場合
+		// When adding the gradient to the root phase
 		shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
 		dj_dw = calc_grad(deep_value, shallow_value, ps);
 		Eval::add_grad(pos, rootColor, dj_dw , without_kpp);
@@ -2135,19 +2136,19 @@ void LearnerThink::thread_worker(size_t thread_id)
 
 }
 
-// 評価関数ファイルの書き出し。
+// Write evaluation function file.
 bool LearnerThink::save(bool is_final)
 {
-	// 保存前にcheck sumを計算して出力しておく。(次に読み込んだときに合致するか調べるため)
-	std::cout << "Check Sum = " << std::hex << Eval::calc_check_sum() << std::dec << std::endl;
+	// Calculate and output check sum before saving. (To check if it matches the next time)
+	std::cout << "Check Sum = "<< std::hex << Eval::calc_check_sum() << std::dec << std::endl;
 
-	// 保存ごとにファイル名の拡張子部分を"0","1","2",..のように変えていく。
-	// (あとでそれぞれの評価関数パラメーターにおいて勝率を比較したいため)
+	// Each time you save, change the extension part of the file name like "0","1","2",..
+	// (Because I want to compare the winning rate for each evaluation function parameter later)
 
 	if (save_only_once)
 	{
-		// EVAL_SAVE_ONLY_ONCEが定義されているときは、
-		// 1度だけの保存としたいのでサブフォルダを掘らない。
+		// When EVAL_SAVE_ONLY_ONCE is defined,
+		// Do not dig a subfolder because I want to save it only once.
 		Eval::save_eval("");
 	}
 	else if (is_final) {
@@ -2196,26 +2197,26 @@ bool LearnerThink::save(bool is_final)
 	return false;
 }
 
-// shuffle_files() , shuffle_files_quick()の下請けで、書き出し部分。
-// output_file_name : 書き出すファイル名
-// prng : 乱数
-// afs  : それぞれの教師局面ファイルのfstream
-// a_count : それぞれのファイルに内在する教師局面の数。
+// Shuffle_files(), shuffle_files_quick() subcontracting, writing part.
+// output_file_name: Name of the file to write
+// prng: random number
+// afs: fstream of each teacher phase file
+// a_count: The number of teacher positions inherent in each file.
 void shuffle_write(const string& output_file_name , PRNG& prng , vector<fstream>& afs , vector<uint64_t>& a_count)
 {
 	uint64_t total_sfen_count = 0;
 	for (auto c : a_count)
 		total_sfen_count += c;
 
-	// 書き出した局面数
+	// number of exported phases
 	uint64_t write_sfen_count = 0;
 
-	// 進捗をこの局面数ごとに画面に出力する。
+	// Output the progress on the screen for each phase.
 	const uint64_t buffer_size = 10000000;
 
 	auto print_status = [&]()
 	{
-		// 10M局面ごと、もしくは、すべての書き出しが終わったときに進捗を出力する
+		// Output progress every 10M phase or when all writing is completed
 		if (((write_sfen_count % buffer_size) == 0) ||
 			(write_sfen_count == total_sfen_count))
 			cout << write_sfen_count << " / " << total_sfen_count << endl;
@@ -2226,7 +2227,7 @@ void shuffle_write(const string& output_file_name , PRNG& prng , vector<fstream>
 
 	fstream fs(output_file_name, ios::out | ios::binary);
 
-	// 教師局面の合計
+	// total teacher positions
 	uint64_t sum = 0;
 	for (auto c : a_count)
 		sum += c;
@@ -2235,22 +2236,22 @@ void shuffle_write(const string& output_file_name , PRNG& prng , vector<fstream>
 	{
 		auto r = prng.rand(sum);
 
-		// fs[0]のファイルに格納されている局面 ... fs[1]のファイルに格納されている局面 ...
-		// のようにひと続きになっているものと考えて、rがどのファイルに格納されている局面を指しているかを確定させる。
-		// ファイルの中身はシャッフルされているので、そのファイルから次の要素を1つ取ってくれば良い。
-		// それぞれのファイルにはa_count[x]ずつ局面が残っているので、この処理は以下のように書ける。
+		// Aspects stored in fs[0] file ... Aspects stored in fs[1] file ...
+		//Think of it as a series like, and determine in which file r is pointing.
+		// The contents of the file are shuffled, so you can take the next element from that file.
+		// Each file has a_count[x] phases, so this process can be written as follows.
 
 		uint64_t n = 0;
 		while (a_count[n] <= r)
 			r -= a_count[n++];
 
-		// これでnが確定した。忘れないうちに残り件数を減らしておく。
+		// This confirms n. Before you forget it, reduce the remaining number.
 
 		--a_count[n];
 		--sum;
 
 		PackedSfenValue psv;
-		// これ、パフォーマンスあんまりよくないまでまとめて読み書きしたほうが良いのだが…。
+		// It's better to read and write all at once until the performance is not so good...
 		if (afs[n].read((char*)&psv, sizeof(PackedSfenValue)))
 		{
 			fs.write((char*)&psv, sizeof(PackedSfenValue));
@@ -2263,47 +2264,47 @@ void shuffle_write(const string& output_file_name , PRNG& prng , vector<fstream>
 	cout << "done!" << endl;
 }
 
-// 教師局面のシャッフル "learn shuffle"コマンドの下請け。
-// output_file_name : シャッフルされた教師局面が書き出される出力ファイル名
+// Subcontracting the teacher shuffle "learn shuffle" command.
+// output_file_name: name of the output file where the shuffled teacher positions will be written
 void shuffle_files(const vector<string>& filenames , const string& output_file_name , uint64_t buffer_size )
 {
-	// 出力先のフォルダは
-	// tmp/               一時書き出し用
+	// The destination folder is
+	// tmp/ for temporary writing
 
-	// テンポラリファイルはbuffer_size局面ずつtmp/フォルダにいったん書き出す。
-	// 例えば、buffer_size = 20Mならば 20M*40bytes = 800MBのバッファが必要。
-	// メモリが少ないPCでは、ここを減らすと良いと思う。
-	// ただし、あまりファイル数が増えるとOSの制限などから同時にopen出来なくなる。
-	// Windowsだと1プロセス512という制約があったはずなので、ここでopen出来るのが500として、
-	// 現在の設定で500ファイル×20M = 10G = 100億局面が限度。
+	// Temporary file is written to tmp/ folder for each buffer_size phase.
+	// For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
+	// In a PC with a small memory, it would be better to reduce this.
+	// However, if the number of files increases too much, it will not be possible to open at the same time due to OS restrictions.
+	// There should have been a limit of 512 per process on Windows, so you can open here as 500,
+	// The current setting is 500 files x 20M = 10G = 10 billion phases.
 
 	PSVector buf;
 	buf.resize(buffer_size);
-	// ↑のバッファ、どこまで使ったかを示すマーカー
+	// ↑ buffer, a marker that indicates how much you have used
 	uint64_t buf_write_marker = 0;
 
-	// 書き出すファイル名(連番なのでインクリメンタルカウンター)
+	// File name to write (incremental counter because it is a serial number)
 	uint64_t write_file_count = 0;
 
-	// シャッフルするための乱数
+	// random number to shuffle
 	PRNG prng((std::random_device())());
 
-	// テンポラリファイルの名前を生成する
+	// generate the name of the temporary file
 	auto make_filename = [](uint64_t i)
 	{
 		return "tmp/" + to_string(i) + ".bin";
 	};
 
-	// 書き出したtmp/フォルダのファイル、それぞれに格納されている教師局面の数
+	// Exported files in tmp/ folder, number of teacher positions stored in each
 	vector<uint64_t> a_count;
 
 	auto write_buffer = [&](uint64_t size)
 	{
-		// buf[0]～buf[size-1]までをshuffle
+		// shuffle from buf[0] to buf[size-1]
 		for (uint64_t i = 0; i < size; ++i)
 			swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
 
-		// ファイルに書き出す
+		// write to a file
 		fstream fs;
 		fs.open(make_filename(write_file_count++), ios::out | ios::binary);
 		fs.write((char*)&buf[0], size * sizeof(PackedSfenValue));
@@ -2316,7 +2317,7 @@ void shuffle_files(const vector<string>& filenames , const string& output_file_n
 
 	Dependency::mkdir("tmp");
 
-	// 10M局面の細切れファイルとしてシャッフルして書き出す。
+	// Shuffle and export as a 10M phase shredded file.
 	for (auto filename : filenames)
 	{
 		fstream fs(filename, ios::in | ios::binary);
@@ -2325,57 +2326,57 @@ void shuffle_files(const vector<string>& filenames , const string& output_file_n
 			if (++buf_write_marker == buffer_size)
 				write_buffer(buffer_size);
 
-		// sizeof(PackedSfenValue)単位で読み込んでいき、
-		// 最後に残っている端数は無視する。(fs.readで失敗するのでwhileを抜ける)
-		// (最後に残っている端数は、教師生成時に途中で停止させたために出来た中途半端なデータだと思われる。)
+		// Read in units of sizeof(PackedSfenValue),
+		// Ignore the last remaining fraction. (Fails in fs.read, so exit while)
+		// (The remaining fraction seems to be half-finished data that was created because it was stopped halfway during teacher generation.)
 
 	}
 
 	if (buf_write_marker != 0)
 		write_buffer(buf_write_marker);
 
-	// シャッフルされたファイルがwrite_file_count個だけ書き出された。
-	// 2pass目として、これをすべて同時にオープンし、ランダムに1つずつ選択して1局面ずつ読み込めば
-	// これにてシャッフルされたことになる。
+	// Only shuffled files have been written write_file_count.
+	// As a second pass, if you open all of them at the same time, select one at random and load one phase at a time
+	// Now you have shuffled.
 
-	// シャツフルする元ファイル+tmpファイル+書き出すファイルで元ファイルの3倍のストレージ容量が必要になる。
-	// 100億局面400GBなのでシャッフルするために1TBのSSDでは足りない。
-	// tmpに書き出しが終わったこのタイミングで元ファイルを消す(あるいは手で削除してしまう)なら、
-	// 元ファイルの2倍程度のストレージ容量で済む。
-	// だから、元ファイルを消すためのオプションを用意すべきかも知れない。
+	// Original file for shirt full + tmp file + file to write requires 3 times the storage capacity of the original file.
+	// 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
+	// If you want to delete (or delete by hand) the original file at this point after writing to tmp,
+	// The storage capacity is about twice that of the original file.
+	// So, maybe we should have an option to delete the original file.
 
-	// ファイルの同時openをしている。これがFOPEN_MAXなどを超える可能性は高い。
-	// その場合、buffer_sizeを調整して、ファイルの数を減らすよりない。
+	// Files are opened at the same time. It is highly possible that this will exceed FOPEN_MAX.
+	// In that case, rather than adjusting buffer_size to reduce the number of files.
 
 	vector<fstream> afs;
 	for (uint64_t i = 0; i < write_file_count; ++i)
 		afs.emplace_back(fstream(make_filename(i),ios::in | ios::binary));
 
-	// 下請け関数に丸投げして終わり。
+	// Throw to the subcontract function and end.
 	shuffle_write(output_file_name, prng, afs, a_count);
 }
 
-// 教師局面のシャッフル "learn shuffleq"コマンドの下請け。
-// こちらは1passで書き出す。
-// output_file_name : シャッフルされた教師局面が書き出される出力ファイル名
+// Subcontracting the teacher shuffle "learn shuffleq" command.
+// This is written in 1 pass.
+// output_file_name: name of the output file where the shuffled teacher positions will be written
 void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name)
 {
-	// 読み込んだ局面数
+	// number of phases read
 	uint64_t read_sfen_count = 0;
 
-	// シャッフルするための乱数
+	// random number to shuffle
 	PRNG prng((std::random_device())());
 
-	// ファイルの数
+	// number of files
 	size_t file_count = filenames.size();
 
-	// filenamesのファイルそれぞれに格納されている教師局面の数
+	// Number of teacher positions stored in each file in filenames
 	vector<uint64_t> a_count(file_count);
 
-	// それぞれのファイルの教師局面の数をカウントする。
+	// Count the number of teacher aspects in each file.
 	vector<fstream> afs(file_count);
 
-	for (size_t i = 0; i < file_count ; ++i)
+	for (size_t i = 0; i <file_count ;++i)
 	{
 		auto filename = filenames[i];
 		auto& fs = afs[i];
@@ -2383,28 +2384,28 @@ void shuffle_files_quick(const vector<string>& filenames, const string& output_f
 		fs.open(filename, ios::in | ios::binary);
 		fs.seekg(0, fstream::end);
 		uint64_t eofPos = (uint64_t)fs.tellg();
-		fs.clear(); // これをしないと次のseekに失敗することがある。
+		fs.clear(); // Otherwise, the next seek may fail.
 		fs.seekg(0, fstream::beg);
 		uint64_t begPos = (uint64_t)fs.tellg();
 		uint64_t file_size = eofPos - begPos;
 		uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
 		a_count[i] = sfen_count;
 
-		// 各ファイルに格納されていたsfenの数を出力する。
+		// Output the number of sfen stored in each file.
 		cout << filename << " = " << sfen_count << " sfens." << endl;
 	}
 
-	// それぞれのファイルのファイルサイズがわかったので、
-	// これらをすべて同時にオープンし(すでにオープンされている)、
-	// ランダムに1つずつ選択して1局面ずつ読み込めば
-	// これにてシャッフルされたことになる。
+	// Since we know the file size of each file,
+	// open them all at once (already open),
+	// Select one at a time and load one phase at a time
+	// Now you have shuffled.
 
-	// 下請け関数に丸投げして終わり。
+	// Throw to the subcontract function and end.
 	shuffle_write(output_file_name, prng, afs, a_count);
 }
 
-// 教師局面のシャッフル "learn shufflem"コマンドの下請け。
-// メモリに丸読みして指定ファイル名で書き出す。
+// Subcontracting the teacher shuffle "learn shufflem" command.
+// Read the whole memory and write it out with the specified file name.
 void shuffle_files_on_memory(const vector<string>& filenames,const string output_file_name)
 {
 	PSVector buf;
@@ -2414,14 +2415,14 @@ void shuffle_files_on_memory(const vector<string>& filenames,const string output
 		std::cout << "read : " << filename << std::endl;
 		read_file_to_memory(filename, [&buf](uint64_t size) {
 			assert((size % sizeof(PackedSfenValue)) == 0);
-			// バッファを拡充して、前回の末尾以降に読み込む。
+			// Expand the buffer and read after the last end.
 			uint64_t last = buf.size();
 			buf.resize(last + size / sizeof(PackedSfenValue));
 			return (void*)&buf[last];
 		});
 	}
 
-	// buf[0]～buf[size-1]までをshuffle
+	// shuffle from buf[0] to buf[size-1]
 	PRNG prng((std::random_device())());
 	uint64_t size = (uint64_t)buf.size();
 	std::cout << "shuffle buf.size() = " << size << std::endl;
@@ -2430,7 +2431,7 @@ void shuffle_files_on_memory(const vector<string>& filenames,const string output
 
 	std::cout << "write : " << output_file_name << endl;
 
-	// 書き出すファイルが2GBを超えるとfstream::write一発では書き出せないのでwrapperを用いる。
+	// If the file to be written exceeds 2GB, it cannot be written in one shot with fstream::write, so use wrapper.
 	write_memory_to_file(output_file_name, (void*)&buf[0], (uint64_t)sizeof(PackedSfenValue)*(uint64_t)buf.size());
 
 	std::cout << "..shuffle_on_memory done." << std::endl;
@@ -2443,7 +2444,7 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 	uint64_t filtered_size = 0;
 	auto th = Threads.main();
 	auto &tpos = th->rootPos;
-	// plain形式の雑巾をやねうら王用のpackedsfenvalueに変換する
+	// convert plain rag to packed sfenvalue for Yaneura king
 	fs.open(output_file_name, ios::app | ios::binary);
 	StateListPtr states;
 	for (auto filename : filenames) {
@@ -2454,9 +2455,8 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 		PackedSfenValue p;
 		data_size = 0;
 		filtered_size = 0;
-		p.gamePly = 1; // apery形式では含まれない。一応初期化するべし
+		p.gamePly = 1; // Not included in apery format. Should be initialized
 		bool ignore_flag = false;
-
 		while (std::getline(ifs, line)) {
 			std::stringstream ss(line);
 			std::string token;
@@ -2480,7 +2480,7 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 				if(temp < ply_minimum || temp > ply_maximum){
 				  ignore_flag = true;
 				}
-				p.gamePly = uint16_t(temp); // 此処のキャストいらない？
+				p.gamePly = uint16_t(temp); // No cast here?
 				if (interpolate_eval != 0){
 				  p.score = min(3000, interpolate_eval * temp);
 				}
@@ -2488,7 +2488,7 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 			else if (token == "result") {
 				int temp;
 				ss >> temp;
-				p.game_result = int8_t(temp); // 此処のキャストいらない？
+				p.game_result = int8_t(temp); // Do you need a cast here?
 				if (interpolate_eval){
 				  p.score = p.score * p.game_result;
 				}
@@ -2702,7 +2702,6 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 	std::cout << now_string() << " all done" << std::endl;
 	ofs.close();
 }
-
 //void convert_plain(const vector<string>& filenames , const string& output_file_name)
 //{
 //	Position tpos;
@@ -2711,14 +2710,14 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 //	for (auto filename : filenames) {
 //		std::cout << "convert " << filename << " ... ";
 //
-//		// ひたすらpackedsfenvalueをテキストに変換する
+// 		// Just convert packedsfenvalue to text
 //		std::fstream fs;
 //		fs.open(filename, ios::in | ios::binary);
 //		PackedSfenValue p;
 //		while (true)
 //		{
 //			if (fs.read((char*)&p, sizeof(PackedSfenValue))) {
-//				// plain textとして書き込む
+// 				// write as plain text
 //				ofs << "sfen " << tpos.sfen_unpack(p.sfen) << std::endl;
 //				ofs << "move " << to_usi_string(Move(p.move)) << std::endl;
 //				ofs << "score " << p.score << std::endl;
@@ -2737,7 +2736,7 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 //	std::cout << "all done" << std::endl;
 //}
 
-// 生成した棋譜からの学習
+// Learning from the generated game record
 void learn(Position&, istringstream& is)
 {
 	auto thread_num = (int)Options["Threads"];
@@ -2746,62 +2745,62 @@ void learn(Position&, istringstream& is)
 	LearnerThink learn_think(sr);
 	vector<string> filenames;
 
-	// mini_batch_size デフォルトで1M局面。これを大きくできる。
+	// mini_batch_size 1M aspect by default. This can be increased.
 	auto mini_batch_size = LEARN_MINI_BATCH_SIZE;
 
-	// ループ回数(この回数だけ棋譜ファイルを読み込む)
+	// Number of loops (read the game record file this number of times)
 	int loop = 1;
 
-	// 棋譜ファイル格納フォルダ(ここから相対pathで棋譜ファイルを取得)
+	// Game file storage folder (get game file with relative path from here)
 	string base_dir;
 
 	string target_dir;
 
-	// 0であれば、デフォルト値になる。
+	// If 0, it will be the default value.
 	double eta1 = 0.0;
 	double eta2 = 0.0;
 	double eta3 = 0.0;
-	uint64_t eta1_epoch = 0; // defaultではeta2は適用されない
-	uint64_t eta2_epoch = 0; // defaultではeta3は適用されない
+	uint64_t eta1_epoch = 0; // eta2 is not applied by default
+	uint64_t eta2_epoch = 0; // eta3 is not applied by default
 
 #if defined(USE_GLOBAL_OPTIONS)
-	// あとで復元するために保存しておく。
+	// Save it for later restore.
 	auto oldGlobalOptions = GlobalOptions;
-	// eval hashにhitするとrmseなどの計算ができなくなるのでオフにしておく。
+	// If you hit the eval hash, you can not calculate rmse etc. so turn it off.
 	GlobalOptions.use_eval_hash = false;
-	// 置換表にhitするとそこで以前の評価値で枝刈りがされることがあるのでオフにしておく。
+	// If you hit the replacement table, pruning may occur at the previous evaluation value, so turn it off.
 	GlobalOptions.use_hash_probe = false;
 #endif
 
-	// --- 教師局面をシャッフルするだけの機能
+	// --- Function that only shuffles the teacher aspect
 
-	// 通常シャッフル
+	// normal shuffle
 	bool shuffle_normal = false;
 	uint64_t buffer_size = 20000000;
-	// それぞれのファイルがシャッフルされていると仮定しての高速シャッフル
+	// fast shuffling assuming each file is shuffled
 	bool shuffle_quick = false;
-	// メモリにファイルを丸読みしてシャッフルする機能。(要、ファイルサイズのメモリ)
+	// A function to read the entire file in memory and shuffle it. (Requires file size memory)
 	bool shuffle_on_memory = false;
-	// packed sfenの変換。plainではsfen(string), 評価値(整数), 指し手(例：7g7f, string)、結果(負け-1、勝ち1、引き分け0)からなる
+	// Conversion of packed sfen. In plain, it consists of sfen(string), evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
 	bool use_convert_plain = false;
-	// plain形式の教師をやねうら王のbinに変換する
+	// convert plain format teacher to Yaneura King's bin
 	bool use_convert_bin = false;
 	int ply_minimum = 0;
 	int ply_maximum = 114514;
 	bool interpolate_eval = 0;
-	// pgn-extract形式の教師をやねうら王のbinに変換する
+	// convert teacher in pgn-extract format to Yaneura King's bin
 	bool use_convert_bin_from_pgn_extract = false;
-	// それらのときに書き出すファイル名(デフォルトでは"shuffled_sfen.bin")
+	// File name to write in those cases (default is "shuffled_sfen.bin")
 	string output_file_name = "shuffled_sfen.bin";
 
-	// 教師局面の深い探索での評価値の絶対値が、この値を超えていたらその局面は捨てる。
+	// If the absolute value of the evaluation value in the deep search of the teacher phase exceeds this value, that phase is discarded.
 	int eval_limit = 32000;
 
-	// 評価関数ファイルの保存は終了間際の1回に限定するかのフラグ。
+	// Flag to save the evaluation function file only once near the end.
 	bool save_only_once = false;
 
-	// 教師局面を先読みしている分に関してシャッフルする。(1000万局面単位ぐらいのシャッフル)
-	// 事前にシャッフルされているファイルを渡すならオンにすれば良い。
+	// Shuffle about what you are pre-reading on the teacher aspect. (Shuffle of about 10 million phases)
+	// Turn on if you want to pass a pre-shuffled file.
 	bool no_shuffle = false;
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
@@ -2811,15 +2810,15 @@ void learn(Position&, istringstream& is)
 	ELMO_LAMBDA_LIMIT = 32000;
 #endif
 
-	// 割引率。これを0以外にすると、PV終端以外でも勾配を加算する。(そのとき、この割引率を適用する)
+	// Discount rate. If this is set to a value other than 0, the slope will be added even at other than the PV termination. (At that time, apply this discount rate)
 	double discount_rate = 0;
 
-	// if (gamePly < rand(reduction_gameply)) continue;
-	// のようにして、序盤を学習対象から程よく除外するためのオプション
-	// 1にしてあるとrand(1)==0なので、何も除外されない。
+	// if (gamePly <rand(reduction_gameply)) continue;
+	// An option to exclude the early stage from the learning target moderately like
+	// If set to 1, rand(1)==0, so nothing is excluded.
 	int reduction_gameply = 1;
 
-	// KK/KKP/KPP/KPPPを学習させないオプション項目
+	// Optional item that does not let you learn KK/KKP/KPP/KPPP
 	array<bool,4> freeze = {};
 
 #if defined(EVAL_NNUE)
@@ -2835,7 +2834,7 @@ void learn(Position&, istringstream& is)
 
 	string validation_set_file_name;
 
-	// ファイル名が後ろにずらずらと書かれていると仮定している。
+	// Assume the filenames are staggered.
 	while (true)
 	{
 		string option;
@@ -2844,26 +2843,26 @@ void learn(Position&, istringstream& is)
 		if (option == "")
 			break;
 
-		// mini-batchの局面数を指定
+		// specify the number of phases of mini-batch
 		if (option == "bat")
 		{
 			is >> mini_batch_size;
-			mini_batch_size *= 10000; // 単位は万
+			mini_batch_size *= 10000; // Unit is ten thousand
 		}
 
-		// 棋譜が格納されているフォルダを指定して、根こそぎ対象とする。
+		// Specify the folder in which the game record is stored and make it the rooting target.
 		else if (option == "targetdir") is >> target_dir;
 
-		// ループ回数の指定
+		// Specify the number of loops
 		else if (option == "loop")      is >> loop;
 
-		// 棋譜ファイル格納フォルダ(ここから相対pathで棋譜ファイルを取得)
+		// Game file storage folder (get game file with relative path from here)
 		else if (option == "basedir")   is >> base_dir;
 
-		// ミニバッチのサイズ
+		// Mini batch size
 		else if (option == "batchsize") is >> mini_batch_size;
 
-		// 学習率
+		// learning rate
 		else if (option == "eta")        is >> eta1;
 		else if (option == "eta1")       is >> eta1; // alias
 		else if (option == "eta2")       is >> eta2;
@@ -2874,10 +2873,10 @@ void learn(Position&, istringstream& is)
 		else if (option == "use_draw_in_training") is >> use_draw_in_training;
 		else if (option == "use_draw_in_validation") is >> use_draw_in_validation;
 		else if (option == "use_hash_in_training") is >> use_hash_in_training;
-		// 割引率
+		// Discount rate
 		else if (option == "discount_rate") is >> discount_rate;
 
-		// KK/KKP/KPP/KPPPの学習なし。
+		// No learning of KK/KKP/KPP/KPPP.
 		else if (option == "freeze_kk")    is >> freeze[0];
 		else if (option == "freeze_kkp")   is >> freeze[1];
 		else if (option == "freeze_kpp")   is >> freeze[2];
@@ -2899,7 +2898,7 @@ void learn(Position&, istringstream& is)
 #endif
 		else if (option == "reduction_gameply") is >> reduction_gameply;
 
-		// シャッフル関連
+		// shuffle related
 		else if (option == "shuffle")	shuffle_normal = true;
 		else if (option == "buffer_size") is >> buffer_size;
 		else if (option == "shuffleq")	shuffle_quick = true;
@@ -2909,7 +2908,7 @@ void learn(Position&, istringstream& is)
 		else if (option == "eval_limit") is >> eval_limit;
 		else if (option == "save_only_once") save_only_once = true;
 		else if (option == "no_shuffle") no_shuffle = true;
-		
+
 #if defined(EVAL_NNUE)
 		else if (option == "nn_batch_size") is >> nn_batch_size;
 		else if (option == "newbob_decay") is >> newbob_decay;
@@ -2920,13 +2919,13 @@ void learn(Position&, istringstream& is)
 		else if (option == "loss_output_interval") is >> loss_output_interval;
 		else if (option == "mirror_percentage") is >> mirror_percentage;
 		else if (option == "validation_set_file_name") is >> validation_set_file_name;
-		
-		// 雑巾のconvert関連
+
+		// Rabbit convert related
 		else if (option == "convert_plain") use_convert_plain = true;
 		else if (option == "convert_bin") use_convert_bin = true;
 		else if (option == "interpolate_eval") is >> interpolate_eval;
-		else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
-		// さもなくば、それはファイル名である。
+		else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;		
+		// Otherwise, it's a filename.
 		else
 			filenames.push_back(option);
 	}
@@ -2935,25 +2934,25 @@ void learn(Position&, istringstream& is)
 
 	cout << "learn command , ";
 
-	// OpenMP無効なら警告を出すように。
+	// Issue a warning if OpenMP is disabled.
 #if !defined(_OPENMP)
 	cout << "Warning! OpenMP disabled." << endl;
 #endif
 
-	// 学習棋譜ファイルの表示
+	// Display learning game file
 	if (target_dir != "")
 	{
 		string kif_base_dir = Path::Combine(base_dir, target_dir);
 
-		// このフォルダを根こそぎ取る。base_dir相対にしておく。
+		// Remove this folder. Keep it relative to base_dir.
 #if defined(_MSC_VER)
-		// std::tr2を使用するとwaring C4996が出るので抑制。
-		// ※　std::tr2は、std:c++14 の下では既定で非推奨の警告を出し、/std:c++17 では既定で削除された。
+		// If you use std::tr2, warning C4996 will appear, so suppress it.
+		// * std::tr2 issued a deprecation warning by default under std:c++14, and was deleted by default in /std:c++17.
 		#pragma warning(push)
 		#pragma warning(disable:4996)
 
 		namespace sys = std::filesystem;
-		sys::path p(kif_base_dir); // 列挙の起点
+		sys::path p(kif_base_dir); // Origin of enumeration
 		std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
 			[&](const sys::path& p) {
 			if (sys::is_regular_file(p))
@@ -2969,17 +2968,17 @@ void learn(Position&, istringstream& is)
 			return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
 		};
 
-		// 仕方ないのでdirent.hを用いて読み込む。
-		DIR *dp;       // ディレクトリへのポインタ
-		dirent* entry; // readdir() で返されるエントリーポイント
+		// It can't be helped, so read it using dirent.h.
+		DIR *dp; // pointer to directory
+		dirent* entry; // entry point returned by readdir()
 
 		dp = opendir(kif_base_dir.c_str());
 		if (dp != NULL)
 		{
 			do {
 				entry = readdir(dp);
-				// ".bin"で終わるファイルのみを列挙
-				// →　連番でファイル生成するときにこの制約ちょっと嫌だな…。
+				// Only list files ending with ".bin"
+				// →I hate this restriction when generating files with serial numbers...
 				if (entry != NULL  && ends_with(entry->d_name, ".bin")  )
 				{
 					//cout << entry->d_name << endl;
@@ -3003,7 +3002,7 @@ void learn(Position&, istringstream& is)
 	cout << "base dir        : " << base_dir   << endl;
 	cout << "target dir      : " << target_dir << endl;
 
-	// シャッフルモード
+	// shuffle mode
 	if (shuffle_normal)
 	{
 		cout << "buffer_size     : " << buffer_size << endl;
@@ -3025,11 +3024,11 @@ void learn(Position&, istringstream& is)
 	}
 	//if (use_convert_plain)
 	//{
-	//  	is_ready(true);
-	//	cout << "convert_plain.." << endl;
-	//	convert_plain(filenames,output_file_name);
-	//	return;
-	//	
+	// 		is_ready(true);
+	//  cout << "convert_plain.." << endl;
+	//  convert_plain(filenames,output_file_name);
+	//  return;
+	//
 	//}
 	if (use_convert_bin)
 	{
@@ -3052,9 +3051,9 @@ void learn(Position&, istringstream& is)
 	cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
 	cout << "no_shuffle        : " << (no_shuffle ? "true" : "false") << endl;
 
-	// ループ回数分だけファイル名を突っ込む。
+	// Insert the file name for the number of loops.
 	for (int i = 0; i < loop; ++i)
-		// sfen reader、逆順で読むからここでreverseしておく。すまんな。
+		// sfen reader, I'll read it in reverse order so I'll reverse it here. I'm sorry.
 		for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
 			sr.filenames.push_back(Path::Combine(base_dir, *it));
 
@@ -3079,7 +3078,7 @@ void learn(Position&, istringstream& is)
 #endif
 	cout << "discount rate     : " << discount_rate     << endl;
 
-	// reduction_gameplyに0を設定されるとrand(0)が0除算になってしまうので1に補正。
+	// If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
 	reduction_gameply = max(reduction_gameply, 1);
 	cout << "reduction_gameply : " << reduction_gameply << endl;
 
@@ -3101,18 +3100,18 @@ void learn(Position&, istringstream& is)
 #endif
 
 	// -----------------------------------
-	//            各種初期化
+	// various initialization
 	// -----------------------------------
 
 	cout << "init.." << endl;
 
-	// 評価関数パラメーターの読み込み
+	// Read evaluation function parameters
 	is_ready(true);
 
 #if !defined(EVAL_NNUE)
 	cout << "init_grad.." << endl;
 
-	// 評価関数パラメーターの勾配配列の初期化
+	// Initialize gradient array of merit function parameters
 	Eval::init_grad(eta1,eta1_epoch,eta2,eta2_epoch,eta3);
 #else
 	cout << "init_training.." << endl;
@@ -3125,7 +3124,7 @@ void learn(Position&, istringstream& is)
 #endif
 
 #if 0
-	// 平手の初期局面に対して1.0の勾配を与えてみるテスト。
+	// A test to give a gradient of 1.0 to the initial stage of Hirate.
 	pos.set_hirate();
 	cout << Eval::evaluate(pos) << endl;
 	//Eval::print_eval_stat(pos);
@@ -3138,7 +3137,7 @@ void learn(Position&, istringstream& is)
 
 	cout << "init done." << endl;
 
-	// その他、オプション設定を反映させる。
+	// Reflect other option settings.
 	learn_think.discount_rate = discount_rate;
 	learn_think.eval_limit = eval_limit;
 	learn_think.save_only_once = save_only_once;
@@ -3154,20 +3153,20 @@ void learn(Position&, istringstream& is)
 	learn_think.loss_output_interval = loss_output_interval;
 	learn_think.mirror_percentage = mirror_percentage;
 
-	// 局面ファイルをバックグラウンドで読み込むスレッドを起動
-	// (これを開始しないとmseの計算が出来ない。)
+	// Start a thread that loads the phase file in the background
+	// (If this is not started, mse cannot be calculated.)
 	learn_think.start_file_read_worker();
 
 	learn_think.mini_batch_size = mini_batch_size;
 
 	if (validation_set_file_name.empty()) {
-		// mse計算用にデータ1万件ほど取得しておく。
+	// Get about 10,000 data for mse calculation.
 		sr.read_for_mse();
 	} else {
 		sr.read_validation_set(validation_set_file_name, eval_limit);
 	}
 
-	// この時点で一度rmseを計算(0 sfenのタイミング)
+	// Calculate rmse once at this point (timing of 0 sfen)
 	// sr.calc_rmse();
 #if defined(EVAL_NNUE)
 	if (newbob_decay != 1.0) {
@@ -3180,17 +3179,17 @@ void learn(Position&, istringstream& is)
 #endif
 
 	// -----------------------------------
-	//   評価関数パラメーターの学習の開始
+	// start learning evaluation function parameters
 	// -----------------------------------
 
-	// 学習開始。
+	// Start learning.
 	learn_think.go_think();
 
-	// 最後に一度保存。
+	// Save once at the end.
 	learn_think.save(true);
 
 #if defined(USE_GLOBAL_OPTIONS)
-	// GlobalOptionsの復元。
+	// Restore Global Options.
 	GlobalOptions = oldGlobalOptions;
 #endif
 }
@@ -3203,4 +3202,4 @@ void learn(Position&, istringstream& is)
 #endif
 
 
-#endif // EVAL_LEARN
+#endif // EVAL_LEARN
\ No newline at end of file
diff --git a/src/learn/learning_tools.cpp b/src/learn/learning_tools.cpp
index d3a7858f..e3bd6f68 100644
--- a/src/learn/learning_tools.cpp
+++ b/src/learn/learning_tools.cpp
@@ -23,15 +23,15 @@ namespace EvalLearningTools
 
 	std::vector<bool> min_index_flag;
 
-	// --- 個別のテーブルごとの初期化
+	// --- initialization for each individual table
 
 	void init_min_index_flag()
 	{
-		// mir_piece、inv_pieceの初期化が終わっていなければならない。
+		// Initialization of mir_piece and inv_piece must be completed.
 		assert(mir_piece(Eval::f_pawn) == Eval::e_pawn);
 
-		// 次元下げ用フラグ配列の初期化
-		// KPPPに関しては関与しない。
+		// Initialize the flag array for dimension reduction
+		// Not involved in KPPP.
 
 		KK g_kk;
 		g_kk.set(SQUARE_NB, Eval::fe_end, 0);
@@ -46,9 +46,9 @@ namespace EvalLearningTools
 #pragma omp parallel
 		{
 #if defined(_OPENMP)
-			// Windows環境下でCPUが２つあるときに、論理64コアまでしか使用されないのを防ぐために
-			// ここで明示的にCPUに割り当てる
-			int thread_index = omp_get_thread_num();    // 自分のthread numberを取得
+			// To prevent the logical 64 cores from being used when there are two CPUs under Windows
+			// explicitly assign to CPU here
+			int thread_index = omp_get_thread_num(); // get your thread number
 			WinProcGroup::bindThisThread(thread_index);
 #endif
 
@@ -56,20 +56,20 @@ namespace EvalLearningTools
 
 			for (int64_t index_ = 0; index_ < (int64_t)size; ++index_)
 			{
-				// OpenMPの制約からループ変数は符号型でないといけないらしいのだが、
-				// さすがに使いにくい。
+				// It seems that the loop variable must be a sign type due to OpenMP restrictions, but
+				// It's really difficult to use.
 				uint64_t index = (uint64_t)index_;
 
 				if (g_kk.is_ok(index))
 				{
-					// indexからの変換と逆変換によって元のindexに戻ることを確認しておく。
-					// 起動時に1回しか実行しない処理なのでassertで書いておく。
+					// Make sure that the original index will be restored by conversion from index and reverse conversion.
+					// It is a process that is executed only once at startup, so write it in assert.
 					assert(g_kk.fromIndex(index).toIndex() == index);
 
 					KK a[KK_LOWER_COUNT];
 					g_kk.fromIndex(index).toLowerDimensions(a);
 
-					// 次元下げの1つ目の要素が元のindexと同一であることを確認しておく。
+					// Make sure that the first element of dimension reduction is the same as the original index.
 					assert(a[0].toIndex() == index);
 
 					uint64_t min_index = UINT64_MAX;
@@ -118,9 +118,9 @@ namespace EvalLearningTools
 	void learning_tools_unit_test_kpp()
 	{
 
-		// KPPの三角配列化にバグがないかテストする
-		// k-p0-p1のすべての組み合わせがきちんとKPPの扱う対象になっていかと、そのときの次元下げが
-		// 正しいかを判定する。
+		// test KPP triangulation for bugs
+		// All combinations of k-p0-p1 are properly handled by KPP, and the dimension reduction at that time is
+		// Determine if it is correct.
 
 		KK g_kk;
 		g_kk.set(SQUARE_NB, Eval::fe_end, 0);
@@ -159,24 +159,24 @@ namespace EvalLearningTools
 					f[index - g_kpp.min_index()] = f[index2-g_kpp.min_index()] = true;
 				}
 
-		// 抜けてるindexがなかったかの確認。
+		// Check if there is no missing index.
 		for(size_t index = 0 ; index < f.size(); index++)
 			if (!f[index])
 			{
-				std::cout << index << g_kpp.fromIndex(index + g_kpp.min_index()) <<  std::endl;
+				std::cout << index << g_kpp.fromIndex(index + g_kpp.min_index()) << std::endl;
 			}
 	}
 
 	void learning_tools_unit_test_kppp()
 	{
-		// KPPPの計算に抜けがないかをテストする
+		// Test for missing KPPP calculations
 
 		KPPP g_kppp;
 		g_kppp.set(15, Eval::fe_end,0);
 		uint64_t min_index = g_kppp.min_index();
 		uint64_t max_index = g_kppp.max_index();
 
-		// 最後の要素の確認。
+		// Confirm last element.
 		//KPPP x = KPPP::fromIndex(max_index-1);
 		//std::cout << x << std::endl;
 
@@ -208,10 +208,10 @@ namespace EvalLearningTools
 	void learning_tools_unit_test_kkpp()
 	{
 		KKPP g_kkpp;
-		g_kkpp.set(SQUARE_NB, 10000 , 0);
+		g_kkpp.set(SQUARE_NB, 10000, 0);
 		uint64_t n = 0;
 		for (int k = 0; k<SQUARE_NB; ++k)
-			for (int i = 0; i<10000; ++i) // 試しに、かなり大きなfe_endを想定して10000で回してみる。
+			for (int i = 0; i<10000; ++i) // As a test, assuming a large fe_end, try turning at 10000.
 				for (int j = 0; j < i; ++j)
 				{
 					auto kkpp = g_kkpp.fromKKPP(k, (BonaPiece)i, (BonaPiece)j);
@@ -222,27 +222,27 @@ namespace EvalLearningTools
 				}
 	}
 
-	// このEvalLearningTools全体の初期化
+	// Initialize this entire EvalLearningTools
 	void init()
 	{
-		// 初期化は、起動後1回限りで良いのでそのためのフラグ。
+		// Initialization is required only once after startup, so a flag for that.
 		static bool first = true;
 
 		if (first)
 		{
 			std::cout << "EvalLearningTools init..";
 
-			// mir_piece()とinv_piece()を利用可能にする。
-			// このあとmin_index_flagの初期化を行なうが、そこが
-			// これに依存しているので、こちらを先に行なう必要がある。
+			// Make mir_piece() and inv_piece() available.
+			// After this, the min_index_flag is initialized, but
+			// It depends on this, so you need to do this first.
 			init_mir_inv_tables();
 
 			//learning_tools_unit_test_kpp();
 			//learning_tools_unit_test_kppp();
 			//learning_tools_unit_test_kkpp();
 
-			// UnitTestを実行するの最後でも良いのだが、init_min_index_flag()にとても時間がかかるので
-			// デバッグ時はこのタイミングで行いたい。
+			// It may be the last time to execute UnitTest, but since init_min_index_flag() takes a long time,
+			// I want to do this at the time of debugging.
 
 			init_min_index_flag();
 
@@ -253,4 +253,4 @@ namespace EvalLearningTools
 	}
 }
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 2bcd3f35..f95ea2d9 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -1,7 +1,7 @@
 ﻿#ifndef __LEARN_WEIGHT_H__
 #define __LEARN_WEIGHT_H__
 
-// 評価関数の機械学習のときに用いる重み配列などに関する機械学習用ツール類一式
+// A set of machine learning tools related to the weight array used for machine learning of evaluation functions
 
 #include "learn.h"
 #if defined (EVAL_LEARN)
@@ -18,28 +18,28 @@
 namespace EvalLearningTools
 {
 	// -------------------------------------------------
-	//                     初期化
+	//                  Initialization
 	// -------------------------------------------------
 
-	// このEvalLearningTools名前空間にあるテーブル類を初期化する。
-	// 学習の開始までに必ず一度呼び出すこと。
-	// この関数のなかで、init_mir_inv_tables()も呼び出している。
-	// (この関数を呼ぶときは、init_mir_inv_tables()を呼び出す必要はない。)
+	// Initialize the tables in this EvalLearningTools namespace.
+	// Be sure to call once before learning starts.
+	// In this function, we also call init_mir_inv_tables().
+	// (It is not necessary to call init_mir_inv_tables() when calling this function.)
 	void init();
 
 	// -------------------------------------------------
 	//                     flags
 	// -------------------------------------------------
 
-	// 次元下げしたときに、そのなかの一番小さなindexになることが
-	// わかっているindexに対してtrueとなっているフラグ配列。
-	// この配列もinit()によって初期化される。
-	// KPPPに関しては、関与しない。
-	// ゆえに、この配列の有効なindexの範囲は、KK::min_index()～KPP::max_index()まで。
+	// When the dimension is lowered, it may become the smallest index among them
+	// A flag array that is true for the known index.
+	// This array is also initialized by init().
+	// KPPP is not involved.
+	// Therefore, the valid index range of this array is from KK::min_index() to KPP::max_index().
 	extern std::vector<bool> min_index_flag;
 
 	// -------------------------------------------------
-	//       勾配等を格納している学習用の配列
+	//   Array for learning that stores gradients etc.
 	// -------------------------------------------------
 
 #if defined(_MSC_VER)
@@ -49,21 +49,21 @@ namespace EvalLearningTools
 #endif
 	struct Weight
 	{
-		// mini-batch 1回分の勾配の累積値
+		// cumulative value of one mini-batch gradient
 		LearnFloatType g = LearnFloatType(0);
 
-		// ADA_GRAD_UPDATEのとき。LearnFloatType == floatとして、
-		// 合計 4*2 + 4*2 + 1*2 = 18 bytes
-		// 1GBの評価関数パラメーターに対してその4.5倍のサイズのWeight配列が確保できれば良い。
-		// ただし、構造体のアライメントが4バイト単位になっているとsizeof(Weight)==20なコードが生成されるので
-		// pragma pack(2)を指定しておく。
+		// When ADA_GRAD_UPDATE. LearnFloatType == float,
+		// total 4*2 + 4*2 + 1*2 = 18 bytes
+		// It suffices to secure a Weight array that is 4.5 times the size of the evaluation function parameter of 1GB.
+		// However, sizeof(Weight)==20 code is generated if the structure alignment is in 4-byte units, so
+		// Specify pragma pack(2).
 
-		// SGD_UPDATE の場合、この構造体はさらに10バイト減って、8バイトで済む。
+		// For SGD_UPDATE, this structure is reduced by 10 bytes to 8 bytes.
 
-		// AdaGradなどの学習率η(eta)。
-		// updateFV()が呼び出されるまでにeta1,2,3,eta1_epoch,eta2_epochは設定されているものとする。
-		// update_weights()のepochが、eta1_epochまでeta1から徐々にeta2に変化する。
-		// eta2_epoch以降は、eta2から徐々にeta3に変化する。
+		// Learning rate η(eta) such as AdaGrad.
+		// It is assumed that eta1,2,3,eta1_epoch,eta2_epoch have been set by the time updateFV() is called.
+		// The epoch of update_weights() gradually changes from eta1 to eta2 until eta1_epoch.
+		// After eta2_epoch, gradually change from eta2 to eta3.
 		static double eta;
 		static double eta1;
 		static double eta2;
@@ -71,7 +71,7 @@ namespace EvalLearningTools
 		static uint64_t eta1_epoch;
 		static uint64_t eta2_epoch;
 
-		// etaの一括初期化。0が渡された場合、デフォルト値が設定される。
+		// Batch initialization of eta. If 0 is passed, the default value will be set.
 		static void init_eta(double eta1, double eta2, double eta3, uint64_t eta1_epoch, uint64_t eta2_epoch)
 		{
 			Weight::eta1 = (eta1 != 0) ? eta1 : 30.0;
@@ -81,15 +81,15 @@ namespace EvalLearningTools
 			Weight::eta2_epoch = (eta2_epoch != 0) ? eta2_epoch : 0;
 		}
 
-		// epochに応じたetaを設定してやる。
+		// Set eta according to epoch.
 		static void calc_eta(uint64_t epoch)
 		{
-			if (Weight::eta1_epoch == 0) // eta2適用除外
+			if (Weight::eta1_epoch == 0) // Exclude eta2
 				Weight::eta = Weight::eta1;
 			else if (epoch < Weight::eta1_epoch)
-				// 按分する
+				// apportion
 				Weight::eta = Weight::eta1 + (Weight::eta2 - Weight::eta1) * epoch / Weight::eta1_epoch;
-			else if (Weight::eta2_epoch == 0) // eta3適用除外
+			else if (Weight::eta2_epoch == 0) // Exclude eta3
 				Weight::eta = Weight::eta2;
 			else if (epoch < Weight::eta2_epoch)
 				Weight::eta = Weight::eta2 + (Weight::eta3 - Weight::eta2) * (epoch - Weight::eta1_epoch) / (Weight::eta2_epoch - Weight::eta1_epoch);
@@ -101,26 +101,26 @@ namespace EvalLearningTools
 
 #if defined (ADA_GRAD_UPDATE)
 
-		// floatで正確に計算できる最大値はINT16_MAX*256-1なのでそれより
-		// 小さい値をマーカーにしておく。
+		// Since the maximum value that can be accurately calculated with float is INT16_MAX*256-1
+		// Keep the small value as a marker.
 		const LearnFloatType V0_NOT_INIT = (INT16_MAX * 128);
 
-		// vを内部的に保持しているもの。以前の実装ではメモリの節約のために固定小数で小数部だけを保持していたが
-		// 精度的に怪しいし、見通しが悪くなるので廃止した。
+		// What holds v internally. The previous implementation kept a fixed decimal with only a fractional part to save memory,
+		// Since it is doubtful in accuracy and the visibility is bad, it was abolished.
 		LearnFloatType v0 = LearnFloatType(V0_NOT_INIT);
 
-		// AdaGradのg2
+		// AdaGrad g2
 		LearnFloatType g2 = LearnFloatType(0);
 
-		// AdaGradでupdateする
-		// この関数を実行しているときにgの値やメンバーが書き変わらないことは
-		// 呼び出し側で保証されている。atomic演算である必要はない。
-		// kはetaに掛かる係数。普通は1.0で良い。手番項に対してetaを下げたいときにここを1/8.0などとする。
+		// update with AdaGrad
+		// When executing this function, the value of g and the member do not change
+		// Guaranteed by the caller. It does not have to be an atomic operation.
+		// k is a coefficient for eta. 1.0 is usually sufficient. If you want to lower eta for your turn item, set this to 1/8.0 etc.
 		template <typename T>
 		void updateFV(T& v,double k)
 		{
-			// AdaGradの更新式
-			//   勾配ベクトルをg、更新したいベクトルをv、η(eta)は定数として、
+			// AdaGrad update formula
+			// Gradient vector is g, vector to be updated is v, η(eta) is a constant,
 			//     g2 = g2 + g^2
 			//     v = v - ηg/sqrt(g2)
 
@@ -131,48 +131,48 @@ namespace EvalLearningTools
 
 			g2 += g * g;
 
-			// v0がV0_NOT_INITであるなら、値がKK/KKP/KPP配列の値で初期化されていないということだから、
-			// この場合、vの値を引数で渡されたものから読み込む。
+			// If v0 is V0_NOT_INIT, it means that the value is not initialized with the value of KK/KKP/KPP array,
+			// In this case, read the value of v from the one passed in the argument.
 			double V = (v0 == V0_NOT_INIT) ? v : v0;
 
 			V -= k * eta * (double)g / sqrt((double)g2 + epsilon);
 
-			// Vの値を型の範囲に収まるように制限する。
-			// ちなみに、windows.hがmin,maxマクロを定義してしまうのでそれを回避するために、
-			// ここでは括弧で括ることで関数形式マクロとして扱われないようにしている。
+			// Limit the value of V to be within the range of types.
+			// By the way, windows.h defines the min and max macros, so to avoid it,
+			// Here, it is enclosed in parentheses so that it is not treated as a function-like macro.
 			V = (std::min)((double)(std::numeric_limits<T>::max)() , V);
 			V = (std::max)((double)(std::numeric_limits<T>::min)() , V);
 
 			v0 = (LearnFloatType)V;
 			v = (T)round(V);
 
-			// この要素に関するmini-batchの1回分の更新が終わったのでgをクリア
+			// Clear g because one update of mini-batch for this element is over
 			// g[i] = 0;
-			// →次元下げの問題があるので、これは呼び出し側で行なうことにする。
+			// → There is a problem of dimension reduction, so this will be done by the caller.
 		}
 
 #elif defined(SGD_UPDATE)
 
-		// 勾配の符号だけ見るSGDでupdateする
-		// この関数を実行しているときにgの値やメンバーが書き変わらないことは
-		// 呼び出し側で保証されている。atomic演算である必要はない。
+		// See only the sign of the gradient Update with SGD
+		// When executing this function, the value of g and the member do not change
+		// Guaranteed by the caller. It does not have to be an atomic operation.
 		template <typename T>
 		void updateFV(T & v , double k)
 		{
 			if (g == 0)
 				return;
 
-			// gの符号だけ見てupdateする。
-			// g < 0 なら vを少し足す。
-			// g > 0 なら vを少し引く。
+			// See only the sign of g and update.
+			// If g <0, add v a little.
+			// If g> 0, subtract v slightly.
 
-			// 整数しか足さないので小数部不要。
+			// Since we only add integers, no decimal part is required.
 
-			// 0～5ぐらいずつ動かすのがよさげ。
-			// ガウス分布っぽいほうが良いので5bitの乱数を発生させて(それぞれのbitは1/2の確率で1である)、
-			// それをpop_count()する。このとき、二項分布になっている。
+			// It's a good idea to move around 0-5.
+			// It is better to have a Gaussian distribution, so generate a 5-bit random number (each bit has a 1/2 probability of 1),
+			// Pop_count() it. At this time, it has a binomial distribution.
 			//int16_t diff = (int16_t)POPCNT32((u32)prng.rand(31));
-			// →　これ80スレッドでやったら、このAsyncPRNG::rand()がlockするのでslow downした。この実装良くない。
+			// → If I do this with 80 threads, this AsyncPRNG::rand() locks, so I slowed down. This implementation is not good.
 			int16_t diff = 1;
 
 			double V = v;
@@ -189,10 +189,10 @@ namespace EvalLearningTools
 
 #endif
 
-		// gradの設定
+		// grad setting
 		template <typename T> void set_grad(const T& g_) { g = g_; }
 
-		// gradの加算
+		// Add grad
 		template <typename T> void add_grad(const T& g_) { g += g_; }
 
 		LearnFloatType get_grad() const { return g; }
@@ -203,13 +203,13 @@ namespace EvalLearningTools
 #pragma pack(0)
 #endif
 
-	// 手番つきのweight配列
-	// 透過的に扱えるようにするために、Weightと同じメンバを持たせておいてやる。
+	// Turned weight array
+	// In order to be able to handle it transparently, let's have the same member as Weight.
 	struct Weight2
 	{
 		Weight w[2];
 
-		// 手番評価、etaを1/8に評価しておく。
+		//Evaluate your turn, eta 1/8.
 		template <typename T> void updateFV(std::array<T, 2>& v) { w[0].updateFV(v[0] , 1.0); w[1].updateFV(v[1],1.0/8.0); }
 
 		template <typename T> void set_grad(const std::array<T, 2>& g) { for (int i = 0; i<2; ++i) w[i].set_grad(g[i]); }
@@ -218,42 +218,42 @@ namespace EvalLearningTools
 		std::array<LearnFloatType, 2> get_grad() const { return std::array<LearnFloatType, 2>{w[0].get_grad(), w[1].get_grad()}; }
 	};
 
-	// -------------------------------------------------
-	// Weight配列を直列化したときのindexを計算したりするヘルパー。
-	// -------------------------------------------------
+	// ------------------------------------------------ -
+	// A helper that calculates the index when the Weight array is serialized.
+	// ------------------------------------------------ -
 
-	// KK,KKP,KPP,KKPPの基底クラス
-	// これらのクラスの使い方
-	// 
-	// 1. まずset()で初期化する。例) KK g_kk; g_kk.set(SQUARE_NB,fe_end,0);
-	// 2. 次にfromIndex(),fromKK()などでインスタンスを生成
-	// 3. king(),piece0(),piece1()などのプロパティを用いてアクセス。
-	// 
-	// この説明だけではわかりにくいかも知れないが、学習部のinit_grad(),add_grad(),update_weights()などを見れば
-	// 必要性を含めて理解できると思う。
+	// Base class for KK,KKP,KPP,KKPP
+	// How to use these classes
 	//
-	// 注意 : この派生クラスでは次元下げのために上記のinv_piece/mir_pieceを間接的に参照することがあるので、
-	// 最初にEvalLearningTools::init()かinit_mir_inv_tables()を呼び出して初期化すること。
+	// 1. Initialize with set() first. Example) KK g_kk; g_kk.set(SQUARE_NB,fe_end,0);
+	// 2. Next create an instance with fromIndex(), fromKK(), etc.
+	// 3. Access using properties such as king(), piece0(), piece1().
 	//
-	// 備考) 派生クラス側でoverrideすべきではない関数名には/*final*/と書いてある。
-	//       派生クラス側でoverrideすべき関数は "= 0"をつけて、純粋仮想関数にしてある。
-	//       派生クラス側でoverrideしてもしなくても良い関数はvirtualだけつけてある。
+	// It may be difficult to understand just by this explanation, but if you look at init_grad(), add_grad(), update_weights() etc. in the learning part
+	// I think you can understand it including the necessity.
+	//
+	// Note: this derived class may indirectly reference the above inv_piece/mir_piece for dimension reduction, so
+	// Initialize by calling EvalLearningTools::init() or init_mir_inv_tables() first.
+	//
+	// Remarks) /*final*/ is written for the function name that should not be overridden on the derived class side.
+	// The function that should be overridden on the derived class side is a pure virtual function with "= 0".
+	// Only virtual functions are added to the derived class that may or may not be overridden.
 	//
 	struct SerializerBase
 	{
 
-		// KK,KKP,KPP配列を直列化するときの通し番号の最小値、最大値+1。
+		// Minimum value and maximum value of serial number +1 when serializing KK, KKP, KPP arrays.
 		/*final*/ uint64_t min_index() const { return min_index_; }
 		/*final*/ uint64_t max_index() const { return min_index() + max_raw_index_; }
 
-		// max_index() - min_index()の値。
-		// 派生クラス側でmax_king_sq_,fe_end_などから、値を計算して返すようにする。
+		// max_index() - min_index() the value of.
+		// Calculate the value from max_king_sq_,fe_end_ etc. on the derived class side and return it.
 		virtual uint64_t size() const = 0;
 
-		// 与えられたindexが、min_index()以上、max_index()未満にあるかを判定する。
+		// Determine if the given index is more than min_index() and less than max_index().
 		/*final*/ bool is_ok(uint64_t index) { return min_index() <= index && index < max_index(); }
 
-		// 必ずこのset()を呼び出して使う。さもなくば、派生クラス側のfromKK()/fromIndex()などでインスタンスを構築して使う。
+		// Make sure to call this set(). Otherwise, construct an instance using fromKK()/fromIndex() etc. on the derived class side.
 		virtual void set(int max_king_sq, uint64_t fe_end, uint64_t min_index)
 		{
 			max_king_sq_ = max_king_sq;
@@ -262,26 +262,26 @@ namespace EvalLearningTools
 			max_raw_index_ = size();
 		}
 
-		// 現在のメンバの値に基いて、直列化されたときのindexを取得する。
+		// Get the index when serialized, based on the value of the current member.
 		/*final*/ uint64_t toIndex() const {
 			return min_index() + toRawIndex();
 		}
 
-		// 直列化するときのindexを返す。(min_index()の値は加算する前のもの)
+		// Returns the index when serializing. (The value of min_index() is before addition)
 		virtual uint64_t toRawIndex() const = 0;
 
 	protected:
-		// このクラスの返すmin_index()の値
+		// The value of min_index() returned by this class
 		uint64_t min_index_;
 
-		// このクラスの返すmax_index()の値 = min_index() + max_raw_index_
-		// この変数は派生クラスのsize()で計算されたもの。
+		// The value of max_index() returned by this class = min_index() + max_raw_index_
+		// This variable is calculated by size() of the derived class.
 		uint64_t max_raw_index_;
 
-		// サポートする玉の升の数(通常SQUARE_NB)
+		// The number of balls to support (normally SQUARE_NB)
 		int max_king_sq_;
 
-		// サポートするBonaPieceの最大値
+		// Maximum BonaPiece value supported
 		uint64_t fe_end_;
 
 	};
@@ -295,10 +295,10 @@ namespace EvalLearningTools
 
 		virtual uint64_t size() const { return max_king_sq_ * max_king_sq_; }
 
-		// index(通し番号)からKKのオブジェクトを生成するbuilder
+		// builder that creates KK object from index (serial number)
 		KK fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
 
-		// raw_index(通し番号ではなく0から始まる番号)からKKのオブジェクトを生成するbuilder
+		// builder that creates KK object from raw_index (number starting from 0, not serial number)
 		KK fromRawIndex(uint64_t raw_index) const
 		{
 			int king1 = (int)(raw_index % SQUARE_NB);
@@ -309,18 +309,18 @@ namespace EvalLearningTools
 		}
 		KK fromKK(Square king0, Square king1 , bool inverse) const
 		{
-			// kkという変数名はEval::kk配列などで使っているので別の名前にする必要がある。(以下、KKP,KPPクラスなどでも同様)
+			// The variable name kk is used in the Eval::kk array etc., so it needs to be different. (The same applies to KKP, KPP classes, etc.)
 			KK my_kk(king0, king1, inverse);
 			my_kk.set(max_king_sq_, fe_end_, min_index());
 			return my_kk;
 		}
 		KK fromKK(Square king0, Square king1) const { return fromKK(king0, king1, false); }
 
-		// fromIndex()を用いてこのオブジェクトを構築したときに、以下のアクセッサで情報が得られる。
+		// When you construct this object using fromIndex(), you can get information with the following accessors.
 		Square king0() const { return king0_; }
 		Square king1() const { return king1_; }
 
-// 次元下げの数
+// number of dimension reductions
 #if defined(USE_KK_INVERSE_WRITE)
 	#define KK_LOWER_COUNT 4
 #elif defined(USE_KK_MIRROR_WRITE)
@@ -330,14 +330,14 @@ namespace EvalLearningTools
 #endif
 
 #if defined(USE_KK_INVERSE_WRITE) && !defined(USE_KK_MIRROR_WRITE) 
-		// USE_KK_INVERSE_WRITEわ使うならUSE_KK_MIRROR_WRITEも定義して欲しい。
+		// USE_KK_INVERSE_WRITE If you use it, please also define USE_KK_MIRROR_WRITE.
 		static_assert(false, "define also USE_KK_MIRROR_WRITE!");
 #endif
 
-		// 低次元の配列のindexを得る。
-		// USE_KK_INVERSE_WRITEが有効なときは、それらをinverseしたものが[2],[3]に入る。
-		// この次元下げに関して、gradの符号は反転させないといけないので注意すること。
-		// is_inverse()で判定できるのでこれを利用すると良い。
+		// Get the index of the low-dimensional array.
+		// When USE_KK_INVERSE_WRITE is enabled, the inverse of them will be in [2] and [3].
+		// Note that the sign of grad must be reversed for this dimension reduction.
+		// You can use is_inverse() because it can be determined.
 		void toLowerDimensions(/*out*/KK kk_[KK_LOWER_COUNT]) const {
 			kk_[0] = fromKK(king0_, king1_,false);
 #if defined(USE_KK_MIRROR_WRITE)
@@ -349,24 +349,24 @@ namespace EvalLearningTools
 #endif
 		}
 
-		// このクラスのmin_index()の値を0として数えたときのindexを取得する。
+		// Get the index when counting the value of min_index() of this class as 0.
 		virtual uint64_t toRawIndex() const {
 			return (uint64_t)king0_ * (uint64_t)max_king_sq_ + (uint64_t)king1_;
 		}
 
-		// toLowerDimensionsで次元下げしたものがinverseしたものであるかを返す。
+		// Returns whether or not the dimension lowered with toLowerDimensions is inverse.
 		bool is_inverse() const {
 			return inverse_sign;
 		}
 
-		// is_inverse() == trueのときに、gradの手番ではないほうの符号を反転させて返す。
+		// When is_inverse() == true, reverse the sign that is not grad's turn and return it.
 		template <typename T>
 		std::array<T, 2> apply_inverse_sign(const std::array<T, 2>& rhs)
 		{
 			return !is_inverse() ? rhs : std::array<T, 2>{-rhs[0], rhs[1]};
 		}
 
-		// 比較演算子
+		// comparison operator
 		bool operator==(const KK& rhs) { return king0() == rhs.king0() && king1() == rhs.king1(); }
 		bool operator!=(const KK& rhs) { return !(*this == rhs); }
 
@@ -375,14 +375,14 @@ namespace EvalLearningTools
 		bool inverse_sign;
 	};
 
-	// デバッグ用出力。
+	// Output for debugging.
 	static std::ostream& operator<<(std::ostream& os, KK rhs)
 	{
 		os << "KK(" << rhs.king0() << "," << rhs.king1() << ")";
 		return os;
 	}
 
-	// KKと同じく。KKP用。
+		// Same as KK. For KKP.
 	struct KKP : public SerializerBase
 	{
 	protected:
@@ -393,10 +393,10 @@ namespace EvalLearningTools
 
 		virtual uint64_t size() const { return (uint64_t)max_king_sq_*(uint64_t)max_king_sq_*(uint64_t)fe_end_; }
 
-		// index(通し番号)からKKPのオブジェクトを生成するbuilder
+		// builder that creates KKP object from index (serial number)
 		KKP fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
 
-		// raw_index(通し番号ではなく0から始まる番号)からKKPのオブジェクトを生成するbuilder
+		// A builder that creates a KKP object from raw_index (a number that starts from 0, not a serial number)
 		KKP fromRawIndex(uint64_t raw_index) const
 		{
 			int piece = (int)(raw_index % Eval::fe_end);
@@ -416,12 +416,12 @@ namespace EvalLearningTools
 		}
 		KKP fromKKP(Square king0, Square king1, Eval::BonaPiece p) const { return fromKKP(king0, king1, p, false); }
 
-		// fromIndex()を用いてこのオブジェクトを構築したときに、以下のアクセッサで情報が得られる。
+		// When you construct this object using fromIndex(), you can get information with the following accessors.
 		Square king0() const { return king0_; }
 		Square king1() const { return king1_; }
 		Eval::BonaPiece piece() const { return piece_; }
 
-		// KKPの次元下げの数
+		// Number of KKP dimension reductions
 #if defined(USE_KKP_INVERSE_WRITE)
 		#define KKP_LOWER_COUNT 4
 #elif defined(USE_KKP_MIRROR_WRITE)
@@ -431,14 +431,14 @@ namespace EvalLearningTools
 #endif
 
 #if defined(USE_KKP_INVERSE_WRITE) && !defined(USE_KKP_MIRROR_WRITE) 
-		// USE_KKP_INVERSE_WRITEわ使うならUSE_KKP_MIRROR_WRITEも定義して欲しい。
+		// USE_KKP_INVERSE_WRITE If you use it, please also define USE_KKP_MIRROR_WRITE.
 		static_assert(false, "define also USE_KKP_MIRROR_WRITE!");
 #endif
 
-		// 低次元の配列のindexを得る。ミラーしたものがkkp_[1]に返る。
-		// USE_KKP_INVERSE_WRITEが有効なときは、それらをinverseしたものが[2],[3]に入る。
-		// この次元下げに関して、gradの符号は反転させないといけないので注意すること。
-		// is_inverse()で判定できるのでこれを利用すると良い。
+		// Get the index of the low-dimensional array. The mirrored one is returned to kkp_[1].
+		// When USE_KKP_INVERSE_WRITE is enabled, the inverse of them will be in [2] and [3].
+		// Note that the sign of grad must be reversed for this dimension reduction.
+		// You can use is_inverse() because it can be determined.
 		void toLowerDimensions(/*out*/ KKP kkp_[KKP_LOWER_COUNT]) const {
 			kkp_[0] = fromKKP(king0_, king1_, piece_,false);
 #if defined(USE_KKP_MIRROR_WRITE)
@@ -450,24 +450,24 @@ namespace EvalLearningTools
 #endif
 		}
 
-		// このクラスのmin_index()の値を0として数えたときのindexを取得する。
+		// Get the index when counting the value of min_index() of this class as 0.
 		virtual uint64_t toRawIndex() const {
 			return  ((uint64_t)king0_ * (uint64_t)max_king_sq_ + (uint64_t)king1_) * (uint64_t)fe_end_ + (uint64_t)piece_;
 		}
 
-		// toLowerDimensionsで次元下げしたものがinverseしたものであるかを返す。
+		// Returns whether or not the dimension lowered with toLowerDimensions is inverse.
 		bool is_inverse() const {
 			return inverse_sign;
 		}
 
-		// is_inverse() == trueのときに、gradの手番ではないほうの符号を反転させて返す。
+		// When is_inverse() == true, reverse the sign that is not grad's turn and return it.
 		template <typename T>
 		std::array<T, 2> apply_inverse_sign(const std::array<T, 2>& rhs)
 		{
 			return !is_inverse() ? rhs : std::array<T, 2>{-rhs[0], rhs[1]};
 		}
 
-		// 比較演算子
+		// comparison operator
 		bool operator==(const KKP& rhs) { return king0() == rhs.king0() && king1() == rhs.king1() && piece() == rhs.piece(); }
 		bool operator!=(const KKP& rhs) { return !(*this == rhs); }
 
@@ -477,7 +477,7 @@ namespace EvalLearningTools
 		bool inverse_sign;
 	};
 
-	// デバッグ用出力。
+	// Output for debugging.
 	static std::ostream& operator<<(std::ostream& os, KKP rhs)
 	{
 		os << "KKP(" << rhs.king0() << "," << rhs.king1() << "," << rhs.piece() << ")";
@@ -485,7 +485,7 @@ namespace EvalLearningTools
 	}
 
 
-	// KK,KKPと同様。KPP用
+	// Same as KK and KKP. For KPP
 	struct KPP : public SerializerBase
 	{
 	protected:
@@ -494,28 +494,28 @@ namespace EvalLearningTools
 	public:
 		KPP() {}
 
-		// KK,KKP,KPP配列を直列化するときの通し番号の、KPPの最小値、最大値。
+		// The minimum and maximum KPP values ​​of serial numbers when serializing KK, KKP, KPP arrays.
 #if !defined(USE_TRIANGLE_WEIGHT_ARRAY)
 		virtual uint64_t size() const { return (uint64_t)max_king_sq_*(uint64_t)fe_end_*(uint64_t)fe_end_; }
 #else
-		// kpp[SQUARE_NB][fe_end][fe_end]の[fe_end][fe_end]な正方配列の部分を三角配列化する。
-		// kpp[SQUARE_NB][triangle_fe_end]とすると、この三角配列の1行目は要素1個、2行目は2個、…。
-		// ゆえに、triangle_fe_end = 1 + 2 + .. + fe_end = fe_end * (fe_end + 1) / 2
+		// Triangularize the square array part of [fe_end][fe_end] of kpp[SQUARE_NB][fe_end][fe_end].
+		// If kpp[SQUARE_NB][triangle_fe_end], the first row of this triangular array has one element, the second row has two elements, and so on.
+		// hence triangle_fe_end = 1 + 2 + .. + fe_end = fe_end * (fe_end + 1) / 2
 		virtual uint64_t size() const { return (uint64_t)max_king_sq_*(uint64_t)triangle_fe_end; }
 #endif
 
 		virtual void set(int max_king_sq, uint64_t fe_end, uint64_t min_index)
 		{
-			// この値、size()で用いていて、SerializerBase::set()でsize()を使うので先に計算する。
+		// This value is used in size(), and size() is used in SerializerBase::set(), so calculate first.
 			triangle_fe_end = (uint64_t)fe_end*((uint64_t)fe_end + 1) / 2;
 
 			SerializerBase::set(max_king_sq, fe_end, min_index);
 		}
 
-		// index(通し番号)からKPPのオブジェクトを生成するbuilder
+		// builder that creates KPP object from index (serial number)
 		KPP fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
 
-		// raw_index(通し番号ではなく0から始まる番号)からKPPのオブジェクトを生成するbuilder
+		// A builder that creates KPP objects from raw_index (a number that starts from 0, not a serial number)
 		KPP fromRawIndex(uint64_t raw_index) const
 		{
 			const uint64_t triangle_fe_end = (uint64_t)fe_end_*((uint64_t)fe_end_ + 1) / 2;
@@ -528,13 +528,13 @@ namespace EvalLearningTools
 #else
 			uint64_t index2 = raw_index % triangle_fe_end;
 
-			// ここにindex2からpiece0,piece1を求める式を書く。
-			// これは index2 = i * (i+1) / 2 + j の逆関数となる。
-			// j = 0 の場合、i^2 + i - 2 * index2 == 0なので
-			// 2次方程式の解の公式から i = (sqrt(8*index2+1) - 1) / 2である。
-			// iを整数化したのちに、j = index2 - i * (i + 1) / 2としてjを求めれば良い。
+			// Write the expression to find piece0, piece1 from index2 here.
+			// This is the inverse function of index2 = i * (i+1) / 2 + j.
+			// If j = 0, i^2 + i-2 * index2 == 0
+			// From the solution formula of the quadratic equation i = (sqrt(8*index2+1)-1) / 2.
+			// After i is converted into an integer, j can be calculated as j = index2-i * (i + 1) / 2.
 
-			// BonaPieceは32bit(16bitに収まらない可能性)を想定しているのでこの掛け算は64bitでないといけない。
+			// BonaPiece assumes 32bit (may not fit in 16bit), so this multiplication must be 64bit.
 			int piece1 = int(sqrt(8 * index2 + 1) - 1) / 2;
 			int piece0 = int(index2 - (uint64_t)piece1*((uint64_t)piece1 + 1) / 2);
 
@@ -556,13 +556,13 @@ namespace EvalLearningTools
 			return my_kpp;
 		}
 
-		// fromIndex()を用いてこのオブジェクトを構築したときに、以下のアクセッサで情報が得られる。
+		// When you construct this object using fromIndex(), you can get information with the following accessors.
 		Square king() const { return king_; }
 		Eval::BonaPiece piece0() const { return piece0_; }
 		Eval::BonaPiece piece1() const { return piece1_; }
 
 
-		// 次元下げの数
+// number of dimension reductions
 #if defined(USE_KPP_MIRROR_WRITE)
 	#if !defined(USE_TRIANGLE_WEIGHT_ARRAY)
 		#define KPP_LOWER_COUNT 4
@@ -577,18 +577,18 @@ namespace EvalLearningTools
 	#endif
 #endif
 
-		// 低次元の配列のindexを得る。p1,p2を入れ替えたもの、ミラーしたものなどが返る。
+		// Get the index of the low-dimensional array. The ones with p1 and p2 swapped, the ones mirrored, etc. are returned.
 		void toLowerDimensions(/*out*/ KPP kpp_[KPP_LOWER_COUNT]) const {
 
 #if defined(USE_TRIANGLE_WEIGHT_ARRAY)
-			// 三角配列を用いる場合は、piece0とpiece1を入れ替えたものは返らないので注意。
+			// Note that if you use a triangular array, the swapped piece0 and piece1 will not be returned.
 			kpp_[0] = fromKPP(king_, piece0_, piece1_);
 #if defined(USE_KPP_MIRROR_WRITE)
 			kpp_[1] = fromKPP(Mir(king_), mir_piece(piece0_), mir_piece(piece1_));
 #endif
 
 #else
-			// 三角配列を用いない場合
+			// When not using triangular array
 			kpp_[0] = fromKPP(king_, piece0_, piece1_);
 			kpp_[1] = fromKPP(king_, piece1_, piece0_);
 #if defined(USE_KPP_MIRROR_WRITE)
@@ -598,7 +598,7 @@ namespace EvalLearningTools
 #endif
 		}
 
-		// このクラスのmin_index()の値を0として数えたときのindexを取得する。
+		// Get the index when counting the value of min_index() of this class as 0.
 		virtual uint64_t toRawIndex() const {
 
 #if !defined(USE_TRIANGLE_WEIGHT_ARRAY)
@@ -606,15 +606,15 @@ namespace EvalLearningTools
 			return ((uint64_t)king_ * (uint64_t)fe_end_ + (uint64_t)piece0_) * (uint64_t)fe_end_ + (uint64_t)piece1_;
 
 #else
-			// Bonanza6.0で使われているのに似せたマクロ
+			// Macro similar to that used in Bonanza 6.0
 			auto PcPcOnSq = [&](Square k, Eval::BonaPiece i, Eval::BonaPiece j)
 			{
 
-				// この三角配列の(i,j)は、i行目のj列目の要素。
-				// i行目0列目は、そこまでの要素の合計であるから、1 + 2 + ... + i = i * (i+1) / 2
-				// i行目j列目は、これにjを足したもの。i * (i + 1) /2 + j
+				// (i,j) in this triangular array is the element in the i-th row and the j-th column.
+				// 1st row + 2 + ... + i = i * (i+1) / 2 because the i-th row and 0th column is the total of the elements up to that point
+				// The i-th row and the j-th column is j plus this. i*(i+1)/2+j
 
-				// BonaPiece型は、32bitを想定しているので掛け算には気をつけないとオーバーフローする。
+				// BonaPiece type is assumed to be 32 bits, so if you do not pay attention to multiplication, it will overflow.
 				return (uint64_t)k * triangle_fe_end + (uint64_t)(uint64_t(i)*(uint64_t(i)+1) / 2 + uint64_t(j));
 			};
 
@@ -626,18 +626,18 @@ namespace EvalLearningTools
 #endif
 		}
 
-		// toLowerDimensionsで次元下げしたものがinverseしたものであるかを返す。
-		// KK,KKPとinterfaceを合せるために用意してある。このKPPクラスでは、このメソッドは常にfalseを返す。
+		// Returns whether or not the dimension lowered with toLowerDimensions is inverse.
+		// Prepared to match KK, KKP and interface. This method always returns false for this KPP class.
 		bool is_inverse() const {
 			return false;
 		}
 
-		// 比較演算子
+		// comparison operator
 		bool operator==(const KPP& rhs) {
 			return king() == rhs.king() &&
 				((piece0() == rhs.piece0() && piece1() == rhs.piece1())
 #if defined(USE_TRIANGLE_WEIGHT_ARRAY)
-					// 三角配列を用いるときはpiece0とpiece1の入れ替わりを許容する。
+					// When using a triangular array, allow swapping of piece0 and piece1.
 				|| (piece0() == rhs.piece1() && piece1() == rhs.piece0())
 #endif
 					); }
@@ -651,24 +651,24 @@ namespace EvalLearningTools
 		uint64_t triangle_fe_end; // = (uint64_t)fe_end_*((uint64_t)fe_end_ + 1) / 2;
 	};
 
-	// デバッグ用出力。
+	// Output for debugging.
 	static std::ostream& operator<<(std::ostream& os, KPP rhs)
 	{
 		os << "KPP(" << rhs.king() << "," << rhs.piece0() << "," << rhs.piece1() << ")";
 		return os;
 	}
 
-	// KPPPの4駒関係。ただし、手番ありでミラー等を考慮しないと学習に2TB以上のメモリが必要…。
-	// 三角配列を使っても学習のために50GB×12バイト = 600GB必要。
-	// ミラーしたもののみを格納するようにしてもの半分ぐらい必要。
-	// ここでは、三角配列は必ず用いて、かつミラーしたものを格納するものとする。
+	// 4 pieces related to KPPP. However, if there is a turn and you do not consider mirrors etc., memory of 2 TB or more is required for learning.
+	// Even if you use a triangular array, you need 50GB x 12 bytes = 600GB for learning.
+	// It takes about half as much as storing only the mirrored one.
+	// Here, the triangular array is always used and the mirrored one is stored.
 	//
-	// また、このクラスのking()は、実際のkingのSquareとは限らず、単に、0～(king_sq-1)までの値が返る。
-	// これは、ミラーを利用した圧縮を行なう場合など、利用側で適切な玉の位置に変換してやる必要がある。
-	// 
-	// あと、このクラスの返すpiece0,1,2に関して、
-	//   piece0() > piece1() > piece2()
-	// であり、コンストラクタでpiece0,1,2を渡すときも、この制約を守る必要がある。
+	// Also, king() of this class is not limited to Square of the actual king, but a value from 0 to (king_sq-1) is simply returned.
+	// This needs to be converted to an appropriate ball position on the user side when performing compression using a mirror.
+	//
+	// Later, regarding the pieces0,1,2 returned by this class,
+	// piece0() >piece1() >piece2()
+	// It is, and it is necessary to keep this constraint when passing piece0,1,2 in the constructor.
 	struct KPPP : public SerializerBase
 	{
 	protected:
@@ -684,21 +684,21 @@ namespace EvalLearningTools
 
 		virtual uint64_t size() const { return (uint64_t)max_king_sq_*triangle_fe_end; }
 
-		// fe_endとking_sqを設定する。
-		// fe_end : このKPPPクラスの想定するfe_end
-		// king_sq : KPPPのときに扱う玉の升の数。
-		//  3段×ミラーなら3段×5筋 = 15みたいな感じ。
-		//  2段×ミラーなしなら2×9筋 = 18みたいな感じ。
-		//  これをこのKPPPクラスを使う側でset()を用いて最初に設定する。
+		// Set fe_end and king_sq.
+		// fe_end: fe_end assumed by this KPPP class
+		// king_sq: Number of balls to handle in KPPP.
+		// 3 layers x 3 mirrors = 3 layers x 5 lines = 15
+		// 2 steps x 2 mirrors without mirror = 18
+		// Set this first using set() on the side that uses this KPPP class.
 		virtual void set(int max_king_sq, uint64_t fe_end,uint64_t min_index) {
-			// この値、size()で用いていて、SerializerBase::set()でsize()を使うので先に計算する。
+			// This value is used in size(), and size() is used in SerializerBase::set(), so calculate first.
 			triangle_fe_end = fe_end * (fe_end - 1) * (fe_end - 2) / 6;
 
 			SerializerBase::set(max_king_sq, fe_end, min_index);
 		}
 
-		// 次元下げの数
-		// とりあえず、ミラーの次元下げ非対応。ここでやることもないかと…。
+		// number of dimension reductions
+		// For the time being, the dimension reduction of the mirror is not supported. I wonder if I'll do it here...
 /*
 #if defined(USE_KPPP_MIRROR_WRITE)
 #define KPPP_LOWER_COUNT 2
@@ -708,70 +708,70 @@ namespace EvalLearningTools
 */
 #define KPPP_LOWER_COUNT 1
 
-		// 低次元の配列のindexを得る。
-		// p0,p1,p2を入れ替えたものは返らないので注意。
-		// またミラーしたものも、USE_KPPP_MIRROR_WRITEが有効なときしか返さない。
+		// Get the index of the low-dimensional array.
+		// Note that the one with p0,p1,p2 swapped will not be returned.
+		// Also, the mirrored one is returned only when USE_KPPP_MIRROR_WRITE is enabled.
 		void toLowerDimensions(/*out*/ KPPP kppp_[KPPP_LOWER_COUNT]) const
 		{
 			kppp_[0] = fromKPPP(king_, piece0_, piece1_,piece2_);
 #if KPPP_LOWER_COUNT > 1
-			// mir_pieceするとsortされてない状態になる。sortするコードが必要。
+			// If mir_piece is done, it will be in a state not sorted. Need code to sort.
 			Eval::BonaPiece p_list[3] = { mir_piece(piece2_), mir_piece(piece1_), mir_piece(piece0_) };
 			my_insertion_sort(p_list, 0, 3);
 			kppp_[1] = fromKPPP((int)Mir((Square)king_), p_list[2] , p_list[1], p_list[0]);
 #endif
 		}
 
-		// index(通し番号)からKPPPのオブジェクトを生成するbuilder
+		// builder that creates KPPP object from index (serial number)
 		KPPP fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
 
-		// raw_index(通し番号ではなく0から始まる番号)からKPPPのオブジェクトを生成するbuilder
+		// A builder that creates KPPP objects from raw_index (a number that starts from 0, not a serial number)
 		KPPP fromRawIndex(uint64_t raw_index) const
 		{
 			uint64_t index2 = raw_index % triangle_fe_end;
 
-			// ここにindex2からpiece0,piece1,piece2を求める式を書く。
-			// これは index2 = i(i-1)(i-2)/6-1 + j(j+1)/2 + k の逆関数となる。
-			// j = k = 0 の場合、3次方程式の解の公式から実根は、 i = ...である。(以下式) 
-			// ただしindex2が0,1のときは実数解が複数ある。これを考慮しないといけない。計算精度が足りないことに対する対策必要。
-			// iが求まったあとはiを整数化したのちに、最初の式に入れてKPPのとき同様にjを求めれば良い。
+			// Write the expression to find piece0, piece1, piece2 from index2 here.
+			// This is the inverse function of index2 = i(i-1)(i-2)/6-1 + j(j+1)/2 + k.
+			// For j = k = 0, the real root is i = ... from the solution formula of the cubic equation. (The following formula)
+			// However, if index2 is 0 or 1, there are multiple real solutions. You have to consider this. It is necessary to take measures against insufficient calculation accuracy.
+			// After i is calculated, i can be converted into an integer, then put in the first expression and then j can be calculated in the same way as in KPP.
 
-			// この処理、数値計算としてわりと難しい。色々工夫が必要。
+			// This process is a relatively difficult numerical calculation. Various ideas are needed.
 
 			int piece0;
 			if (index2 <= 1)
 			{
-				// index2 == 0,1のときだけ実数解が複数ある。
+				// There are multiple real solutions only when index2 == 0,1.
 				piece0 = (int)index2 + 2;
 
 			} else {
 
-				//double t = pow(sqrt((243 *index2 * index2 - 1) * 3) + 27 * index2, 1.0 / 3);
-				// →　これだとindex2が大きくなるとsqrt()の中身、オーバーフローする。
+				//double t = pow(sqrt((243 *index2 * index2-1) * 3) + 27 * index2, 1.0 / 3);
+				// → In this case, the content of sqrt() will overflow if index2 becomes large.
 
-				// sqrt()の中身がオーバーフローするので、sqrtのなかで3.0を掛けずにsqrtの外側でsqrt(3.0)を掛ける。
-				// sqrt()の中身がオーバーフローするので、index2が大きいときは近似式を用いる。
+				// Since the contents of sqrt() overflow, do not multiply 3.0 in sqrt, but multiply sqrt(3.0) outside sqrt.
+				// Since the contents of sqrt() will overflow, use an approximate expression when index2 is large.
 
 				double t;
 				
 				if (index2 < 100000000)
 					t = pow(sqrt((243.0 *index2 * index2 - 1)) * sqrt(3.0) + 27 * index2, 1.0 / 3);
 				else
-					// index2が非常に大きいとき、sqrtの中身、近似的に √243 * index2とみなせるだろう。
+					// If index2 is very large, we can think of the contents of sqrt as approximately √243 * index2.
 					t = pow( index2 * sqrt(243 * 3.0) + 27 * index2, 1.0 / 3);
-				
-				// 丸めのときに計算誤差でわずかに足りないのを防ぐためデルタを加算する。
-				// 大きすぎると1大きい数になってしまう時があるので調整が必要。
-				
+
+				// Add deltas to avoid a slight calculation error when rounding.
+				// If it is too large, it may increase by 1 so adjustment is necessary.
+
 				const double delta = 0.000000001;
 
 				piece0 = int(t / pow(3.0, 2.0 / 3) + 1.0 / (pow(3.0, 1.0 / 3) * t) + delta) + 1;
-				// ううう。ほんまにこんなことせんとあかんのか？(´ω｀)
+				// Uuu. Is it really like this? ('Ω`)
 			}
 
-			// piece2が求まったので、上式のi(i-1)(i-2)/6(=aとする)のiにpiece2を代入。また、k = 0を代入。
-			// j(j+1)/2 = index2 - a
-			// これは、2次方程式の解の公式より..
+			//Since piece2 is obtained, substitute piece2 for i of i(i-1)(i-2)/6 (=a) in the above formula. Also substitute k = 0.
+			// j(j+1)/2 = index2-a
+			// This is from the solution formula of the quadratic equation..
 
 			uint64_t a = (uint64_t)piece0*((uint64_t)piece0 - 1)*((uint64_t)piece0 - 2) / 6;
 			int piece1 = int((1 + sqrt(8.0 * (index2 - a ) + 1)) / 2);
@@ -796,12 +796,12 @@ namespace EvalLearningTools
 			int king = (int)(raw_index  /* % SQUARE_NB */);
 			assert(king < max_king_sq_);
 
-			// king_sqとfe_endに関しては伝播させる。
+			// Propagate king_sq and fe_end.
 			return fromKPPP((Square)king, (Eval::BonaPiece)piece0, (Eval::BonaPiece)piece1 , (Eval::BonaPiece)piece2);
 		}
 
-		// k,p0,p1,p2を指定してKPPPのインスタンスをbuildする。
-		// 内部的に保持しているset()で渡されたking_sqとfe_endは引き継ぐ。
+		// Specify k,p0,p1,p2 to build KPPP instance.
+		// The king_sq and fe_end passed by set() which is internally retained are inherited.
 		KPPP fromKPPP(int king, Eval::BonaPiece p0, Eval::BonaPiece p1, Eval::BonaPiece p2) const
 		{
 			KPPP kppp(king, p0, p1, p2);
@@ -809,21 +809,21 @@ namespace EvalLearningTools
 			return kppp;
 		}
 
-		// このクラスのmin_index()の値を0として数えたときのindexを取得する。
+		// Get the index when counting the value of min_index() of this class as 0.
 		virtual uint64_t toRawIndex() const {
 
-			// Bonanza 6.0で使われているのに似せたマクロ
-			// 前提条件) i > j > k であること。
-			// i==j,j==kのケースはNG。
+			// Macro similar to the one used in Bonanza 6.0
+			// Precondition) i> j> k.
+			// NG in case of i==j,j==k.
 			auto PcPcPcOnSq = [this](int king, Eval::BonaPiece i, Eval::BonaPiece j , Eval::BonaPiece k)
 			{
-				// この三角配列の(i,j,k)は、i行目のj列目の要素。
-				// i行目0列0番目は、そこまでの要素の合計であるから、0 + 0 + 1 + 3 + 6 + ... + (i)*(i-1)/2 = i*(i-1)*(i-2)/ 6
-				// i行目j列0番目は、そこにjを加味したもの。 + j*(j-1) / 2
-				// i行目j列k番目は、そこにkを足したもの。   + k
+				// (i,j,k) in this triangular array is the element in the i-th row and the j-th column.
+				// 0th row 0th column 0th is the sum of the elements up to that point, so 0 + 0 + 1 + 3 + 6 + ... + (i)*(i-1)/2 = i*( i-1)*(i-2)/6
+				// i-th row, j-th column, 0-th is j with j added. + j*(j-1) / 2
+				// i-th row, j-th column and k-th row is k plus it. + k
 				assert(i > j && j > k);
 
-				// BonaPiece型は、32bitを想定しているので掛け算には気をつけないとオーバーフローする。
+				// BonaPiece type is assumed to be 32 bits, so if you do not pay attention to multiplication, it will overflow.
 				return (uint64_t)king * triangle_fe_end + (uint64_t)(
 						  uint64_t(i)*(uint64_t(i) - 1) * (uint64_t(i) - 2) / 6
 						+ uint64_t(j)*(uint64_t(j) - 1) / 2
@@ -834,24 +834,24 @@ namespace EvalLearningTools
 			return PcPcPcOnSq(king_, piece0_, piece1_, piece2_);
 		}
 
-		// fromIndex()を用いてこのオブジェクトを構築したときに、以下のアクセッサで情報が得られる。
+		// When you construct this object using fromIndex(), you can get information with the following accessors.
 		int king() const { return king_; }
 		Eval::BonaPiece piece0() const { return piece0_; }
 		Eval::BonaPiece piece1() const { return piece1_; }
 		Eval::BonaPiece piece2() const { return piece2_; }
-		// toLowerDimensionsで次元下げしたものがinverseしたものであるかを返す。
-		// KK,KKPとinterfaceを合せるために用意してある。このKPPPクラスでは、このメソッドは常にfalseを返す。
+		// Returns whether or not the dimension lowered with toLowerDimensions is inverse.
+		// Prepared to match KK, KKP and interface. This method always returns false for this KPPP class.
 		bool is_inverse() const {
 			return false;
 		}
 
-		// 3角配列化したときの要素の数を返す。kppp配列が、以下のような2次元配列だと想定している。
-		//   kppp[king_sq][triangle_fe_end];
+		// Returns the number of elements in a triangular array. It is assumed that the kppp array is the following two-dimensional array.
+		// kppp[king_sq][triangle_fe_end];
 		uint64_t get_triangle_fe_end() const { return triangle_fe_end; }
 
-		// 比較演算子
+		// comparison operator
 		bool operator==(const KPPP& rhs) {
-			// piece0 > piece1 > piece2を前提とするので、入れ替わりの可能性はない。
+			// piece0> piece1> piece2 is assumed, so there is no possibility of replacement.
 			return king() == rhs.king() && piece0() == rhs.piece0() && piece1() == rhs.piece1() && piece2() == rhs.piece2();
 		}
 		bool operator!=(const KPPP& rhs) { return !(*this == rhs); }
@@ -861,33 +861,33 @@ namespace EvalLearningTools
 		int king_;
 		Eval::BonaPiece piece0_, piece1_,piece2_;
 
-		// kppp[king_sq][fe_end][fe_end][fe_end]の[fe_end][fe_end][fe_end]な正方配列の部分を三角配列化する。
-		// kppp[king_sq][triangle_fe_end]とすると、この三角配列の0行目から要素数は、0,0,1,3,…,n行目はn(n-1)/2個。
-		// ゆえに、
+		// The part of the square array of [fe_end][fe_end][fe_end] of kppp[king_sq][fe_end][fe_end][fe_end] is made into a triangular array.
+		// If kppp[king_sq][triangle_fe_end], the number of elements from the 0th row of this triangular array is 0,0,1,3,..., The nth row is n(n-1)/2.
+		// therefore,
 		// triangle_fe_end = Σn(n-1)/2 , n=0..fe_end-1
 		//                 =  fe_end * (fe_end - 1) * (fe_end - 2) / 6
 		uint64_t triangle_fe_end; // ((uint64_t)Eval::fe_end)*((uint64_t)Eval::fe_end - 1)*((uint64_t)Eval::fe_end - 2) / 6;
 	};
 
-	// デバッグ用出力。
+	// Output for debugging.
 	static std::ostream& operator<<(std::ostream& os, KPPP rhs)
 	{
 		os << "KPPP(" << rhs.king() << "," << rhs.piece0() << "," << rhs.piece1() << "," << rhs.piece2() << ")";
 		return os;
 	}
 
-	// KKPPによる4駒関係の学習用。
+	// For learning about 4 pieces by KKPP.
 	//
-	// KPPPクラスと同じ設計。KPPPクラスで、pが一枚少ないものとして扱う。
-	// ２つの玉の位置は0～king_sq-1までの値としてencodeされているものとする。
+	// Same design as KPPP class. In KPPP class, treat as one with less p.
+	// The positions of the two balls are encoded as values ​​from 0 to king_sq-1.
 	//
-	// あと、このクラスの返すpiece0,1に関して、
-	//   piece0() > piece1()
-	// であり、コンストラクタでpiece0,1を渡すときも、この制約を守る必要がある。
+	// Later, regarding the pieces0 and 1 returned by this class,
+	// piece0() >piece1()
+	// It is, and it is necessary to keep this constraint even when passing piece0,1 in the constructor.
 	//
-	// この制約から、BonaPieceZeroをpiece0,piece1に同時に代入して渡すことは出来ない。
-	// 駒落ちの学習に対応させるならevaluate()で工夫が必要。
-	struct KKPP : SerializerBase
+	// Due to this constraint, BonaPieceZero cannot be assigned to piece0 and piece1 at the same time and passed.
+	// If you want to support learning of dropped frames, you need to devise with evaluate().
+	struct KKPP: SerializerBase
 	{
 	protected:
 		KKPP(int king, Eval::BonaPiece p0, Eval::BonaPiece p1) :
@@ -902,45 +902,45 @@ namespace EvalLearningTools
 
 		virtual uint64_t size() const { return (uint64_t)max_king_sq_*triangle_fe_end; }
 
-		// fe_endとking_sqを設定する。
-		// fe_end : このKPPPクラスの想定するfe_end
-		// king_sq : KPPPのときに扱う玉の升の数。
-		//  9段×ミラーなら9段×5筋の2乗(先後の玉) = 45*45 = 2025 みたいな感じ。
-		//  これをこのKKPPクラスを使う側でset()を用いて最初に設定する。
+		// Set fe_end and king_sq.
+		// fe_end: fe_end assumed by this KPPP class
+		// king_sq: Number of balls to handle in KPPP.
+		// 9 steps x mirrors 9 steps x 5 squared squares (balls before and after) = 45*45 = 2025.
+		// Set this first using set() on the side that uses this KKPP class.
 		void set(int max_king_sq, uint64_t fe_end , uint64_t min_index) {
-			// この値、size()で用いていて、SerializerBase::set()でsize()を使うので先に計算する。
+			// This value is used in size(), and size() is used in SerializerBase::set(), so calculate first.
 			triangle_fe_end = fe_end * (fe_end - 1) / 2;
 
 			SerializerBase::set(max_king_sq, fe_end, min_index);
 		}
 
-		// 次元下げの数
-		// とりあえず、ミラーの次元下げ非対応。ここでやることもないかと…。(学習用のメモリがもったいないので)
+		// number of dimension reductions
+		// For the time being, the dimension reduction of the mirror is not supported. I wonder if I'll do it here... (Because the memory for learning is a waste)
 #define KKPP_LOWER_COUNT 1
 
-		// 低次元の配列のindexを得る。
-		// p0,p1,p2を入れ替えたものは返らないので注意。
-		// またミラーしたものも、USE_KPPP_MIRROR_WRITEが有効なときしか返さない。
+		// Get the index of the low-dimensional array.
+		//Note that the one with p0,p1,p2 swapped will not be returned.
+		// Also, the mirrored one is returned only when USE_KPPP_MIRROR_WRITE is enabled.
 		void toLowerDimensions(/*out*/ KKPP kkpp_[KPPP_LOWER_COUNT]) const
 		{
 			kkpp_[0] = fromKKPP(king_, piece0_, piece1_);
 
-			// ミラーする場合、mir_pieceするとsortされてない状態になる。sortするコードが必要。
-			// あとking_に対するミラーを定義する必要も。
+			// When mirroring, mir_piece will not be sorted. Need code to sort.
+			// We also need to define a mirror for king_.
 		}
 
-		// index(通し番号)からKKPPのオブジェクトを生成するbuilder
+		// builder that creates KKPP object from index (serial number)
 		KKPP fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
 
-		// raw_index(通し番号ではなく0から始まる番号)からKKPPのオブジェクトを生成するbuilder
+		// builder that creates KKPP object from raw_index (number starting from 0, not serial number)
 		KKPP fromRawIndex(uint64_t raw_index) const
 		{
 			uint64_t index2 = raw_index % triangle_fe_end;
 
-			// ここにindex2からpiece0,piece1,piece2を求める式を書く。
-			// これは index2 = i(i-1)/2 + j の逆関数となる。
-			// j=0として、二次方程式の解の公式を用いる。
-			// index2=0のときは重根だが小さいほうはi>jを満たさないので無視。
+			// Write the expression to find piece0, piece1, piece2 from index2 here.
+			// This is the inverse function of index2 = i(i-1)/2 + j.
+			// Use the formula of the solution of the quadratic equation with j=0.
+			// When index2=0, it is a double root, but the smaller one does not satisfy i>j and is ignored.
 
 			int piece0 = (int(sqrt(8 * index2 + 1)) + 1)/2;
 			int piece1 = int(index2 - piece0 * (piece0 - 1) /2 );
@@ -955,12 +955,12 @@ namespace EvalLearningTools
 			int king = (int)(raw_index  /* % SQUARE_NB */);
 			assert(king < max_king_sq_);
 
-			// king_sqとfe_endに関しては伝播させる。
+			// Propagate king_sq and fe_end.
 			return fromKKPP(king, (Eval::BonaPiece)piece0, (Eval::BonaPiece)piece1);
 		}
 
-		// k,p0,p1を指定してKKPPのインスタンスをbuildする。
-		// 内部的に保持しているset()で渡されたking_sqとfe_endは引き継ぐ。
+		// Specify k,p0,p1 to build KKPP instance.
+		// The king_sq and fe_end passed by set() which is internally retained are inherited.
 		KKPP fromKKPP(int king, Eval::BonaPiece p0, Eval::BonaPiece p1) const
 		{
 			KKPP kkpp(king, p0, p1);
@@ -968,17 +968,17 @@ namespace EvalLearningTools
 			return kkpp;
 		}
 
-		// このクラスのmin_index()の値を0として数えたときのindexを取得する。
+		// Get the index when counting the value of min_index() of this class as 0.
 		virtual uint64_t toRawIndex() const {
 
-			// Bonanza 6.0で使われているのに似せたマクロ
-			// 前提条件) i > jであること。
-			// i==j,j==kのケースはNG。
+			// Macro similar to the one used in Bonanza 6.0
+			// Precondition) i> j.
+			// NG in case of i==j,j==k.
 			auto PcPcOnSq = [this](int king, Eval::BonaPiece i, Eval::BonaPiece j)
 			{
 				assert(i > j);
 
-				// BonaPiece型は、32bitを想定しているので掛け算には気をつけないとオーバーフローする。
+				// BonaPiece type is assumed to be 32 bits, so if you do not pay attention to multiplication, it will overflow.
 				return (uint64_t)king * triangle_fe_end + (uint64_t)(
 					+ uint64_t(i)*(uint64_t(i) - 1) / 2
 					+ uint64_t(j)
@@ -988,24 +988,24 @@ namespace EvalLearningTools
 			return PcPcOnSq(king_, piece0_, piece1_);
 		}
 
-		// fromIndex(),fromKKPP()を用いてこのオブジェクトを構築したときに、以下のアクセッサで情報が得られる。
+		// When you construct this object using fromIndex(), fromKKPP(), you can get information with the following accessors.
 		int king() const { return king_; }
 		Eval::BonaPiece piece0() const { return piece0_; }
 		Eval::BonaPiece piece1() const { return piece1_; }
 
-		// toLowerDimensionsで次元下げしたものがinverseしたものであるかを返す。
-		// KK,KKPとinterfaceを合せるために用意してある。このKKPPクラスでは、このメソッドは常にfalseを返す。
+		// Returns whether or not the dimension lowered with toLowerDimensions is inverse.
+		// Prepared to match KK, KKP and interface. In this KKPP class, this method always returns false.
 		bool is_inverse() const {
 			return false;
 		}
 
-		// 3角配列化したときの要素の数を返す。kkpp配列が、以下のような2次元配列だと想定している。
+		//Returns the number of elements in a triangular array. It is assumed that the kkpp array is the following two-dimensional array.
 		//   kkpp[king_sq][triangle_fe_end];
 		uint64_t get_triangle_fe_end() const { return triangle_fe_end; }
 
-		// 比較演算子
+		// comparison operator
 		bool operator==(const KKPP& rhs) {
-			// piece0 > piece1を前提とするので、入れ替わりの可能性はない。
+			// Since piece0> piece1 is assumed, there is no possibility of replacement.
 			return king() == rhs.king() && piece0() == rhs.piece0() && piece1() == rhs.piece1();
 		}
 		bool operator!=(const KKPP& rhs) { return !(*this == rhs); }
@@ -1015,12 +1015,12 @@ namespace EvalLearningTools
 		int king_;
 		Eval::BonaPiece piece0_, piece1_;
 
-		// kppp[king_sq][fe_end][fe_end]の[fe_end][fe_end]な正方配列の部分を三角配列化する。
+		// Triangularize the square array part of [fe_end][fe_end] of kppp[king_sq][fe_end][fe_end].
 		uint64_t triangle_fe_end = 0;
 		
 	};
 
-	// デバッグ用出力。
+	// Output for debugging.
 	static std::ostream& operator<<(std::ostream& os, KKPP rhs)
 	{
 		os << "KKPP(" << rhs.king() << "," << rhs.piece0() << "," << rhs.piece1() << ")";
@@ -1031,4 +1031,4 @@ namespace EvalLearningTools
 }
 
 #endif // defined (EVAL_LEARN)
-#endif
+#endif
\ No newline at end of file
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index 2dcb5b46..34f5373d 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -10,67 +10,67 @@
 
 void MultiThink::go_think()
 {
-	// あとでOptionsの設定を復元するためにコピーで保持しておく。
+	// Keep a copy to restore the Options settings later.
 	auto oldOptions = Options;
 
-	// 定跡を用いる場合、on the flyで行なうとすごく時間がかかる＆ファイルアクセスを行なう部分が
-	// thread safeではないので、メモリに丸読みされている状態であることをここで保証する。
+	// When using the constant track, it takes a lot of time to perform on the fly & the part to access the file is
+	// Since it is not thread safe, it is guaranteed here that it is being completely read in memory.
 	Options["BookOnTheFly"] = std::string("false");
 
-	// 評価関数の読み込み等
-	// learnコマンドの場合、評価関数読み込み後に評価関数の値を補正している可能性があるので、
-	// メモリの破損チェックは省略する。
+	// Read evaluation function, etc.
+	// In the case of the learn command, the value of the evaluation function may be corrected after reading the evaluation function, so
+	// Skip memory corruption check.
 	is_ready(true);
 
-	// 派生クラスのinit()を呼び出す。
+	// Call the derived class's init().
 	init();
 
-	// ループ上限はset_loop_max()で設定されているものとする。
+	// The loop upper limit is set with set_loop_max().
 	loop_count = 0;
 	done_count = 0;
 
-	// threadをOptions["Threads"]の数だけ生成して思考開始。
+	// Create threads as many as Options["Threads"] and start thinking.
 	std::vector<std::thread> threads;
 	auto thread_num = (size_t)Options["Threads"];
 
-	// worker threadの終了フラグの確保
+	// Secure end flag of worker thread
 	thread_finished.resize(thread_num);
 	
-	// worker threadの起動
+	// start worker thread
 	for (size_t i = 0; i < thread_num; ++i)
 	{
 		thread_finished[i] = 0;
 		threads.push_back(std::thread([i, this]
 		{ 
-			// プロセッサの全スレッドを使い切る。
+			// exhaust all processor threads.
 			WinProcGroup::bindThisThread(i);
 
-			// オーバーライドされている処理を実行
+			// execute the overridden process
 			this->thread_worker(i);
 
-			// スレッドが終了したので終了フラグを立てる
+			// Set the end flag because the thread has ended
 			this->thread_finished[i] = 1;
 		}));
 	}
 
-	// すべてのthreadの終了待ちを
-	// for (auto& th : threads)
-	//  th.join();
-	// のように書くとスレッドがまだ仕事をしている状態でここに突入するので、
-	// その間、callback_func()が呼び出せず、セーブできなくなる。
-	// そこで終了フラグを自前でチェックする必要がある。
+	// wait for all threads to finish
+	// for (auto& th :threads)
+	// th.join();
+	// If you write like, the thread will rush here while it is still working,
+	// During that time, callback_func() cannot be called and you cannot save.
+	// Therefore, you need to check the end flag yourself.
 
-	// すべてのスレッドが終了したかを判定する関数
+	// function to determine if all threads have finished
 	auto threads_done = [&]()
 	{
-		// ひとつでも終了していなければfalseを返す
+		// returns false if no one is finished
 		for (auto& f : thread_finished)
 			if (!f)
 				return false;
 		return true;
 	};
 
-	// コールバック関数が設定されているならコールバックする。
+	// Call back if the callback function is set.
 	auto do_a_callback = [&]()
 	{
 		if (callback_func)
@@ -80,44 +80,44 @@ void MultiThink::go_think()
 
 	for (uint64_t i = 0 ; ; )
 	{
-		// 全スレッドが終了していたら、ループを抜ける。
+		// If all threads have finished, exit the loop.
 		if (threads_done())
 			break;
 
 		sleep(1000);
 
-		// callback_secondsごとにcallback_func()が呼び出される。
+		// callback_func() is called every callback_seconds.
 		if (++i == callback_seconds)
 		{
 			do_a_callback();
-			// ↑から戻ってきてからカウンターをリセットしているので、
-			// do_a_callback()のなかでsave()などにどれだけ時間がかかろうと
-			// 次に呼び出すのは、そこから一定時間の経過を要する。
+			// Since I am returning from ↑, I reset the counter, so
+			// no matter how long it takes to save() etc. in do_a_callback()
+			// The next call will take a certain amount of time.
 			i = 0;
 		}
 	}
 
-	// 最後の保存。
+	// Last save.
 	std::cout << std::endl << "finalize..";
 
 	// do_a_callback();
-	// →　呼び出し元で保存するはずで、ここでは要らない気がする。
+	// → It should be saved by the caller, so I feel that it is not necessary here.
 
-	// 終了したフラグは立っているがスレッドの終了コードの実行中であるということはありうるので
-	// join()でその終了を待つ必要がある。
+	// It is possible that the exit code of the thread is running but the exit code of the thread is running, so
+	// We need to wait for the end with join().
 	for (auto& th : threads)
 		th.join();
 
-	// 全スレッドが終了しただけでfileの書き出しスレッドなどはまだ動いていて
-	// 作業自体は完了していない可能性があるのでスレッドがすべて終了したことだけ出力する。
+	// The file writing thread etc. are still running only when all threads are finished
+	// Since the work itself may not have completed, output only that all threads have finished.
 	std::cout << "all threads are joined." << std::endl;
 
-	// Optionsを書き換えたので復元。
-	// 値を代入しないとハンドラが起動しないのでこうやって復元する。
+	// Restored because Options were rewritten.
+	// Restore the handler because the handler will not start unless you assign a value.
 	for (auto& s : oldOptions)
 		Options[s.first] = std::string(s.second);
 
 }
 
 
-#endif // defined(EVAL_LEARN)
+#endif // defined(EVAL_LEARN)
\ No newline at end of file
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index ad6baa5e..a2ef8cde 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -11,9 +11,9 @@
 
 #include <atomic>
 
-// 棋譜からの学習や、自ら思考させて定跡を生成するときなど、
-// 複数スレッドが個別にSearch::think()を呼び出したいときに用いるヘルパクラス。
-// このクラスを派生させて用いる。
+// Learning from a game record, when making yourself think and generating a fixed track, etc.
+// Helper class used when multiple threads want to call Search::think() individually.
+// Derive and use this class.
 struct MultiThink
 {
 	MultiThink() : prng(21120903)
@@ -21,43 +21,43 @@ struct MultiThink
 		loop_count = 0;
 	}
 
-	// マスタースレッドからこの関数を呼び出すと、スレッドがそれぞれ思考して、
-	// 思考終了条件を満たしたところで制御を返す。
-	// 他にやってくれること。
-	// ・各スレッドがLearner::search(),qsearch()を呼び出しても安全なように
-	// 　置換表をスレッドごとに分離してくれる。(終了後、元に戻してくれる。)
-	// ・bookはon the flyモードだとthread safeではないので、このモードを一時的に
-	// 　オフにしてくれる。
-	// [要件]
-	// 1) thread_worker()のオーバーライド
-	// 2) set_loop_max()でループ回数の設定
-	// 3) 定期的にcallbackされる関数を設定する(必要なら)
-	//   callback_funcとcallback_interval
+	// Call this function from the master thread, each thread will think,
+	// Return control when the thought ending condition is satisfied.
+	// Do something else.
+	// ・It is safe for each thread to call Learner::search(),qsearch()
+	// Separates the substitution table for each thread. (It will be restored after the end.)
+	// ・Book is not thread safe when in on the fly mode, so temporarily change this mode.
+	// Turn it off.
+	// [Requirements]
+	// 1) Override thread_worker()
+	// 2) Set the loop count with set_loop_max()
+	// 3) set a function to be called back periodically (if necessary)
+	// callback_func and callback_interval
 	void go_think();
 
-	// 派生クラス側で初期化したいものがあればこれをoverrideしておけば、
-	// go_think()で初期化が終わったタイミングで呼び出される。
-	// 定跡の読み込みなどはそのタイミングで行うと良い。
+	// If there is something you want to initialize on the derived class side, override this,
+	// Called when initialization is completed with go_think().
+	// It is better to read the fixed trace at that timing.
 	virtual void init() {}
 
-	// go_think()したときにスレッドを生成して呼び出されるthread worker
-	// これをoverrideして用いる。
+	// A thread worker that is called by creating a thread when you go_think()
+	// Override and use this.
 	virtual void thread_worker(size_t thread_id) = 0;
 
-	// go_think()したときにcallback_seconds[秒]ごとにcallbackされる。
+	// Called back every callback_seconds [seconds] when go_think().
 	std::function<void()> callback_func;
 	uint64_t callback_seconds = 600;
 
-	// workerが処理する(Search::think()を呼び出す)回数を設定する。
+	// Set the number of times worker processes (calls Search::think()).
 	void set_loop_max(uint64_t loop_max_) { loop_max = loop_max_; }
-	
-	// set_loop_max()で設定した値を取得する。
+
+	// Get the value set by set_loop_max().
 	uint64_t get_loop_max() const { return loop_max; }
 
-	// [ASYNC] ループカウンターの値を取り出して、取り出し後にループカウンターを加算する。
-	// もしループカウンターがloop_maxに達していたらUINT64_MAXを返す。
-	// 局面を生成する場合などは、局面を生成するタイミングでこの関数を呼び出すようにしないと、
-	// 生成した局面数と、カウンターの値が一致しなくなってしまうので注意すること。
+	// [ASYNC] Take the value of the loop counter and add the loop counter after taking it out.
+	// If the loop counter has reached loop_max, return UINT64_MAX.
+	// If you want to generate a phase, you must call this function at the time of generating the phase,
+	// Please note that the number of generated phases and the value of the counter will not match.
 	uint64_t get_next_loop_count() {
 		std::unique_lock<std::mutex> lk(loop_mutex);
 		if (loop_count >= loop_max)
@@ -65,46 +65,46 @@ struct MultiThink
 		return loop_count++;
 	}
 
-	// [ASYNC] 処理した個数を返す用。呼び出されるごとにインクリメントされたカウンターが返る。
+	// [ASYNC] For returning the processed number. Each time it is called, it returns a counter that is incremented.
 	uint64_t get_done_count() {
 		std::unique_lock<std::mutex> lk(loop_mutex);
 		return ++done_count;
 	}
 
-	// worker threadがI/Oにアクセスするときのmutex
+	// Mutex when worker thread accesses I/O
 	std::mutex io_mutex;
 
 protected:
-	// 乱数発生器本体
+	// Random number generator body
 	AsyncPRNG prng;
 
 private:
-	// workerが処理する(Search::think()を呼び出す)回数
+	// number of times worker processes (calls Search::think())
 	std::atomic<uint64_t> loop_max;
-	// workerが処理した(Search::think()を呼び出した)回数
+	// number of times the worker has processed (calls Search::think())
 	std::atomic<uint64_t> loop_count;
-	// 処理した回数を返す用。
+	// To return the number of times it has been processed.
 	std::atomic<uint64_t> done_count;
 
-	// ↑の変数を変更するときのmutex
+	// Mutex when changing the variables in ↑
 	std::mutex loop_mutex;
 
-	// スレッドの終了フラグ。
-	// vector<bool>にすると複数スレッドから書き換えようとしたときに正しく反映されないことがある…はず。
+	// Thread end flag.
+	// vector<bool> may not be reflected properly when trying to rewrite from multiple threads...
 	typedef uint8_t Flag;
 	std::vector<Flag> thread_finished;
 
 };
 
-// idle時間にtaskを処理する仕組み。
-// masterは好きなときにpush_task_async()でtaskを渡す。
-// slaveは暇なときにon_idle()を実行すると、taskを一つ取り出してqueueがなくなるまで実行を続ける。
-// MultiThinkのthread workerをmaster-slave方式で書きたいときに用いると便利。
+// Mechanism to process task during idle time.
+// master passes the task with push_task_async() whenever you like.
+// When slave executes on_idle() in its spare time, it retrieves one task and continues execution until there is no queue.
+// Convenient to use when you want to write MultiThink thread worker in master-slave method.
 struct TaskDispatcher
 {
 	typedef std::function<void(size_t /* thread_id */)> Task;
 
-	// slaveはidle中にこの関数を呼び出す。
+	// slave calls this function during idle.
 	void on_idle(size_t thread_id)
 	{
 		Task task;
@@ -114,24 +114,24 @@ struct TaskDispatcher
 		sleep(1);
 	}
 
-	// [ASYNC] taskを一つ積む。
+	// Stack [ASYNC] task.
 	void push_task_async(Task task)
 	{
 		std::unique_lock<std::mutex> lk(task_mutex);
 		tasks.push_back(task);
 	}
 
-	// task用の配列の要素をsize分だけ事前に確保する。
+	// Allocate size array elements for task in advance.
 	void task_reserve(size_t size)
 	{
 		tasks.reserve(size);
 	}
 
 protected:
-	// taskの集合
+	// set of tasks
 	std::vector<Task> tasks;
 
-	// [ASYNC] taskを一つ取り出す。on_idle()から呼び出される。
+	// Take out one [ASYNC] task. Called from on_idle().
 	Task get_task_async()
 	{
 		std::unique_lock<std::mutex> lk(task_mutex);
@@ -142,10 +142,10 @@ protected:
 		return task;
 	}
 
-	// tasksにアクセスするとき用のmutex
+	// a mutex for accessing tasks
 	std::mutex task_mutex;
 };
 
 #endif // defined(EVAL_LEARN) && defined(YANEURAOU_2018_OTAFUKU_ENGINE)
 
-#endif
+#endif
\ No newline at end of file

From 87c8b324f815d8eaaf09088b54fe2188d3879cef Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 28 Jun 2020 11:37:15 +0900
Subject: [PATCH 073/583] Simplified source code to estimate the winning ratio
 from an eval value.

We need to adjust the eta again after this commit is pushed.
---
 src/learn/learner.cpp | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index f105296f..7189b5a3 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1028,24 +1028,10 @@ double sigmoid(double x)
 // 評価値を勝率[0,1]に変換する関数
 double winning_percentage(double value)
 {
-	// In Maxima,
-	// load("C:/maxima-5.44.0/cform.lisp");
-	// PawnValueEg = 206;
-	// cform(1.0 / (1.0 + 10.0 ^ (-value / PawnValueEg / 4.0)));
-	constexpr double PawnValue = PawnValueEg;
-	return 1.0 * pow(pow(10.0, -0.25 * pow(PawnValue, -1) * value) + 1.0, -1);
-}
-
-double delta_winning_percentage(double value)
-{
-	// In Maxima,
-	// load("C:/maxima-5.44.0/cform.lisp");
-	// PawnValueEg = 206;
-	// cform(diff(1.0/(1.0+10.0^(-value/PawnValue/4.0)),value));
-	constexpr double PawnValue = PawnValueEg;
-	return
-		0.5756462732485115 * pow(PawnValue, -1) * pow(10.0, -0.25 * pow(PawnValue, -1) * value) *
-		pow(pow(10.0, -0.25 * pow(PawnValue, -1) * value) + 1.0, -2);
+	// 1/(1+10^(-Eval/4))
+	// = 1/(1+e^(-Eval/4*ln(10))
+	// = sigmoid(Eval/4*ln(10))
+	return sigmoid(value / PawnValueEg / 4.0 * log(10.0));
 }
 
 // 普通のシグモイド関数の導関数。
@@ -1145,7 +1131,6 @@ double calc_grad(Value deep, Value shallow , const PackedSfenValue& psv)
 
 	const double q = winning_percentage(shallow);
 	const double p = winning_percentage(deep);
-	const double dq = delta_winning_percentage(shallow);
 
 	// 期待勝率を勝っていれば1、負けていれば 0、引き分けなら0.5として補正項として用いる。
 	// game_result = 1,0,-1なので1足して2で割る。
@@ -1156,9 +1141,7 @@ double calc_grad(Value deep, Value shallow , const PackedSfenValue& psv)
 
 	// 実際の勝率を補正項として使っている。
 	// これがelmo(WCSC27)のアイデアで、現代のオーパーツ。
-	const double pp = (q - p) * dq / q / (1.0 - q);
-	const double tt = (q - t) * dq / q / (1.0 - q);
-	const double grad = lambda * pp + (1.0 - lambda) * tt;
+	const double grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
 
 	return grad;
 }

From f5cc77bc7c62df8efd905f46e5564741f793c5d3 Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Sun, 28 Jun 2020 23:35:48 +0200
Subject: [PATCH 074/583] EOL

add eol at eof
---
 src/eval/evaluate_common.h                              | 2 +-
 src/eval/evaluate_mir_inv_tools.h                       | 2 +-
 src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h  | 2 +-
 src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h     | 2 +-
 src/eval/nnue/architectures/k-p-cr_256x2-32-32.h        | 2 +-
 src/eval/nnue/architectures/k-p_256x2-32-32.h           | 2 +-
 src/eval/nnue/evaluate_nnue.h                           | 2 +-
 src/eval/nnue/evaluate_nnue_learner.h                   | 2 +-
 src/eval/nnue/features/castling_right.h                 | 2 +-
 src/eval/nnue/features/enpassant.h                      | 2 +-
 src/eval/nnue/features/feature_set.h                    | 2 +-
 src/eval/nnue/features/features_common.h                | 2 +-
 src/eval/nnue/features/half_kp.h                        | 2 +-
 src/eval/nnue/features/half_relative_kp.h               | 2 +-
 src/eval/nnue/features/index_list.h                     | 2 +-
 src/eval/nnue/features/k.h                              | 2 +-
 src/eval/nnue/features/p.h                              | 2 +-
 src/eval/nnue/layers/affine_transform.h                 | 2 +-
 src/eval/nnue/layers/clipped_relu.h                     | 2 +-
 src/eval/nnue/layers/input_slice.h                      | 2 +-
 src/eval/nnue/layers/sum.h                              | 2 +-
 src/eval/nnue/nnue_accumulator.h                        | 2 +-
 src/eval/nnue/nnue_architecture.h                       | 2 +-
 src/eval/nnue/nnue_feature_transformer.h                | 2 +-
 src/eval/nnue/trainer/features/factorizer.h             | 2 +-
 src/eval/nnue/trainer/features/factorizer_feature_set.h | 2 +-
 src/eval/nnue/trainer/features/factorizer_half_kp.h     | 2 +-
 src/eval/nnue/trainer/trainer.h                         | 2 +-
 src/eval/nnue/trainer/trainer_clipped_relu.h            | 2 +-
 src/eval/nnue/trainer/trainer_feature_transformer.h     | 2 +-
 src/eval/nnue/trainer/trainer_sum.h                     | 2 +-
 src/learn/half_float.h                                  | 2 +-
 src/learn/learn.h                                       | 2 +-
 src/learn/learner.cpp                                   | 2 +-
 src/learn/learning_tools.cpp                            | 2 +-
 src/learn/learning_tools.h                              | 2 +-
 src/learn/multi_think.cpp                               | 2 +-
 src/learn/multi_think.h                                 | 2 +-
 38 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index 84a96bee..b043f2e1 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -79,4 +79,4 @@ namespace Eval
 
 #endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
 
-#endif // _EVALUATE_KPPT_COMMON_H_
\ No newline at end of file
+#endif // _EVALUATE_KPPT_COMMON_H_
diff --git a/src/eval/evaluate_mir_inv_tools.h b/src/eval/evaluate_mir_inv_tools.h
index fa4e70ac..826164bf 100644
--- a/src/eval/evaluate_mir_inv_tools.h
+++ b/src/eval/evaluate_mir_inv_tools.h
@@ -44,4 +44,4 @@ namespace Eval
 
 #endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h b/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
index 1bb9609e..37b155d5 100644
--- a/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
+++ b/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
@@ -39,4 +39,4 @@ namespace Eval {
   }  // namespace NNUE
 
 }  // namespace Eval
-#endif // HALFKP_CR_EP_256X2_32_32_H
\ No newline at end of file
+#endif // HALFKP_CR_EP_256X2_32_32_H
diff --git a/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h b/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
index 72531fd4..e178b57b 100644
--- a/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
+++ b/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
@@ -39,4 +39,4 @@ namespace Eval {
   }  // namespace NNUE
 
 }  // namespace Eval
-#endif // K_P_CR_EP_256X2_32_32_H
\ No newline at end of file
+#endif // K_P_CR_EP_256X2_32_32_H
diff --git a/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h b/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
index b4161880..d3c187c0 100644
--- a/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
+++ b/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
@@ -38,4 +38,4 @@ namespace Eval {
   }  // namespace NNUE
 
 }  // namespace Eval
-#endif // K_P_CR_256X2_32_32_H
\ No newline at end of file
+#endif // K_P_CR_256X2_32_32_H
diff --git a/src/eval/nnue/architectures/k-p_256x2-32-32.h b/src/eval/nnue/architectures/k-p_256x2-32-32.h
index 9fc9b2a1..00b14d47 100644
--- a/src/eval/nnue/architectures/k-p_256x2-32-32.h
+++ b/src/eval/nnue/architectures/k-p_256x2-32-32.h
@@ -35,4 +35,4 @@ using Network = Layers::OutputLayer;
 }  // namespace NNUE
 
 }  // namespace Eval
-#endif // K_P_256X2_32_32_H
\ No newline at end of file
+#endif // K_P_256X2_32_32_H
diff --git a/src/eval/nnue/evaluate_nnue.h b/src/eval/nnue/evaluate_nnue.h
index 7f8f700a..ee498f51 100644
--- a/src/eval/nnue/evaluate_nnue.h
+++ b/src/eval/nnue/evaluate_nnue.h
@@ -61,4 +61,4 @@ bool WriteParameters(std::ostream& stream);
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/evaluate_nnue_learner.h b/src/eval/nnue/evaluate_nnue_learner.h
index 932a5f8c..ace66524 100644
--- a/src/eval/nnue/evaluate_nnue_learner.h
+++ b/src/eval/nnue/evaluate_nnue_learner.h
@@ -43,4 +43,4 @@ void CheckHealth();
 
 #endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/features/castling_right.h b/src/eval/nnue/features/castling_right.h
index f585b1d7..709d4688 100644
--- a/src/eval/nnue/features/castling_right.h
+++ b/src/eval/nnue/features/castling_right.h
@@ -45,4 +45,4 @@ namespace Eval {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/features/enpassant.h b/src/eval/nnue/features/enpassant.h
index c0ac8234..51880bb4 100644
--- a/src/eval/nnue/features/enpassant.h
+++ b/src/eval/nnue/features/enpassant.h
@@ -45,4 +45,4 @@ namespace Eval {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/features/feature_set.h b/src/eval/nnue/features/feature_set.h
index 6190db04..0430ebfe 100644
--- a/src/eval/nnue/features/feature_set.h
+++ b/src/eval/nnue/features/feature_set.h
@@ -246,4 +246,4 @@ class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/features/features_common.h b/src/eval/nnue/features/features_common.h
index 0031d37b..8d2ca4a2 100644
--- a/src/eval/nnue/features/features_common.h
+++ b/src/eval/nnue/features/features_common.h
@@ -44,4 +44,4 @@ enum class Side {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/features/half_kp.h b/src/eval/nnue/features/half_kp.h
index 65ea46f1..cc9cd660 100644
--- a/src/eval/nnue/features/half_kp.h
+++ b/src/eval/nnue/features/half_kp.h
@@ -59,4 +59,4 @@ class HalfKP {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/features/half_relative_kp.h b/src/eval/nnue/features/half_relative_kp.h
index f6ca5cc0..2f967745 100644
--- a/src/eval/nnue/features/half_relative_kp.h
+++ b/src/eval/nnue/features/half_relative_kp.h
@@ -65,4 +65,4 @@ class HalfRelativeKP {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/features/index_list.h b/src/eval/nnue/features/index_list.h
index 90317b4c..39e66a09 100644
--- a/src/eval/nnue/features/index_list.h
+++ b/src/eval/nnue/features/index_list.h
@@ -52,4 +52,4 @@ class IndexList
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/features/k.h b/src/eval/nnue/features/k.h
index 0930c160..d7a6f4aa 100644
--- a/src/eval/nnue/features/k.h
+++ b/src/eval/nnue/features/k.h
@@ -45,4 +45,4 @@ class K {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/features/p.h b/src/eval/nnue/features/p.h
index ded678a5..27a944fa 100644
--- a/src/eval/nnue/features/p.h
+++ b/src/eval/nnue/features/p.h
@@ -45,4 +45,4 @@ class P {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/layers/affine_transform.h b/src/eval/nnue/layers/affine_transform.h
index 99dae0fe..c06af1a0 100644
--- a/src/eval/nnue/layers/affine_transform.h
+++ b/src/eval/nnue/layers/affine_transform.h
@@ -175,4 +175,4 @@ class AffineTransform {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/layers/clipped_relu.h b/src/eval/nnue/layers/clipped_relu.h
index 1b7e8fc1..7c5c1f75 100644
--- a/src/eval/nnue/layers/clipped_relu.h
+++ b/src/eval/nnue/layers/clipped_relu.h
@@ -165,4 +165,4 @@ class ClippedReLU {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/layers/input_slice.h b/src/eval/nnue/layers/input_slice.h
index 0497e769..ec7627d2 100644
--- a/src/eval/nnue/layers/input_slice.h
+++ b/src/eval/nnue/layers/input_slice.h
@@ -71,4 +71,4 @@ class InputSlice {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/layers/sum.h b/src/eval/nnue/layers/sum.h
index c64852a1..d8c7bf93 100644
--- a/src/eval/nnue/layers/sum.h
+++ b/src/eval/nnue/layers/sum.h
@@ -160,4 +160,4 @@ class Sum<PreviousLayer> {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/nnue_accumulator.h b/src/eval/nnue/nnue_accumulator.h
index e480526b..07f4f183 100644
--- a/src/eval/nnue/nnue_accumulator.h
+++ b/src/eval/nnue/nnue_accumulator.h
@@ -27,4 +27,4 @@ struct alignas(32) Accumulator {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/nnue_architecture.h b/src/eval/nnue/nnue_architecture.h
index aa4e8c7f..977c67fc 100644
--- a/src/eval/nnue/nnue_architecture.h
+++ b/src/eval/nnue/nnue_architecture.h
@@ -29,4 +29,4 @@ constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/nnue_feature_transformer.h b/src/eval/nnue/nnue_feature_transformer.h
index 039a0b98..27bbb562 100644
--- a/src/eval/nnue/nnue_feature_transformer.h
+++ b/src/eval/nnue/nnue_feature_transformer.h
@@ -344,4 +344,4 @@ class FeatureTransformer {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/trainer/features/factorizer.h b/src/eval/nnue/trainer/features/factorizer.h
index dea95370..148ee8ec 100644
--- a/src/eval/nnue/trainer/features/factorizer.h
+++ b/src/eval/nnue/trainer/features/factorizer.h
@@ -107,4 +107,4 @@ constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/trainer/features/factorizer_feature_set.h b/src/eval/nnue/trainer/features/factorizer_feature_set.h
index 0afe7a48..af524719 100644
--- a/src/eval/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/eval/nnue/trainer/features/factorizer_feature_set.h
@@ -101,4 +101,4 @@ public:
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/trainer/features/factorizer_half_kp.h b/src/eval/nnue/trainer/features/factorizer_half_kp.h
index 6ce5854a..a5363771 100644
--- a/src/eval/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/eval/nnue/trainer/features/factorizer_half_kp.h
@@ -100,4 +100,4 @@ constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
 
 #endif  // defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/trainer/trainer.h b/src/eval/nnue/trainer/trainer.h
index f50ce092..49400bbe 100644
--- a/src/eval/nnue/trainer/trainer.h
+++ b/src/eval/nnue/trainer/trainer.h
@@ -122,4 +122,4 @@ std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
 
 #endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/trainer/trainer_clipped_relu.h b/src/eval/nnue/trainer/trainer_clipped_relu.h
index 7fe1913d..566ed777 100644
--- a/src/eval/nnue/trainer/trainer_clipped_relu.h
+++ b/src/eval/nnue/trainer/trainer_clipped_relu.h
@@ -139,4 +139,4 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
 
 #endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/trainer/trainer_feature_transformer.h b/src/eval/nnue/trainer/trainer_feature_transformer.h
index eb14d98b..0139d534 100644
--- a/src/eval/nnue/trainer/trainer_feature_transformer.h
+++ b/src/eval/nnue/trainer/trainer_feature_transformer.h
@@ -374,4 +374,4 @@ class Trainer<FeatureTransformer> {
 
 #endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/eval/nnue/trainer/trainer_sum.h b/src/eval/nnue/trainer/trainer_sum.h
index bae3edd5..2efdff67 100644
--- a/src/eval/nnue/trainer/trainer_sum.h
+++ b/src/eval/nnue/trainer/trainer_sum.h
@@ -187,4 +187,4 @@ class Trainer<Layers::Sum<PreviousLayer>> {
 
 #endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/learn/half_float.h b/src/learn/half_float.h
index d5c2f83c..30b3e482 100644
--- a/src/learn/half_float.h
+++ b/src/learn/half_float.h
@@ -130,4 +130,4 @@ namespace HalfFloat
 
 }
 
-#endif // __HALF_FLOAT_H__
\ No newline at end of file
+#endif // __HALF_FLOAT_H__
diff --git a/src/learn/learn.h b/src/learn/learn.h
index ab53e046..eda2bb32 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -234,4 +234,4 @@ namespace Learner
 
 #endif
 
-#endif // ifndef _LEARN_H_
\ No newline at end of file
+#endif // ifndef _LEARN_H_
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index ea7c4d7e..9229770a 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -3185,4 +3185,4 @@ void learn(Position&, istringstream& is)
 #endif
 
 
-#endif // EVAL_LEARN
\ No newline at end of file
+#endif // EVAL_LEARN
diff --git a/src/learn/learning_tools.cpp b/src/learn/learning_tools.cpp
index e3bd6f68..4bcecab8 100644
--- a/src/learn/learning_tools.cpp
+++ b/src/learn/learning_tools.cpp
@@ -253,4 +253,4 @@ namespace EvalLearningTools
 	}
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index f95ea2d9..a1de03dd 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -1031,4 +1031,4 @@ namespace EvalLearningTools
 }
 
 #endif // defined (EVAL_LEARN)
-#endif
\ No newline at end of file
+#endif
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index 34f5373d..d511c277 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -120,4 +120,4 @@ void MultiThink::go_think()
 }
 
 
-#endif // defined(EVAL_LEARN)
\ No newline at end of file
+#endif // defined(EVAL_LEARN)
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index a2ef8cde..55edb049 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -148,4 +148,4 @@ protected:
 
 #endif // defined(EVAL_LEARN) && defined(YANEURAOU_2018_OTAFUKU_ENGINE)
 
-#endif
\ No newline at end of file
+#endif

From 8f31d74cf64a3daa24eac1c3ae659a572f04667d Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Mon, 29 Jun 2020 17:31:35 +0200
Subject: [PATCH 075/583] More comment translation

including 11 files in /src
---
 src/evaluate.cpp  |  22 +++----
 src/evaluate.h    | 112 +++++++++++++++++-----------------
 src/misc.cpp      |  52 ++++++++--------
 src/misc.h        |  86 +++++++++++++-------------
 src/position.cpp  |  20 +++---
 src/position.h    |  32 +++++-----
 src/search.cpp    | 151 +++++++++++++++++++++++-----------------------
 src/types.h       |  16 ++---
 src/uci.cpp       |  62 +++++++++----------
 src/uci.h         |  11 ++--
 src/ucioption.cpp |  22 +++----
 11 files changed, 293 insertions(+), 293 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 83dfaadd..07bed614 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -945,7 +945,7 @@ ExtBonaPiece kpp_board_index[PIECE_NB] = {
     { f_king, e_king },
     { BONA_PIECE_ZERO, BONA_PIECE_ZERO },
 
-    // ��肩�猩���ꍇ�Bf��e������ւ��B
+    // When viewed from behind. f and e are exchanged.
     { BONA_PIECE_ZERO, BONA_PIECE_ZERO },
     { e_pawn, f_pawn },
     { e_knight, f_knight },
@@ -953,11 +953,11 @@ ExtBonaPiece kpp_board_index[PIECE_NB] = {
     { e_rook, f_rook },
     { e_queen, f_queen },
     { e_king, f_king },
-    { BONA_PIECE_ZERO, BONA_PIECE_ZERO }, // ���̐���͂Ȃ�
+    { BONA_PIECE_ZERO, BONA_PIECE_ZERO }, // no money
 };
 
-// �����ŕێ����Ă���pieceListFw[]��������BonaPiece�ł��邩����������B
-// �� : �f�o�b�O�p�B�x���B
+// Check whether the pieceListFw[] held internally is a correct BonaPiece.
+// Note: For debugging. slow.
 bool EvalList::is_valid(const Position& pos)
 {
   std::set<PieceNumber> piece_numbers;
@@ -973,28 +973,28 @@ bool EvalList::is_valid(const Position& pos)
   for (int i = 0; i < length(); ++i)
   {
     BonaPiece fw = pieceListFw[i];
-    // ����fw���{���ɑ��݂��邩��Position�N���X�̂ق��ɒ��ׂɍs���B
+    // Go to the Position class to see if this fw really exists.
 
     if (fw == Eval::BONA_PIECE_ZERO) {
       continue;
     }
 
-    // �͈͊O
+    // Out of range
     if (!(0 <= fw && fw < fe_end))
       return false;
 
-    // �Տ�̋�Ȃ̂ł��̋�{���ɑ��݂��邩���ׂɂ����B
+    // Since it is a piece on the board, I will check if this piece really exists.
     for (Piece pc = NO_PIECE; pc < PIECE_NB; ++pc)
     {
       auto pt = type_of(pc);
-      if (pt == NO_PIECE_TYPE || pt == 7) // ���݂��Ȃ���
+      if (pt == NO_PIECE_TYPE || pt == 7) // non-existing piece
         continue;
 
-      // ��pc��BonaPiece�̊J�n�ԍ�
+      // BonaPiece start number of piece pc
       auto s = BonaPiece(kpp_board_index[pc].fw);
       if (s <= fw && fw < s + SQUARE_NB)
       {
-        // ���������̂ł��̋sq�̒n�_�ɂ��邩�𒲂ׂ�B
+        // Since it was found, check if this piece is at sq.
         Square sq = (Square)(fw - s);
         Piece pc2 = pos.piece_on(sq);
 
@@ -1004,7 +1004,7 @@ bool EvalList::is_valid(const Position& pos)
         goto Found;
       }
     }
-    // ���̂����݂��Ȃ���ł�����..
+    // It was a piece that did not exist for some reason..
     return false;
   Found:;
   }
diff --git a/src/evaluate.h b/src/evaluate.h
index a9e6a563..0301f455 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -38,39 +38,39 @@ void evaluate_with_no_return(const Position& pos);
 Value compute_eval(const Position& pos);
 
 #if defined(EVAL_NNUE) || defined(EVAL_LEARN)
-// �]���֐��t�@�C����ǂݍ��ށB
-// ����́A"is_ready"�R�}���h�̉�������1�x�����Ăяo�����B2�x�Ăяo�����Ƃ͑z�肵�Ă��Ȃ��B
-// (�������AEvalDir(�]���֐��t�H���_)���ύX�ɂȂ������ƁAisready���ēx�����Ă�����ǂ݂Ȃ����B)
+// Read the evaluation function file.
+// This is only called once in response to the "is_ready" command. It is not supposed to be called twice.
+// (However, if isready is sent again after EvalDir (evaluation function folder) has been changed, read it again.)
 void load_eval();
 
-static uint64_t calc_check_sum() { return 0; }
+static uint64_t calc_check_sum() {return 0;}
 
 static void print_softname(uint64_t check_sum) {}
 
-// --- �]���֐��Ŏg���萔 KPP(�ʂƔC��2��)��P�ɑ�������enum
+// --- enum corresponding to P of constant KPP (ball and arbitrary 2 pieces) used in evaluation function
 
-// (�]���֐��̎����̂Ƃ��ɂ́ABonaPiece�͎��R�ɒ�`�������̂ł����ł͒�`���Ȃ��B)
+// (BonaPiece wants to define freely in experiment of evaluation function, so I don't define it here.)
 
 
-// Bonanza��KKP/KPP�ƌ����Ƃ���P(Piece)��\������^�B
-// �� KPP�����߂�Ƃ��ɁA39�̒n�_�̕��̂悤�ɁA���~���ɑ΂��Ĉ�ӂȔԍ����K�v�ƂȂ�B
+// A type that represents P(Piece) when calling KKP/KPP in Bonanza.
+// When you ask for �� KPP, you need a unique number for each box �~ piece type, like the step at 39 points.
 enum BonaPiece : int32_t
 {
-	// f = friend(�����)�̈Ӗ��Be = enemy(�����)�̈Ӗ�
+	// Meaning of f = friend (��first move). Meaning of e = enemy (��rear)
 
-	// ���������̎��̒l
+	// Value when uninitialized
 	BONA_PIECE_NOT_INIT = -1,
 
-	// �����ȋ�B����̂Ƃ��Ȃǂ́A�s�v�ȋ�������Ɉړ�������B
+	// Invalid piece. When you drop a piece, move unnecessary pieces here.
 	BONA_PIECE_ZERO = 0,
 
 	fe_hand_end = BONA_PIECE_ZERO + 1,
 
-    // Bonanza�̂悤�ɔՏ�̂��肦�Ȃ����̕��⍁�̔ԍ����l�߂Ȃ��B
-	// ���R1) �w�K�̂Ƃ��ɑ���PP��1�i�ڂɍ�������Ƃ��������āA������t�ϊ��ɂ����Đ������\������̂�����B
-	// ���R2) �c�^Bitboard����Square����̕ϊ��ɍ���B
+	// Don't pack the numbers of unrealistic walks and incense on the board like Bonanza.
+	// Reason 1) When learning, there are times when the incense is on the first stage in relative PP, and it is difficult to display it correctly in the inverse transformation.
+	// Reason 2) It is difficult to convert from Square with vertical Bitboard.
 
-	// --- �Տ�̋�
+	// --- Pieces on the board
 	f_pawn = fe_hand_end,
 	e_pawn = f_pawn + SQUARE_NB,
 	f_knight = e_pawn + SQUARE_NB,
@@ -84,7 +84,7 @@ enum BonaPiece : int32_t
 	fe_end = e_queen + SQUARE_NB,
 	f_king = fe_end,
 	e_king = f_king + SQUARE_NB,
-	fe_end2 = e_king + SQUARE_NB, // �ʂ��܂߂������̔ԍ��B
+	fe_end2 = e_king + SQUARE_NB, // Last number including balls.
 };
 
 #define ENABLE_INCR_OPERATORS_ON(T)                                \
@@ -95,8 +95,8 @@ ENABLE_INCR_OPERATORS_ON(BonaPiece)
 
 #undef ENABLE_INCR_OPERATORS_ON
 
-// BonaPiece����肩�猩���Ƃ�(����39�̕�����肩�猩��ƌ���71�̕�)�̔ԍ��Ƃ�
-// �y�A�ɂ������̂�ExtBonaPiece�^�ƌĂԂ��Ƃɂ���B
+// The number when you look at BonaPiece from the back (the number of steps from the previous 39 to the number 71 from the back)
+// Let's call the paired one the ExtBonaPiece type.
 union ExtBonaPiece
 {
 	struct {
@@ -109,28 +109,28 @@ union ExtBonaPiece
 	ExtBonaPiece(BonaPiece fw_, BonaPiece fb_) : fw(fw_), fb(fb_) {}
 };
 
-// �����̎w����ɂ���Ăǂ�����ǂ��Ɉړ������̂��̏��B
-// ���ExtBonaPiece�\���ł���Ƃ���B
+// Information about where the piece has moved from where to by this move.
+// Assume the piece is an ExtBonaPiece expression.
 struct ChangedBonaPiece
 {
 	ExtBonaPiece old_piece;
 	ExtBonaPiece new_piece;
 };
 
-// KPP�e�[�u���̔Տ�̋�pc�ɑΉ�����BonaPiece�����߂邽�߂̔z��B
-// ��)
-// BonaPiece fb = kpp_board_index[pc].fb + sq; // ��肩�猩��sq�ɂ���pc�ɑΉ�����BonaPiece
-// BonaPiece fw = kpp_board_index[pc].fw + sq; // ��肩�猩��sq�ɂ���pc�ɑΉ�����BonaPiece
+// An array for finding the BonaPiece corresponding to the piece pc on the board of the KPP table.
+// example)
+// BonaPiece fb = kpp_board_index[pc].fb + sq; // BonaPiece corresponding to pc in sq seen from the front
+// BonaPiece fw = kpp_board_index[pc].fw + sq; // BonaPiece corresponding to pc in sq seen from behind
 extern ExtBonaPiece kpp_board_index[PIECE_NB];
 
-// �]���֐��ŗp�����X�g�B�ǂ̋�(PieceNumber)���ǂ��ɂ���̂�(BonaPiece)��ێ����Ă���\����
+// List of pieces used in the evaluation function. A structure holding which piece (PieceNumber) is where (BonaPiece)
 struct EvalList
 {
-	// �]���֐�(FV38�^)�ŗp�����ԍ��̃��X�g
+	// List of frame numbers used in evaluation function (FV38 type)
 	BonaPiece* piece_list_fw() const { return const_cast<BonaPiece*>(pieceListFw); }
 	BonaPiece* piece_list_fb() const { return const_cast<BonaPiece*>(pieceListFb); }
 
-	// �w�肳�ꂽpiece_no�̋��ExtBonaPiece�^�ɕϊ����ĕԂ��B
+	// Convert the specified piece_no piece to ExtBonaPiece type and return it.
 	ExtBonaPiece bona_piece(PieceNumber piece_no) const
 	{
 		ExtBonaPiece bp;
@@ -139,36 +139,36 @@ struct EvalList
 		return bp;
 	}
 
-	// �Տ��sq�̏���piece_no��pc�̋��z�u����
+	// Place the piece_no pc piece in the sq box on the board
 	void put_piece(PieceNumber piece_no, Square sq, Piece pc) {
 		set_piece_on_board(piece_no, BonaPiece(kpp_board_index[pc].fw + sq), BonaPiece(kpp_board_index[pc].fb + Inv(sq)), sq);
 	}
 
-	// �Տ�̂��鏡sq�ɑΉ�����PieceNumber��Ԃ��B
+	// Returns the PieceNumber corresponding to a box on the board.
 	PieceNumber piece_no_of_board(Square sq) const { return piece_no_list_board[sq]; }
 
-	// pieceList������������B
-	// ����ɑΉ������鎞�̂��߂ɁA���g�p�̋�̒l��BONA_PIECE_ZERO�ɂ��Ă����B
-	// �ʏ�̕]���֐�������̕]���֐��Ƃ��ė��p�ł���B
-	// piece_no_list�̂ق��̓f�o�b�O������悤��PIECE_NUMBER_NB�ŏ������B
+	// Initialize the pieceList.
+	// Set the value of unused pieces to BONA_PIECE_ZERO in case you want to deal with dropped pieces.
+	// A normal evaluation function can be used as an evaluation function for missing frames.
+	// piece_no_list is initialized with PIECE_NUMBER_NB to facilitate debugging.
 	void clear()
 	{
 
-		for (auto& p : pieceListFw)
+		for (auto& p: pieceListFw)
 			p = BONA_PIECE_ZERO;
 
-		for (auto& p : pieceListFb)
+		for (auto& p: pieceListFb)
 			p = BONA_PIECE_ZERO;
 
-		for (auto& v : piece_no_list_board)
+		for (auto& v :piece_no_list_board)
 			v = PIECE_NUMBER_NB;
 	}
 
-	// �����ŕێ����Ă���pieceListFw[]��������BonaPiece�ł��邩����������B
-	// �� : �f�o�b�O�p�B�x���B
+	// Check whether the pieceListFw[] held internally is a correct BonaPiece.
+	// Note: For debugging. slow.
 	bool is_valid(const Position& pos);
 
-	// �Տ�sq�ɂ���piece_no�̋��BonaPiece��fb,fw�ł��邱�Ƃ�ݒ肷��B
+	// Set that the BonaPiece of the piece_no piece on the board sq is fb,fw.
 	inline void set_piece_on_board(PieceNumber piece_no, BonaPiece fw, BonaPiece fb, Square sq)
 	{
 		assert(is_ok(piece_no));
@@ -177,21 +177,21 @@ struct EvalList
 		piece_no_list_board[sq] = piece_no;
 	}
 
-	// ��X�g�B��ԍ�(PieceNumber)�����̋�ǂ��ɂ���̂�(BonaPiece)�������BFV38�Ȃǂŗp����B
+	// Piece list. Piece Number Shows how many pieces are in place (Bona Piece). Used in FV38 etc.
 
-	// ��X�g�̒���
-  // 38�Œ�
+	// Length of piece list
+  // 38 fixed
 public:
 	int length() const { return PIECE_NUMBER_KING; }
 
-	// VPGATHERDD���g���s���A4�̔{���łȂ���΂Ȃ�Ȃ��B
-	// �܂��AKPPT�^�]���֐��Ȃǂ́A39,40�Ԗڂ̗v�f���[���ł��邱�Ƃ�O��Ƃ���
-	// �A�N�Z�X�����Ă���ӏ�������̂Œ��ӂ��邱�ƁB
+	// Must be a multiple of 4 to use VPGATHERDD.
+	// In addition, the KPPT type evaluation function, etc. is based on the assumption that the 39th and 40th elements are zero.
+	// Please note that there is a part that is accessed.
 	static const int MAX_LENGTH = 32;
 
-  // �Տ�̋�ɑ΂��āA���̋�ԍ�(PieceNumber)��ێ����Ă���z��
-  // �ʂ�SQUARE_NB�Ɉړ����Ă���Ƃ��p��+1�܂ŕێ����Ă������A
-  // SQUARE_NB�̋ʂ��ړ������Ȃ��̂ŁA���̒l���g�����Ƃ͂Ȃ��͂��B
+  // An array that holds the piece number (PieceNumber) for the pieces on the board
+  // Hold up to +1 for when the ball is moving to SQUARE_NB,
+  // SQUARE_NB balls are not moved, so this value should never be used.
   PieceNumber piece_no_list_board[SQUARE_NB_PLUS1];
 private:
 
@@ -199,20 +199,20 @@ private:
 	BonaPiece pieceListFb[MAX_LENGTH];
 };
 
-// �]���l�̍����v�Z�̊Ǘ��p
-// �O�̋ǖʂ���ړ�������ԍ����Ǘ����邽�߂̍\����
-// ������́A�ő��2�B
+// For management of evaluation value difference calculation
+// A structure for managing the number of pieces that have moved from the previous stage
+// Up to 2 moving pieces.
 struct DirtyPiece
 {
-	// ���̋�ԍ��̋�����牽�ɕς�����̂�
+	// What changed from the piece with that piece number
 	Eval::ChangedBonaPiece changed_piece[2];
 
-	// dirty�ɂȂ�����ԍ�
+	// The number of dirty pieces
 	PieceNumber pieceNo[2];
 
-	// dirty�ɂȂ������B
-	// null move����0�Ƃ������Ƃ����肤��B
-	// ������Ǝ�����Ƃōő��2�B
+	// The number of dirty files.
+	// It can be 0 for null move.
+	// Up to 2 moving pieces and taken pieces.
 	int dirty_num;
 
 };
diff --git a/src/misc.cpp b/src/misc.cpp
index 9d14cc1f..1d6bbb4f 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -527,11 +527,11 @@ void bindThisThread(size_t idx) {
 
 } // namespace WinProcGroup
 
-// 現在時刻を文字列化したもを返す。(評価関数の学習時などに用いる)
+// Returns a string that represents the current time. (Used when learning evaluation functions)
 std::string now_string()
 {
-  // std::ctime(), localtime()を使うと、MSVCでセキュアでないという警告が出る。
-  // C++標準的にはそんなことないはずなのだが…。
+  // Using std::ctime(), localtime() gives a warning that MSVC is not secure.
+  // This shouldn't happen in the C++ standard, but...
 
 #if defined(_MSC_VER)
   // C4996 : 'ctime' : This function or variable may be unsafe.Consider using ctime_s instead.
@@ -542,7 +542,7 @@ std::string now_string()
   auto tp = std::chrono::system_clock::to_time_t(now);
   auto result = string(std::ctime(&tp));
 
-  // 末尾に改行コードが含まれているならこれを除去する
+  // remove line endings if they are included at the end
   while (*result.rbegin() == '\n' || (*result.rbegin() == '\r'))
     result.pop_back();
   return result;
@@ -572,31 +572,31 @@ int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> ca
 
   fs.seekg(0, fstream::end);
   uint64_t eofPos = (uint64_t)fs.tellg();
-  fs.clear(); // これをしないと次のseekに失敗することがある。
+  fs.clear(); // Otherwise the next seek may fail.
   fs.seekg(0, fstream::beg);
   uint64_t begPos = (uint64_t)fs.tellg();
   uint64_t file_size = eofPos - begPos;
   //std::cout << "filename = " << filename << " , file_size = " << file_size << endl;
 
-  // ファイルサイズがわかったのでcallback_funcを呼び出してこの分のバッファを確保してもらい、
-  // そのポインターをもらう。
+  // I know the file size, so call callback_func to get a buffer for this,
+  // Get the pointer.
   void* ptr = callback_func(file_size);
 
-  // バッファが確保できなかった場合や、想定していたファイルサイズと異なった場合は、
-  // nullptrを返すことになっている。このとき、読み込みを中断し、エラーリターンする。
+  // If the buffer could not be secured, or if the file size is different from the expected file size,
+  // It is supposed to return nullptr. At this time, reading is interrupted and an error is returned.
   if (ptr == nullptr)
     return 2;
 
-  // 細切れに読み込む
+  // read in pieces
 
-  const uint64_t block_size = 1024 * 1024 * 1024; // 1回のreadで読み込む要素の数(1GB)
+  const uint64_t block_size = 1024 * 1024 * 1024; // number of elements to read in one read (1GB)
   for (uint64_t pos = 0; pos < file_size; pos += block_size)
   {
-    // 今回読み込むサイズ
+    // size to read this time
     uint64_t read_size = (pos + block_size < file_size) ? block_size : (file_size - pos);
     fs.read((char*)ptr + pos, read_size);
 
-    // ファイルの途中で読み込みエラーに至った。
+    // Read error occurred in the middle of the file.
     if (fs.fail())
       return 2;
 
@@ -613,10 +613,10 @@ int write_memory_to_file(std::string filename, void* ptr, uint64_t size)
   if (fs.fail())
     return 1;
 
-  const uint64_t block_size = 1024 * 1024 * 1024; // 1回のwriteで書き出す要素の数(1GB)
+  const uint64_t block_size = 1024 * 1024 * 1024; // number of elements to write in one write (1GB)
   for (uint64_t pos = 0; pos < size; pos += block_size)
   {
-    // 今回書き出すメモリサイズ
+    // Memory size to write this time
     uint64_t write_size = (pos + block_size < size) ? block_size : (size - pos);
     fs.write((char*)ptr + pos, write_size);
     //cout << ".";
@@ -629,17 +629,17 @@ int write_memory_to_file(std::string filename, void* ptr, uint64_t size)
 //     mkdir wrapper
 // ----------------------------
 
-// カレントフォルダ相対で指定する。成功すれば0、失敗すれば非0が返る。
-// フォルダを作成する。日本語は使っていないものとする。
-// どうもmsys2環境下のgccだと_wmkdir()だとフォルダの作成に失敗する。原因不明。
-// 仕方ないので_mkdir()を用いる。
+// Specify relative to the current folder. Returns 0 on success, non-zero on failure.
+// Create a folder. Japanese is not used.
+// In case of gcc under msys2 environment, folder creation fails with _wmkdir(). Cause unknown.
+// Use _mkdir() because there is no help for it.
 
 #if defined(_WIN32)
-// Windows用
+// for Windows
 
 #if defined(_MSC_VER)
-#include <codecvt>	// mkdirするのにwstringが欲しいのでこれが必要
-#include <locale>   // wstring_convertにこれが必要。
+#include <codecvt> // I need this because I want wstring to mkdir
+#include <locale> // This is required for wstring_convert.
 
 namespace Dependency {
   int mkdir(std::string dir_name)
@@ -663,9 +663,9 @@ namespace Dependency {
 #endif
 #elif defined(__linux__)
 
-// linux環境において、この_LINUXというシンボルはmakefileにて定義されるものとする。
+// In the linux environment, this symbol _LINUX is defined in the makefile.
 
-// Linux用のmkdir実装。
+// mkdir implementation for Linux.
 #include "sys/stat.h"
 
 namespace Dependency {
@@ -676,8 +676,8 @@ namespace Dependency {
 }
 #else
 
-// Linux環境かどうかを判定するためにはmakefileを分けないといけなくなってくるな..
-// linuxでフォルダ掘る機能は、とりあえずナシでいいや..。評価関数ファイルの保存にしか使ってないし…。
+// In order to judge whether it is a Linux environment, we have to divide the makefile..
+// The function to dig a folder on linux is good for the time being... Only used to save the evaluation function file...
 
 namespace Dependency {
   int mkdir(std::string dir_name)
diff --git a/src/misc.h b/src/misc.h
index 72f621a6..0e2e8403 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -115,15 +115,14 @@ public:
   /// Output values only have 1/8th of their bits set on average.
   template<typename T> T sparse_rand()
   { return T(rand64() & rand64() & rand64()); }
-
-  // 0からn-1までの乱数を返す。(一様分布ではないが現実的にはこれで十分)
+  // Returns a random number from 0 to n-1. (Not uniform distribution, but this is enough in reality)
   uint64_t rand(uint64_t n) { return rand<uint64_t>() % n; }
 
-  // 内部で使用している乱数seedを返す。
+  // Return the random seed used internally.
   uint64_t get_seed() const { return s; }
 };
 
-// 乱数のseedを表示する。(デバッグ用)
+// Display a random seed. (For debugging)
 inline std::ostream& operator<<(std::ostream& os, PRNG& prng)
 {
   os << "PRNG::seed = " << std::hex << prng.get_seed() << std::dec;
@@ -153,54 +152,53 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
 namespace WinProcGroup {
   void bindThisThread(size_t idx);
 }
-
-// 指定されたミリ秒だけsleepする。
+// sleep for the specified number of milliseconds.
 extern void sleep(int ms);
 
-// 現在時刻を文字列化したもを返す。(評価関数の学習時などにログ出力のために用いる)
+// Returns a string that represents the current time. (Used for log output when learning evaluation function)
 std::string now_string();
 
-// 途中での終了処理のためのwrapper
+// wrapper for end processing on the way
 static void my_exit()
 {
-	sleep(3000); // エラーメッセージが出力される前に終了するのはまずいのでwaitを入れておく。
+	sleep(3000); // It is bad to finish before the error message is output, so put wait.
 	exit(EXIT_FAILURE);
 }
 
-// msys2、Windows Subsystem for Linuxなどのgcc/clangでコンパイルした場合、
-// C++のstd::ifstreamで::read()は、一発で2GB以上のファイルの読み書きが出来ないのでそのためのwrapperである。
+// When compiled with gcc/clang such as msys2, Windows Subsystem for Linux,
+// In C++ std::ifstream, ::read() is a wrapper for that because it is not possible to read and write files larger than 2GB in one shot.
 //
-// read_file_to_memory()の引数のcallback_funcは、ファイルがオープン出来た時点でそのファイルサイズを引数として
-// callbackされるので、バッファを確保して、その先頭ポインタを返す関数を渡すと、そこに読み込んでくれる。
-// これらの関数は、ファイルが見つからないときなどエラーの際には非0を返す。
+// callback_func of the argument of read_file_to_memory() uses the file size as an argument when the file can be opened
+// It will be called back, so if you allocate a buffer and pass a function that returns the first pointer, it will be read there.
+// These functions return non-zero on error, such as when the file cannot be found.
 //
-// また、callbackされた関数のなかでバッファが確保できなかった場合や、想定していたファイルサイズと異なった場合は、
-// nullptrを返せば良い。このとき、read_file_to_memory()は、読み込みを中断し、エラーリターンする。
+// Also, if the buffer cannot be allocated in the callback function or if the file size is different from the expected file size,
+// Return nullptr. At this time, read_file_to_memory() interrupts reading and returns with an error.
 
 int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func);
 int write_memory_to_file(std::string filename, void* ptr, uint64_t size);
 
 // --------------------
-//    PRNGのasync版
+// async version of PRNG
 // --------------------
 
-// PRNGのasync版
+// async version of PRNG
 struct AsyncPRNG
 {
   AsyncPRNG(uint64_t seed) : prng(seed) { assert(seed); }
-  // [ASYNC] 乱数を一つ取り出す。
+  // [ASYNC] Extract one random number.
   template<typename T> T rand() {
     std::unique_lock<std::mutex> lk(mutex);
     return prng.rand<T>();
   }
 
-  // [ASYNC] 0からn-1までの乱数を返す。(一様分布ではないが現実的にはこれで十分)
+  // [ASYNC] Returns a random number from 0 to n-1. (Not uniform distribution, but this is enough in reality)
   uint64_t rand(uint64_t n) {
     std::unique_lock<std::mutex> lk(mutex);
     return prng.rand(n);
   }
 
-  // 内部で使用している乱数seedを返す。
+  // Return the random seed used internally.
   uint64_t get_seed() const { return prng.get_seed(); }
 
 protected:
@@ -208,7 +206,7 @@ protected:
   PRNG prng;
 };
 
-// 乱数のseedを表示する。(デバッグ用)
+// Display a random seed. (For debugging)
 inline std::ostream& operator<<(std::ostream& os, AsyncPRNG& prng)
 {
   os << "AsyncPRNG::seed = " << std::hex << prng.get_seed() << std::dec;
@@ -219,18 +217,18 @@ inline std::ostream& operator<<(std::ostream& os, AsyncPRNG& prng)
 //       Math
 // --------------------
 
-// 進行度の計算や学習で用いる数学的な関数
+// Mathematical function used for progress calculation and learning
 namespace Math {
-	// シグモイド関数
-	//  = 1.0 / (1.0 + std::exp(-x))
+	// Sigmoid function
+	// = 1.0 / (1.0 + std::exp(-x))
 	double sigmoid(double x);
 
-	// シグモイド関数の微分
-	//  = sigmoid(x) * (1.0 - sigmoid(x))
+	// Differentiation of sigmoid function
+	// = sigmoid(x) * (1.0-sigmoid(x))
 	double dsigmoid(double x);
 
-	// vを[lo,hi]の間に収まるようにクリップする。
-	// ※　Stockfishではこの関数、bitboard.hに書いてある。
+	// Clip v so that it fits between [lo,hi].
+	// * In Stockfish, this function is written in bitboard.h.
 	template<class T> constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
 		return v < lo ? lo : v > hi ? hi : v;
 	}
@@ -241,12 +239,12 @@ namespace Math {
 //       Path
 // --------------------
 
-// C#にあるPathクラス的なもの。ファイル名の操作。
-// C#のメソッド名に合わせておく。
+// Something like Path class in C#. File name manipulation.
+// Match with the C# method name.
 struct Path
 {
-	// path名とファイル名を結合して、それを返す。
-	// folder名のほうは空文字列でないときに、末尾に'/'か'\\'がなければそれを付与する。
+	// Combine the path name and file name and return it.
+	// If the folder name is not an empty string, append it if there is no'/' or'\\' at the end.
 	static std::string Combine(const std::string& folder, const std::string& filename)
 	{
 		if (folder.length() >= 1 && *folder.rbegin() != '/' && *folder.rbegin() != '\\')
@@ -255,10 +253,10 @@ struct Path
 		return folder + filename;
 	}
 
-	// full path表現から、(フォルダ名を除いた)ファイル名の部分を取得する。
+	// Get the file name part (excluding the folder name) from the full path expression.
 	static std::string GetFileName(const std::string& path)
 	{
-		// "\"か"/"か、どちらを使ってあるかはわからない。
+		// I don't know which "\" or "/" is used.
 		auto path_index1 = path.find_last_of("\\") + 1;
 		auto path_index2 = path.find_last_of("/") + 1;
 		auto path_index = std::max(path_index1, path_index2);
@@ -270,8 +268,8 @@ struct Path
 extern void* aligned_malloc(size_t size, size_t align);
 static void aligned_free(void* ptr) { _mm_free(ptr); }
 
-// alignasを指定しているのにnewのときに無視される＆STLのコンテナがメモリ確保するときに無視するので、
-// そのために用いるカスタムアロケーター。
+// It is ignored when new even though alignas is specified & because it is ignored when the STL container allocates memory,
+// A custom allocator used for that.
 template <typename T>
 class AlignedAllocator {
 public:
@@ -293,15 +291,15 @@ public:
 
 namespace Dependency
 {
-  // Linux環境ではgetline()したときにテキストファイルが'\r\n'だと
-  // '\r'が末尾に残るのでこの'\r'を除去するためにwrapperを書く。
-  // そのため、fstreamに対してgetline()を呼び出すときは、
-  // std::getline()ではなく単にgetline()と書いて、この関数を使うべき。
+  // In the Linux environment, if you getline() the text file is'\r\n'
+  // Since'\r' remains at the end, write a wrapper to remove this'\r'.
+  // So when calling getline() on fstream,
+  // just write getline() instead of std::getline() and use this function.
   extern bool getline(std::ifstream& fs, std::string& s);
 
-  // フォルダを作成する。
-  // カレントフォルダ相対で指定する。dir_nameに日本語は使っていないものとする。
-  // 成功すれば0、失敗すれば非0が返る。
+  // Create a folder.
+  // Specify relative to the current folder. Japanese is not used for dir_name.
+  // Returns 0 on success, non-zero on failure.
   extern int mkdir(std::string dir_name);
 }
 
diff --git a/src/position.cpp b/src/position.cpp
index a316304b..917d1646 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -209,11 +209,11 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
   st = si;
 
 #if defined(EVAL_NNUE)
-  // evalList��clear�B���memset�Ń[���N���A�����Ƃ��ɃN���A����Ă��邪�c�B
+  // clear evalList. It is cleared when memset is cleared to zero above...
   evalList.clear();
 
-  // PieceList���X�V�����ŁA�ǂ̋�ǂ��ɂ��邩��ݒ肵�Ȃ���΂Ȃ�Ȃ����A
-  // ���ꂼ��̋���ǂ��܂Ŏg�������̃J�E���^�[
+  // In updating the PieceList, we have to set which piece is where,
+  // A counter of how much each piece has been used
   PieceNumber next_piece_number = PIECE_NUMBER_ZERO;
 #endif  // defined(EVAL_NNUE)
 
@@ -235,10 +235,10 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
 
 #if defined(EVAL_NNUE)
           PieceNumber piece_no =
-            (idx == W_KING) ? PIECE_NUMBER_WKING : // ����
-            (idx == B_KING) ? PIECE_NUMBER_BKING : // ����
-            next_piece_number++; // ����ȊO
-          evalList.put_piece(piece_no, sq, pc); // sq�̏���pc�̋��z�u����
+            (idx == W_KING) ?PIECE_NUMBER_WKING : //
+            (idx == B_KING) ?PIECE_NUMBER_BKING : // back ball
+            next_piece_number++; // otherwise
+          evalList.put_piece(piece_no, sq, pc); // Place the pc piece in the sq box
 #endif  // defined(EVAL_NNUE)
 
           ++sq;
@@ -823,7 +823,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
       st->rule50 = 0;
 
 #if defined(EVAL_NNUE)
-      dp.dirty_num = 2; // ���������2��
+      dp.dirty_num = 2; // 2 pieces moved
 
       dp.pieceNo[1] = piece_no1;
       dp.changed_piece[1].old_piece = evalList.bona_piece(piece_no1);
@@ -1054,8 +1054,8 @@ template<bool Do>
 void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto) {
 #if defined(EVAL_NNUE)
   auto& dp = st->dirtyPiece;
-  // �����v�Z�̂��߂Ɉړ��������StateInfo�ɋL�^���Ă����B
-  dp.dirty_num = 2; // ���������2��
+   // Record the moved pieces in StateInfo for difference calculation.
+   dp.dirty_num = 2; // 2 pieces moved
 
   PieceNumber piece_no0;
   PieceNumber piece_no1;
diff --git a/src/position.h b/src/position.h
index ec9d3be5..725be527 100644
--- a/src/position.h
+++ b/src/position.h
@@ -63,7 +63,7 @@ struct StateInfo {
 #if defined(EVAL_NNUE)
   Eval::NNUE::Accumulator accumulator;
 
-  // �]���l�̍����v�Z�̊Ǘ��p
+   // For management of evaluation value difference calculation
   Eval::DirtyPiece dirtyPiece;
 #endif  // defined(EVAL_NNUE)
 };
@@ -82,7 +82,7 @@ typedef std::unique_ptr<std::deque<StateInfo>> StateListPtr;
 /// traversing the search tree.
 class Thread;
 
-// pack���ꂽsfen
+// packed sfen
 struct PackedSfen { uint8_t data[32]; };
 
 class Position {
@@ -181,31 +181,31 @@ public:
 #if defined(EVAL_NNUE) || defined(EVAL_LEARN)
   // --- StateInfo
 
-  // ���݂̋ǖʂɑΉ�����StateInfo��Ԃ��B
-  // ���Ƃ��΁Astate()->capturedPiece�ł���΁A�O�ǖʂŕߊl���ꂽ��i�[����Ă���B
+  // Returns the StateInfo corresponding to the current situation.
+  // For example, if state()->capturedPiece, the pieces captured in the previous phase are stored.
   StateInfo* state() const { return st; }
 
-  // �]���֐��Ŏg�����߂́A�ǂ̋�ԍ��̋�ǂ��ɂ��邩�Ȃǂ̏��B
+  // Information such as where and which piece number is used for the evaluation function.
   const Eval::EvalList* eval_list() const { return &evalList; }
 #endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
 
 #if defined(EVAL_LEARN)
-  // -- sfen���w���p
+  // --sfenization helper
 
-  // pack���ꂽsfen�𓾂�B�����Ɏw�肵���o�b�t�@�ɕԂ��B
-  // gamePly��pack�Ɋ܂߂Ȃ��B
+  // Get the packed sfen. Returns to the buffer specified in the argument.
+  // Do not include gamePly in pack.
   void sfen_pack(PackedSfen& sfen);
 
-  // ��sfen���o�R����ƒx���̂Œ���pack���ꂽsfen���Z�b�g����֐���������B
-  // pos.set(sfen_unpack(data),si,th); �Ɠ����B
-  // �n���ꂽ�ǖʂɖ�肪�����āA�G���[�̂Ƃ��͔�0��Ԃ��B
-  // PackedSfen��gamePly�͊܂܂Ȃ��̂ŕ����ł��Ȃ��B������ݒ肵�����̂ł���Έ����Ŏw�肷�邱�ƁB
+  // �� It is slow to go through sfen, so I made a function to set packed sfen directly.
+  // Equivalent to pos.set(sfen_unpack(data),si,th);.
+  // If there is a problem with the passed phase and there is an error, non-zero is returned.
+  // PackedSfen does not include gamePly so it cannot be restored. If you want to set it, specify it with an argument.
   int set_from_packed_sfen(const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror = false);
 
-  // �ՖʂƎ��A��Ԃ�^���āA����sfen��Ԃ��B
+  // Give the board, hand piece, and turn, and return the sfen.
   //static std::string sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly);
 
-  // c���̋ʂ̈ʒu��Ԃ��B
+  // Returns the position of the ball on the c side.
   Square king_square(Color c) const { return pieceList[make_piece(c, KING)][0]; }
 #endif // EVAL_LEARN
 
@@ -223,7 +223,7 @@ private:
   void do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto);
 
 #if defined(EVAL_NNUE)
-  // �Տ��sq�̏��ɂ�����PieceNumber��Ԃ��B
+  // Returns the PieceNumber of the piece in the sq box on the board.
   PieceNumber piece_no_of(Square sq) const;
 #endif  // defined(EVAL_NNUE)
 
@@ -245,7 +245,7 @@ private:
   bool chess960;
 
 #if defined(EVAL_NNUE) || defined(EVAL_LEARN)
-  // �]���֐��ŗp�����̃��X�g
+  // List of pieces used in the evaluation function
   Eval::EvalList evalList;
 #endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
 };
diff --git a/src/search.cpp b/src/search.cpp
index bc3cc745..68f97fca 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1938,60 +1938,61 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
     }
 }
 
-// --- �w�K���ɗp����Adepth�Œ�T���Ȃǂ̊֐����O���ɑ΂��Č��J
+// --- expose the functions such as fixed depth search used for learning to the outside
 
 #if defined (EVAL_LEARN)
 
 namespace Learner
 {
-  // �w�K�p�ɁA1�̃X���b�h����search,qsearch()���Ăяo����悤�ȃX�^�u��p�ӂ���B
-  // ���܂ɂ��Ďv���΁AApery�̂悤��Searcher�������ăX���b�h���Ƃɒu���\�Ȃǂ�p�ӂ���ق���
-  // �ǂ����������m��Ȃ��B
+  // For learning, prepare a stub that can call search,qsearch() from one thread.
+  // From now on, it is better to have a Searcher and prepare a substitution table for each thread like Apery.
+  // It might have been good.
 
-  // �w�K�̂��߂̏������B
-  // Learner::search(),Learner::qsearch()����Ăяo�����B
+  // Initialization for learning.
+  // Called from Learner::search(),Learner::qsearch().
   void init_for_search(Position& pos, Stack* ss)
   {
 
-    // RootNode��ss->ply == 0�����̏����B
-    // �[���N���A����̂ŁAss->ply == 0�ƂȂ�̂ő��v�c�B
+    // RootNode requires ss->ply == 0.
+    // Because it clears to zero, ss->ply == 0, so it's okay...
 
     std::memset(ss - 7, 0, 10 * sizeof(Stack));
 
-    // Search::Limits�Ɋւ���
-    // ���̃����o�[�ϐ���global�Ȃ̂ő��̃X���b�h�ɉe�����y�ڂ��̂ŋC�����邱�ƁB
+    // About Search::Limits
+    // Be careful because this member variable is global and affects other threads.
     {
       auto& limits = Search::Limits;
 
-      // �T����"go infinite"�R�}���h�����ɂ���B(time management�����ƍ��邽��)
+      // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
       limits.infinite = true;
 
-      // PV��\�������Ǝז��Ȃ̂ŏ����Ă����B
+      // Since PV is an obstacle when displayed, erase it.
       limits.silent = true;
 
-      // �����p����Ɗe�X���b�h��nodes��ώZ�������̂Ɣ�r����Ă��܂��B�䂦�Ɏg�p���Ȃ��B
+      // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
       limits.nodes = 0;
 
-      // depth���ALearner::search()�̈����Ƃ��ēn���ꂽ���̂ŏ�������B
+      // depth is also processed by the one passed as an argument of Learner::search().
       limits.depth = 0;
 
-      // ���������t�߂̎萔�ň��������̒l���Ԃ�̂�h�����߂ɑ傫�Ȓl�ɂ��Ă����B
+      // Set a large value to prevent the draw value from being returned due to the number of moves near the draw.
       //limits.max_game_ply = 1 << 16;
 
-      // ���ʃ��[��������Ă����Ȃ��ƈ��������ɂȂ��Č������ɂ����B
+      // If you do not include the ball entry rule, it will be a draw and it will be difficult to settle.
       //limits.enteringKingRule = EnteringKingRule::EKR_27_POINT;
     }
 
-    // DrawValue�̐ݒ�
+    // Set DrawValue
     {
-      // �X���b�h���Ƃɗp�ӂ��ĂȂ��̂�
-      // ���̃X���b�h�ŏ㏑�����ꂩ�˂Ȃ��B�d�����Ȃ����B
-      // �ǂ��������Ȃ�Ȃ�A0�ɂ��ׂ����Ǝv���B
+      // Because it is not prepared for each thread
+      // May be overwritten by another thread. There is no help for it.
+      // If that happens, I think it should be 0.
       //drawValueTable[REPETITION_DRAW][BLACK] = VALUE_ZERO;
       //drawValueTable[REPETITION_DRAW][WHITE] = VALUE_ZERO;
     }
 
-    // this_thread�Ɋւ��āB
+    // Regarding this_thread.
+
     {
       auto th = pos.this_thread();
 
@@ -1999,10 +2000,10 @@ namespace Learner
       th->selDepth = 0;
       th->rootDepth = 0;
 
-      // �T���m�[�h���̃[��������
+	  // Zero initialization of the number of search nodes
       th->nodes = 0;
 
-      // history�ނ�S���N���A����B���̏������͏������Ԃ������邵�A�T���̐��x�͂ނ��뉺����̂őP���͂悭�킩��Ȃ��B
+      // Clear all history types. This initialization takes a little time, and the accuracy of the search is rather low, so the good and bad are not well understood.
       // th->clear();
 
       int ct = int(Options["Contempt"]) * PawnValueEg / 100; // From centipawns
@@ -2023,57 +2024,57 @@ namespace Learner
       for (int i = 7; i > 0; i--)
           (ss - i)->continuationHistory = &th->continuationHistory[0][0][NO_PIECE][0]; // Use as a sentinel
 
-      // rootMoves�̐ݒ�
+ // set rootMoves
       auto& rootMoves = th->rootMoves;
 
       rootMoves.clear();
-      for (auto m : MoveList<LEGAL>(pos))
+      for (auto m: MoveList<LEGAL>(pos))
         rootMoves.push_back(Search::RootMove(m));
 
       assert(!rootMoves.empty());
 
       //#if defined(USE_GLOBAL_OPTIONS)
-      // �T���X���b�h���Ƃ̒u���\�̐�����Ǘ����Ă���͂��Ȃ̂ŁA
-      // �V�K�̒T���ł��邩��A���̃X���b�h�ɑ΂���u���\�̐���𑝂₷�B
+      // Since the generation of the substitution table for each search thread should be managed,
+      // Increase the generation of the substitution table for this thread because it is a new search.
             //TT.new_search(th->thread_id());
 
-            // ��������new_search���Ăяo����1��O�̒T�����ʂ��g���Ȃ��đ��Ƃ������Ƃ͂���̂ł́c�B
-            // �����ł���͂�炸�ɁA�Ăяo������1�ǂ��Ƃ�TT.new_search(th->thread_id())�����ׂ��ł́c�B
+            // ↑ If you call new_search here, it may be a loss because you can't use the previous search result.
+            // Do not do this here, but caller should do TT.new_search(th->thread_id()) for each station ...
 
-            // ���@����̏I�ǐ}�Ɏ���̂�����������̂ŁA���t�������ɂ͒u���\�͑S�X�����ʂŎg���悤�ɂ���B
+            // →Because we want to avoid reaching the same final diagram, use the substitution table commonly for all threads when generating teachers.
       //#endif
     }
   }
 
-  // �ǂ݋؂ƕ]���l�̃y�A�BLearner::search(),Learner::qsearch()���Ԃ��B
+  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
   typedef std::pair<Value, std::vector<Move> > ValueAndPV;
 
-  // �Î~�T���B
+  // Stationary search.
   //
-  // �O�����) pos.set_this_thread(Threads[thread_id])�ŒT���X���b�h���ݒ肳��Ă��邱�ƁB
-  // �@�܂��AThreads.stop������ƒT���𒆒f���Ă��܂��̂ŁA���̂Ƃ���PV�͐������Ȃ��B
-  // �@search()����߂������ƁAThreads.stop == true�Ȃ�A���̒T�����ʂ�p���Ă͂Ȃ�Ȃ��B
-  // �@���ƁA�Ăяo���O�́AThreads.stop == false�̏�ԂŌĂяo���Ȃ��ƁA�T���𒆒f���ĕԂ��Ă��܂��̂Œ��ӁB
+  // Precondition) Search thread is set by pos.set_this_thread(Threads[thread_id]).
+  // Also, when Threads.stop arrives, the search is interrupted, so the PV at that time is not correct.
+  // After returning from search(), if Threads.stop == true, do not use the search result.
+  // Also, note that before calling, if you do not call it with Threads.stop == false, the search will be interrupted and it will return.
   //
-  // �l�܂���Ă���ꍇ�́APV�z���MOVE_RESIGN���Ԃ�B
+  // If it is clogged, MOVE_RESIGN is returned in the PV array.
   //
-  // ������alpha,beta���w��ł���悤�ɂ��Ă������A���ꂪ���̑��ŒT�������Ƃ��̌��ʂ�
-  // �u���\�ɏ������ނ̂ŁA���̑��ɑ΂��Ď}���肪�o����悤�Ȓl���������܂�Ċw�K�̂Ƃ���
-  // �����e��������̂ŁA���͈̔͂��w��ł���悤�ɂ���̂���߂邱�Ƃɂ����B
+  //Although it was possible to specify alpha and beta with arguments, this will show the result when searching in that window
+  // Because it writes to the substitution table, the value that can be pruned is written to that window when learning
+  // As it has a bad effect, I decided to stop allowing the window range to be specified.
   ValueAndPV qsearch(Position& pos)
   {
     Stack stack[MAX_PLY + 10], * ss = stack + 7;
     Move pv[MAX_PLY + 1];
 
     init_for_search(pos, ss);
-    ss->pv = pv; // �Ƃ肠�����_�~�[�łǂ����o�b�t�@���Ȃ��Ƃ����Ȃ��B
+    ss->pv = pv; // For the time being, it must be a dummy and somewhere with a buffer.
 
     if (pos.is_draw(0)) {
       // Return draw value if draw.
       return { VALUE_DRAW, {} };
     }
 
-    // �l�܂���Ă���̂�
+    // Is it stuck?
     if (MoveList<LEGAL>(pos).size() == 0)
     {
       // Return the mated value if checkmated.
@@ -2082,7 +2083,7 @@ namespace Learner
 
     auto bestValue = ::qsearch<PV>(pos, ss, -VALUE_INFINITE, VALUE_INFINITE, 0);
 
-    // ����ꂽPV��Ԃ��B
+  // Returns the PV obtained.
     std::vector<Move> pvs;
     for (Move* p = &ss->pv[0]; is_ok(*p); ++p)
       pvs.push_back(*p);
@@ -2090,21 +2091,21 @@ namespace Learner
     return ValueAndPV(bestValue, pvs);
   }
 
-  // �ʏ�T���B�[��depth(�����Ŏw��)�B
-  // 3��ǂݎ��̃X�R�A���~�����Ȃ�A
-  //   auto v = search(pos,3);
-  // �̂悤�ɂ��ׂ��B
-  // v.first�ɕ]���l�Av.second��PV��������B
-  // multi pv���L���̂Ƃ��́Apos.this_thread()->rootMoves[N].pv�ɂ���PV(�ǂ݋�)�̔z�񂪓�����B
-  // multi pv�̎w��͂��̊֐��̈���multiPV�ōs�Ȃ��B(Options["MultiPV"]�̒l�͖��������)
-  // 
-  // root�ł̐錾��������͂��Ȃ��̂�(�������ʓ|�Ȃ̂�)�A�����ł͍s��Ȃ��B
-  // �Ăяo�����ŏ������邱�ƁB
+  // Normal search. Depth depth (specified as an integer).
+  // 3 If you want a score for hand reading,
+  // auto v = search(pos,3);
+  // Do something like
+  // Evaluation value is obtained in v.first and PV is obtained in v.second.
+  // When multi pv is enabled, you can get the PV (reading line) array in pos.this_thread()->rootMoves[N].pv.
+  // Specify multi pv with the argument multiPV of this function. (The value of Options["MultiPV"] is ignored)
   //
-  // �O�����) pos.set_this_thread(Threads[thread_id])�ŒT���X���b�h���ݒ肳��Ă��邱�ƁB
-  // �@�܂��AThreads.stop������ƒT���𒆒f���Ă��܂��̂ŁA���̂Ƃ���PV�͐������Ȃ��B
-  // �@search()����߂������ƁAThreads.stop == true�Ȃ�A���̒T�����ʂ�p���Ă͂Ȃ�Ȃ��B
-  // �@���ƁA�Ăяo���O�́AThreads.stop == false�̏�ԂŌĂяo���Ȃ��ƁA�T���𒆒f���ĕԂ��Ă��܂��̂Œ��ӁB
+  // Declaration win judgment is not done as root (because it is troublesome to handle), so it is not done here.
+  // Handle it by the caller.
+  //
+  // Precondition) Search thread is set by pos.set_this_thread(Threads[thread_id]).
+  // Also, when Threads.stop arrives, the search is interrupted, so the PV at that time is not correct.
+  // After returning from search(), if Threads.stop == true, do not use the search result.
+  // Also, note that before calling, if you do not call it with Threads.stop == false, the search will be interrupted and it will return.
 
   ValueAndPV search(Position& pos, int depth_, size_t multiPV /* = 1 */, uint64_t nodesLimit /* = 0 */)
   {
@@ -2122,9 +2123,9 @@ namespace Learner
 
     init_for_search(pos, ss);
 
-    ss->pv = pv; // �Ƃ肠�����_�~�[�łǂ����o�b�t�@���Ȃ��Ƃ����Ȃ��B
+	ss->pv = pv; // For the time being, it must be a dummy and somewhere with a buffer.
 
-    // this_thread�Ɋ֘A����ϐ��̏�����
+    // Initialize the variables related to this_thread
     auto th = pos.this_thread();
     auto& rootDepth = th->rootDepth;
     auto& pvIdx = th->pvIdx;
@@ -2133,13 +2134,13 @@ namespace Learner
     auto& completedDepth = th->completedDepth;
     auto& selDepth = th->selDepth;
 
-    // bestmove�Ƃ��Ă����̋ǖʂ̏��N��T������@�\
-    //size_t multiPV = Options["MultiPV"];
+     // A function to search the top N of this stage as best move
+     //size_t multiPV = Options["MultiPV"];
 
-    // ���̋ǖʂł̎w����̐��������Ă͂����Ȃ�
+     // Do not exceed the number of moves in this situation
     multiPV = std::min(multiPV, rootMoves.size());
 
-    // �m�[�h������MultiPV�̒l���|���Ă����Ȃ��ƁAdepth�Œ�AMultiPV����ɂ����Ƃ���1�̌���ɓ���node�����v�l�������ƂɂȂ�Ȃ��B
+     // If you do not multiply the node limit by the value of MultiPV, you will not be thinking about the same node for one candidate hand when you fix the depth and have MultiPV.
     nodesLimit *= multiPV;
 
     Value alpha = -VALUE_INFINITE;
@@ -2148,9 +2149,9 @@ namespace Learner
     Value bestValue = -VALUE_INFINITE;
 
     while ((rootDepth += 1) <= depth
-      // node�����𒴂����ꍇ�����̃��[�v�𔲂���
-      // �T���m�[�h���́A���̊֐��̈����œn����Ă���B
-      && !(nodesLimit /*node��������*/ && th->nodes.load(std::memory_order_relaxed) >= nodesLimit)
+	  // exit this loop even if the node limit is exceeded
+      // The number of search nodes is passed in the argument of this function.
+      && !(nodesLimit /* limited nodes */ && th->nodes.load(std::memory_order_relaxed) >= nodesLimit)
       )
     {
       for (RootMove& rm : rootMoves)
@@ -2170,10 +2171,10 @@ namespace Learner
               break;
         }
 
-        // ���ꂼ���depth��PV line�ɑ΂���USI info�ŏo�͂���selDepth
+	    // selDepth output with USI info for each depth and PV line
         selDepth = 0;
 
-        // depth 5�ȏ�ɂ����Ă�aspiration search�ɐ؂�ւ���B
+        // Switch to aspiration search for depth 5 and above.
         if (rootDepth >= 5 * 1)
         {
           delta = Value(20);
@@ -2194,8 +2195,8 @@ namespace Learner
           stable_sort(rootMoves.begin() + pvIdx, rootMoves.end());
           //my_stable_sort(pos.this_thread()->thread_id(),&rootMoves[0] + pvIdx, rootMoves.size() - pvIdx);
 
-          // fail low/high�ɑ΂���aspiration window���L����B
-          // �������A�����Ŏw�肳��Ă����l�ɂȂ��Ă�����A����fail low/high�����Ƃ���break����B
+		  // Expand aspiration window for fail low/high.
+          // However, if it is the value specified by the argument, it will be treated as fail low/high and break.
           if (bestValue <= alpha)
           {
             beta = (alpha + beta) / 2;
@@ -2217,7 +2218,7 @@ namespace Learner
           delta += delta / 4 + 5;
           assert(-VALUE_INFINITE <= alpha && beta <= VALUE_INFINITE);
 
-          // �\���`�F�b�N
+          // runaway check
           //assert(th->nodes.load(std::memory_order_relaxed) <= 1000000 );
         }
 
@@ -2229,9 +2230,9 @@ namespace Learner
       completedDepth = rootDepth;
     }
 
-    // ����PV�A�r����NULL_MOVE�̉\�������邩���m��Ȃ��̂Ŕr�����邽�߂�is_ok()��ʂ��B
-    // ���@PV�Ȃ̂�NULL_MOVE�͂��Ȃ����ƂɂȂ��Ă���͂������A
-    //     MOVE_WIN���˂����܂�Ă��邱�Ƃ͂Ȃ��B(���܂̂Ƃ���)
+    // Pass PV_is(ok) to eliminate this PV, there may be NULL_MOVE in the middle.
+    // → PV should not be NULL_MOVE because it is PV
+    // MOVE_WIN has never been thrust. (For now)
     for (Move move : rootMoves[0].pv)
     {
       if (!is_ok(move))
@@ -2241,7 +2242,7 @@ namespace Learner
 
     //sync_cout << rootDepth << sync_endl;
 
-    // multiPV�����l�����āArootMoves[0]��score��bestValue�Ƃ��ĕԂ��B
+    // Considering multiPV, the score of rootMoves[0] is returned as bestValue.
     bestValue = rootMoves[0].score;
 
     return ValueAndPV(bestValue, pvs);
diff --git a/src/types.h b/src/types.h
index a4a9f315..33a35483 100644
--- a/src/types.h
+++ b/src/types.h
@@ -192,7 +192,7 @@ enum Value : int {
 
   MidgameLimit  = 15258, EndgameLimit  = 3915,
 
-  // �]���֐��̕Ԃ��l�̍ő�l(2**14���炢�Ɏ��܂��Ă��ė~�����Ƃ��낾��..)
+// Maximum value returned by the evaluation function (I want it to be around 2**14..)
   VALUE_MAX_EVAL = 27000,
 };
 
@@ -239,7 +239,7 @@ enum Square : int {
   SQ_NONE,
 
   SQUARE_ZERO = 0, SQUARE_NB = 64,
-  SQUARE_NB_PLUS1 = SQUARE_NB + 1, // �ʂ����Ȃ��ꍇ�ASQUARE_NB�Ɉړ��������̂Ƃ��Ĉ������߁A�z���SQUARE_NB+1�Ŋm�ۂ��Ȃ��Ƃ����Ȃ��Ƃ�������̂ł��̒萔��p����B
+  SQUARE_NB_PLUS1 = SQUARE_NB + 1, // If there are no balls, it is treated as having moved to SQUARE_NB, so it may be necessary to secure the array with SQUARE_NB+1, so this constant is used.
 };
 
 enum Direction : int {
@@ -463,18 +463,18 @@ constexpr bool is_ok(Move m) {
   return from_sq(m) != to_sq(m); // Catch MOVE_NULL and MOVE_NONE
 }
 
-// �Ֆʂ�180���񂵂��Ƃ��̏��ڂ�Ԃ�
+// Return squares when turning the board 180��
 constexpr Square Inv(Square sq) { return (Square)((SQUARE_NB - 1) - sq); }
 
-// �Ֆʂ��~���[�����Ƃ��̏��ڂ�Ԃ�
+// Return squares when mirroring the board
 constexpr Square Mir(Square sq) { return make_square(File(7 - (int)file_of(sq)), rank_of(sq)); }
 
 #if defined(EVAL_NNUE) || defined(EVAL_LEARN)
 // --------------------
-//        �
+// 		piece box
 // --------------------
 
-// Position�N���X�ŗp����A��X�g(�ǂ̋�ǂ��ɂ���̂�)���Ǘ�����Ƃ��̔ԍ��B
+// A number used to manage the piece list (which piece is where) used in the Position class.
 enum PieceNumber : uint8_t
 {
 	PIECE_NUMBER_PAWN = 0,
@@ -484,7 +484,7 @@ enum PieceNumber : uint8_t
 	PIECE_NUMBER_QUEEN = 28,
 	PIECE_NUMBER_KING = 30,
 	PIECE_NUMBER_WKING = 30,
-	PIECE_NUMBER_BKING = 31, // ���A���̋ʂ̔ԍ����K�v�ȏꍇ�͂�������p����
+	PIECE_NUMBER_BKING = 31, // Use this if you need the numbers of the first and second balls
 	PIECE_NUMBER_ZERO = 0,
 	PIECE_NUMBER_NB = 32,
 };
@@ -497,7 +497,7 @@ inline PieceNumber operator++(PieceNumber& d, int) {
 }
 inline PieceNumber& operator--(PieceNumber& d) { return d = PieceNumber(int8_t(d) - 1); }
 
-// PieceNumber�̐������̌����Bassert�p�B
+// Piece Number integrity check. for assert.
 constexpr bool is_ok(PieceNumber pn) { return pn < PIECE_NUMBER_NB; }
 #endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
 
diff --git a/src/uci.cpp b/src/uci.cpp
index b7ece34b..13888d1a 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -44,22 +44,22 @@ extern vector<string> setup_bench(const Position&, istream&);
 // FEN string of the initial position, normal chess
 const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
 
-// ������������������R�}���h
+// Command to automatically generate a game record
 #if defined (EVAL_LEARN)
 namespace Learner
 {
-  // ���t�ǖʂ̎�������
+  // Automatic generation of teacher position
   void gen_sfen(Position& pos, istringstream& is);
 
-  // ����������������̊w�K
+  // Learning from the generated game record
   void learn(Position& pos, istringstream& is);
 
 #if defined(GENSFEN2019)
-  // �J�����̋��t�ǖʂ̎��������R�}���h
+  // Automatic generation command of teacher phase under development
   void gen_sfen2019(Position& pos, istringstream& is);
 #endif
 
-  // �ǂ݋؂ƕ]���l�̃y�A�BLearner::search(),Learner::qsearch()���Ԃ��B
+  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
   typedef std::pair<Value, std::vector<Move> > ValueAndPV;
 
   ValueAndPV qsearch(Position& pos);
@@ -71,7 +71,7 @@ namespace Learner
 #if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
 void test_cmd(Position& pos, istringstream& is)
 {
-    // �T�������邩���m��Ȃ��̂ŏ��������Ă����B
+    // Initialize as it may be searched.
     is_ready();
 
     std::string param;
@@ -221,21 +221,21 @@ namespace {
          << "\nNodes/second    : " << 1000 * nodes / elapsed << endl;
   }
 
-  // check sum���v�Z�����Ƃ��A�����ۑ����Ă����Ă��ƂŎ���ȍ~�A�������̃`�F�b�N���s�Ȃ��B
+// When you calculate check sum, save it and check the consistency later.
   uint64_t eval_sum;
 } // namespace
 
-// is_ready_cmd()���O������Ăяo����悤�ɂ��Ă����B(bench�R�}���h�Ȃǂ���Ăяo����������)
-// �ǖʂ͏���������Ȃ��̂Œ��ӁB
+// Make is_ready_cmd() callable from outside. (Because I want to call it from the bench command etc.)
+// Note that the phase is not initialized.
 void is_ready(bool skipCorruptCheck)
 {
 #if defined(EVAL_NNUE)
-  // "isready"���󂯎�������ƁA"readyok"��Ԃ��܂�5�b���Ƃɉ��s�𑗂�悤�ɏC������B(keep alive�I�ȏ���)
-  //	USI2.0�̎d�l���B
-  //  -"isready"�̂��Ƃ�time out���Ԃ́A30�b���x�Ƃ���B����𒴂��āA�]���֐��̏������Ahash�e�[�u���̊m�ۂ��������ꍇ�A
-  //  �v�l�G���W�����������I�ɉ��炩�̃��b�Z�[�W(���s��)�𑗂�ׂ��ł���B
-  //  -ShogiGUI�ł͂��łɂ����Ȃ��Ă���̂ŁAMyShogi������ɒǐ�����B
-  //  -�܂��A��˂��牤�̃G���W�����́A"isready"���󂯎�������ƁA"readyok"��Ԃ��܂�5�b���Ƃɉ��s�𑗂�悤�ɏC������B
+  // After receiving "isready", modify so that a line feed is sent every 5 seconds until "readyok" is returned. (keep alive processing)
+  // From USI 2.0 specifications.
+  // -The time out time after "is ready" is about 30 seconds. Beyond this, if you want to initialize the evaluation function and secure the hash table,
+  // You should send some kind of message (breakable) from the thinking engine side.
+  // -Shogi GUI already does so, so MyShogi will follow along.
+  //-Also, the engine side of Yaneura King modifies it so that after "isready" is received, a line feed is sent every 5 seconds until "readyok" is returned.
 
   auto ended = false;
   auto th = std::thread([&ended] {
@@ -243,25 +243,25 @@ void is_ready(bool skipCorruptCheck)
     while (!ended)
     {
       std::this_thread::sleep_for(std::chrono::milliseconds(100));
-      if (++count >= 50 /* 5�b */)
+      if (++count >= 50 /* 5 seconds */)
       {
         count = 0;
-        sync_cout << sync_endl; // ���s�𑗐M����B
+        sync_cout << sync_endl; // Send a line break.
       }
     }
     });
 
-  // �]���֐��̓ǂݍ��݂Ȃǎ��Ԃ̂�����ł��낤�����͂��̃^�C�~���O�ōs�Ȃ��B
-  // �N�����Ɏ��Ԃ̂����鏈�������Ă��܂��Ə��������^�C���A�E�g��������āA�v�l�G���W���Ƃ��Ă̔F�������^�C�A���Ă��܂��B
+  // Perform processing that may take time, such as reading the evaluation function, at this timing.
+  // If you do a time-consuming process at startup, Shogi place will make a timeout judgment and retire the recognition as a thinking engine.
   if (!UCI::load_eval_finished)
   {
-    // �]���֐��̓ǂݍ���
+    // Read evaluation function
     Eval::load_eval();
 
-    // �`�F�b�N�T���̌v�Z�ƕۑ�(���̌�̃������j���̃`�F�b�N�̂���)
+    // Calculate and save checksum (to check for subsequent memory corruption)
     eval_sum = Eval::calc_check_sum();
 
-    // �\�t�g���̕\��
+    // display soft name
     Eval::print_softname(eval_sum);
 
     UCI::load_eval_finished = true;
@@ -269,14 +269,14 @@ void is_ready(bool skipCorruptCheck)
   }
   else
   {
-    // ���������j�󂳂�Ă��Ȃ����𒲂ׂ邽�߂Ƀ`�F�b�N�T���𖈉񒲂ׂ�B
-    // ���Ԃ��������������Ȃ��C�����邪.. 0.1�b���炢�̂��ƂȂ̂ŗǂ��Ƃ���B
+    // Check the checksum every time to see if the memory has been corrupted.
+    // It seems that the time is a little wasteful, but it is good because it is about 0.1 seconds.
     if (!skipCorruptCheck && eval_sum != Eval::calc_check_sum())
       sync_cout << "Error! : EVAL memory is corrupted" << sync_endl;
   }
 
-  // isready�ɑ΂��Ă�readyok��Ԃ��܂Ŏ��̃R�}���h�����Ȃ����Ƃ͖񑩂���Ă���̂�
-  // ���̃^�C�~���O�Ŋe��ϐ��̏����������Ă����B
+  // For isready, it is promised that the next command will not come until it returns readyok.
+  // Initialize various variables at this timing.
 
   TT.resize(Options["Hash"]);
   Search::clear();
@@ -284,7 +284,7 @@ void is_ready(bool skipCorruptCheck)
 
   Threads.stop = false;
 
-  // keep alive�𑗐M���邽�߂ɐ��������X���b�h���I�������A�ҋ@����B
+  // Terminate the thread created to send keep alive and wait.
   ended = true;
   th.join();
 #endif  // defined(EVAL_NNUE)
@@ -294,7 +294,7 @@ void is_ready(bool skipCorruptCheck)
 
 
 // --------------------
-// �e�X�g�p��qsearch(),search()�𒼐ڌĂ�
+// Call qsearch(),search() directly for testing
 // --------------------
 
 #if defined(EVAL_LEARN)
@@ -391,10 +391,10 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "learn") Learner::learn(pos, is);
 
 #if defined (GENSFEN2019)
-      // �J�����̋��t�ǖʐ����R�}���h
+	  // Command to generate teacher phase under development
       else if (token == "gensfen2019") Learner::gen_sfen2019(pos, is);
 #endif
-      // �e�X�g�p��qsearch(),search()�𒼐ڌĂԃR�}���h
+      // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);
       else if (token == "search") search_cmd(pos, is);
 
@@ -405,7 +405,7 @@ void UCI::loop(int argc, char* argv[]) {
 #endif
 
 #if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
-      // �e�X�g�R�}���h
+      // test command
       else if (token == "test") test_cmd(pos, is);
 #endif
       else
diff --git a/src/uci.h b/src/uci.h
index 71e07787..d255db76 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -75,16 +75,17 @@ std::string move(Move m, bool chess960);
 std::string pv(const Position& pos, Depth depth, Value alpha, Value beta);
 Move to_move(const Position& pos, std::string& str);
 
-// �]���֐���ǂݍ��񂾂��̃t���O�B�����evaldir�̕ύX�ɂƂ��Ȃ���false�ɂ���B
+// Flag that read the evaluation function. This is set to false when evaldir is changed.
 extern bool load_eval_finished; // = false;
 } // namespace UCI
 
 extern UCI::OptionsMap Options;
 
-// USI��"isready"�R�}���h���Ăяo���ꂽ�Ƃ��̏����B���̂Ƃ��ɕ]���֐��̓ǂݍ��݂Ȃǂ��s�Ȃ��B
-// benchmark�R�}���h�̃n���h���Ȃǂ�"isready"�����Ă��Ȃ��Ƃ��ɕ]���֐���ǂݍ��܂������Ƃ��ɗp����B
-// skipCorruptCheck == true�̂Ƃ��͕]���֐���2�x�ڂ̓ǂݍ��݂̂Ƃ���check sum�ɂ�郁�����j���`�F�b�N���ȗ�����B
-// ���@���̊֐��́AStockfish�ɂ͂Ȃ����Ȃ��ƕs�ւȂ̂Œǉ����Ă����B
+// Processing when USI "isready" command is called. At this time, the evaluation function is read.
+// Used when you want to load the evaluation function when "isready" does not come in handler of benchmark command etc.
+// If skipCorruptCheck == true, skip memory corruption check by check sum when reading the evaluation function a second time.
+// * This function is inconvenient if it is not available in Stockfish, so add it.
+
 void is_ready(bool skipCorruptCheck = false);
 
 extern const char* StartFEN;
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 999941ed..c24884ce 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -79,21 +79,21 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
-  // �]���֐��t�H���_�B�����ύX�����Ƃ��A�]���֐�������isready�^�C�~���O�œǂݒ����K�v������B
+  // Evaluation function folder. When this is changed, it is necessary to reread the evaluation function at the next isready timing.
   o["EvalDir"]               << Option("eval", on_eval_dir);
-  // isready�^�C�~���O�ŕ]���֐���ǂݍ��܂��ƁA�V�����]���֐��̕ϊ��̂��߂�
-  // test evalconvert�R�}���h��@�������̂ɁA���̐V�����]���֐����Ȃ������߂�
-  // ���̃R�}���h�̎��s�O�Ɉُ�I�����Ă��܂��B
-  // �����ł��̉B���I�v�V������isready���̕]���֐��̓ǂݍ��݂�}�����āA
-  // test evalconvert�R�}���h��@���B
+  // When the evaluation function is loaded at the isready timing, it is necessary to convert the new evaluation function.
+  // I want to hit the test eval convert command, but there is no new evaluation function
+  // It ends abnormally before executing this command.
+  // Therefore, with this hidden option, you can suppress the loading of the evaluation function when isready,
+  // Hit the test eval convert command.
   o["SkipLoadingEval"]       << Option(false);
-  // ��Ղ̎w���������ڂ܂ŗp���邩
+  // how many moves to use a fixed move
   o["BookMoves"] << Option(16, 0, 10000);
 
 #if defined(EVAL_LEARN)
-  // �]���֐��̊w�K���s�Ȃ��Ƃ��́A�]���֐��̕ۑ���̃t�H���_��ύX�ł���B
-  // �f�t�H���g�ł�evalsave�B���̃t�H���_�͎��O�ɗp�ӂ���Ă�����̂Ƃ���B
-  // ���̃t�H���_�z���Ƀt�H���_��"0/","1/",�c�̂悤�Ɏ����I�Ɍ@��A�����ɕ]���֐��t�@�C����ۑ�����B
+  // When learning the evaluation function, you can change the folder to save the evaluation function.
+  // Evalsave by default. This folder shall be prepared in advance.
+  // Automatically dig a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
   o["EvalSaveDir"] << Option("evalsave");
 #endif
 }
@@ -204,6 +204,6 @@ Option& Option::operator=(const string& v) {
   return *this;
 }
 
-// �]���֐���ǂݍ��񂾂��̃t���O�B�����evaldir�̕ύX�ɂƂ��Ȃ���false�ɂ���B
+// Flag that read the evaluation function. This is set to false when evaldir is changed.
 bool load_eval_finished = false;
 } // namespace UCI

From fda3945c07aa28971def8917a7bc50be9f63b3d9 Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Tue, 30 Jun 2020 00:35:36 +0900
Subject: [PATCH 076/583] =?UTF-8?q?learn=20convert=5Fbin=5Ffrom=5Fpgn-extr?=
 =?UTF-8?q?act=E3=82=B3=E3=83=9E=E3=83=B3=E3=83=89=E3=82=92=E8=BF=BD?=
 =?UTF-8?q?=E5=8A=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

http://rebel13.nl/download/data.html
Download Selected Lichess games
pgn-extract --fencomments -Wlalg --nochecks --nomovenumbers --noresults -w500000 -N -V -o comp-2019-06.txt comp-2019-06.pgn
stockfish.exe
learn convert_bin_from_pgn-extract pgn_eval_side_to_move 0 output_file_name fens_comp-2019-06.bin comp-2019-06.txt

https://github.com/glinscott/fishtest/wiki/PGN-files-of-games-played-on-fishtest
pgn-extract --fencomments -Wlalg --nochecks --nomovenumbers --noresults -w500000 -N -V -o fishtest.txt fishtest.pgn
stockfish.exe
learn convert_bin_from_pgn-extract pgn_eval_side_to_move 1 output_file_name fens_fishtest.bin fishtest.txt
---
 src/learn/learner.cpp | 103 +++++++++++++++++++++++++++++++++---------
 1 file changed, 82 insertions(+), 21 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 9229770a..c13b69f7 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -2529,10 +2529,14 @@ int parse_game_result_from_pgn_extract(std::string result) {
 	}
 }
 
-// 0.25 -->  25
+// 0.25 -->  0.25 * PawnValueEg
 // #-4  --> -mate_in(4)
 // #3   -->  mate_in(3)
-Value parse_score_from_pgn_extract(std::string eval) {
+// -M4  --> -mate_in(4)
+// +M3  -->  mate_in(3)
+Value parse_score_from_pgn_extract(std::string eval, bool& success) {
+	success = true;
+
 	if (eval.substr(0, 1) == "#") {
 		if (eval.substr(1, 1) == "-") {
 			return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
@@ -2541,14 +2545,32 @@ Value parse_score_from_pgn_extract(std::string eval) {
 			return mate_in(stoi(eval.substr(1, eval.length() - 1)));
 		}
 	}
+	else if (eval.substr(0, 2) == "-M") {
+		//std::cout << "eval=" << eval << std::endl;
+		return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
+	}
+	else if (eval.substr(0, 2) == "+M") {
+		//std::cout << "eval=" << eval << std::endl;
+		return mate_in(stoi(eval.substr(2, eval.length() - 2)));
+	}
 	else {
-		return Value(stod(eval) * 100.0f);
+		char *endptr;
+		double value = strtod(eval.c_str(), &endptr);
+
+		if (*endptr != '\0') {
+			success = false;
+			return VALUE_ZERO;
+		}
+		else {
+			return Value(value * PawnValueEg);
+		}
 	}
 }
 
-// pgn-extract形式の教師をやねうら王用のPackedSfenValueに変換する
-void convert_bin_from_pgn_extract(const vector<string>& filenames, const string& output_file_name)
+void convert_bin_from_pgn_extract(const vector<string>& filenames, const string& output_file_name, const bool pgn_eval_side_to_move)
 {
+	std::cout << "pgn_eval_side_to_move=" << pgn_eval_side_to_move << std::endl;
+
 	auto th = Threads.main();
 	auto &pos = th->rootPos;
 
@@ -2602,11 +2624,20 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 					gamePly++;
 
 					std::regex pattern_bracket(R"(\{(.+?)\})");
-					std::regex pattern_eval(R"(\[\%eval (.+?)\])");
+
+					std::regex pattern_eval1(R"(\[\%eval (.+?)\])");
+					std::regex pattern_eval2(R"((.+?)\/)");
+
+					// very slow
+					//std::regex pattern_eval1(R"(\[\%eval (#?[+-]?(?:\d+\.?\d*|\.\d+))\])");
+					//std::regex pattern_eval2(R"((#?[+-]?(?:\d+\.?\d*|\.\d+)\/))");
+
 					std::regex pattern_move(R"((.+?)\{)");
 					std::smatch match;
 
 					// example: { [%eval 0.25] [%clk 0:10:00] }
+					// example: { +0.71/22 1.2s }
+					// example: { book }
 					if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
 						break;
 					}
@@ -2616,17 +2647,39 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 					trim(str_eval_clk);
 					//std::cout << "str_eval_clk="<< str_eval_clk << std::endl;
 
+					if (str_eval_clk == "book") {
+						//std::cout << "book" << std::endl;
+
+						// example: { rnbqkbnr/pppppppp/8/8/8/4P3/PPPP1PPP/RNBQKBNR b KQkq - 0 1 }
+						if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+							break;
+						}
+						itr += match.position(0) + match.length(0);
+						continue;
+					}
+
 					// example: [%eval 0.25]
 					// example: [%eval #-4]
 					// example: [%eval #3]
-					if (!std::regex_search(str_eval_clk, match, pattern_eval)) {
-						continue;
-					}
-					else {
+					// example: +0.71/
+					if (std::regex_search(str_eval_clk, match, pattern_eval1) ||
+						std::regex_search(str_eval_clk, match, pattern_eval2)) {
 						std::string str_eval = match.str(1);
 						trim(str_eval);
-						psv.score = parse_score_from_pgn_extract(str_eval);
-						//std::cout << "psv.score=" << psv.score << std::endl;
+						//std::cout << "str_eval=" << str_eval << std::endl;
+
+						bool success = false;
+						psv.score = Math::clamp(parse_score_from_pgn_extract(str_eval, success), -VALUE_MATE , VALUE_MATE);
+						//std::cout << "success=" << success << ", psv.score=" << psv.score << std::endl;
+
+						if (!success) {
+							//std::cout << "str_eval=" << str_eval << std::endl;
+							//std::cout << "success=" << success << ", psv.score=" << psv.score << std::endl;
+							break;
+						}
+					}
+					else {
+						break;
 					}
 
 					// example: { rnbqkbnr/pppppppp/8/8/3P4/8/PPP1PPPP/RNBQKBNR b KQkq d3 0 1 }
@@ -2659,16 +2712,20 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 					psv.game_result = game_result;
 
 					if (pos.side_to_move() == BLACK) {
-						psv.score *= -1;
+						if (!pgn_eval_side_to_move) {
+							psv.score *= -1;
+						}
 						psv.game_result *= -1;
 					}
 
-					//std::cout << "write: "
-					//		  << "score=" << psv.score
-					//		  << ", move=" << psv.move
-					//		  << ", gamePly=" << psv.gamePly
-					//		  << ", game_result=" << (int)psv.game_result
-					//		  << std::endl;
+#if 0
+					std::cout << "write: "
+							  << "score=" << psv.score
+							  << ", move=" << psv.move
+							  << ", gamePly=" << psv.gamePly
+							  << ", game_result=" << (int)psv.game_result
+							  << std::endl;
+#endif
 
 					ofs.write((char*)&psv, sizeof(PackedSfenValue));
 					memset((char*)&psv, 0, sizeof(PackedSfenValue));
@@ -2685,6 +2742,7 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 	std::cout << now_string() << " all done" << std::endl;
 	ofs.close();
 }
+
 //void convert_plain(const vector<string>& filenames , const string& output_file_name)
 //{
 //	Position tpos;
@@ -2773,6 +2831,7 @@ void learn(Position&, istringstream& is)
 	bool interpolate_eval = 0;
 	// convert teacher in pgn-extract format to Yaneura King's bin
 	bool use_convert_bin_from_pgn_extract = false;
+	bool pgn_eval_side_to_move = false;
 	// File name to write in those cases (default is "shuffled_sfen.bin")
 	string output_file_name = "shuffled_sfen.bin";
 
@@ -2907,7 +2966,9 @@ void learn(Position&, istringstream& is)
 		else if (option == "convert_plain") use_convert_plain = true;
 		else if (option == "convert_bin") use_convert_bin = true;
 		else if (option == "interpolate_eval") is >> interpolate_eval;
-		else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;		
+		else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
+		else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+
 		// Otherwise, it's a filename.
 		else
 			filenames.push_back(option);
@@ -3025,7 +3086,7 @@ void learn(Position&, istringstream& is)
 	{
 		is_ready(true);
 		cout << "convert_bin_from_pgn-extract.." << endl;
-		convert_bin_from_pgn_extract(filenames, output_file_name);
+		convert_bin_from_pgn_extract(filenames, output_file_name, pgn_eval_side_to_move);
 		return;
 	}
 

From 486f72af5418a7915779c496f92c7fc2722e3e18 Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Tue, 30 Jun 2020 13:30:18 +0900
Subject: [PATCH 077/583] =?UTF-8?q?=E3=83=90=E3=82=B0=E4=BF=AE=E6=AD=A3?=
 =?UTF-8?q?=EF=BC=88learn=20convert=5Fbin=5Ffrom=5Fpgn-extract=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/learn/learner.cpp | 55 +++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index c13b69f7..d2507fba 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -2614,6 +2614,7 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 
 			else {
 				int gamePly = 0;
+				bool first = true;
 
 				PackedSfenValue psv;
 				memset((char*)&psv, 0, sizeof(PackedSfenValue));
@@ -2682,6 +2683,35 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 						break;
 					}
 
+					if (first) {
+						first = false;
+					}
+					else {
+						psv.gamePly = gamePly;
+						psv.game_result = game_result;
+
+						if (pos.side_to_move() == BLACK) {
+							if (!pgn_eval_side_to_move) {
+								psv.score *= -1;
+							}
+							psv.game_result *= -1;
+						}
+
+#if 0
+						std::cout << "write: "
+								  << "score=" << psv.score
+								  << ", move=" << psv.move
+								  << ", gamePly=" << psv.gamePly
+								  << ", game_result=" << (int)psv.game_result
+								  << std::endl;
+#endif
+
+						ofs.write((char*)&psv, sizeof(PackedSfenValue));
+						memset((char*)&psv, 0, sizeof(PackedSfenValue));
+
+						fen_count++;
+					}
+
 					// example: { rnbqkbnr/pppppppp/8/8/3P4/8/PPP1PPPP/RNBQKBNR b KQkq d3 0 1 }
 					if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
 						break;
@@ -2706,31 +2736,6 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 					trim(str_move);
 					//std::cout << "str_move=" << str_move << std::endl;
 					psv.move = UCI::to_move(pos, str_move);
-
-					//
-					psv.gamePly = gamePly;
-					psv.game_result = game_result;
-
-					if (pos.side_to_move() == BLACK) {
-						if (!pgn_eval_side_to_move) {
-							psv.score *= -1;
-						}
-						psv.game_result *= -1;
-					}
-
-#if 0
-					std::cout << "write: "
-							  << "score=" << psv.score
-							  << ", move=" << psv.move
-							  << ", gamePly=" << psv.gamePly
-							  << ", game_result=" << (int)psv.game_result
-							  << std::endl;
-#endif
-
-					ofs.write((char*)&psv, sizeof(PackedSfenValue));
-					memset((char*)&psv, 0, sizeof(PackedSfenValue));
-
-					fen_count++;
 				}
 
 				game_result = 0;

From c8262f8aeca69101900e2dc1c531eb0d31ff9165 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 30 Jun 2020 15:58:51 +0900
Subject: [PATCH 078/583] Fixed a compile error.

---
 src/learn/learner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index d2507fba..1d724266 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -2562,7 +2562,7 @@ Value parse_score_from_pgn_extract(std::string eval, bool& success) {
 			return VALUE_ZERO;
 		}
 		else {
-			return Value(value * PawnValueEg);
+			return Value(value * static_cast<double>(PawnValueEg));
 		}
 	}
 }

From 145e4c2a1006a629327dbd83ffbc7c112f8b8dfc Mon Sep 17 00:00:00 2001
From: joergoster <osterj165@googlemail.com>
Date: Wed, 1 Jul 2020 14:14:27 +0200
Subject: [PATCH 079/583] Add SSE41 switch. This allows building modern
 compiles with SSE41 enabled, which gives a nice speedup on my Bulldozer CPU.

For example:
make nnue ARCH=x86-64-modern sse41=yes -j
---
 src/Makefile | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/Makefile b/src/Makefile
index 24a62826..5c8a7a90 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -386,6 +386,13 @@ ifeq ($(avx2),yes)
 	endif
 endif
 
+ifeq ($(sse41),yes)
+	CXXFLAGS += -DUSE_SSE41
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
+		CXXFLAGS += -msse4
+	endif
+endif
+
 ### 3.7 pext
 ifeq ($(pext),yes)
 	CXXFLAGS += -DUSE_PEXT

From 13824d8b9653a3c60703e0f778594aac17f014fe Mon Sep 17 00:00:00 2001
From: joergoster <osterj165@googlemail.com>
Date: Thu, 2 Jul 2020 09:13:16 +0200
Subject: [PATCH 080/583] Explicitly specify SSE41.

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 5c8a7a90..1de2d243 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -389,7 +389,7 @@ endif
 ifeq ($(sse41),yes)
 	CXXFLAGS += -DUSE_SSE41
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
-		CXXFLAGS += -msse4
+		CXXFLAGS += -msse4.1
 	endif
 endif
 

From c679e8f3603e49971bf828736e1ecfdcad3310d1 Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Wed, 1 Jul 2020 02:41:14 +0200
Subject: [PATCH 081/583] Update search.h

1 line of translation
---
 src/search.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/search.h b/src/search.h
index 7638d822..eae1cafc 100644
--- a/src/search.h
+++ b/src/search.h
@@ -99,8 +99,8 @@ struct LimitsType {
   TimePoint time[COLOR_NB], inc[COLOR_NB], npmsec, movetime, startTime;
   int movestogo, depth, mate, perft, infinite;
   int64_t nodes;
-  // ��ʂɏo�͂��Ȃ��T�C�����g���[�h(�v���Z�X���ł̘A�����ȑΐ�̂Ƃ��p)
-  // ���̂Ƃ�PV���o�͂��Ȃ��B
+  // Silent mode that does not output to the screen (for continuous self-play in process)
+  // Do not output PV at this time.
   bool silent;
 };
 

From 9c190218088c78967d9cb37cb2f3a8903133bc62 Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Wed, 1 Jul 2020 18:21:07 +0200
Subject: [PATCH 082/583] update translation

1 line src\eval\nnue\features\half_kp.cpp
1 line src\movegen.h
---
 src/eval/nnue/features/half_kp.cpp | 2 +-
 src/movegen.h                      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/eval/nnue/features/half_kp.cpp b/src/eval/nnue/features/half_kp.cpp
index 72156c82..cba2c9cd 100644
--- a/src/eval/nnue/features/half_kp.cpp
+++ b/src/eval/nnue/features/half_kp.cpp
@@ -35,7 +35,7 @@ inline void HalfKP<AssociatedKing>::GetPieces(
 template <Side AssociatedKing>
 void HalfKP<AssociatedKing>::AppendActiveIndices(
     const Position& pos, Color perspective, IndexList* active) {
-  // コンパイラの警告を回避するため、配列サイズが小さい場合は何もしない
+  // do nothing if array size is small to avoid compiler warning
   if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
 
   BonaPiece* pieces;
diff --git a/src/movegen.h b/src/movegen.h
index 838541f1..d5f82f16 100644
--- a/src/movegen.h
+++ b/src/movegen.h
@@ -68,7 +68,7 @@ struct MoveList {
     return std::find(begin(), end(), move) != end();
   }
 
-  // i�Ԗڂ̗v�f��Ԃ�
+  // returns the i th element
   const ExtMove at(size_t i) const { assert(0 <= i && i < size()); return begin()[i]; }
 
 private:

From 9ce0ef3ac093bc139e55ced2d8121f1c29033f7b Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Fri, 3 Jul 2020 23:01:37 +0900
Subject: [PATCH 083/583] merge "Provide WDL statistics"

https://github.com/official-stockfish/Stockfish/commit/110068808b51344ac59f8c6a0846f5dfdf670392
https://github.com/official-stockfish/Stockfish/pull/2778
https://github.com/official-stockfish/Stockfish/pull/2788
---
 Readme.md         |  5 +++++
 src/search.cpp    |  3 +++
 src/uci.cpp       | 39 +++++++++++++++++++++++++++++++++++++++
 src/uci.h         |  1 +
 src/ucioption.cpp |  1 +
 5 files changed, 49 insertions(+)

diff --git a/Readme.md b/Readme.md
index 2b1de86b..e60ac718 100644
--- a/Readme.md
+++ b/Readme.md
@@ -66,6 +66,11 @@ Currently, Stockfish has the following UCI options:
     If enabled by UCI_LimitStrength, aim for an engine strength of the given Elo.
     This Elo rating has been calibrated at a time control of 60s+0.6s and anchored to CCRL 40/4.
 
+  * #### UCI_ShowWDL
+    If enabled, show approximate WDL statistics as part of the engine output.
+    These WDL numbers model expected game outcomes for a given evaluation and
+    game ply for engine self-play at fishtest LTC conditions (60+0.6s per game).
+
   * #### Move Overhead
     Assume a time delay of x ms due to network and GUI overheads. This is useful to
     avoid losses on time in those cases.
diff --git a/src/search.cpp b/src/search.cpp
index 68f97fca..5990905f 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1841,6 +1841,9 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
          << " multipv "  << i + 1
          << " score "    << UCI::value(v);
 
+      if (Options["UCI_ShowWDL"])
+          ss << UCI::wdl(v, pos.game_ply());
+
       if (!tb && i == pvIdx)
           ss << (v >= beta ? " lowerbound" : v <= alpha ? " upperbound" : "");
 
diff --git a/src/uci.cpp b/src/uci.cpp
index 13888d1a..a95a629d 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -19,6 +19,7 @@
 */
 
 #include <cassert>
+#include <cmath>
 #include <iostream>
 #include <sstream>
 #include <string>
@@ -221,6 +222,28 @@ namespace {
          << "\nNodes/second    : " << 1000 * nodes / elapsed << endl;
   }
 
+  // The win rate model returns the probability (per mille) of winning given an eval
+  // and a game-ply. The model fits rather accurately the LTC fishtest statistics.
+  int win_rate_model(Value v, int ply) {
+
+     // The model captures only up to 240 plies, so limit input (and rescale)
+     double m = std::min(240, ply) / 64.0;
+
+     // Coefficients of a 3rd order polynomial fit based on fishtest data
+     // for two parameters needed to transform eval to the argument of a
+     // logistic function.
+     double as[] = {-8.24404295, 64.23892342, -95.73056462, 153.86478679};
+     double bs[] = {-3.37154371, 28.44489198, -56.67657741,  72.05858751};
+     double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
+     double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
+
+     // Transform eval to centipawns with limited range
+     double x = Utility::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
+
+     // Return win rate in per mille (rounded to nearest)
+     return int(0.5 + 1000 / (1 + std::exp((a - x) / b)));
+  }
+
 // When you calculate check sum, save it and check the consistency later.
   uint64_t eval_sum;
 } // namespace
@@ -437,6 +460,22 @@ string UCI::value(Value v) {
 }
 
 
+/// UCI::wdl() report WDL statistics given an evaluation and a game ply, based on
+/// data gathered for fishtest LTC games.
+
+string UCI::wdl(Value v, int ply) {
+
+  stringstream ss;
+
+  int wdl_w = win_rate_model( v, ply);
+  int wdl_l = win_rate_model(-v, ply);
+  int wdl_d = 1000 - wdl_w - wdl_l;
+  ss << " wdl " << wdl_w << " " << wdl_d << " " << wdl_l;
+
+  return ss.str();
+}
+
+
 /// UCI::square() converts a Square to a string in algebraic notation (g1, a7, etc.)
 
 std::string UCI::square(Square s) {
diff --git a/src/uci.h b/src/uci.h
index d255db76..5073262e 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -73,6 +73,7 @@ std::string value(Value v);
 std::string square(Square s);
 std::string move(Move m, bool chess960);
 std::string pv(const Position& pos, Depth depth, Value alpha, Value beta);
+std::string wdl(Value v, int ply);
 Move to_move(const Position& pos, std::string& str);
 
 // Flag that read the evaluation function. This is set to false when evaldir is changed.
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index c24884ce..d63caa9f 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -75,6 +75,7 @@ void init(OptionsMap& o) {
   o["UCI_AnalyseMode"]       << Option(false);
   o["UCI_LimitStrength"]     << Option(false);
   o["UCI_Elo"]               << Option(1350, 1350, 2850);
+  o["UCI_ShowWDL"]           << Option(false);
   o["SyzygyPath"]            << Option("<empty>", on_tb_path);
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);

From c964e902c594ec69ab5d82a380ca292ba28d551c Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Fri, 3 Jul 2020 23:21:49 +0900
Subject: [PATCH 084/583] use winning_percentage_wdl in learn

---
 src/learn/learner.cpp | 40 ++++++++++++++++++++++++++++++++--------
 src/uci.cpp           | 10 +++++++---
 src/uci.h             |  1 +
 3 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 1d724266..93b54ab2 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -116,6 +116,7 @@ bool use_draw_in_training_data_generation = false;
 bool use_draw_in_training = false;
 bool use_draw_in_validation = false;
 bool use_hash_in_training = true;
+bool use_wdl = false;
 
 // -----------------------------------
 // write phase file
@@ -1025,6 +1026,16 @@ double sigmoid(double x)
 	return 1.0 / (1.0 + std::exp(-x));
 }
 
+// A function that converts the evaluation value to the winning rate [0,1]
+double winning_percentage_wdl(Value value, int ply)
+{
+	double wdl_w = UCI::win_rate_model( value, ply);
+	double wdl_l = UCI::win_rate_model(-value, ply);
+	double wdl_d = 1000.0 - wdl_w - wdl_l;
+
+	return (wdl_w + wdl_d / 2.0) / 1000.0;
+}
+
 // A function that converts the evaluation value to the winning rate [0,1]
 double winning_percentage(double value)
 {
@@ -1033,6 +1044,18 @@ double winning_percentage(double value)
 	// = sigmoid(Eval/4*ln(10))
 	return sigmoid(value / PawnValueEg / 4.0 * log(10.0));
 }
+
+// A function that converts the evaluation value to the winning rate [0,1]
+double winning_percentage(Value value, int ply)
+{
+	if (use_wdl) {
+		return winning_percentage_wdl(value, ply);
+	}
+	else {
+		return winning_percentage(value);
+	}
+}
+
 double dsigmoid(double x)
 {
 	// Sigmoid function
@@ -1069,8 +1092,8 @@ double calc_grad(Value deep, Value shallow, PackedSfenValue& psv)
 	// Also, the coefficient of 1/m is unnecessary if you use the update formula that has the automatic gradient adjustment function like Adam and AdaGrad.
 	// Therefore, it is not necessary to save it in memory.
 
-	double p = winning_percentage(deep);
-	double q = winning_percentage(shallow);
+	double p = winning_percentage(deep, psv.gamePly);
+	double q = winning_percentage(shallow, psv.gamePly);
 	return (q - p) * dsigmoid(double(shallow) / 600.0);
 }
 #endif
@@ -1095,8 +1118,8 @@ double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
 	// = ...
 	// = q-p.
 
-	double p = winning_percentage(deep);
-	double q = winning_percentage(shallow);
+	double p = winning_percentage(deep, psv.gamePly);
+	double q = winning_percentage(shallow, psv.gamePly);
 
 	return q - p;
 }
@@ -1127,8 +1150,8 @@ double calc_grad(Value deep, Value shallow , const PackedSfenValue& psv)
 	// elmo (WCSC27) method
 	// Correct with the actual game wins and losses.
 
-	const double q = winning_percentage(shallow);
-	const double p = winning_percentage(deep);
+	const double q = winning_percentage(shallow, psv.gamePly);
+	const double p = winning_percentage(deep, psv.gamePly);
 
 	// Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
 	// game_result = 1,0,-1 so add 1 and divide by 2.
@@ -1150,8 +1173,8 @@ void calc_cross_entropy(Value deep, Value shallow, const PackedSfenValue& psv,
 	double& cross_entropy_eval, double& cross_entropy_win, double& cross_entropy,
 	double& entropy_eval, double& entropy_win, double& entropy)
 {
-	const double p /* teacher_winrate */ = winning_percentage(deep);
-	const double q /* eval_winrate    */ = winning_percentage(shallow);
+	const double p /* teacher_winrate */ = winning_percentage(deep, psv.gamePly);
+	const double q /* eval_winrate    */ = winning_percentage(shallow, psv.gamePly);
 	const double t = double(psv.game_result + 1) / 2;
 
 	constexpr double epsilon = 0.000001;
@@ -2920,6 +2943,7 @@ void learn(Position&, istringstream& is)
 		else if (option == "use_draw_in_training") is >> use_draw_in_training;
 		else if (option == "use_draw_in_validation") is >> use_draw_in_validation;
 		else if (option == "use_hash_in_training") is >> use_hash_in_training;
+		else if (option == "use_wdl") is >> use_wdl;
 		// Discount rate
 		else if (option == "discount_rate") is >> discount_rate;
 
diff --git a/src/uci.cpp b/src/uci.cpp
index a95a629d..8dd485b0 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -222,6 +222,12 @@ namespace {
          << "\nNodes/second    : " << 1000 * nodes / elapsed << endl;
   }
 
+// When you calculate check sum, save it and check the consistency later.
+  uint64_t eval_sum;
+} // namespace
+
+
+namespace UCI{
   // The win rate model returns the probability (per mille) of winning given an eval
   // and a game-ply. The model fits rather accurately the LTC fishtest statistics.
   int win_rate_model(Value v, int ply) {
@@ -243,10 +249,8 @@ namespace {
      // Return win rate in per mille (rounded to nearest)
      return int(0.5 + 1000 / (1 + std::exp((a - x) / b)));
   }
+} // namespace UCI
 
-// When you calculate check sum, save it and check the consistency later.
-  uint64_t eval_sum;
-} // namespace
 
 // Make is_ready_cmd() callable from outside. (Because I want to call it from the bench command etc.)
 // Note that the phase is not initialized.
diff --git a/src/uci.h b/src/uci.h
index 5073262e..8e12c856 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -74,6 +74,7 @@ std::string square(Square s);
 std::string move(Move m, bool chess960);
 std::string pv(const Position& pos, Depth depth, Value alpha, Value beta);
 std::string wdl(Value v, int ply);
+int win_rate_model(Value v, int ply);
 Move to_move(const Position& pos, std::string& str);
 
 // Flag that read the evaluation function. This is set to false when evaldir is changed.

From 5dec3e547e35798f1b47c42f32f5c052aea24644 Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Fri, 3 Jul 2020 23:01:37 +0900
Subject: [PATCH 085/583] merge "Provide WDL statistics"

https://github.com/official-stockfish/Stockfish/commit/110068808b51344ac59f8c6a0846f5dfdf670392
https://github.com/official-stockfish/Stockfish/pull/2778
https://github.com/official-stockfish/Stockfish/pull/2788
---
 Readme.md         |  5 +++++
 src/search.cpp    |  3 +++
 src/uci.cpp       | 39 +++++++++++++++++++++++++++++++++++++++
 src/uci.h         |  1 +
 src/ucioption.cpp |  1 +
 5 files changed, 49 insertions(+)

diff --git a/Readme.md b/Readme.md
index 2b1de86b..e60ac718 100644
--- a/Readme.md
+++ b/Readme.md
@@ -66,6 +66,11 @@ Currently, Stockfish has the following UCI options:
     If enabled by UCI_LimitStrength, aim for an engine strength of the given Elo.
     This Elo rating has been calibrated at a time control of 60s+0.6s and anchored to CCRL 40/4.
 
+  * #### UCI_ShowWDL
+    If enabled, show approximate WDL statistics as part of the engine output.
+    These WDL numbers model expected game outcomes for a given evaluation and
+    game ply for engine self-play at fishtest LTC conditions (60+0.6s per game).
+
   * #### Move Overhead
     Assume a time delay of x ms due to network and GUI overheads. This is useful to
     avoid losses on time in those cases.
diff --git a/src/search.cpp b/src/search.cpp
index 68f97fca..5990905f 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1841,6 +1841,9 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
          << " multipv "  << i + 1
          << " score "    << UCI::value(v);
 
+      if (Options["UCI_ShowWDL"])
+          ss << UCI::wdl(v, pos.game_ply());
+
       if (!tb && i == pvIdx)
           ss << (v >= beta ? " lowerbound" : v <= alpha ? " upperbound" : "");
 
diff --git a/src/uci.cpp b/src/uci.cpp
index 13888d1a..a95a629d 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -19,6 +19,7 @@
 */
 
 #include <cassert>
+#include <cmath>
 #include <iostream>
 #include <sstream>
 #include <string>
@@ -221,6 +222,28 @@ namespace {
          << "\nNodes/second    : " << 1000 * nodes / elapsed << endl;
   }
 
+  // The win rate model returns the probability (per mille) of winning given an eval
+  // and a game-ply. The model fits rather accurately the LTC fishtest statistics.
+  int win_rate_model(Value v, int ply) {
+
+     // The model captures only up to 240 plies, so limit input (and rescale)
+     double m = std::min(240, ply) / 64.0;
+
+     // Coefficients of a 3rd order polynomial fit based on fishtest data
+     // for two parameters needed to transform eval to the argument of a
+     // logistic function.
+     double as[] = {-8.24404295, 64.23892342, -95.73056462, 153.86478679};
+     double bs[] = {-3.37154371, 28.44489198, -56.67657741,  72.05858751};
+     double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
+     double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
+
+     // Transform eval to centipawns with limited range
+     double x = Utility::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
+
+     // Return win rate in per mille (rounded to nearest)
+     return int(0.5 + 1000 / (1 + std::exp((a - x) / b)));
+  }
+
 // When you calculate check sum, save it and check the consistency later.
   uint64_t eval_sum;
 } // namespace
@@ -437,6 +460,22 @@ string UCI::value(Value v) {
 }
 
 
+/// UCI::wdl() report WDL statistics given an evaluation and a game ply, based on
+/// data gathered for fishtest LTC games.
+
+string UCI::wdl(Value v, int ply) {
+
+  stringstream ss;
+
+  int wdl_w = win_rate_model( v, ply);
+  int wdl_l = win_rate_model(-v, ply);
+  int wdl_d = 1000 - wdl_w - wdl_l;
+  ss << " wdl " << wdl_w << " " << wdl_d << " " << wdl_l;
+
+  return ss.str();
+}
+
+
 /// UCI::square() converts a Square to a string in algebraic notation (g1, a7, etc.)
 
 std::string UCI::square(Square s) {
diff --git a/src/uci.h b/src/uci.h
index d255db76..5073262e 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -73,6 +73,7 @@ std::string value(Value v);
 std::string square(Square s);
 std::string move(Move m, bool chess960);
 std::string pv(const Position& pos, Depth depth, Value alpha, Value beta);
+std::string wdl(Value v, int ply);
 Move to_move(const Position& pos, std::string& str);
 
 // Flag that read the evaluation function. This is set to false when evaldir is changed.
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index c24884ce..d63caa9f 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -75,6 +75,7 @@ void init(OptionsMap& o) {
   o["UCI_AnalyseMode"]       << Option(false);
   o["UCI_LimitStrength"]     << Option(false);
   o["UCI_Elo"]               << Option(1350, 1350, 2850);
+  o["UCI_ShowWDL"]           << Option(false);
   o["SyzygyPath"]            << Option("<empty>", on_tb_path);
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);

From 85c802d0b9547fc305538c4ae26eee873d02835d Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 6 Jul 2020 11:07:46 +0900
Subject: [PATCH 086/583] Revert "use winning_percentage_wdl in learn"

This reverts commit c964e902c594ec69ab5d82a380ca292ba28d551c.

# Conflicts:
#	src/uci.cpp
---
 src/learn/learner.cpp | 40 ++++++++--------------------------------
 src/uci.cpp           | 26 --------------------------
 src/uci.h             |  1 -
 3 files changed, 8 insertions(+), 59 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 93b54ab2..1d724266 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -116,7 +116,6 @@ bool use_draw_in_training_data_generation = false;
 bool use_draw_in_training = false;
 bool use_draw_in_validation = false;
 bool use_hash_in_training = true;
-bool use_wdl = false;
 
 // -----------------------------------
 // write phase file
@@ -1026,16 +1025,6 @@ double sigmoid(double x)
 	return 1.0 / (1.0 + std::exp(-x));
 }
 
-// A function that converts the evaluation value to the winning rate [0,1]
-double winning_percentage_wdl(Value value, int ply)
-{
-	double wdl_w = UCI::win_rate_model( value, ply);
-	double wdl_l = UCI::win_rate_model(-value, ply);
-	double wdl_d = 1000.0 - wdl_w - wdl_l;
-
-	return (wdl_w + wdl_d / 2.0) / 1000.0;
-}
-
 // A function that converts the evaluation value to the winning rate [0,1]
 double winning_percentage(double value)
 {
@@ -1044,18 +1033,6 @@ double winning_percentage(double value)
 	// = sigmoid(Eval/4*ln(10))
 	return sigmoid(value / PawnValueEg / 4.0 * log(10.0));
 }
-
-// A function that converts the evaluation value to the winning rate [0,1]
-double winning_percentage(Value value, int ply)
-{
-	if (use_wdl) {
-		return winning_percentage_wdl(value, ply);
-	}
-	else {
-		return winning_percentage(value);
-	}
-}
-
 double dsigmoid(double x)
 {
 	// Sigmoid function
@@ -1092,8 +1069,8 @@ double calc_grad(Value deep, Value shallow, PackedSfenValue& psv)
 	// Also, the coefficient of 1/m is unnecessary if you use the update formula that has the automatic gradient adjustment function like Adam and AdaGrad.
 	// Therefore, it is not necessary to save it in memory.
 
-	double p = winning_percentage(deep, psv.gamePly);
-	double q = winning_percentage(shallow, psv.gamePly);
+	double p = winning_percentage(deep);
+	double q = winning_percentage(shallow);
 	return (q - p) * dsigmoid(double(shallow) / 600.0);
 }
 #endif
@@ -1118,8 +1095,8 @@ double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
 	// = ...
 	// = q-p.
 
-	double p = winning_percentage(deep, psv.gamePly);
-	double q = winning_percentage(shallow, psv.gamePly);
+	double p = winning_percentage(deep);
+	double q = winning_percentage(shallow);
 
 	return q - p;
 }
@@ -1150,8 +1127,8 @@ double calc_grad(Value deep, Value shallow , const PackedSfenValue& psv)
 	// elmo (WCSC27) method
 	// Correct with the actual game wins and losses.
 
-	const double q = winning_percentage(shallow, psv.gamePly);
-	const double p = winning_percentage(deep, psv.gamePly);
+	const double q = winning_percentage(shallow);
+	const double p = winning_percentage(deep);
 
 	// Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
 	// game_result = 1,0,-1 so add 1 and divide by 2.
@@ -1173,8 +1150,8 @@ void calc_cross_entropy(Value deep, Value shallow, const PackedSfenValue& psv,
 	double& cross_entropy_eval, double& cross_entropy_win, double& cross_entropy,
 	double& entropy_eval, double& entropy_win, double& entropy)
 {
-	const double p /* teacher_winrate */ = winning_percentage(deep, psv.gamePly);
-	const double q /* eval_winrate    */ = winning_percentage(shallow, psv.gamePly);
+	const double p /* teacher_winrate */ = winning_percentage(deep);
+	const double q /* eval_winrate    */ = winning_percentage(shallow);
 	const double t = double(psv.game_result + 1) / 2;
 
 	constexpr double epsilon = 0.000001;
@@ -2943,7 +2920,6 @@ void learn(Position&, istringstream& is)
 		else if (option == "use_draw_in_training") is >> use_draw_in_training;
 		else if (option == "use_draw_in_validation") is >> use_draw_in_validation;
 		else if (option == "use_hash_in_training") is >> use_hash_in_training;
-		else if (option == "use_wdl") is >> use_wdl;
 		// Discount rate
 		else if (option == "discount_rate") is >> discount_rate;
 
diff --git a/src/uci.cpp b/src/uci.cpp
index e28dba32..a95a629d 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -248,32 +248,6 @@ namespace {
   uint64_t eval_sum;
 } // namespace
 
-
-namespace UCI{
-  // The win rate model returns the probability (per mille) of winning given an eval
-  // and a game-ply. The model fits rather accurately the LTC fishtest statistics.
-  int win_rate_model(Value v, int ply) {
-
-     // The model captures only up to 240 plies, so limit input (and rescale)
-     double m = std::min(240, ply) / 64.0;
-
-     // Coefficients of a 3rd order polynomial fit based on fishtest data
-     // for two parameters needed to transform eval to the argument of a
-     // logistic function.
-     double as[] = {-8.24404295, 64.23892342, -95.73056462, 153.86478679};
-     double bs[] = {-3.37154371, 28.44489198, -56.67657741,  72.05858751};
-     double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
-     double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
-
-     // Transform eval to centipawns with limited range
-     double x = Utility::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
-
-     // Return win rate in per mille (rounded to nearest)
-     return int(0.5 + 1000 / (1 + std::exp((a - x) / b)));
-  }
-} // namespace UCI
-
-
 // Make is_ready_cmd() callable from outside. (Because I want to call it from the bench command etc.)
 // Note that the phase is not initialized.
 void is_ready(bool skipCorruptCheck)
diff --git a/src/uci.h b/src/uci.h
index 8e12c856..5073262e 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -74,7 +74,6 @@ std::string square(Square s);
 std::string move(Move m, bool chess960);
 std::string pv(const Position& pos, Depth depth, Value alpha, Value beta);
 std::string wdl(Value v, int ply);
-int win_rate_model(Value v, int ply);
 Move to_move(const Position& pos, std::string& str);
 
 // Flag that read the evaluation function. This is set to false when evaldir is changed.

From 288fdc55977cba92de1213159d43ddca1b5b6a48 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 6 Jul 2020 17:38:43 +0900
Subject: [PATCH 087/583] Added "nodes" option to the "gensfen" command to
 specify the number of the nodes to be searched.

---
 src/learn/learner.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 1d724266..94991948 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -335,6 +335,10 @@ struct MultiThinkGenSfen : public MultiThink
 	int search_depth;
 	int search_depth2;
 
+	// Number of the nodes to be searched.
+	// 0 represents no limits.
+	uint64_t nodes;
+
 	// Upper limit of evaluation value of generated situation
 	int eval_limit;
 
@@ -553,7 +557,7 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 				// search_depth～search_depth2 Evaluation value of hand reading and PV (best responder row)
 				// There should be no problem if you narrow the search window.
 
-				auto pv_value1 = search(pos, depth);
+				auto pv_value1 = search(pos, depth, 1, nodes);
 
 				auto value1 = pv_value1.first;
 				auto& pv1 = pv_value1.second;
@@ -850,6 +854,9 @@ void gen_sfen(Position&, istringstream& is)
 	int search_depth = 3;
 	int search_depth2 = INT_MIN;
 
+	// Number of nodes to be searched.
+	uint64_t nodes = 0;
+
 	// minimum ply, maximum ply and number of random moves
 	int random_move_minply = 1;
 	int random_move_maxply = 24;
@@ -895,6 +902,8 @@ void gen_sfen(Position&, istringstream& is)
 			is >> search_depth;
 		else if (token == "depth2")
 			is >> search_depth2;
+		else if (token == "nodes")
+			is >> nodes;
 		else if (token == "loop")
 			is >> loop_max;
 		else if (token == "output_file_name")
@@ -964,6 +973,7 @@ void gen_sfen(Position&, istringstream& is)
 
 	std::cout << "gensfen : " << endl
 		<< "  search_depth = " << search_depth << " to " << search_depth2 << endl
+		<< "  nodes = " << nodes << endl
 		<< "  loop_max = " << loop_max << endl
 		<< "  eval_limit = " << eval_limit << endl
 		<< "  thread_num (set by USI setoption) = " << thread_num << endl
@@ -988,6 +998,7 @@ void gen_sfen(Position&, istringstream& is)
 		sw.save_every = save_every;
 
 		MultiThinkGenSfen multi_think(search_depth, search_depth2, sw);
+		multi_think.nodes = nodes;
 		multi_think.set_loop_max(loop_max);
 		multi_think.eval_limit = eval_limit;
 		multi_think.random_move_minply = random_move_minply;

From a5af8510a56c952bf37e4efb5241142f2206b822 Mon Sep 17 00:00:00 2001
From: joergoster <osterj165@googlemail.com>
Date: Mon, 6 Jul 2020 17:10:31 +0200
Subject: [PATCH 088/583] Rework loading the net.

---
 src/eval/nnue/evaluate_nnue.cpp | 38 ++++++++---------
 src/learn/learner.cpp           |  6 +--
 src/learn/multi_think.cpp       |  2 +-
 src/uci.cpp                     | 75 +++++++++++++--------------------
 src/uci.h                       |  2 +-
 src/ucioption.cpp               |  2 +-
 6 files changed, 52 insertions(+), 73 deletions(-)

diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index 6b3f0b2f..a573c9c7 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -233,31 +233,27 @@ void prefetch_evalhash(const Key key) {
 // Save and restore Options with bench command etc., so EvalDir is changed at this time,
 // This function may be called twice to flag that the evaluation function needs to be reloaded.
 void load_eval() {
+
+  if (Options["SkipLoadingEval"])
+  {
+      std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
+      return;
+  }
+
   NNUE::Initialize();
 
-  if (!Options["SkipLoadingEval"])
-  {
-    const std::string dir_name = Options["EvalDir"];
-    const std::string file_name = Path::Combine(dir_name, NNUE::kFileName);
-    //{
-    //  std::ofstream stream(file_name, std::ios::binary);
-    //  NNUE::WriteParameters(stream);
-    //}
-    std::ifstream stream(file_name, std::ios::binary);
-    const bool result = NNUE::ReadParameters(stream);
+  const std::string dir_name = Options["EvalDir"];
+  const std::string file_name = Path::Combine(dir_name, NNUE::kFileName);
+
+  std::ifstream stream(file_name, std::ios::binary);
+  const bool result = NNUE::ReadParameters(stream);
+
+  if (!result)
+      // It's a problem if it doesn't finish when there is a read error.
+      std::cout << "Error! " << NNUE::kFileName << " not found or wrong format" << std::endl;
 
-//    ASSERT(result);
-	if (!result)
-	{
-		// It's a problem if it doesn't finish when there is a read error.
-		std::cout << "Error! " << NNUE::kFileName << " not found or wrong format" << std::endl;
-		//my_exit();
-	}
-	else
-	  std::cout << "info string NNUE " << NNUE::kFileName << " found & loaded" << std::endl;
-  }
   else
-    std::cout << "info string NNUE " << NNUE::kFileName << " not loaded" << std::endl;
+      std::cout << "info string NNUE " << NNUE::kFileName << " found & loaded" << std::endl;
 }
 
 // Initialization
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 94991948..58719821 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -3092,7 +3092,7 @@ void learn(Position&, istringstream& is)
 	//}
 	if (use_convert_bin)
 	{
-	  	is_ready(true);
+	  	init_nnue(true);
 		cout << "convert_bin.." << endl;
 		convert_bin(filenames,output_file_name, ply_minimum, ply_maximum, interpolate_eval);
 		return;
@@ -3100,7 +3100,7 @@ void learn(Position&, istringstream& is)
 	}
 	if (use_convert_bin_from_pgn_extract)
 	{
-		is_ready(true);
+		init_nnue(true);
 		cout << "convert_bin_from_pgn-extract.." << endl;
 		convert_bin_from_pgn_extract(filenames, output_file_name, pgn_eval_side_to_move);
 		return;
@@ -3166,7 +3166,7 @@ void learn(Position&, istringstream& is)
 	cout << "init.." << endl;
 
 	// Read evaluation function parameters
-	is_ready(true);
+	init_nnue(true);
 
 #if !defined(EVAL_NNUE)
 	cout << "init_grad.." << endl;
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index d511c277..ba2c47d4 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -20,7 +20,7 @@ void MultiThink::go_think()
 	// Read evaluation function, etc.
 	// In the case of the learn command, the value of the evaluation function may be corrected after reading the evaluation function, so
 	// Skip memory corruption check.
-	is_ready(true);
+	init_nnue(true);
 
 	// Call the derived class's init().
 	init();
diff --git a/src/uci.cpp b/src/uci.cpp
index a95a629d..6d86ebca 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -73,7 +73,7 @@ namespace Learner
 void test_cmd(Position& pos, istringstream& is)
 {
     // Initialize as it may be searched.
-    is_ready();
+    init_nnue();
 
     std::string param;
     is >> param;
@@ -209,7 +209,14 @@ namespace {
         }
         else if (token == "setoption")  setoption(is);
         else if (token == "position")   position(pos, is, states);
-        else if (token == "ucinewgame") { Search::clear(); elapsed = now(); } // Search::clear() may take some while
+        else if (token == "ucinewgame")
+        {
+#if defined(EVAL_NNUE)
+            init_nnue();
+#endif
+            Search::clear();
+            elapsed = now(); // Search::clear() may take some while
+        }
     }
 
     elapsed = now() - elapsed + 1; // Ensure positivity to avoid a 'divide by zero'
@@ -250,7 +257,7 @@ namespace {
 
 // Make is_ready_cmd() callable from outside. (Because I want to call it from the bench command etc.)
 // Note that the phase is not initialized.
-void is_ready(bool skipCorruptCheck)
+void init_nnue(bool skipCorruptCheck)
 {
 #if defined(EVAL_NNUE)
   // After receiving "isready", modify so that a line feed is sent every 5 seconds until "readyok" is returned. (keep alive processing)
@@ -260,59 +267,29 @@ void is_ready(bool skipCorruptCheck)
   // -Shogi GUI already does so, so MyShogi will follow along.
   //-Also, the engine side of Yaneura King modifies it so that after "isready" is received, a line feed is sent every 5 seconds until "readyok" is returned.
 
-  auto ended = false;
-  auto th = std::thread([&ended] {
-    int count = 0;
-    while (!ended)
-    {
-      std::this_thread::sleep_for(std::chrono::milliseconds(100));
-      if (++count >= 50 /* 5 seconds */)
-      {
-        count = 0;
-        sync_cout << sync_endl; // Send a line break.
-      }
-    }
-    });
-
   // Perform processing that may take time, such as reading the evaluation function, at this timing.
   // If you do a time-consuming process at startup, Shogi place will make a timeout judgment and retire the recognition as a thinking engine.
   if (!UCI::load_eval_finished)
   {
-    // Read evaluation function
-    Eval::load_eval();
+      // Read evaluation function
+      Eval::load_eval();
 
-    // Calculate and save checksum (to check for subsequent memory corruption)
-    eval_sum = Eval::calc_check_sum();
+      // Calculate and save checksum (to check for subsequent memory corruption)
+      eval_sum = Eval::calc_check_sum();
 
-    // display soft name
-    Eval::print_softname(eval_sum);
-
-    UCI::load_eval_finished = true;
+      // display soft name
+      Eval::print_softname(eval_sum);
 
+      UCI::load_eval_finished = true;
   }
   else
   {
-    // Check the checksum every time to see if the memory has been corrupted.
-    // It seems that the time is a little wasteful, but it is good because it is about 0.1 seconds.
-    if (!skipCorruptCheck && eval_sum != Eval::calc_check_sum())
-      sync_cout << "Error! : EVAL memory is corrupted" << sync_endl;
+      // Check the checksum every time to see if the memory has been corrupted.
+      // It seems that the time is a little wasteful, but it is good because it is about 0.1 seconds.
+      if (!skipCorruptCheck && eval_sum != Eval::calc_check_sum())
+          sync_cout << "Error! : EVAL memory is corrupted" << sync_endl;
   }
-
-  // For isready, it is promised that the next command will not come until it returns readyok.
-  // Initialize various variables at this timing.
-
-  TT.resize(Options["Hash"]);
-  Search::clear();
-  Time.availableNodes = 0;
-
-  Threads.stop = false;
-
-  // Terminate the thread created to send keep alive and wait.
-  ended = true;
-  th.join();
 #endif  // defined(EVAL_NNUE)
-
-  sync_cout << "readyok" << sync_endl;
 }
 
 
@@ -399,8 +376,14 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "setoption")  setoption(is);
       else if (token == "go")         go(pos, is, states);
       else if (token == "position")   position(pos, is, states);
-      else if (token == "ucinewgame") Search::clear();
-      else if (token == "isready")    is_ready();
+      else if (token == "ucinewgame")
+      {
+#if defined(EVAL_NNUE)
+          init_nnue();
+#endif
+          Search::clear();
+      }
+      else if (token == "isready")    sync_cout << "readyok" << sync_endl;
 
       // Additional custom non-UCI commands, mainly for debugging.
       // Do not use these commands during a search!
diff --git a/src/uci.h b/src/uci.h
index 5073262e..6529f90c 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -87,7 +87,7 @@ extern UCI::OptionsMap Options;
 // If skipCorruptCheck == true, skip memory corruption check by check sum when reading the evaluation function a second time.
 // * This function is inconvenient if it is not available in Stockfish, so add it.
 
-void is_ready(bool skipCorruptCheck = false);
+void init_nnue(bool skipCorruptCheck = false);
 
 extern const char* StartFEN;
 
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index d63caa9f..f067a875 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -42,7 +42,7 @@ void on_hash_size(const Option& o) { TT.resize(size_t(o)); }
 void on_logger(const Option& o) { start_logger(o); }
 void on_threads(const Option& o) { Threads.set(size_t(o)); }
 void on_tb_path(const Option& o) { Tablebases::init(o); }
-void on_eval_dir(const Option& o) { load_eval_finished = false; }
+void on_eval_dir(const Option& o) { load_eval_finished = false; init_nnue(); }
 
 
 /// Our case insensitive less() function as required by UCI protocol

From cd55c268cbada104d8fdb55ce8b9221fc59c08b7 Mon Sep 17 00:00:00 2001
From: joergoster <osterj165@googlemail.com>
Date: Tue, 7 Jul 2020 11:27:16 +0200
Subject: [PATCH 089/583] Bugfix. Otherwise creating a new net fails.

---
 src/eval/nnue/evaluate_nnue.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index a573c9c7..bc06ea31 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -234,14 +234,15 @@ void prefetch_evalhash(const Key key) {
 // This function may be called twice to flag that the evaluation function needs to be reloaded.
 void load_eval() {
 
+  // Must be done!
+  NNUE::Initialize();
+
   if (Options["SkipLoadingEval"])
   {
       std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
       return;
   }
 
-  NNUE::Initialize();
-
   const std::string dir_name = Options["EvalDir"];
   const std::string file_name = Path::Combine(dir_name, NNUE::kFileName);
 

From c59583bbf06e69cb367b835b0d5e8f2555bb77c2 Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Tue, 7 Jul 2020 23:25:20 +0200
Subject: [PATCH 090/583] UCI option EvalFile

Replace EvalDir with EvalFile
Can now browse filesystem for net (eval\nn.bin is default)
nn.bin no longer hard-coded
---
 src/eval/nnue/evaluate_nnue.cpp |  6 +++---
 src/eval/nnue/evaluate_nnue.h   |  2 +-
 src/ucioption.cpp               | 10 +++++-----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index bc06ea31..27c79605 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -23,7 +23,7 @@ AlignedPtr<FeatureTransformer> feature_transformer;
 AlignedPtr<Network> network;
 
 // Evaluation function file name
-const char* const kFileName = "nn.bin";
+const char* kFileName = "eval\\nn.bin";
 
 // Get a string that represents the structure of the evaluation function
 std::string GetArchitectureString() {
@@ -243,8 +243,8 @@ void load_eval() {
       return;
   }
 
-  const std::string dir_name = Options["EvalDir"];
-  const std::string file_name = Path::Combine(dir_name, NNUE::kFileName);
+  const std::string file_name = Options["EvalFile"];
+  NNUE::kFileName = file_name.c_str();
 
   std::ifstream stream(file_name, std::ios::binary);
   const bool result = NNUE::ReadParameters(stream);
diff --git a/src/eval/nnue/evaluate_nnue.h b/src/eval/nnue/evaluate_nnue.h
index ee498f51..6fb45277 100644
--- a/src/eval/nnue/evaluate_nnue.h
+++ b/src/eval/nnue/evaluate_nnue.h
@@ -36,7 +36,7 @@ extern AlignedPtr<FeatureTransformer> feature_transformer;
 extern AlignedPtr<Network> network;
 
 // Evaluation function file name
-extern const char* const kFileName;
+extern const char* kFileName;
 
 // Get a string that represents the structure of the evaluation function
 std::string GetArchitectureString();
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index f067a875..d98d82b1 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -42,7 +42,7 @@ void on_hash_size(const Option& o) { TT.resize(size_t(o)); }
 void on_logger(const Option& o) { start_logger(o); }
 void on_threads(const Option& o) { Threads.set(size_t(o)); }
 void on_tb_path(const Option& o) { Tablebases::init(o); }
-void on_eval_dir(const Option& o) { load_eval_finished = false; init_nnue(); }
+void on_eval_file(const Option& o) { load_eval_finished = false; init_nnue(); }
 
 
 /// Our case insensitive less() function as required by UCI protocol
@@ -80,12 +80,12 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
-  // Evaluation function folder. When this is changed, it is necessary to reread the evaluation function at the next isready timing.
-  o["EvalDir"]               << Option("eval", on_eval_dir);
-  // When the evaluation function is loaded at the isready timing, it is necessary to convert the new evaluation function.
+  // Evaluation function file name. When this is changed, it is necessary to reread the evaluation function at the next ucinewgame timing.
+  o["EvalFile"]              << Option("eval\\nn.bin", on_eval_file);
+  // When the evaluation function is loaded at the ucinewgame timing, it is necessary to convert the new evaluation function.
   // I want to hit the test eval convert command, but there is no new evaluation function
   // It ends abnormally before executing this command.
-  // Therefore, with this hidden option, you can suppress the loading of the evaluation function when isready,
+  // Therefore, with this hidden option, you can suppress the loading of the evaluation function when ucinewgame,
   // Hit the test eval convert command.
   o["SkipLoadingEval"]       << Option(false);
   // how many moves to use a fixed move

From d1760a1f15a9f82e64b01ab18f6a850d4ddb2e1a Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Wed, 8 Jul 2020 04:23:50 +0200
Subject: [PATCH 091/583] update evaluate_nnue.cpp

rename kFileName and change to std:string
---
 src/eval/nnue/evaluate_nnue.cpp     | 8 ++++----
 src/eval/nnue/evaluate_nnue.h       | 2 +-
 src/eval/nnue/nnue_test_command.cpp | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index 27c79605..d0c04958 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -23,7 +23,7 @@ AlignedPtr<FeatureTransformer> feature_transformer;
 AlignedPtr<Network> network;
 
 // Evaluation function file name
-const char* kFileName = "eval\\nn.bin";
+std::string fileName = "eval\\nn.bin";
 
 // Get a string that represents the structure of the evaluation function
 std::string GetArchitectureString() {
@@ -244,17 +244,17 @@ void load_eval() {
   }
 
   const std::string file_name = Options["EvalFile"];
-  NNUE::kFileName = file_name.c_str();
+  NNUE::fileName = file_name;
 
   std::ifstream stream(file_name, std::ios::binary);
   const bool result = NNUE::ReadParameters(stream);
 
   if (!result)
       // It's a problem if it doesn't finish when there is a read error.
-      std::cout << "Error! " << NNUE::kFileName << " not found or wrong format" << std::endl;
+      std::cout << "Error! " << NNUE::fileName << " not found or wrong format" << std::endl;
 
   else
-      std::cout << "info string NNUE " << NNUE::kFileName << " found & loaded" << std::endl;
+      std::cout << "info string NNUE " << NNUE::fileName << " found & loaded" << std::endl;
 }
 
 // Initialization
diff --git a/src/eval/nnue/evaluate_nnue.h b/src/eval/nnue/evaluate_nnue.h
index 6fb45277..97ce3df8 100644
--- a/src/eval/nnue/evaluate_nnue.h
+++ b/src/eval/nnue/evaluate_nnue.h
@@ -36,7 +36,7 @@ extern AlignedPtr<FeatureTransformer> feature_transformer;
 extern AlignedPtr<Network> network;
 
 // Evaluation function file name
-extern const char* kFileName;
+extern std::string fileName;
 
 // Get a string that represents the structure of the evaluation function
 std::string GetArchitectureString();
diff --git a/src/eval/nnue/nnue_test_command.cpp b/src/eval/nnue/nnue_test_command.cpp
index 46bc97de..b0c57d4c 100644
--- a/src/eval/nnue/nnue_test_command.cpp
+++ b/src/eval/nnue/nnue_test_command.cpp
@@ -190,7 +190,7 @@ void TestCommand(Position& pos, std::istream& stream) {
   } else {
     std::cout << "usage:" << std::endl;
     std::cout << " test nnue test_features" << std::endl;
-    std::cout << " test nnue info [path/to/" << kFileName << "...]" << std::endl;
+    std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
   }
 }
 

From f7420652b7f9da0df9096e3d721037e3cd011ec4 Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Tue, 7 Jul 2020 23:25:20 +0200
Subject: [PATCH 092/583] UCI option EvalFile

Replace EvalDir with EvalFile
Can now browse filesystem for net (eval\nn.bin is default)
nn.bin no longer hard-coded
---
 src/eval/nnue/evaluate_nnue.cpp |  6 +++---
 src/eval/nnue/evaluate_nnue.h   |  2 +-
 src/ucioption.cpp               | 10 +++++-----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index bc06ea31..27c79605 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -23,7 +23,7 @@ AlignedPtr<FeatureTransformer> feature_transformer;
 AlignedPtr<Network> network;
 
 // Evaluation function file name
-const char* const kFileName = "nn.bin";
+const char* kFileName = "eval\\nn.bin";
 
 // Get a string that represents the structure of the evaluation function
 std::string GetArchitectureString() {
@@ -243,8 +243,8 @@ void load_eval() {
       return;
   }
 
-  const std::string dir_name = Options["EvalDir"];
-  const std::string file_name = Path::Combine(dir_name, NNUE::kFileName);
+  const std::string file_name = Options["EvalFile"];
+  NNUE::kFileName = file_name.c_str();
 
   std::ifstream stream(file_name, std::ios::binary);
   const bool result = NNUE::ReadParameters(stream);
diff --git a/src/eval/nnue/evaluate_nnue.h b/src/eval/nnue/evaluate_nnue.h
index ee498f51..6fb45277 100644
--- a/src/eval/nnue/evaluate_nnue.h
+++ b/src/eval/nnue/evaluate_nnue.h
@@ -36,7 +36,7 @@ extern AlignedPtr<FeatureTransformer> feature_transformer;
 extern AlignedPtr<Network> network;
 
 // Evaluation function file name
-extern const char* const kFileName;
+extern const char* kFileName;
 
 // Get a string that represents the structure of the evaluation function
 std::string GetArchitectureString();
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index f067a875..d98d82b1 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -42,7 +42,7 @@ void on_hash_size(const Option& o) { TT.resize(size_t(o)); }
 void on_logger(const Option& o) { start_logger(o); }
 void on_threads(const Option& o) { Threads.set(size_t(o)); }
 void on_tb_path(const Option& o) { Tablebases::init(o); }
-void on_eval_dir(const Option& o) { load_eval_finished = false; init_nnue(); }
+void on_eval_file(const Option& o) { load_eval_finished = false; init_nnue(); }
 
 
 /// Our case insensitive less() function as required by UCI protocol
@@ -80,12 +80,12 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
-  // Evaluation function folder. When this is changed, it is necessary to reread the evaluation function at the next isready timing.
-  o["EvalDir"]               << Option("eval", on_eval_dir);
-  // When the evaluation function is loaded at the isready timing, it is necessary to convert the new evaluation function.
+  // Evaluation function file name. When this is changed, it is necessary to reread the evaluation function at the next ucinewgame timing.
+  o["EvalFile"]              << Option("eval\\nn.bin", on_eval_file);
+  // When the evaluation function is loaded at the ucinewgame timing, it is necessary to convert the new evaluation function.
   // I want to hit the test eval convert command, but there is no new evaluation function
   // It ends abnormally before executing this command.
-  // Therefore, with this hidden option, you can suppress the loading of the evaluation function when isready,
+  // Therefore, with this hidden option, you can suppress the loading of the evaluation function when ucinewgame,
   // Hit the test eval convert command.
   o["SkipLoadingEval"]       << Option(false);
   // how many moves to use a fixed move

From ec3eaad64f32e26404313b7a38a273f9dfae92b7 Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Wed, 8 Jul 2020 04:23:50 +0200
Subject: [PATCH 093/583] update evaluate_nnue.cpp

rename kFileName and change to std:string
---
 src/eval/nnue/evaluate_nnue.cpp     | 8 ++++----
 src/eval/nnue/evaluate_nnue.h       | 2 +-
 src/eval/nnue/nnue_test_command.cpp | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index 27c79605..d0c04958 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -23,7 +23,7 @@ AlignedPtr<FeatureTransformer> feature_transformer;
 AlignedPtr<Network> network;
 
 // Evaluation function file name
-const char* kFileName = "eval\\nn.bin";
+std::string fileName = "eval\\nn.bin";
 
 // Get a string that represents the structure of the evaluation function
 std::string GetArchitectureString() {
@@ -244,17 +244,17 @@ void load_eval() {
   }
 
   const std::string file_name = Options["EvalFile"];
-  NNUE::kFileName = file_name.c_str();
+  NNUE::fileName = file_name;
 
   std::ifstream stream(file_name, std::ios::binary);
   const bool result = NNUE::ReadParameters(stream);
 
   if (!result)
       // It's a problem if it doesn't finish when there is a read error.
-      std::cout << "Error! " << NNUE::kFileName << " not found or wrong format" << std::endl;
+      std::cout << "Error! " << NNUE::fileName << " not found or wrong format" << std::endl;
 
   else
-      std::cout << "info string NNUE " << NNUE::kFileName << " found & loaded" << std::endl;
+      std::cout << "info string NNUE " << NNUE::fileName << " found & loaded" << std::endl;
 }
 
 // Initialization
diff --git a/src/eval/nnue/evaluate_nnue.h b/src/eval/nnue/evaluate_nnue.h
index 6fb45277..97ce3df8 100644
--- a/src/eval/nnue/evaluate_nnue.h
+++ b/src/eval/nnue/evaluate_nnue.h
@@ -36,7 +36,7 @@ extern AlignedPtr<FeatureTransformer> feature_transformer;
 extern AlignedPtr<Network> network;
 
 // Evaluation function file name
-extern const char* kFileName;
+extern std::string fileName;
 
 // Get a string that represents the structure of the evaluation function
 std::string GetArchitectureString();
diff --git a/src/eval/nnue/nnue_test_command.cpp b/src/eval/nnue/nnue_test_command.cpp
index 46bc97de..b0c57d4c 100644
--- a/src/eval/nnue/nnue_test_command.cpp
+++ b/src/eval/nnue/nnue_test_command.cpp
@@ -190,7 +190,7 @@ void TestCommand(Position& pos, std::istream& stream) {
   } else {
     std::cout << "usage:" << std::endl;
     std::cout << " test nnue test_features" << std::endl;
-    std::cout << " test nnue info [path/to/" << kFileName << "...]" << std::endl;
+    std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
   }
 }
 

From 76d124ed70f0f2be17207779606e1670263e41d2 Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Wed, 8 Jul 2020 13:42:28 +0200
Subject: [PATCH 094/583] Update evaluate_nnue_learner.cpp

replace NNUE::kFileName with NNUE::fileName
---
 src/eval/nnue/evaluate_nnue_learner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/eval/nnue/evaluate_nnue_learner.cpp b/src/eval/nnue/evaluate_nnue_learner.cpp
index 0a2077a7..28243871 100644
--- a/src/eval/nnue/evaluate_nnue_learner.cpp
+++ b/src/eval/nnue/evaluate_nnue_learner.cpp
@@ -110,7 +110,7 @@ void SetOptions(const std::string& options) {
 
 // Reread the evaluation function parameters for learning from the file
 void RestoreParameters(const std::string& dir_name) {
-  const std::string file_name = Path::Combine(dir_name, NNUE::kFileName);
+  const std::string file_name = NNUE::fileName;
   std::ifstream stream(file_name, std::ios::binary);
   bool result = ReadParameters(stream);
   assert(result);
@@ -213,7 +213,7 @@ void save_eval(std::string dir_name) {
     NNUE::SendMessages({{"clear_unobserved_feature_weights"}});
   }
 
-  const std::string file_name = Path::Combine(eval_dir, NNUE::kFileName);
+  const std::string file_name = NNUE::fileName;
   std::ofstream stream(file_name, std::ios::binary);
   const bool result = NNUE::WriteParameters(stream);
   assert(result);

From 821aaf3836e639c8521280b7d8f3b976348fcd16 Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Wed, 8 Jul 2020 14:31:40 +0200
Subject: [PATCH 095/583] Update misc.cpp

do not clutter console window
remove "Windows large pages not used."
only show message when/if successful
---
 src/misc.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/misc.cpp b/src/misc.cpp
index 1d6bbb4f..6a7f178e 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -371,8 +371,8 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
   {
       if (mem)
           sync_cout << "info string Hash table allocation: Windows large pages used." << sync_endl;
-      else
-          sync_cout << "info string Hash table allocation: Windows large pages not used." << sync_endl;
+      //else
+          //sync_cout << "info string Hash table allocation: Windows large pages not used." << sync_endl;
   }
   firstCall = false;
 

From df9b2a87db1af229394f54129d09da11709442f0 Mon Sep 17 00:00:00 2001
From: FireFather <firefather@telenet.be>
Date: Wed, 8 Jul 2020 16:18:42 +0200
Subject: [PATCH 096/583] Update misc.cpp

change name to Stockfish+NNUE
and add 3 more authors
---
 src/misc.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/misc.cpp b/src/misc.cpp
index 6a7f178e..865e21fb 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -140,7 +140,7 @@ const string engine_info(bool to_uci) {
   string month, day, year;
   stringstream ss, date(__DATE__); // From compiler, format is "Sep 21 2008"
 
-  ss << "Stockfish " << Version << setfill('0');
+  ss << "Stockfish+NNUE " << Version << setfill('0');
 
   if (Version.empty())
   {
@@ -151,7 +151,7 @@ const string engine_info(bool to_uci) {
   ss << (Is64Bit ? " 64" : "")
      << (HasPext ? " BMI2" : (HasPopCnt ? " POPCNT" : ""))
      << (to_uci  ? "\nid author ": " by ")
-     << "T. Romstad, M. Costalba, J. Kiiski, G. Linscott";
+     << "T. Romstad, M. Costalba, J. Kiiski, G. Linscott, H. Noda, Y. Nasu, M. Isozaki";
 
   return ss.str();
 }

From a06234c639ee155c02e77cda92a31a503cf44c52 Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Wed, 8 Jul 2020 23:44:01 +0900
Subject: [PATCH 097/583] enable convert_plain

learn convert_plain output_file_name xxx.txt xxx.bin
---
 src/learn/learner.cpp | 85 ++++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 41 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 58719821..bf4a3034 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -2759,39 +2759,43 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 	ofs.close();
 }
 
-//void convert_plain(const vector<string>& filenames , const string& output_file_name)
-//{
-//	Position tpos;
-//	std::ofstream ofs;
-//	ofs.open(output_file_name, ios::app);
-//	for (auto filename : filenames) {
-//		std::cout << "convert " << filename << " ... ";
-//
-// 		// Just convert packedsfenvalue to text
-//		std::fstream fs;
-//		fs.open(filename, ios::in | ios::binary);
-//		PackedSfenValue p;
-//		while (true)
-//		{
-//			if (fs.read((char*)&p, sizeof(PackedSfenValue))) {
-// 				// write as plain text
-//				ofs << "sfen " << tpos.sfen_unpack(p.sfen) << std::endl;
-//				ofs << "move " << to_usi_string(Move(p.move)) << std::endl;
-//				ofs << "score " << p.score << std::endl;
-//				ofs << "ply " << int(p.gamePly) << std::endl;
-//				ofs << "result " << int(p.game_result) << std::endl;
-//				ofs << "e" << std::endl;
-//			}
-//			else {
-//				break;
-//			}
-//		}
-//		fs.close();
-//		std::cout << "done" << std::endl;
-//	}
-//	ofs.close();
-//	std::cout << "all done" << std::endl;
-//}
+void convert_plain(const vector<string>& filenames, const string& output_file_name)
+{
+	Position tpos;
+	std::ofstream ofs;
+	ofs.open(output_file_name, ios::app);
+	auto th = Threads.main();
+	for (auto filename : filenames) {
+		std::cout << "convert " << filename << " ... ";
+
+		// Just convert packedsfenvalue to text
+		std::fstream fs;
+		fs.open(filename, ios::in | ios::binary);
+		PackedSfenValue p;
+		while (true)
+		{
+			if (fs.read((char*)&p, sizeof(PackedSfenValue))) {
+				StateInfo si;
+				tpos.set_from_packed_sfen(p.sfen, &si, th, false);
+
+				// write as plain text
+				ofs << "fen " << tpos.fen() << std::endl;
+				ofs << "move " << UCI::move(Move(p.move), false) << std::endl;
+				ofs << "score " << p.score << std::endl;
+				ofs << "ply " << int(p.gamePly) << std::endl;
+				ofs << "result " << int(p.game_result) << std::endl;
+				ofs << "e" << std::endl;
+			}
+			else {
+				break;
+			}
+		}
+		fs.close();
+		std::cout << "done" << std::endl;
+	}
+	ofs.close();
+	std::cout << "all done" << std::endl;
+}
 
 // Learning from the generated game record
 void learn(Position&, istringstream& is)
@@ -3082,14 +3086,13 @@ void learn(Position&, istringstream& is)
 		shuffle_files_on_memory(filenames,output_file_name);
 		return;
 	}
-	//if (use_convert_plain)
-	//{
-	// 		is_ready(true);
-	//  cout << "convert_plain.." << endl;
-	//  convert_plain(filenames,output_file_name);
-	//  return;
-	//
-	//}
+	if (use_convert_plain)
+	{
+		init_nnue(true);
+		cout << "convert_plain.." << endl;
+		convert_plain(filenames, output_file_name);
+		return;
+	}
 	if (use_convert_bin)
 	{
 	  	init_nnue(true);

From b9a32fe3313e0369fc636415a7b292d2485d3e8f Mon Sep 17 00:00:00 2001
From: No name <no@email>
Date: Thu, 9 Jul 2020 22:09:26 +0300
Subject: [PATCH 098/583] Define USE_SSE2 for any x86-64 target

Rather than only when popcnt=yes
x86-64 instruction set includes SSE2.
---
 src/Makefile | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 1de2d243..eaa4c867 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -373,9 +373,9 @@ ifeq ($(popcnt),yes)
 	ifeq ($(arch),$(filter $(arch),ppc64 armv8-a))
 		CXXFLAGS += -DUSE_POPCNT
 	else ifeq ($(comp),icc)
-		CXXFLAGS += -msse3 -DUSE_POPCNT -DUSE_SSE2
+		CXXFLAGS += -msse3 -DUSE_POPCNT
 	else
-		CXXFLAGS += -msse3 -mpopcnt -DUSE_POPCNT -DUSE_SSE2
+		CXXFLAGS += -msse3 -mpopcnt -DUSE_POPCNT
 	endif
 endif
 
@@ -393,6 +393,10 @@ ifeq ($(sse41),yes)
 	endif
 endif
 
+ifeq ($(arch),x86_64)
+	CXXFLAGS += -DUSE_SSE2
+endif
+
 ### 3.7 pext
 ifeq ($(pext),yes)
 	CXXFLAGS += -DUSE_PEXT

From 081761d084452c2cb204614d14f919fc1c0cb567 Mon Sep 17 00:00:00 2001
From: No name <no@email>
Date: Thu, 9 Jul 2020 23:18:13 +0300
Subject: [PATCH 099/583] Add support for SSSE3-only compiles

For Core 2 Duo.

To compile:
make ARCH=x86-64 ssse3=yes nnue

No observable difference in speed to SSE4.1 on my machine.
---
 src/Makefile                             |  8 ++++++++
 src/eval/nnue/layers/affine_transform.h  |  4 ++--
 src/eval/nnue/layers/clipped_relu.h      | 15 ++++++++++++---
 src/eval/nnue/nnue_common.h              |  2 ++
 src/eval/nnue/nnue_feature_transformer.h | 18 ++++++++++++++----
 5 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index eaa4c867..aa5cc1ba 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -387,12 +387,20 @@ ifeq ($(avx2),yes)
 endif
 
 ifeq ($(sse41),yes)
+	ssse3 = yes
 	CXXFLAGS += -DUSE_SSE41
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
 		CXXFLAGS += -msse4.1
 	endif
 endif
 
+ifeq ($(ssse3),yes)
+	CXXFLAGS += -DUSE_SSSE3
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
+		CXXFLAGS += -mssse3
+	endif
+endif
+
 ifeq ($(arch),x86_64)
 	CXXFLAGS += -DUSE_SSE2
 endif
diff --git a/src/eval/nnue/layers/affine_transform.h b/src/eval/nnue/layers/affine_transform.h
index c06af1a0..cb56b07d 100644
--- a/src/eval/nnue/layers/affine_transform.h
+++ b/src/eval/nnue/layers/affine_transform.h
@@ -86,7 +86,7 @@ class AffineTransform {
     constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
     const __m256i kOnes = _mm256_set1_epi16(1);
     const auto input_vector = reinterpret_cast<const __m256i*>(input);
-#elif defined(USE_SSE41)
+#elif defined(USE_SSSE3)
     constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
     const __m128i kOnes = _mm_set1_epi16(1);
     const auto input_vector = reinterpret_cast<const __m128i*>(input);
@@ -118,7 +118,7 @@ class AffineTransform {
       const __m128i lo = _mm256_extracti128_si256(sum, 0);
       const __m128i hi = _mm256_extracti128_si256(sum, 1);
       output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
-#elif defined(USE_SSE41)
+#elif defined(USE_SSSE3)
       __m128i sum = _mm_cvtsi32_si128(biases_[i]);
       const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
       for (IndexType j = 0; j < kNumChunks; ++j) {
diff --git a/src/eval/nnue/layers/clipped_relu.h b/src/eval/nnue/layers/clipped_relu.h
index 7c5c1f75..fe4bedaa 100644
--- a/src/eval/nnue/layers/clipped_relu.h
+++ b/src/eval/nnue/layers/clipped_relu.h
@@ -110,9 +110,12 @@ class ClippedReLU {
           _mm256_packs_epi16(words0, words1), kZero), kOffsets));
     }
     constexpr IndexType kStart = kNumChunks * kSimdWidth;
-#elif defined(USE_SSE41)
+#elif defined(USE_SSSE3)
     constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
     const __m128i kZero = _mm_setzero_si128();
+#ifndef USE_SSE41
+    const __m128i k0x80s = _mm_set1_epi8(-128);
+#endif
     const auto in = reinterpret_cast<const __m128i*>(input);
     const auto out = reinterpret_cast<__m128i*>(output);
     for (IndexType i = 0; i < kNumChunks; ++i) {
@@ -122,8 +125,14 @@ class ClippedReLU {
       const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
           _mm_load_si128(&in[i * 4 + 2]),
           _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
-      _mm_store_si128(&out[i], _mm_max_epi8(
-          _mm_packs_epi16(words0, words1), kZero));
+      const __m128i packedbytes = _mm_packs_epi16(words0, words1);
+      _mm_store_si128(&out[i], 
+#ifdef USE_SSE41
+        _mm_max_epi8(packedbytes, kZero)
+#else
+        _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+#endif
+      );
     }
     constexpr IndexType kStart = kNumChunks * kSimdWidth;
 #elif defined(IS_ARM)
diff --git a/src/eval/nnue/nnue_common.h b/src/eval/nnue/nnue_common.h
index bb52bdfe..cffb0098 100644
--- a/src/eval/nnue/nnue_common.h
+++ b/src/eval/nnue/nnue_common.h
@@ -9,6 +9,8 @@
 #include <immintrin.h>
 #elif defined(USE_SSE41)
 #include <smmintrin.h>
+#elif defined(USE_SSSE3)
+#include <tmmintrin.h>
 #elif defined(USE_SSE2)
 #include <emmintrin.h>
 #endif
diff --git a/src/eval/nnue/nnue_feature_transformer.h b/src/eval/nnue/nnue_feature_transformer.h
index 27bbb562..bb1a50bc 100644
--- a/src/eval/nnue/nnue_feature_transformer.h
+++ b/src/eval/nnue/nnue_feature_transformer.h
@@ -87,9 +87,12 @@ class FeatureTransformer {
     constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
     constexpr int kControl = 0b11011000;
     const __m256i kZero = _mm256_setzero_si256();
-#elif defined(USE_SSE41)
+#elif defined(USE_SSSE3)
     constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
     const __m128i kZero = _mm_setzero_si128();
+#ifndef USE_SSE41
+    const __m128i k0x80s = _mm_set1_epi8(-128);
+#endif
 #elif defined(IS_ARM)
     constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
     const int8x8_t kZero = {0};
@@ -133,7 +136,7 @@ class FeatureTransformer {
         (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
             _mm256_packs_epi16(sum0, sum1), kZero), kControl));
       }
-#elif defined(USE_SSE41)
+#elif defined(USE_SSSE3)
       auto out = reinterpret_cast<__m128i*>(&output[offset]);
       for (IndexType j = 0; j < kNumChunks; ++j) {
         __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
@@ -146,8 +149,15 @@ class FeatureTransformer {
           sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
               accumulation[perspectives[p]][i])[j * 2 + 1]);
         }
-        _mm_store_si128(&out[j], _mm_max_epi8(
-            _mm_packs_epi16(sum0, sum1), kZero));
+  	const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
+ 
+        _mm_store_si128(&out[j],
+#ifdef USE_SSE41
+          _mm_max_epi8(packedbytes, kZero)
+#else
+          _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+#endif
+        );
       }
 #elif defined(IS_ARM)
       const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);

From 1de1eb2d0de45dca5c0833cf72f9cd8b1446f8d6 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Fri, 10 Jul 2020 16:13:21 +0900
Subject: [PATCH 100/583] Refactoring: Restructured the architecture list in
 Makefile.

---
 src/Makefile | 49 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index aa5cc1ba..ccd7c8c6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -82,6 +82,10 @@ endif
 # prefetch = yes/no   --- -DUSE_PREFETCH   --- Use prefetch asm-instruction
 # popcnt = yes/no     --- -DUSE_POPCNT     --- Use popcnt asm-instruction
 # sse = yes/no        --- -msse            --- Use Intel Streaming SIMD Extensions
+# ssse3 = yes/no      --- -mssse3          --- Use Intel Supplemental Streaming SIMD Extensions 3
+# sse41 = yes/no      --- -msse4.1         --- Use Intel Streaming SIMD Extensions 4.1
+# sse42 = yes/no      --- -msse4.2         --- Use Intel Streaming SIMD Extensions 4.2
+# avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
 # pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
 #
 # Note that Makefile is space sensitive, so when adding new architectures
@@ -96,6 +100,9 @@ bits = 64
 prefetch = no
 popcnt = no
 sse = no
+ssse3 = no
+sse41 = no
+sse42 = no
 avx2 = no
 pext = no
 
@@ -127,11 +134,29 @@ ifeq ($(ARCH),x86-64)
 	sse = yes
 endif
 
-ifeq ($(ARCH),x86-64-modern)
+ifeq ($(ARCH),x86-64-ssse3)
+	arch = x86_64
+	prefetch = yes
+	sse = yes
+	ssse3 = yes
+endif
+
+ifeq ($(ARCH),x86-64-sse41)
+	arch = x86_64
+	prefetch = yes
+	sse = yes
+	ssse3 = yes
+	sse41 = yes
+endif
+
+ifeq ($(ARCH),x86-64-sse42)
 	arch = x86_64
 	prefetch = yes
 	popcnt = yes
 	sse = yes
+	ssse3 = yes
+	sse41 = yes
+	sse42 = yes
 endif
 
 ifeq ($(ARCH),x86-64-avx2)
@@ -140,6 +165,9 @@ ifeq ($(ARCH),x86-64-avx2)
 	prefetch = yes
 	popcnt = yes
 	sse = yes
+	ssse3 = yes
+	sse41 = yes
+	sse42 = yes
 	avx2 = yes
 endif
 
@@ -148,6 +176,9 @@ ifeq ($(ARCH),x86-64-bmi2)
 	prefetch = yes
 	popcnt = yes
 	sse = yes
+	ssse3 = yes
+	sse41 = yes
+	sse42 = yes
 	avx2 = yes
 	pext = yes
 endif
@@ -370,13 +401,7 @@ endif
 
 ### 3.6 popcnt
 ifeq ($(popcnt),yes)
-	ifeq ($(arch),$(filter $(arch),ppc64 armv8-a))
-		CXXFLAGS += -DUSE_POPCNT
-	else ifeq ($(comp),icc)
-		CXXFLAGS += -msse3 -DUSE_POPCNT
-	else
-		CXXFLAGS += -msse3 -mpopcnt -DUSE_POPCNT
-	endif
+	CXXFLAGS += -DUSE_POPCNT
 endif
 
 ifeq ($(avx2),yes)
@@ -386,8 +411,14 @@ ifeq ($(avx2),yes)
 	endif
 endif
 
+ifeq ($(sse42),yes)
+	CXXFLAGS += -DUSE_SSE42
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
+		CXXFLAGS += -msse4.2
+	endif
+endif
+
 ifeq ($(sse41),yes)
-	ssse3 = yes
 	CXXFLAGS += -DUSE_SSE41
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
 		CXXFLAGS += -msse4.1

From df05ecb1d540b0a56936ab4960404da5f6e0b5d6 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Fri, 10 Jul 2020 16:14:19 +0900
Subject: [PATCH 101/583] Added halfkp_384x2-32-32.

---
 .../nnue/architectures/halfkp_384x2-32-32.h   | 39 +++++++++++++++++++
 src/eval/nnue/nnue_architecture.h             |  1 +
 2 files changed, 40 insertions(+)
 create mode 100644 src/eval/nnue/architectures/halfkp_384x2-32-32.h

diff --git a/src/eval/nnue/architectures/halfkp_384x2-32-32.h b/src/eval/nnue/architectures/halfkp_384x2-32-32.h
new file mode 100644
index 00000000..3d28139a
--- /dev/null
+++ b/src/eval/nnue/architectures/halfkp_384x2-32-32.h
@@ -0,0 +1,39 @@
+﻿// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef HALFKP_384X2_32_32_H
+#define HALFKP_384X2_32_32_H
+
+#include "../features/feature_set.h"
+#include "../features/half_kp.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// Input features used in evaluation function
+using RawFeatures = Features::FeatureSet<
+    Features::HalfKP<Features::Side::kFriend>>;
+
+// Number of input feature dimensions after conversion
+constexpr IndexType kTransformedFeatureDimensions = 384;
+
+namespace Layers {
+
+// define network structure
+using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+}  // namespace Layers
+
+using Network = Layers::OutputLayer;
+
+}  // namespace NNUE
+
+}  // namespace Eval
+#endif // HALFKP_384X2_32_32_H
diff --git a/src/eval/nnue/nnue_architecture.h b/src/eval/nnue/nnue_architecture.h
index 977c67fc..cb53e4f9 100644
--- a/src/eval/nnue/nnue_architecture.h
+++ b/src/eval/nnue/nnue_architecture.h
@@ -11,6 +11,7 @@
 //#include "architectures/k-p-cr-ep_256x2-32-32.h"
 #include "architectures/halfkp_256x2-32-32.h"
 //#include "architectures/halfkp-cr-ep_256x2-32-32.h"
+//#include "architectures/halfkp_384x2-32-32.h"
 
 namespace Eval {
 

From bc6a8d09e9964aafd8361189520047ab5dac435c Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Fri, 10 Jul 2020 16:17:35 +0900
Subject: [PATCH 102/583] Unified the nnue-learn and nnue-learn-use-blas
 targets into nnue-learn.

---
 src/Makefile | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index ccd7c8c6..58c091ae 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -661,9 +661,6 @@ nnue-gen-sfen-from-original-eval: config-sanity
 	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DUSE_EVAL_HASH -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
 
 nnue-learn: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
-
-nnue-learn-use-blas: config-sanity
 	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp' LDFLAGS='$(LDFLAGS) -lopenblas -fopenmp' build
 
 .depend:

From d7c358cf19da5e00112804c64f84c0bbc894679e Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Fri, 10 Jul 2020 16:55:32 +0900
Subject: [PATCH 103/583] Fixed descriptions and sanity checks in Makefile.

---
 src/Makefile | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 58c091ae..5d32def0 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -493,9 +493,11 @@ help:
 	@echo ""
 	@echo "Supported archs:"
 	@echo ""
-	@echo "x86-64-bmi2             > x86 64-bit with pext support (also enables SSE4)"
-	@echo "x86-64-avx2             > x86 64-bit with avx2 support (also enables SSE4)"
-	@echo "x86-64-modern           > x86 64-bit with popcnt support (also enables SSE3)"
+	@echo "x86-64-bmi2             > x86 64-bit with bmi2 support"
+	@echo "x86-64-avx2             > x86 64-bit with avx2 support"
+	@echo "x86-64-sse42            > x86 64-bit with sse42 support"
+	@echo "x86-64-sse41            > x86 64-bit with sse41 support"
+	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
 	@echo "x86-64                  > x86 64-bit generic"
 	@echo "x86-32                  > x86 32-bit (also enables SSE)"
 	@echo "x86-32-old              > x86 32-bit fall back for old hardware"
@@ -592,6 +594,9 @@ config-sanity:
 	@echo "prefetch: '$(prefetch)'"
 	@echo "popcnt: '$(popcnt)'"
 	@echo "sse: '$(sse)'"
+	@echo "ssse3: '$(ssse3)'"
+	@echo "sse41: '$(sse41)'"
+	@echo "sse42: '$(sse42)'"
 	@echo "avx2: '$(avx2)'"
 	@echo "pext: '$(pext)'"
 	@echo ""
@@ -612,6 +617,10 @@ config-sanity:
 	@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
 	@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
 	@test "$(sse)" = "yes" || test "$(sse)" = "no"
+	@test "$(ssse3)" = "yes" || test "$(ssse3)" = "no"
+	@test "$(sse41)" = "yes" || test "$(sse41)" = "no"
+	@test "$(sse42)" = "yes" || test "$(sse42)" = "no"
+	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
 	@test "$(pext)" = "yes" || test "$(pext)" = "no"
 	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang"
 

From b521e405d32d884392be8cc9dd5de45559b2dce6 Mon Sep 17 00:00:00 2001
From: zz4032 <alg4032@arcor.de>
Date: Fri, 10 Jul 2020 16:40:24 +0200
Subject: [PATCH 104/583] Default network path in Linux.

---
 src/ucioption.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index d98d82b1..8658adb4 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -81,7 +81,11 @@ void init(OptionsMap& o) {
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
   // Evaluation function file name. When this is changed, it is necessary to reread the evaluation function at the next ucinewgame timing.
+#if defined(__linux__)
+  o["EvalFile"]              << Option("eval/nn.bin", on_eval_file);
+#else
   o["EvalFile"]              << Option("eval\\nn.bin", on_eval_file);
+#endif
   // When the evaluation function is loaded at the ucinewgame timing, it is necessary to convert the new evaluation function.
   // I want to hit the test eval convert command, but there is no new evaluation function
   // It ends abnormally before executing this command.

From d6e8089f5084941a7e73d821cbbb5db799b2c21e Mon Sep 17 00:00:00 2001
From: zz4032 <alg4032@arcor.de>
Date: Sat, 11 Jul 2020 10:33:36 +0200
Subject: [PATCH 105/583] Saving new network in correct path.

---
 src/eval/nnue/evaluate_nnue.cpp         | 5 ++++-
 src/eval/nnue/evaluate_nnue.h           | 3 +++
 src/eval/nnue/evaluate_nnue_learner.cpp | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/eval/nnue/evaluate_nnue.cpp
index d0c04958..55e627d0 100644
--- a/src/eval/nnue/evaluate_nnue.cpp
+++ b/src/eval/nnue/evaluate_nnue.cpp
@@ -23,7 +23,10 @@ AlignedPtr<FeatureTransformer> feature_transformer;
 AlignedPtr<Network> network;
 
 // Evaluation function file name
-std::string fileName = "eval\\nn.bin";
+std::string fileName = "nn.bin";
+
+// Saved evaluation function file name
+std::string savedfileName = "nn.bin";
 
 // Get a string that represents the structure of the evaluation function
 std::string GetArchitectureString() {
diff --git a/src/eval/nnue/evaluate_nnue.h b/src/eval/nnue/evaluate_nnue.h
index 97ce3df8..d474a8ae 100644
--- a/src/eval/nnue/evaluate_nnue.h
+++ b/src/eval/nnue/evaluate_nnue.h
@@ -38,6 +38,9 @@ extern AlignedPtr<Network> network;
 // Evaluation function file name
 extern std::string fileName;
 
+// Saved evaluation function file name
+extern std::string savedfileName;
+
 // Get a string that represents the structure of the evaluation function
 std::string GetArchitectureString();
 
diff --git a/src/eval/nnue/evaluate_nnue_learner.cpp b/src/eval/nnue/evaluate_nnue_learner.cpp
index 28243871..37bc79bc 100644
--- a/src/eval/nnue/evaluate_nnue_learner.cpp
+++ b/src/eval/nnue/evaluate_nnue_learner.cpp
@@ -213,7 +213,7 @@ void save_eval(std::string dir_name) {
     NNUE::SendMessages({{"clear_unobserved_feature_weights"}});
   }
 
-  const std::string file_name = NNUE::fileName;
+  const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
   std::ofstream stream(file_name, std::ios::binary);
   const bool result = NNUE::WriteParameters(stream);
   assert(result);

From fcb391919ffa5ac9edbb08a3ffc08e7f1831bdb4 Mon Sep 17 00:00:00 2001
From: No name <no@email>
Date: Fri, 10 Jul 2020 20:39:49 +0300
Subject: [PATCH 106/583] Disable EVAL_HASH for 'nnue' target

Gives a 7% speed gain for me, without any parameter set loaded
(all-zero).
---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 5d32def0..1c3645b6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -664,7 +664,7 @@ icc-profile-use:
 	all
 
 nnue: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_NNUE -DUSE_EVAL_HASH -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_NNUE -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
 
 nnue-gen-sfen-from-original-eval: config-sanity
 	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DUSE_EVAL_HASH -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build

From df40de9486e5efda63ef593da991fdf36e54634e Mon Sep 17 00:00:00 2001
From: Anson Hu <40702929+farmersrice@users.noreply.github.com>
Date: Sat, 11 Jul 2020 23:37:22 -0700
Subject: [PATCH 107/583] game result bugfix

---
 script/pgn_to_plain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/script/pgn_to_plain.py b/script/pgn_to_plain.py
index 61aa9917..5f9300cb 100644
--- a/script/pgn_to_plain.py
+++ b/script/pgn_to_plain.py
@@ -18,7 +18,7 @@ def parse_result(result_str:str, board:chess.Board) -> int:
         if board.turn == chess.WHITE:
             return 1
         else:
-            return 0
+            return -1
     else:
         print("illeagal result", result_str)
         raise ValueError

From 686a5a0df91f0effb99397a6da2965f257105198 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 13 Jul 2020 22:25:23 +0900
Subject: [PATCH 108/583] Fixed a bug that gensfen command does not accept the
 use_draw_in_training_data_generation option.

---
 src/learn/learner.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index bf4a3034..e343fde5 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -938,6 +938,8 @@ void gen_sfen(Position&, istringstream& is)
 			is >> save_every;
 		else if (token == "random_file_name")
 			is >> random_file_name;
+		else if (token == "use_draw_in_training_data_generation")
+			is >> use_draw_in_training_data_generation;
 		else
 			cout << "Error! : Illegal token " << token << endl;
 	}
@@ -2931,7 +2933,6 @@ void learn(Position&, istringstream& is)
 		else if (option == "eta3")       is >> eta3;
 		else if (option == "eta1_epoch") is >> eta1_epoch;
 		else if (option == "eta2_epoch") is >> eta2_epoch;
-		else if (option == "use_draw_in_training_data_generation") is >> use_draw_in_training_data_generation;
 		else if (option == "use_draw_in_training") is >> use_draw_in_training;
 		else if (option == "use_draw_in_validation") is >> use_draw_in_validation;
 		else if (option == "use_hash_in_training") is >> use_hash_in_training;

From e29499ee4b99174570fc49ac918f1dbd5bc22660 Mon Sep 17 00:00:00 2001
From: "J. Oster" <osterj165@googlemail.com>
Date: Wed, 15 Jul 2020 12:15:45 +0200
Subject: [PATCH 109/583] Use the path and filename for restoring parameter
 files.

---
 src/eval/nnue/evaluate_nnue_learner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/eval/nnue/evaluate_nnue_learner.cpp b/src/eval/nnue/evaluate_nnue_learner.cpp
index 37bc79bc..3297037d 100644
--- a/src/eval/nnue/evaluate_nnue_learner.cpp
+++ b/src/eval/nnue/evaluate_nnue_learner.cpp
@@ -110,7 +110,7 @@ void SetOptions(const std::string& options) {
 
 // Reread the evaluation function parameters for learning from the file
 void RestoreParameters(const std::string& dir_name) {
-  const std::string file_name = NNUE::fileName;
+  const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
   std::ifstream stream(file_name, std::ios::binary);
   bool result = ReadParameters(stream);
   assert(result);

From 961047ed6ec3f251f9c05c494efc0cdbc3dd30c6 Mon Sep 17 00:00:00 2001
From: No name <no@email>
Date: Wed, 15 Jul 2020 22:52:27 +0300
Subject: [PATCH 110/583] Experimental support for PGO builds of NNUE

Only 'nnue' target and only gcc/mingw.
(does not clean profile data generated by other compilers)
To use:
 make profile-nnue ARCH=arch
(see 'make help' for list of supported archs)
---
 src/Makefile | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 1c3645b6..585d93a4 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -211,7 +211,7 @@ endif
 ### ==========================================================================
 
 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS)
+CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS) $(NNUECXXFLAGS)
 DEPENDFLAGS += -std=c++17
 LDFLAGS += $(EXTRALDFLAGS)
 
@@ -569,7 +569,7 @@ objclean:
 # clean auxiliary profiling files
 profileclean:
 	@rm -rf profdir
-	@rm -f bench.txt *.gcda *.gcno
+	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda ./eval/nnue/*.gcda ./eval/nnue/features/*.gcda
 	@rm -f stockfish.profdata *.profraw
 
 default:
@@ -666,6 +666,10 @@ icc-profile-use:
 nnue: config-sanity
 	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_NNUE -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
 
+profile-nnue: export NNUECXXFLAGS = -DEVAL_NNUE -DENABLE_TEST_CMD
+profile-nnue: config-sanity
+	$(MAKE) profile-build
+
 nnue-gen-sfen-from-original-eval: config-sanity
 	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DUSE_EVAL_HASH -DENABLE_TEST_CMD -fopenmp' LDFLAGS='$(LDFLAGS) -fopenmp' build
 

From 2fd1c48e6088856fdae9bea8218d419212447a33 Mon Sep 17 00:00:00 2001
From: xXH4CKST3RXx <52459831+xXH4CKST3RXx@users.noreply.github.com>
Date: Wed, 15 Jul 2020 23:15:34 -0400
Subject: [PATCH 111/583] Rename Readme.md to stockfish.md

---
 Readme.md => stockfish.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename Readme.md => stockfish.md (100%)

diff --git a/Readme.md b/stockfish.md
similarity index 100%
rename from Readme.md
rename to stockfish.md

From 6118151c6613bfb2fb01329987ce26dd565803ce Mon Sep 17 00:00:00 2001
From: xXH4CKST3RXx <52459831+xXH4CKST3RXx@users.noreply.github.com>
Date: Thu, 16 Jul 2020 00:00:29 -0400
Subject: [PATCH 112/583] Create README.md

Added and cleaned up Gekkehenker's training guide.
---
 README.md | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 00000000..9af97bee
--- /dev/null
+++ b/README.md
@@ -0,0 +1,41 @@
+# Stockfish NNUE
+
+## Overview
+Stockfish NNUE is a port of a shogi NN called NNUE (efficiently updateable neural network backwards) to Stockfish 11.
+
+## Training Guide
+### Generating Training Data
+Use the "no-nnue.nnue-gen-sfen-from-original-eval" binary. The given example is generation in its simplest form. There are more commands. 
+```
+uci
+setoption name Threads value x
+setoption name Hash value y
+setoption name SyzygyPath value path
+isready
+gensfen depth a loop b  use_draw_in_training_data_generation 1 eval_limit 32000
+```
+Specify how many threads and how much memory you would like to use with the x and y values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The path is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
+
+This will save a file named "generated_kifu.bin" in the same folder as the binary. Once generation is done, rename the file to something like "1billiondepth12.bin" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
+#### Generation Parameters
+- Depth is the searched depth per move, or how far the engine looks forward. This value is an integer.
+- Loop is the amount of positions generated. This value is also an integer
+### Generating validation data
+The process is the same as the generation of training data, except for the fact that you need to set loop to 1 million, because you don't need a lot of validation data. The depth should be the same as before or a little higher than the depth of the training data. After generation rename the validation data file to val.bin and drop it in a folder named "validationdata" in the same directory to make it easier. 
+### Training a completely new network
+Use the "avx2.halfkp_256x2-32-32.nnue-learn.2020-07-11" binary. Create an empty folder named "evalsave" in the same directory as the binaries.
+```
+uci
+setoption name SkipLoadingEval value true
+setoption name Threads value x
+isready
+learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 mirror_percentage 50 validation_set_file_name validationdata\val.bin
+```
+Nets get saved in the "evalsave" folder. 
+
+#### Training Parameters
+- eta is the learning rate
+- lambda is the amount of weight it puts to eval of learning data vs win/draw/loss results. 1 puts all weight on eval, lambda 0 puts all weight on WDL results.
+
+### Using the Trained Net
+If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into the "eval" folder. You can then use the halfkp_256x2 binaries with a standard chess GUI, such as Cutechess.

From df4da8dc41381a85f7f02dbafddcced5e41c5cce Mon Sep 17 00:00:00 2001
From: xXH4CKST3RXx <52459831+xXH4CKST3RXx@users.noreply.github.com>
Date: Thu, 16 Jul 2020 00:01:02 -0400
Subject: [PATCH 113/583] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9af97bee..5d30b021 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ setoption name Threads value x
 setoption name Hash value y
 setoption name SyzygyPath value path
 isready
-gensfen depth a loop b  use_draw_in_training_data_generation 1 eval_limit 32000
+gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000
 ```
 Specify how many threads and how much memory you would like to use with the x and y values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The path is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
 

From ec5ef2b6dfad8b7d33aa504afd8c28bdf2b63396 Mon Sep 17 00:00:00 2001
From: xXH4CKST3RXx <52459831+xXH4CKST3RXx@users.noreply.github.com>
Date: Thu, 16 Jul 2020 00:01:59 -0400
Subject: [PATCH 114/583] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5d30b021..555b76b1 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # Stockfish NNUE
 
 ## Overview
-Stockfish NNUE is a port of a shogi NN called NNUE (efficiently updateable neural network backwards) to Stockfish 11.
+Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updateable neural network backwards) to Stockfish 11.
 
 ## Training Guide
 ### Generating Training Data

From be754a237972c2085a39a6947a585a283f971603 Mon Sep 17 00:00:00 2001
From: xXH4CKST3RXx <52459831+xXH4CKST3RXx@users.noreply.github.com>
Date: Thu, 16 Jul 2020 00:10:30 -0400
Subject: [PATCH 115/583] Update README.md

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 555b76b1..44a8d1e0 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # Stockfish NNUE
 
 ## Overview
-Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updateable neural network backwards) to Stockfish 11.
+Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updateable neural network backwards) to Stockfish 11. To learn more about the Stockfish chess engine, look [here](stockfish.md) for an overview and [here](https://github.com/official-stockfish/Stockfish) for the official repository.
 
 ## Training Guide
 ### Generating Training Data
@@ -20,9 +20,9 @@ This will save a file named "generated_kifu.bin" in the same folder as the binar
 #### Generation Parameters
 - Depth is the searched depth per move, or how far the engine looks forward. This value is an integer.
 - Loop is the amount of positions generated. This value is also an integer
-### Generating validation data
-The process is the same as the generation of training data, except for the fact that you need to set loop to 1 million, because you don't need a lot of validation data. The depth should be the same as before or a little higher than the depth of the training data. After generation rename the validation data file to val.bin and drop it in a folder named "validationdata" in the same directory to make it easier. 
-### Training a completely new network
+### Generating Validation Data
+The process is the same as the generation of training data, except for the fact that you need to set loop to 1 million, because you don't need a lot of validation data. The depth should be the same as before or slightly higher than the depth of the training data. After generation rename the validation data file to val.bin and drop it in a folder named "validationdata" in the same directory to make it easier. 
+### Training a Completely New Network
 Use the "avx2.halfkp_256x2-32-32.nnue-learn.2020-07-11" binary. Create an empty folder named "evalsave" in the same directory as the binaries.
 ```
 uci
@@ -38,4 +38,4 @@ Nets get saved in the "evalsave" folder.
 - lambda is the amount of weight it puts to eval of learning data vs win/draw/loss results. 1 puts all weight on eval, lambda 0 puts all weight on WDL results.
 
 ### Using the Trained Net
-If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into the "eval" folder. You can then use the halfkp_256x2 binaries with a standard chess GUI, such as Cutechess.
+If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into a new folder named "eval" under the directory with the binaries. You can then use the halfkp_256x2 binaries pertaining to your CPU with a standard chess GUI, such as Cutechess. Refer to the [releases page](https://github.com/nodchip/Stockfish/releases) to find out which binary is best for your CPU.

From 2b821682aa9d815c00040dd8669bcaa017119e7c Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Fri, 17 Jul 2020 11:55:30 +0900
Subject: [PATCH 116/583] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 44a8d1e0..daa8fefb 100644
--- a/README.md
+++ b/README.md
@@ -39,3 +39,5 @@ Nets get saved in the "evalsave" folder.
 
 ### Using the Trained Net
 If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into a new folder named "eval" under the directory with the binaries. You can then use the halfkp_256x2 binaries pertaining to your CPU with a standard chess GUI, such as Cutechess. Refer to the [releases page](https://github.com/nodchip/Stockfish/releases) to find out which binary is best for your CPU.
+
+If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to sepcify the net with the full file path by the "EvalFile" option.

From 4d4c80d7fdc4bb44036644f026e6deb25a580aa4 Mon Sep 17 00:00:00 2001
From: xXH4CKST3RXx <52459831+xXH4CKST3RXx@users.noreply.github.com>
Date: Thu, 16 Jul 2020 23:34:38 -0400
Subject: [PATCH 117/583] Update README.md

Added logo, reinforcement learning instructions, and resources list.
---
 README.md | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index daa8fefb..73eec1fb 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,8 @@
-# Stockfish NNUE
+<p align="center">
+  <img src="https://cdn.discordapp.com/attachments/724700045525647420/729135226365804594/SFNNUE2.png">
+</p>
+
+<h1 align="center">Stockfish NNUE</h1>
 
 ## Overview
 Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updateable neural network backwards) to Stockfish 11. To learn more about the Stockfish chess engine, look [here](stockfish.md) for an overview and [here](https://github.com/official-stockfish/Stockfish) for the official repository.
@@ -37,7 +41,23 @@ Nets get saved in the "evalsave" folder.
 - eta is the learning rate
 - lambda is the amount of weight it puts to eval of learning data vs win/draw/loss results. 1 puts all weight on eval, lambda 0 puts all weight on WDL results.
 
-### Using the Trained Net
+### Reinforcement Learning
+If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `uci setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+
+After you have generated the training data, you must move it into your training data folder and delete the older data so that the binary does not accidentally train on the same data again. Do the same for the validation data and name it to val-1.bin to make it less confusing. Make sure the evalsave folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set eval_save_interval to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value. The validation file should be set to the new validation data, not the old data.
+
+After training is finished, your new net should be located in the "final" folder under the "evalsave" directory. You should test this new network against the older network to see if there are any improvements.
+
+## Using Your Trained Net
 If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into a new folder named "eval" under the directory with the binaries. You can then use the halfkp_256x2 binaries pertaining to your CPU with a standard chess GUI, such as Cutechess. Refer to the [releases page](https://github.com/nodchip/Stockfish/releases) to find out which binary is best for your CPU.
 
-If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to sepcify the net with the full file path by the "EvalFile" option.
+If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to sepcify the net with the full file path with the "EvalFile" option by typing the command `setoption name EvalFile value path` where path is the full file path.
+
+## Resources
+- [Stockfish NNUE Wiki](https://www.qhapaq.org/shogi/shogiwiki/stockfish-nnue/)
+- [Training instructions](https://twitter.com/mktakizawa/status/1273042640280252416) from the creator of the Elmo shogi engine
+- [Original Talkchess thread](http://talkchess.com/forum3/viewtopic.php?t=74059) discussing Stockfish NNUE
+- [Guide to Stockfish NNUE](http://yaneuraou.yaneu.com/2020/06/19/stockfish-nnue-the-complete-guide/) 
+- [Unofficial Stockfish Discord](https://discord.gg/nv8gDtt)
+
+A more updated list can be found in the #sf-nnue-resources channel in the Discord.

From 7a13d4ed60b09a9ce1b5aee46aa2a596bc4ca0fd Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Fri, 17 Jul 2020 15:40:01 +0900
Subject: [PATCH 118/583] Changed the default eval file path so that more GUIs
 can use Stockfish NNUE.

---
 src/ucioption.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 8658adb4..ac5a6a16 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -81,11 +81,8 @@ void init(OptionsMap& o) {
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
   // Evaluation function file name. When this is changed, it is necessary to reread the evaluation function at the next ucinewgame timing.
-#if defined(__linux__)
-  o["EvalFile"]              << Option("eval/nn.bin", on_eval_file);
-#else
-  o["EvalFile"]              << Option("eval\\nn.bin", on_eval_file);
-#endif
+  // Without the preceding "./", some GUIs can not load he net file.
+  o["EvalFile"]              << Option("./eval/nn.bin", on_eval_file);
   // When the evaluation function is loaded at the ucinewgame timing, it is necessary to convert the new evaluation function.
   // I want to hit the test eval convert command, but there is no new evaluation function
   // It ends abnormally before executing this command.

From 961a4dad5ce83a7795a5e60f4f34dd56212621db Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sat, 18 Jul 2020 19:21:46 -0700
Subject: [PATCH 119/583] Add AVX512 support. bench: 3909820

---
 src/Makefile                            | 28 ++++++++++++++-
 src/eval/nnue/layers/affine_transform.h | 47 ++++++++++++++++++++++---
 2 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 585d93a4..254f9bac 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -87,6 +87,7 @@ endif
 # sse42 = yes/no      --- -msse4.2         --- Use Intel Streaming SIMD Extensions 4.2
 # avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
 # pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
+# avx512 = yes/no     --- -mavx512vbmi     --- Use Intel Advanced Vector Extensions 512
 #
 # Note that Makefile is space sensitive, so when adding new architectures
 # or modifying existing flags, you have to make sure there are no extra spaces
@@ -105,6 +106,7 @@ sse41 = no
 sse42 = no
 avx2 = no
 pext = no
+avx512 = no
 
 ### 2.2 Architecture specific
 ifeq ($(ARCH),general-32)
@@ -183,6 +185,20 @@ ifeq ($(ARCH),x86-64-bmi2)
 	pext = yes
 endif
 
+ifeq ($(ARCH),x86-64-avx512)
+	arch = x86_64
+	bits = 64
+	prefetch = yes
+	popcnt = yes
+	sse = yes
+	ssse3 = yes
+	sse41 = yes
+	sse42 = yes
+	avx2 = yes
+	pext = yes
+	avx512 = yes
+endif
+
 ifeq ($(ARCH),armv7)
 	arch = armv7
 	prefetch = yes
@@ -407,7 +423,14 @@ endif
 ifeq ($(avx2),yes)
 	CXXFLAGS += -DUSE_AVX2
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
-		CXXFLAGS += -mavx2
+	CXXFLAGS += -mavx2
+	endif
+endif
+
+ifeq ($(avx512),yes)
+	CXXFLAGS += -DUSE_AVX512
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
+	CXXFLAGS += -mavx512vbmi
 	endif
 endif
 
@@ -493,6 +516,7 @@ help:
 	@echo ""
 	@echo "Supported archs:"
 	@echo ""
+	@echo "x86-64-avx512           > x86 64-bit with avx512 support"
 	@echo "x86-64-bmi2             > x86 64-bit with bmi2 support"
 	@echo "x86-64-avx2             > x86 64-bit with avx2 support"
 	@echo "x86-64-sse42            > x86 64-bit with sse42 support"
@@ -599,6 +623,7 @@ config-sanity:
 	@echo "sse42: '$(sse42)'"
 	@echo "avx2: '$(avx2)'"
 	@echo "pext: '$(pext)'"
+	@echo "avx512: '$(avx512)'"
 	@echo ""
 	@echo "Flags:"
 	@echo "CXX: $(CXX)"
@@ -622,6 +647,7 @@ config-sanity:
 	@test "$(sse42)" = "yes" || test "$(sse42)" = "no"
 	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
 	@test "$(pext)" = "yes" || test "$(pext)" = "no"
+	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
 	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang"
 
 $(EXE): $(OBJS)
diff --git a/src/eval/nnue/layers/affine_transform.h b/src/eval/nnue/layers/affine_transform.h
index cb56b07d..2db7f731 100644
--- a/src/eval/nnue/layers/affine_transform.h
+++ b/src/eval/nnue/layers/affine_transform.h
@@ -82,7 +82,11 @@ class AffineTransform {
     const auto input = previous_layer_.Propagate(
         transformed_features, buffer + kSelfBufferSize);
     const auto output = reinterpret_cast<OutputType*>(buffer);
-#if defined(USE_AVX2)
+#if defined(USE_AVX512)
+    constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
+    const __m512i kOnes = _mm512_set1_epi16(1);
+    const auto input_vector = reinterpret_cast<const __m512i*>(input);
+#elif defined(USE_AVX2)
     constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
     const __m256i kOnes = _mm256_set1_epi16(1);
     const auto input_vector = reinterpret_cast<const __m256i*>(input);
@@ -96,8 +100,43 @@ class AffineTransform {
 #endif
     for (IndexType i = 0; i < kOutputDimensions; ++i) {
       const IndexType offset = i * kPaddedInputDimensions;
-#if defined(USE_AVX2)
-      __m256i sum = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, biases_[i]);
+#if defined(USE_AVX512)
+      __m512i sum = _mm512_setzero_si512();
+      const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+#if defined(__MINGW32__) || defined(__MINGW64__)
+          __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+#else
+          __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+#endif
+          product = _mm512_madd_epi16(product, kOnes);
+          sum = _mm512_add_epi32(sum, product);
+      }
+      output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
+      
+      // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
+      // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
+      // and we have to do one more 256bit chunk.
+      if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
+      {
+          const auto iv_256  = reinterpret_cast<const __m256i*>(input);
+          const auto row_256 = reinterpret_cast<const __m256i*>(&weights_[offset]);
+          int j = kNumChunks * 2;
+#if defined(__MINGW32__) || defined(__MINGW64__)  // See HACK comment below in AVX2.
+          __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
+#else
+          __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
+#endif
+          sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1));
+
+          sum256 = _mm256_hadd_epi32(sum256, sum256);
+          sum256 = _mm256_hadd_epi32(sum256, sum256);
+          const __m128i lo = _mm256_extracti128_si256(sum256, 0);
+          const __m128i hi = _mm256_extracti128_si256(sum256, 1);
+          output[i] += _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
+      }
+#elif defined(USE_AVX2)
+      __m256i sum = _mm256_setzero_si256();
       const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
       for (IndexType j = 0; j < kNumChunks; ++j) {
         __m256i product = _mm256_maddubs_epi16(
@@ -117,7 +156,7 @@ class AffineTransform {
       sum = _mm256_hadd_epi32(sum, sum);
       const __m128i lo = _mm256_extracti128_si256(sum, 0);
       const __m128i hi = _mm256_extracti128_si256(sum, 1);
-      output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
+      output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi) + biases_[i];
 #elif defined(USE_SSSE3)
       __m128i sum = _mm_cvtsi32_si128(biases_[i]);
       const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);

From c24ad8d8b5cfa4a6b3b47b087d3fa32dfb3337c0 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 19 Jul 2020 12:26:37 +0900
Subject: [PATCH 120/583] Supported sse3 build.

---
 src/Makefile | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/Makefile b/src/Makefile
index 254f9bac..245fda0a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -101,6 +101,7 @@ bits = 64
 prefetch = no
 popcnt = no
 sse = no
+sse3 = no
 ssse3 = no
 sse41 = no
 sse42 = no
@@ -136,10 +137,19 @@ ifeq ($(ARCH),x86-64)
 	sse = yes
 endif
 
+ifeq ($(ARCH),x86-64-sse3)
+	arch = x86_64
+	prefetch = yes
+	sse = yes
+	sse3 = yes
+	ssse3 = yes
+endif
+
 ifeq ($(ARCH),x86-64-ssse3)
 	arch = x86_64
 	prefetch = yes
 	sse = yes
+	sse3 = yes
 	ssse3 = yes
 endif
 
@@ -147,6 +157,7 @@ ifeq ($(ARCH),x86-64-sse41)
 	arch = x86_64
 	prefetch = yes
 	sse = yes
+	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
 endif
@@ -156,6 +167,7 @@ ifeq ($(ARCH),x86-64-sse42)
 	prefetch = yes
 	popcnt = yes
 	sse = yes
+	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
 	sse42 = yes
@@ -167,6 +179,7 @@ ifeq ($(ARCH),x86-64-avx2)
 	prefetch = yes
 	popcnt = yes
 	sse = yes
+	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
 	sse42 = yes
@@ -178,6 +191,7 @@ ifeq ($(ARCH),x86-64-bmi2)
 	prefetch = yes
 	popcnt = yes
 	sse = yes
+	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
 	sse42 = yes
@@ -191,6 +205,7 @@ ifeq ($(ARCH),x86-64-avx512)
 	prefetch = yes
 	popcnt = yes
 	sse = yes
+	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
 	sse42 = yes
@@ -455,6 +470,13 @@ ifeq ($(ssse3),yes)
 	endif
 endif
 
+ifeq ($(sse3),yes)
+	CXXFLAGS += -DUSE_SSE3
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
+		CXXFLAGS += -msse3
+	endif
+endif
+
 ifeq ($(arch),x86_64)
 	CXXFLAGS += -DUSE_SSE2
 endif
@@ -522,6 +544,7 @@ help:
 	@echo "x86-64-sse42            > x86 64-bit with sse42 support"
 	@echo "x86-64-sse41            > x86 64-bit with sse41 support"
 	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
+	@echo "x86-64-sse3             > x86 64-bit with ssse3 support"
 	@echo "x86-64                  > x86 64-bit generic"
 	@echo "x86-32                  > x86 32-bit (also enables SSE)"
 	@echo "x86-32-old              > x86 32-bit fall back for old hardware"
@@ -618,6 +641,7 @@ config-sanity:
 	@echo "prefetch: '$(prefetch)'"
 	@echo "popcnt: '$(popcnt)'"
 	@echo "sse: '$(sse)'"
+	@echo "sse3: '$(sse3)'"
 	@echo "ssse3: '$(ssse3)'"
 	@echo "sse41: '$(sse41)'"
 	@echo "sse42: '$(sse42)'"
@@ -642,6 +666,7 @@ config-sanity:
 	@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
 	@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
 	@test "$(sse)" = "yes" || test "$(sse)" = "no"
+	@test "$(sse3)" = "yes" || test "$(sse3)" = "no"
 	@test "$(ssse3)" = "yes" || test "$(ssse3)" = "no"
 	@test "$(sse41)" = "yes" || test "$(sse41)" = "no"
 	@test "$(sse42)" = "yes" || test "$(sse42)" = "no"

From a4786db4c2a0270d215550e7fec22edea691b123 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 19 Jul 2020 12:41:50 +0900
Subject: [PATCH 121/583] Added support for architectures which supports
 SSE3+POPCNT, SSSE3+POPCNT and SSE41+POPCNT.

---
 src/Makefile | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 245fda0a..c1b03dd8 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -145,6 +145,15 @@ ifeq ($(ARCH),x86-64-sse3)
 	ssse3 = yes
 endif
 
+ifeq ($(ARCH),x86-64-sse3-popcnt)
+	arch = x86_64
+	prefetch = yes
+	popcnt = yes
+	sse = yes
+	sse3 = yes
+	ssse3 = yes
+endif
+
 ifeq ($(ARCH),x86-64-ssse3)
 	arch = x86_64
 	prefetch = yes
@@ -153,6 +162,15 @@ ifeq ($(ARCH),x86-64-ssse3)
 	ssse3 = yes
 endif
 
+ifeq ($(ARCH),x86-64-ssse3-popcnt)
+	arch = x86_64
+	prefetch = yes
+	popcnt = yes
+	sse = yes
+	sse3 = yes
+	ssse3 = yes
+endif
+
 ifeq ($(ARCH),x86-64-sse41)
 	arch = x86_64
 	prefetch = yes
@@ -162,6 +180,16 @@ ifeq ($(ARCH),x86-64-sse41)
 	sse41 = yes
 endif
 
+ifeq ($(ARCH),x86-64-sse41-popcnt)
+	arch = x86_64
+	prefetch = yes
+	popcnt = yes
+	sse = yes
+	sse3 = yes
+	ssse3 = yes
+	sse41 = yes
+endif
+
 ifeq ($(ARCH),x86-64-sse42)
 	arch = x86_64
 	prefetch = yes
@@ -433,19 +461,22 @@ endif
 ### 3.6 popcnt
 ifeq ($(popcnt),yes)
 	CXXFLAGS += -DUSE_POPCNT
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
+		CXXFLAGS += -mpopcnt
+	endif
 endif
 
 ifeq ($(avx2),yes)
 	CXXFLAGS += -DUSE_AVX2
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
-	CXXFLAGS += -mavx2
+		CXXFLAGS += -mavx2
 	endif
 endif
 
 ifeq ($(avx512),yes)
 	CXXFLAGS += -DUSE_AVX512
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
-	CXXFLAGS += -mavx512vbmi
+		CXXFLAGS += -mavx512vbmi
 	endif
 endif
 

From 92c21674812fc1d7dcb9baf3d7e0b0999071a17b Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 19 Jul 2020 12:52:20 +0900
Subject: [PATCH 122/583] Removed x86-64-ssse3-popcnt and x86-64-sse41-popcnt.

---
 src/Makefile | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index c1b03dd8..a504ce27 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -162,15 +162,6 @@ ifeq ($(ARCH),x86-64-ssse3)
 	ssse3 = yes
 endif
 
-ifeq ($(ARCH),x86-64-ssse3-popcnt)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	sse3 = yes
-	ssse3 = yes
-endif
-
 ifeq ($(ARCH),x86-64-sse41)
 	arch = x86_64
 	prefetch = yes
@@ -180,16 +171,6 @@ ifeq ($(ARCH),x86-64-sse41)
 	sse41 = yes
 endif
 
-ifeq ($(ARCH),x86-64-sse41-popcnt)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	sse3 = yes
-	ssse3 = yes
-	sse41 = yes
-endif
-
 ifeq ($(ARCH),x86-64-sse42)
 	arch = x86_64
 	prefetch = yes

From 1536e31065df90060b9053acdbc21b4319da7de9 Mon Sep 17 00:00:00 2001
From: No name <no@email>
Date: Fri, 17 Jul 2020 10:25:23 +0300
Subject: [PATCH 123/583] Load the parameter set on an `isready' as well

Unbreaks Scid vs. PC, which doesn't send `ucinewgame'.
---
 src/uci.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/uci.cpp b/src/uci.cpp
index 6d86ebca..c775f333 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -383,8 +383,12 @@ void UCI::loop(int argc, char* argv[]) {
 #endif
           Search::clear();
       }
-      else if (token == "isready")    sync_cout << "readyok" << sync_endl;
-
+      else if (token == "isready") {
+#if defined(EVAL_NNUE)
+          init_nnue(true);
+#endif
+          sync_cout << "readyok" << sync_endl;
+      }
       // Additional custom non-UCI commands, mainly for debugging.
       // Do not use these commands during a search!
       else if (token == "flip")     pos.flip();

From c001a4e62d8d63de6145a45f19cd35d855444e5c Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 19 Jul 2020 13:58:19 +0900
Subject: [PATCH 124/583] Revert "Removed x86-64-ssse3-popcnt and
 x86-64-sse41-popcnt."

This reverts commit 92c21674812fc1d7dcb9baf3d7e0b0999071a17b.
---
 src/Makefile | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/Makefile b/src/Makefile
index a504ce27..c1b03dd8 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -162,6 +162,15 @@ ifeq ($(ARCH),x86-64-ssse3)
 	ssse3 = yes
 endif
 
+ifeq ($(ARCH),x86-64-ssse3-popcnt)
+	arch = x86_64
+	prefetch = yes
+	popcnt = yes
+	sse = yes
+	sse3 = yes
+	ssse3 = yes
+endif
+
 ifeq ($(ARCH),x86-64-sse41)
 	arch = x86_64
 	prefetch = yes
@@ -171,6 +180,16 @@ ifeq ($(ARCH),x86-64-sse41)
 	sse41 = yes
 endif
 
+ifeq ($(ARCH),x86-64-sse41-popcnt)
+	arch = x86_64
+	prefetch = yes
+	popcnt = yes
+	sse = yes
+	sse3 = yes
+	ssse3 = yes
+	sse41 = yes
+endif
+
 ifeq ($(ARCH),x86-64-sse42)
 	arch = x86_64
 	prefetch = yes

From 3bbe4802b12bb7dd4298173ae002f87d2e1de476 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 19 Jul 2020 14:02:49 +0900
Subject: [PATCH 125/583] Removed the sse41-popcnt architecture.

---
 src/Makefile | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index c1b03dd8..984f5871 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -180,16 +180,6 @@ ifeq ($(ARCH),x86-64-sse41)
 	sse41 = yes
 endif
 
-ifeq ($(ARCH),x86-64-sse41-popcnt)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	sse3 = yes
-	ssse3 = yes
-	sse41 = yes
-endif
-
 ifeq ($(ARCH),x86-64-sse42)
 	arch = x86_64
 	prefetch = yes

From 36092b855a5e2bcfb587a36e2055ea068e4bd8e5 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 19 Jul 2020 14:17:35 +0900
Subject: [PATCH 126/583] Removed the x86-64-ssse3-popcnt architecture.

---
 src/Makefile | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 984f5871..a504ce27 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -162,15 +162,6 @@ ifeq ($(ARCH),x86-64-ssse3)
 	ssse3 = yes
 endif
 
-ifeq ($(ARCH),x86-64-ssse3-popcnt)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	sse3 = yes
-	ssse3 = yes
-endif
-
 ifeq ($(ARCH),x86-64-sse41)
 	arch = x86_64
 	prefetch = yes

From afd7d0ea4d8ac031386ffc27f178c6dee49e0f89 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 19 Jul 2020 18:34:35 +0900
Subject: [PATCH 127/583] Fixed a bug that Makefile specifies -mpopcnt for
 armv8-a.

---
 src/Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index a504ce27..4d56fc01 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -442,8 +442,10 @@ endif
 ### 3.6 popcnt
 ifeq ($(popcnt),yes)
 	CXXFLAGS += -DUSE_POPCNT
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
-		CXXFLAGS += -mpopcnt
+	ifneq ($(arch),$(filter $(arch),ppc64 armv8-a))
+		ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
+			CXXFLAGS += -mpopcnt
+		endif
 	endif
 endif
 

From fd78fb05f6fbf3cab18160ce4f0bfba9d40bf5eb Mon Sep 17 00:00:00 2001
From: No name <no@email>
Date: Sun, 19 Jul 2020 13:50:00 +0300
Subject: [PATCH 128/583] Hide NNUE options if building without NNUE support

Also remove an unused option.
---
 src/ucioption.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index ac5a6a16..e145c34b 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -80,6 +80,7 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
+#ifdef EVAL_NNUE
   // Evaluation function file name. When this is changed, it is necessary to reread the evaluation function at the next ucinewgame timing.
   // Without the preceding "./", some GUIs can not load he net file.
   o["EvalFile"]              << Option("./eval/nn.bin", on_eval_file);
@@ -90,8 +91,8 @@ void init(OptionsMap& o) {
   // Hit the test eval convert command.
   o["SkipLoadingEval"]       << Option(false);
   // how many moves to use a fixed move
-  o["BookMoves"] << Option(16, 0, 10000);
-
+  // o["BookMoves"] << Option(16, 0, 10000);
+#endif
 #if defined(EVAL_LEARN)
   // When learning the evaluation function, you can change the folder to save the evaluation function.
   // Evalsave by default. This folder shall be prepared in advance.

From 77018c77cc736854367a918eb14b45615a1f7587 Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sun, 19 Jul 2020 05:16:13 -0700
Subject: [PATCH 129/583] Fix profile builds for AVX512.

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 4d56fc01..cfa96694 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -459,7 +459,7 @@ endif
 ifeq ($(avx512),yes)
 	CXXFLAGS += -DUSE_AVX512
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw msys2))
-		CXXFLAGS += -mavx512vbmi
+		CXXFLAGS += -mavx512bw
 	endif
 endif
 

From fbdb373b6482db2b462b30d7399c3c7fed1d4f26 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 20 Jul 2020 17:17:50 +0900
Subject: [PATCH 130/583] Changed to set the binary directory to the current
 working directory.

---
 src/main.cpp | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/main.cpp b/src/main.cpp
index fafefee2..6001432d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -20,6 +20,15 @@
 
 #include <iostream>
 
+#ifdef _WIN32
+#include <filesystem>
+
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <Windows.h>
+#endif
+
 #include "bitboard.h"
 #include "endgame.h"
 #include "position.h"
@@ -34,6 +43,17 @@ namespace PSQT {
 }
 
 int main(int argc, char* argv[]) {
+  // Change the current working directory to the binary directory.  So that a
+  // net file path can be specified with a relative path from the binary
+  // directory.
+  // TODO(someone): Implement the logic for other OS.
+#ifdef _WIN32
+  TCHAR filename[_MAX_PATH];
+  ::GetModuleFileName(NULL, filename, sizeof(filename) / sizeof(filename[0]));
+  std::filesystem::path current_path = filename;
+  current_path.remove_filename();
+  std::filesystem::current_path(current_path);
+#endif
 
   std::cout << engine_info() << std::endl;
 

From 74049a450c99393912aa3e60da48f4fb7622fb95 Mon Sep 17 00:00:00 2001
From: No name <no@email>
Date: Mon, 20 Jul 2020 09:53:44 +0300
Subject: [PATCH 131/583] Add NNUE targets to the output of 'make help'

---
 src/Makefile | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index cfa96694..9c9aed5c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -544,8 +544,17 @@ help:
 	@echo ""
 	@echo "Supported targets:"
 	@echo ""
-	@echo "build                   > Standard build"
-	@echo "profile-build           > PGO build"
+	@echo "build                   > Standard (without NNUE) build"
+	@echo "profile-build           > Standard build with PGO"
+	@echo "nnue                    > NNUE-enabled build"
+	@echo "profile-nnue            > NNUE-enabled build with PGO"
+	@echo "nnue-learn              > Produces or refines a NNUE parameter set."
+	@echo "                            Requires training data that can be"
+	@echo "                            generated by itself using an existing"
+	@echo "                            parameter set, or with the next tool"
+	@echo "nnue-gen-sfen-from-original-eval"
+	@echo "                        > Produces training data for 'nnue-learn'"
+	@echo "                        >   without using a NNUE parameter set"
 	@echo "strip                   > Strip executable"
 	@echo "install                 > Install executable"
 	@echo "clean                   > Clean up"

From c0e1235fef49b3338660c076e55cccf67331d77f Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 20 Jul 2020 17:36:09 +0900
Subject: [PATCH 132/583] Added a description to Makefile.

---
 src/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Makefile b/src/Makefile
index 9c9aed5c..2e6c415d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -567,6 +567,7 @@ help:
 	@echo "x86-64-sse42            > x86 64-bit with sse42 support"
 	@echo "x86-64-sse41            > x86 64-bit with sse41 support"
 	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
+	@echo "x86-64-sse3-popcnt      > x86 64-bit with ssse3 and popcnt support"
 	@echo "x86-64-sse3             > x86 64-bit with ssse3 support"
 	@echo "x86-64                  > x86 64-bit generic"
 	@echo "x86-32                  > x86 32-bit (also enables SSE)"

From 1c2346538396c1046564ddc411bc9dac72939f8a Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Fri, 7 Aug 2020 22:34:53 +0900
Subject: [PATCH 133/583] Moved the nnue folder.

---
 src/{eval => }/nnue/architectures/halfkp-cr-ep_256x2-32-32.h  | 0
 src/{eval => }/nnue/architectures/halfkp_256x2-32-32.h        | 0
 src/{eval => }/nnue/architectures/halfkp_384x2-32-32.h        | 0
 src/{eval => }/nnue/architectures/k-p-cr-ep_256x2-32-32.h     | 0
 src/{eval => }/nnue/architectures/k-p-cr_256x2-32-32.h        | 0
 src/{eval => }/nnue/architectures/k-p_256x2-32-32.h           | 0
 src/{eval => }/nnue/evaluate_nnue.cpp                         | 0
 src/{eval => }/nnue/evaluate_nnue.h                           | 0
 src/{eval => }/nnue/evaluate_nnue_learner.cpp                 | 0
 src/{eval => }/nnue/evaluate_nnue_learner.h                   | 0
 src/{eval => }/nnue/features/castling_right.cpp               | 0
 src/{eval => }/nnue/features/castling_right.h                 | 0
 src/{eval => }/nnue/features/enpassant.cpp                    | 0
 src/{eval => }/nnue/features/enpassant.h                      | 0
 src/{eval => }/nnue/features/feature_set.h                    | 0
 src/{eval => }/nnue/features/features_common.h                | 0
 src/{eval => }/nnue/features/half_kp.cpp                      | 0
 src/{eval => }/nnue/features/half_kp.h                        | 0
 src/{eval => }/nnue/features/half_relative_kp.cpp             | 0
 src/{eval => }/nnue/features/half_relative_kp.h               | 0
 src/{eval => }/nnue/features/index_list.h                     | 0
 src/{eval => }/nnue/features/k.cpp                            | 0
 src/{eval => }/nnue/features/k.h                              | 0
 src/{eval => }/nnue/features/p.cpp                            | 0
 src/{eval => }/nnue/features/p.h                              | 0
 src/{eval => }/nnue/layers/affine_transform.h                 | 0
 src/{eval => }/nnue/layers/clipped_relu.h                     | 0
 src/{eval => }/nnue/layers/input_slice.h                      | 0
 src/{eval => }/nnue/layers/sum.h                              | 0
 src/{eval => }/nnue/nnue_accumulator.h                        | 0
 src/{eval => }/nnue/nnue_architecture.h                       | 0
 src/{eval => }/nnue/nnue_common.h                             | 0
 src/{eval => }/nnue/nnue_feature_transformer.h                | 0
 src/{eval => }/nnue/nnue_test_command.cpp                     | 0
 src/{eval => }/nnue/nnue_test_command.h                       | 0
 src/{eval => }/nnue/trainer/features/factorizer.h             | 0
 src/{eval => }/nnue/trainer/features/factorizer_feature_set.h | 0
 src/{eval => }/nnue/trainer/features/factorizer_half_kp.h     | 0
 src/{eval => }/nnue/trainer/trainer.h                         | 0
 src/{eval => }/nnue/trainer/trainer_affine_transform.h        | 0
 src/{eval => }/nnue/trainer/trainer_clipped_relu.h            | 0
 src/{eval => }/nnue/trainer/trainer_feature_transformer.h     | 0
 src/{eval => }/nnue/trainer/trainer_input_slice.h             | 0
 src/{eval => }/nnue/trainer/trainer_sum.h                     | 0
 44 files changed, 0 insertions(+), 0 deletions(-)
 rename src/{eval => }/nnue/architectures/halfkp-cr-ep_256x2-32-32.h (100%)
 rename src/{eval => }/nnue/architectures/halfkp_256x2-32-32.h (100%)
 rename src/{eval => }/nnue/architectures/halfkp_384x2-32-32.h (100%)
 rename src/{eval => }/nnue/architectures/k-p-cr-ep_256x2-32-32.h (100%)
 rename src/{eval => }/nnue/architectures/k-p-cr_256x2-32-32.h (100%)
 rename src/{eval => }/nnue/architectures/k-p_256x2-32-32.h (100%)
 rename src/{eval => }/nnue/evaluate_nnue.cpp (100%)
 rename src/{eval => }/nnue/evaluate_nnue.h (100%)
 rename src/{eval => }/nnue/evaluate_nnue_learner.cpp (100%)
 rename src/{eval => }/nnue/evaluate_nnue_learner.h (100%)
 rename src/{eval => }/nnue/features/castling_right.cpp (100%)
 rename src/{eval => }/nnue/features/castling_right.h (100%)
 rename src/{eval => }/nnue/features/enpassant.cpp (100%)
 rename src/{eval => }/nnue/features/enpassant.h (100%)
 rename src/{eval => }/nnue/features/feature_set.h (100%)
 rename src/{eval => }/nnue/features/features_common.h (100%)
 rename src/{eval => }/nnue/features/half_kp.cpp (100%)
 rename src/{eval => }/nnue/features/half_kp.h (100%)
 rename src/{eval => }/nnue/features/half_relative_kp.cpp (100%)
 rename src/{eval => }/nnue/features/half_relative_kp.h (100%)
 rename src/{eval => }/nnue/features/index_list.h (100%)
 rename src/{eval => }/nnue/features/k.cpp (100%)
 rename src/{eval => }/nnue/features/k.h (100%)
 rename src/{eval => }/nnue/features/p.cpp (100%)
 rename src/{eval => }/nnue/features/p.h (100%)
 rename src/{eval => }/nnue/layers/affine_transform.h (100%)
 rename src/{eval => }/nnue/layers/clipped_relu.h (100%)
 rename src/{eval => }/nnue/layers/input_slice.h (100%)
 rename src/{eval => }/nnue/layers/sum.h (100%)
 rename src/{eval => }/nnue/nnue_accumulator.h (100%)
 rename src/{eval => }/nnue/nnue_architecture.h (100%)
 rename src/{eval => }/nnue/nnue_common.h (100%)
 rename src/{eval => }/nnue/nnue_feature_transformer.h (100%)
 rename src/{eval => }/nnue/nnue_test_command.cpp (100%)
 rename src/{eval => }/nnue/nnue_test_command.h (100%)
 rename src/{eval => }/nnue/trainer/features/factorizer.h (100%)
 rename src/{eval => }/nnue/trainer/features/factorizer_feature_set.h (100%)
 rename src/{eval => }/nnue/trainer/features/factorizer_half_kp.h (100%)
 rename src/{eval => }/nnue/trainer/trainer.h (100%)
 rename src/{eval => }/nnue/trainer/trainer_affine_transform.h (100%)
 rename src/{eval => }/nnue/trainer/trainer_clipped_relu.h (100%)
 rename src/{eval => }/nnue/trainer/trainer_feature_transformer.h (100%)
 rename src/{eval => }/nnue/trainer/trainer_input_slice.h (100%)
 rename src/{eval => }/nnue/trainer/trainer_sum.h (100%)

diff --git a/src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h b/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
similarity index 100%
rename from src/eval/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
rename to src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
diff --git a/src/eval/nnue/architectures/halfkp_256x2-32-32.h b/src/nnue/architectures/halfkp_256x2-32-32.h
similarity index 100%
rename from src/eval/nnue/architectures/halfkp_256x2-32-32.h
rename to src/nnue/architectures/halfkp_256x2-32-32.h
diff --git a/src/eval/nnue/architectures/halfkp_384x2-32-32.h b/src/nnue/architectures/halfkp_384x2-32-32.h
similarity index 100%
rename from src/eval/nnue/architectures/halfkp_384x2-32-32.h
rename to src/nnue/architectures/halfkp_384x2-32-32.h
diff --git a/src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h b/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
similarity index 100%
rename from src/eval/nnue/architectures/k-p-cr-ep_256x2-32-32.h
rename to src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
diff --git a/src/eval/nnue/architectures/k-p-cr_256x2-32-32.h b/src/nnue/architectures/k-p-cr_256x2-32-32.h
similarity index 100%
rename from src/eval/nnue/architectures/k-p-cr_256x2-32-32.h
rename to src/nnue/architectures/k-p-cr_256x2-32-32.h
diff --git a/src/eval/nnue/architectures/k-p_256x2-32-32.h b/src/nnue/architectures/k-p_256x2-32-32.h
similarity index 100%
rename from src/eval/nnue/architectures/k-p_256x2-32-32.h
rename to src/nnue/architectures/k-p_256x2-32-32.h
diff --git a/src/eval/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
similarity index 100%
rename from src/eval/nnue/evaluate_nnue.cpp
rename to src/nnue/evaluate_nnue.cpp
diff --git a/src/eval/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
similarity index 100%
rename from src/eval/nnue/evaluate_nnue.h
rename to src/nnue/evaluate_nnue.h
diff --git a/src/eval/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
similarity index 100%
rename from src/eval/nnue/evaluate_nnue_learner.cpp
rename to src/nnue/evaluate_nnue_learner.cpp
diff --git a/src/eval/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
similarity index 100%
rename from src/eval/nnue/evaluate_nnue_learner.h
rename to src/nnue/evaluate_nnue_learner.h
diff --git a/src/eval/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
similarity index 100%
rename from src/eval/nnue/features/castling_right.cpp
rename to src/nnue/features/castling_right.cpp
diff --git a/src/eval/nnue/features/castling_right.h b/src/nnue/features/castling_right.h
similarity index 100%
rename from src/eval/nnue/features/castling_right.h
rename to src/nnue/features/castling_right.h
diff --git a/src/eval/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
similarity index 100%
rename from src/eval/nnue/features/enpassant.cpp
rename to src/nnue/features/enpassant.cpp
diff --git a/src/eval/nnue/features/enpassant.h b/src/nnue/features/enpassant.h
similarity index 100%
rename from src/eval/nnue/features/enpassant.h
rename to src/nnue/features/enpassant.h
diff --git a/src/eval/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
similarity index 100%
rename from src/eval/nnue/features/feature_set.h
rename to src/nnue/features/feature_set.h
diff --git a/src/eval/nnue/features/features_common.h b/src/nnue/features/features_common.h
similarity index 100%
rename from src/eval/nnue/features/features_common.h
rename to src/nnue/features/features_common.h
diff --git a/src/eval/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
similarity index 100%
rename from src/eval/nnue/features/half_kp.cpp
rename to src/nnue/features/half_kp.cpp
diff --git a/src/eval/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
similarity index 100%
rename from src/eval/nnue/features/half_kp.h
rename to src/nnue/features/half_kp.h
diff --git a/src/eval/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
similarity index 100%
rename from src/eval/nnue/features/half_relative_kp.cpp
rename to src/nnue/features/half_relative_kp.cpp
diff --git a/src/eval/nnue/features/half_relative_kp.h b/src/nnue/features/half_relative_kp.h
similarity index 100%
rename from src/eval/nnue/features/half_relative_kp.h
rename to src/nnue/features/half_relative_kp.h
diff --git a/src/eval/nnue/features/index_list.h b/src/nnue/features/index_list.h
similarity index 100%
rename from src/eval/nnue/features/index_list.h
rename to src/nnue/features/index_list.h
diff --git a/src/eval/nnue/features/k.cpp b/src/nnue/features/k.cpp
similarity index 100%
rename from src/eval/nnue/features/k.cpp
rename to src/nnue/features/k.cpp
diff --git a/src/eval/nnue/features/k.h b/src/nnue/features/k.h
similarity index 100%
rename from src/eval/nnue/features/k.h
rename to src/nnue/features/k.h
diff --git a/src/eval/nnue/features/p.cpp b/src/nnue/features/p.cpp
similarity index 100%
rename from src/eval/nnue/features/p.cpp
rename to src/nnue/features/p.cpp
diff --git a/src/eval/nnue/features/p.h b/src/nnue/features/p.h
similarity index 100%
rename from src/eval/nnue/features/p.h
rename to src/nnue/features/p.h
diff --git a/src/eval/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
similarity index 100%
rename from src/eval/nnue/layers/affine_transform.h
rename to src/nnue/layers/affine_transform.h
diff --git a/src/eval/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
similarity index 100%
rename from src/eval/nnue/layers/clipped_relu.h
rename to src/nnue/layers/clipped_relu.h
diff --git a/src/eval/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h
similarity index 100%
rename from src/eval/nnue/layers/input_slice.h
rename to src/nnue/layers/input_slice.h
diff --git a/src/eval/nnue/layers/sum.h b/src/nnue/layers/sum.h
similarity index 100%
rename from src/eval/nnue/layers/sum.h
rename to src/nnue/layers/sum.h
diff --git a/src/eval/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
similarity index 100%
rename from src/eval/nnue/nnue_accumulator.h
rename to src/nnue/nnue_accumulator.h
diff --git a/src/eval/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h
similarity index 100%
rename from src/eval/nnue/nnue_architecture.h
rename to src/nnue/nnue_architecture.h
diff --git a/src/eval/nnue/nnue_common.h b/src/nnue/nnue_common.h
similarity index 100%
rename from src/eval/nnue/nnue_common.h
rename to src/nnue/nnue_common.h
diff --git a/src/eval/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
similarity index 100%
rename from src/eval/nnue/nnue_feature_transformer.h
rename to src/nnue/nnue_feature_transformer.h
diff --git a/src/eval/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
similarity index 100%
rename from src/eval/nnue/nnue_test_command.cpp
rename to src/nnue/nnue_test_command.cpp
diff --git a/src/eval/nnue/nnue_test_command.h b/src/nnue/nnue_test_command.h
similarity index 100%
rename from src/eval/nnue/nnue_test_command.h
rename to src/nnue/nnue_test_command.h
diff --git a/src/eval/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
similarity index 100%
rename from src/eval/nnue/trainer/features/factorizer.h
rename to src/nnue/trainer/features/factorizer.h
diff --git a/src/eval/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
similarity index 100%
rename from src/eval/nnue/trainer/features/factorizer_feature_set.h
rename to src/nnue/trainer/features/factorizer_feature_set.h
diff --git a/src/eval/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
similarity index 100%
rename from src/eval/nnue/trainer/features/factorizer_half_kp.h
rename to src/nnue/trainer/features/factorizer_half_kp.h
diff --git a/src/eval/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
similarity index 100%
rename from src/eval/nnue/trainer/trainer.h
rename to src/nnue/trainer/trainer.h
diff --git a/src/eval/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
similarity index 100%
rename from src/eval/nnue/trainer/trainer_affine_transform.h
rename to src/nnue/trainer/trainer_affine_transform.h
diff --git a/src/eval/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
similarity index 100%
rename from src/eval/nnue/trainer/trainer_clipped_relu.h
rename to src/nnue/trainer/trainer_clipped_relu.h
diff --git a/src/eval/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
similarity index 100%
rename from src/eval/nnue/trainer/trainer_feature_transformer.h
rename to src/nnue/trainer/trainer_feature_transformer.h
diff --git a/src/eval/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
similarity index 100%
rename from src/eval/nnue/trainer/trainer_input_slice.h
rename to src/nnue/trainer/trainer_input_slice.h
diff --git a/src/eval/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
similarity index 100%
rename from src/eval/nnue/trainer/trainer_sum.h
rename to src/nnue/trainer/trainer_sum.h

From bf7d02578e94296667f159a46fe5fdc8e2f3b94b Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Fri, 7 Aug 2020 22:47:45 +0900
Subject: [PATCH 134/583] Fixed build errors.

---
 src/evaluate.cpp                               |  2 +-
 src/learn/learner.cpp                          |  2 +-
 src/nnue/evaluate_nnue.cpp                     |  8 ++++----
 src/nnue/evaluate_nnue_learner.cpp             | 14 +++++++-------
 src/nnue/evaluate_nnue_learner.h               |  2 +-
 src/nnue/features/castling_right.h             |  2 +-
 src/nnue/features/enpassant.h                  |  2 +-
 src/nnue/features/features_common.h            |  2 +-
 src/nnue/features/half_kp.h                    |  2 +-
 src/nnue/features/half_relative_kp.h           |  2 +-
 src/nnue/features/index_list.h                 |  2 +-
 src/nnue/features/k.h                          |  2 +-
 src/nnue/features/p.h                          |  2 +-
 src/nnue/nnue_test_command.cpp                 |  4 ++--
 src/nnue/trainer/trainer_affine_transform.h    |  2 +-
 src/nnue/trainer/trainer_clipped_relu.h        |  2 +-
 src/nnue/trainer/trainer_feature_transformer.h |  2 +-
 src/nnue/trainer/trainer_input_slice.h         |  2 +-
 src/nnue/trainer/trainer_sum.h                 |  2 +-
 src/position.h                                 |  2 +-
 src/uci.cpp                                    |  2 +-
 21 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 12ecff00..384c081d 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -30,7 +30,7 @@
 #include "material.h"
 #include "pawns.h"
 #include "thread.h"
-#include "eval/nnue/evaluate_nnue.h"
+#include "nnue/evaluate_nnue.h"
 
 namespace Trace {
 
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index e343fde5..aff32ee8 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -81,7 +81,7 @@
 #include "multi_think.h"
 
 #if defined(EVAL_NNUE)
-#include "../eval/nnue/evaluate_nnue_learner.h"
+#include "../nnue/evaluate_nnue_learner.h"
 #include <shared_mutex>
 #endif
 
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 55e627d0..6a664907 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -5,10 +5,10 @@
 #include <fstream>
 #include <iostream>
 
-#include "../../evaluate.h"
-#include "../../position.h"
-#include "../../misc.h"
-#include "../../uci.h"
+#include "../evaluate.h"
+#include "../position.h"
+#include "../misc.h"
+#include "../uci.h"
 
 #include "evaluate_nnue.h"
 
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 3297037d..650f443e 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -5,15 +5,15 @@
 #include <random>
 #include <fstream>
 
-#include "../../learn/learn.h"
-#include "../../learn/learning_tools.h"
+#include "../learn/learn.h"
+#include "../learn/learning_tools.h"
 
-#include "../../position.h"
-#include "../../uci.h"
-#include "../../misc.h"
-#include "../../thread_win32_osx.h"
+#include "../position.h"
+#include "../uci.h"
+#include "../misc.h"
+#include "../thread_win32_osx.h"
 
-#include "../evaluate_common.h"
+#include "../eval/evaluate_common.h"
 
 #include "evaluate_nnue.h"
 #include "evaluate_nnue_learner.h"
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index ace66524..1e4a463e 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#include "../../learn/learn.h"
+#include "../learn/learn.h"
 
 namespace Eval {
 
diff --git a/src/nnue/features/castling_right.h b/src/nnue/features/castling_right.h
index 709d4688..3af5b074 100644
--- a/src/nnue/features/castling_right.h
+++ b/src/nnue/features/castling_right.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_NNUE)
 
-#include "../../../evaluate.h"
+#include "../../evaluate.h"
 #include "features_common.h"
 
 namespace Eval {
diff --git a/src/nnue/features/enpassant.h b/src/nnue/features/enpassant.h
index 51880bb4..f77f9c4f 100644
--- a/src/nnue/features/enpassant.h
+++ b/src/nnue/features/enpassant.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_NNUE)
 
-#include "../../../evaluate.h"
+#include "../../evaluate.h"
 #include "features_common.h"
 
 namespace Eval {
diff --git a/src/nnue/features/features_common.h b/src/nnue/features/features_common.h
index 8d2ca4a2..2fabbd4f 100644
--- a/src/nnue/features/features_common.h
+++ b/src/nnue/features/features_common.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_NNUE)
 
-#include "../../../evaluate.h"
+#include "../../evaluate.h"
 #include "../nnue_common.h"
 
 namespace Eval {
diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
index cc9cd660..d2e8e521 100644
--- a/src/nnue/features/half_kp.h
+++ b/src/nnue/features/half_kp.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_NNUE)
 
-#include "../../../evaluate.h"
+#include "../../evaluate.h"
 #include "features_common.h"
 
 namespace Eval {
diff --git a/src/nnue/features/half_relative_kp.h b/src/nnue/features/half_relative_kp.h
index 2f967745..49b31499 100644
--- a/src/nnue/features/half_relative_kp.h
+++ b/src/nnue/features/half_relative_kp.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_NNUE)
 
-#include "../../../evaluate.h"
+#include "../../evaluate.h"
 #include "features_common.h"
 
 namespace Eval {
diff --git a/src/nnue/features/index_list.h b/src/nnue/features/index_list.h
index 39e66a09..0374b19d 100644
--- a/src/nnue/features/index_list.h
+++ b/src/nnue/features/index_list.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_NNUE)
 
-#include "../../../position.h"
+#include "../../position.h"
 #include "../nnue_architecture.h"
 
 namespace Eval {
diff --git a/src/nnue/features/k.h b/src/nnue/features/k.h
index d7a6f4aa..28431010 100644
--- a/src/nnue/features/k.h
+++ b/src/nnue/features/k.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_NNUE)
 
-#include "../../../evaluate.h"
+#include "../../evaluate.h"
 #include "features_common.h"
 
 namespace Eval {
diff --git a/src/nnue/features/p.h b/src/nnue/features/p.h
index 27a944fa..d07acb59 100644
--- a/src/nnue/features/p.h
+++ b/src/nnue/features/p.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_NNUE)
 
-#include "../../../evaluate.h"
+#include "../../evaluate.h"
 #include "features_common.h"
 
 namespace Eval {
diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index b0c57d4c..311c5ded 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -2,8 +2,8 @@
 
 #if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
 
-#include "../../thread.h"
-#include "../../uci.h"
+#include "../thread.h"
+#include "../uci.h"
 #include "evaluate_nnue.h"
 #include "nnue_test_command.h"
 
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index f5b208a3..db56c1c0 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#include "../../../learn/learn.h"
+#include "../../learn/learn.h"
 #include "../layers/affine_transform.h"
 #include "trainer.h"
 
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 566ed777..fd7b1a07 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#include "../../../learn/learn.h"
+#include "../../learn/learn.h"
 #include "../layers/clipped_relu.h"
 #include "trainer.h"
 
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 0139d534..97dbeff4 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#include "../../../learn/learn.h"
+#include "../../learn/learn.h"
 #include "../nnue_feature_transformer.h"
 #include "trainer.h"
 #include "features/factorizer_feature_set.h"
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index f5b263d3..33e39244 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#include "../../../learn/learn.h"
+#include "../../learn/learn.h"
 #include "../layers/input_slice.h"
 #include "trainer.h"
 
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index 2efdff67..fb5b1532 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -5,7 +5,7 @@
 
 #if defined(EVAL_LEARN) && defined(EVAL_NNUE)
 
-#include "../../../learn/learn.h"
+#include "../../learn/learn.h"
 #include "../layers/sum.h"
 #include "trainer.h"
 
diff --git a/src/position.h b/src/position.h
index 725be527..52cc0254 100644
--- a/src/position.h
+++ b/src/position.h
@@ -32,7 +32,7 @@
 #include "misc.h"
 #include "types.h"
 
-#include "eval/nnue/nnue_accumulator.h"
+#include "nnue/nnue_accumulator.h"
 
 
 /// StateInfo struct stores information needed to restore a Position object to
diff --git a/src/uci.cpp b/src/uci.cpp
index c775f333..3b0a08a9 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -35,7 +35,7 @@
 #include "syzygy/tbprobe.h"
 
 #if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
-#include "eval/nnue/nnue_test_command.h"
+#include "nnue/nnue_test_command.h"
 #endif
 
 using namespace std;

From 1abae04cebac252c9728f16a997e0aedaf939277 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Fri, 7 Aug 2020 23:00:11 +0900
Subject: [PATCH 135/583] Fixed Makefile.

---
 src/Makefile | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 2e6c415d..1e07bba4 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -40,15 +40,15 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
 	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
 	eval/evaluate_mir_inv_tools.cpp \
-	eval/nnue/evaluate_nnue.cpp \
-	eval/nnue/evaluate_nnue_learner.cpp \
-	eval/nnue/features/half_kp.cpp \
-	eval/nnue/features/half_relative_kp.cpp \
-	eval/nnue/features/k.cpp \
-	eval/nnue/features/p.cpp \
-	eval/nnue/features/castling_right.cpp \
-	eval/nnue/features/enpassant.cpp \
-	eval/nnue/nnue_test_command.cpp \
+	nnue/evaluate_nnue.cpp \
+	nnue/evaluate_nnue_learner.cpp \
+	nnue/features/half_kp.cpp \
+	nnue/features/half_relative_kp.cpp \
+	nnue/features/k.cpp \
+	nnue/features/p.cpp \
+	nnue/features/castling_right.cpp \
+	nnue/features/enpassant.cpp \
+	nnue/nnue_test_command.cpp \
 	extra/sfen_packer.cpp \
 	learn/gensfen2019.cpp \
 	learn/learner.cpp \
@@ -635,12 +635,12 @@ clean: objclean profileclean
 
 # clean binaries and objects
 objclean:
-	@rm -f $(EXE) *.o ./syzygy/*.o ./learn/*.o ./extra/*.o ./eval/*.o ./eval/nnue/*.o ./eval/nnue/features/*.o
+	@rm -f $(EXE) *.o ./syzygy/*.o ./learn/*.o ./extra/*.o ./eval/*.o ./nnue/*.o ./nnue/features/*.o
 
 # clean auxiliary profiling files
 profileclean:
 	@rm -rf profdir
-	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda ./eval/nnue/*.gcda ./eval/nnue/features/*.gcda
+	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda
 	@rm -f stockfish.profdata *.profraw
 
 default:

From 2c9075e91966895390bf1e5818a0498ad4e8af7a Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 8 Aug 2020 16:05:05 +0900
Subject: [PATCH 136/583] Fixed Makefile to fix build.

---
 src/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 79ce61a3..85f33b0f 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -57,7 +57,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 
 OBJS = $(notdir $(SRCS:.cpp=.o))
 
-VPATH = syzygy:nnue:nnue/features
+VPATH = syzygy:nnue:nnue/features:eval:extra:learn
 
 ### Establish the operating system name
 KERNEL = $(shell uname -s)
@@ -745,8 +745,8 @@ icc-profile-use:
 	EXTRACXXFLAGS='-prof_use -prof_dir ./profdir' \
 	all
 
-nnue-learn: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DUSE_EVAL_HASH -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp' LDFLAGS='$(LDFLAGS) -lopenblas -fopenmp' build
+learn: config-sanity
+	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp' LDFLAGS='$(LDFLAGS) -lopenblas -fopenmp' build
 
 .depend:
 	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null

From fa649ba1e2b2f6e47a6ece33448f64150db8563a Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 8 Aug 2020 16:17:55 +0900
Subject: [PATCH 137/583] Removed a compile warning.

---
 src/misc.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/misc.h b/src/misc.h
index e51d5f3f..fe5990e0 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -158,13 +158,6 @@ extern void sleep(int ms);
 // Returns a string that represents the current time. (Used for log output when learning evaluation function)
 std::string now_string();
 
-// wrapper for end processing on the way
-static void my_exit()
-{
-	sleep(3000); // It is bad to finish before the error message is output, so put wait.
-	exit(EXIT_FAILURE);
-}
-
 // When compiled with gcc/clang such as msys2, Windows Subsystem for Linux,
 // In C++ std::ifstream, ::read() is a wrapper for that because it is not possible to read and write files larger than 2GB in one shot.
 //

From 2395833c07a40a3cc83e211025fcb05a3014c9ea Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 8 Aug 2020 16:52:18 +0900
Subject: [PATCH 138/583] Re-added commands for training data generator and
 trainer.

---
 src/uci.cpp       | 97 ++++++++++++++++++++++++++++++++++++++++++++++-
 src/ucioption.cpp | 16 ++++++++
 2 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/src/uci.cpp b/src/uci.cpp
index 24073369..b0d7b6f9 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -32,6 +32,10 @@
 #include "uci.h"
 #include "syzygy/tbprobe.h"
 
+#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
+#include "nnue/nnue_test_command.h"
+#endif
+
 using namespace std;
 
 extern vector<string> setup_bench(const Position&, istream&);
@@ -39,8 +43,44 @@ extern vector<string> setup_bench(const Position&, istream&);
 // FEN string of the initial position, normal chess
 const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
 
-namespace {
+// Command to automatically generate a game record
+#if defined (EVAL_LEARN)
+namespace Learner
+{
+  // Automatic generation of teacher position
+  void gen_sfen(Position& pos, istringstream& is);
 
+  // Learning from the generated game record
+  void learn(Position& pos, istringstream& is);
+
+#if defined(GENSFEN2019)
+  // Automatic generation command of teacher phase under development
+  void gen_sfen2019(Position& pos, istringstream& is);
+#endif
+
+  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
+  typedef std::pair<Value, std::vector<Move> > ValueAndPV;
+
+  ValueAndPV qsearch(Position& pos);
+  ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
+
+}
+#endif
+
+#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
+void test_cmd(Position& pos, istringstream& is)
+{
+    // Initialize as it may be searched.
+    Eval::init_NNUE();
+
+    std::string param;
+    is >> param;
+
+    if (param == "nnue") Eval::NNUE::TestCommand(pos, is);
+}
+#endif
+
+namespace {
   // position() is called when engine receives the "position" UCI command.
   // The function sets up the position described in the given FEN string ("fen")
   // or the starting position ("startpos") and then makes the moves given in the
@@ -218,6 +258,43 @@ namespace {
 
 } // namespace
 
+// --------------------
+// Call qsearch(),search() directly for testing
+// --------------------
+
+#if defined(EVAL_LEARN)
+void qsearch_cmd(Position& pos)
+{
+  cout << "qsearch : ";
+  auto pv = Learner::qsearch(pos);
+  cout << "Value = " << pv.first << " , " << UCI::value(pv.first) << " , PV = ";
+  for (auto m : pv.second)
+    cout << UCI::move(m, false) << " ";
+  cout << endl;
+}
+
+void search_cmd(Position& pos, istringstream& is)
+{
+  string token;
+  int depth = 1;
+  int multi_pv = (int)Options["MultiPV"];
+  while (is >> token)
+  {
+    if (token == "depth")
+      is >> depth;
+    if (token == "multipv")
+      is >> multi_pv;
+  }
+
+  cout << "search depth = " << depth << " , multi_pv = " << multi_pv << " : ";
+  auto pv = Learner::search(pos, depth, multi_pv);
+  cout << "Value = " << pv.first << " , " << UCI::value(pv.first) << " , PV = ";
+  for (auto m : pv.second)
+    cout << UCI::move(m, false) << " ";
+  cout << endl;
+}
+
+#endif
 
 /// UCI::loop() waits for a command from stdin, parses it and calls the appropriate
 /// function. Also intercepts EOF from stdin to ensure gracefully exiting if the
@@ -274,6 +351,24 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "d")        sync_cout << pos << sync_endl;
       else if (token == "eval")     trace_eval(pos);
       else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
+#if defined (EVAL_LEARN)
+      else if (token == "gensfen") Learner::gen_sfen(pos, is);
+      else if (token == "learn") Learner::learn(pos, is);
+
+#if defined (GENSFEN2019)
+	  // Command to generate teacher phase under development
+      else if (token == "gensfen2019") Learner::gen_sfen2019(pos, is);
+#endif
+      // Command to call qsearch(),search() directly for testing
+      else if (token == "qsearch") qsearch_cmd(pos);
+      else if (token == "search") search_cmd(pos, is);
+
+#endif
+
+#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
+      // test command
+      else if (token == "test") test_cmd(pos, is);
+#endif
       else
           sync_cout << "Unknown command: " << cmd << sync_endl;
 
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index faeb78ae..168e73a9 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -80,6 +80,22 @@ void init(OptionsMap& o) {
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
   o["Use NNUE"]              << Option(false, on_use_NNUE);
   o["EvalFile"]              << Option("nn-9931db908a9b.nnue", on_eval_file);
+#ifdef EVAL_NNUE
+  // When the evaluation function is loaded at the ucinewgame timing, it is necessary to convert the new evaluation function.
+  // I want to hit the test eval convert command, but there is no new evaluation function
+  // It ends abnormally before executing this command.
+  // Therefore, with this hidden option, you can suppress the loading of the evaluation function when ucinewgame,
+  // Hit the test eval convert command.
+  o["SkipLoadingEval"]       << Option(false);
+  // how many moves to use a fixed move
+  // o["BookMoves"] << Option(16, 0, 10000);
+#endif
+#if defined(EVAL_LEARN)
+  // When learning the evaluation function, you can change the folder to save the evaluation function.
+  // Evalsave by default. This folder shall be prepared in advance.
+  // Automatically dig a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
+  o["EvalSaveDir"] << Option("evalsave");
+#endif
 }
 
 
From ed4d007e3cfb012671e09723c160dbc2e3d98d54 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 8 Aug 2020 18:21:38 +0900
Subject: [PATCH 139/583] Fixed a bug that the training data generator crahses
 on memory allocation.

---
 src/misc.h                 | 2 +-
 src/nnue/trainer/trainer.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/misc.h b/src/misc.h
index fe5990e0..9ea57fa8 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -271,7 +271,7 @@ public:
 
   template <typename U> AlignedAllocator(const AlignedAllocator<U>&) {}
 
-  T* allocate(std::size_t n) { return (T*)std_aligned_alloc(n * sizeof(T), alignof(T)); }
+  T* allocate(std::size_t n) { return (T*)std_aligned_alloc(alignof(T), n * sizeof(T)); }
   void deallocate(T* p, std::size_t n) { std_aligned_free(p); }
 };
 
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index b42cb4fa..4b467041 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -111,7 +111,7 @@ IntType Round(double value) {
 // make_shared with alignment
 template <typename T, typename... ArgumentTypes>
 std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
-  const auto ptr = new(std_aligned_alloc(sizeof(T), alignof(T)))
+  const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
       T(std::forward<ArgumentTypes>(arguments)...);
   return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
 }

From 70d88364fe1f0d55474313ad8b2d5a2b7b28e18f Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 8 Aug 2020 18:22:29 +0900
Subject: [PATCH 140/583] Fixed a bug that the training data generation
 crashes.

---
 src/learn/learner.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index eaddbb8a..fea25503 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -635,10 +635,10 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 					// cout << pos;
 
 					auto v = Eval::evaluate(pos);
-					// evaluate() returns the evaluation value on the turn side, so
-					// If it's a turn different from root_color, you must invert v and return it.
-					if (rootColor != pos.side_to_move())
-						v = -v;
+						// evaluate() returns the evaluation value on the turn side, so
+						// If it's a turn different from root_color, you must invert v and return it.
+						if (rootColor != pos.side_to_move())
+							v = -v;
 
 					// Rewind.
 					// Is it C++x14, and isn't there even foreach to turn in reverse?
@@ -979,7 +979,7 @@ void gen_sfen(Position&, istringstream& is)
 		<< "  loop_max = " << loop_max << endl
 		<< "  eval_limit = " << eval_limit << endl
 		<< "  thread_num (set by USI setoption) = " << thread_num << endl
-		<< "  book_moves (set by USI setoption) = " << Options["BookMoves"] << endl
+		//<< "  book_moves (set by USI setoption) = " << Options["BookMoves"] << endl
 		<< "  random_move_minply     = " << random_move_minply << endl
 		<< "  random_move_maxply     = " << random_move_maxply << endl
 		<< "  random_move_count      = " << random_move_count << endl

From b0d28ac3abcac8db4708055b380edc49f63637aa Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 8 Aug 2020 18:23:11 +0900
Subject: [PATCH 141/583] Fixed a bug that the training data generation crashes
 if eval_limit is high.

---
 src/learn/learner.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index fea25503..85da2ac3 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -634,11 +634,19 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 					// reach leaf
 					// cout << pos;
 
-					auto v = Eval::evaluate(pos);
+					Value v;
+					if (pos.checkers()) {
+						// HACK: If a king is checkmated, Eval::evalute(pos) crashes with an
+						// assertion.  To avoid crashes, return value1 instead.
+						v = value1;
+					}
+					else {
+						v = Eval::evaluate(pos);
 						// evaluate() returns the evaluation value on the turn side, so
 						// If it's a turn different from root_color, you must invert v and return it.
 						if (rootColor != pos.side_to_move())
 							v = -v;
+					}
 
 					// Rewind.
 					// Is it C++x14, and isn't there even foreach to turn in reverse?

From 9a0b20d3fc95e66157db121232d9a025ec86f385 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 8 Aug 2020 18:24:09 +0900
Subject: [PATCH 142/583] Changed to show if NNUE is used in the training data
 generator.

---
 src/learn/learner.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 85da2ac3..b5821395 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1002,6 +1002,9 @@ void gen_sfen(Position&, istringstream& is)
 		<< "  save_every             = " << save_every << endl
 		<< "  random_file_name       = " << random_file_name << endl;
 
+	// Show if the training data generator uses NNUE.
+	Eval::verify_NNUE();
+
 	// Create and execute threads as many as Options["Threads"].
 	{
 		SfenWriter sw(output_file_name, thread_num);

From 4f94f29f390c958dc8638cfbf9b4f7d75957fcc6 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 8 Aug 2020 18:38:02 +0900
Subject: [PATCH 143/583] Revert "Fixed a bug that the training data generation
 crashes if eval_limit is high."

This reverts commit b0d28ac3abcac8db4708055b380edc49f63637aa.
---
 src/learn/learner.cpp | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index b5821395..afc82447 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -634,19 +634,11 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 					// reach leaf
 					// cout << pos;
 
-					Value v;
-					if (pos.checkers()) {
-						// HACK: If a king is checkmated, Eval::evalute(pos) crashes with an
-						// assertion.  To avoid crashes, return value1 instead.
-						v = value1;
-					}
-					else {
-						v = Eval::evaluate(pos);
+					auto v = Eval::evaluate(pos);
 						// evaluate() returns the evaluation value on the turn side, so
 						// If it's a turn different from root_color, you must invert v and return it.
 						if (rootColor != pos.side_to_move())
 							v = -v;
-					}
 
 					// Rewind.
 					// Is it C++x14, and isn't there even foreach to turn in reverse?

From 22b85810fe42b6d198ff8732c97c0ed84c3784b6 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 8 Aug 2020 19:04:08 +0900
Subject: [PATCH 144/583] Re-added the code to skip loading a net file.

---
 src/nnue/evaluate_nnue.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 66e5ff57..a28a4573 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -184,6 +184,13 @@ namespace Eval::NNUE {
   bool load_eval_file(const std::string& evalFile) {
 
     Initialize();
+
+    if (Options["SkipLoadingEval"])
+    {
+      std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
+      return true;
+    }
+
     fileName = evalFile;
 
     std::ifstream stream(evalFile, std::ios::binary);

From fcd70a3c814a75484bbe57c066b8dc992a9e659e Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 8 Aug 2020 21:00:19 +0900
Subject: [PATCH 145/583] Updated README.md.

Bench: 4067325
---
 README.md | 8 +++++---
 Readme.md | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 73eec1fb..7a237480 100644
--- a/README.md
+++ b/README.md
@@ -9,9 +9,10 @@ Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updat
 
 ## Training Guide
 ### Generating Training Data
-Use the "no-nnue.nnue-gen-sfen-from-original-eval" binary. The given example is generation in its simplest form. There are more commands. 
+To generate training data from the classic eval, use gensfen command with setting "Use NNUE" to "false". The given example is generation in its simplest form. There are more commands. 
 ```
 uci
+setoption name Use NNUE value false
 setoption name Threads value x
 setoption name Hash value y
 setoption name SyzygyPath value path
@@ -27,10 +28,11 @@ This will save a file named "generated_kifu.bin" in the same folder as the binar
 ### Generating Validation Data
 The process is the same as the generation of training data, except for the fact that you need to set loop to 1 million, because you don't need a lot of validation data. The depth should be the same as before or slightly higher than the depth of the training data. After generation rename the validation data file to val.bin and drop it in a folder named "validationdata" in the same directory to make it easier. 
 ### Training a Completely New Network
-Use the "avx2.halfkp_256x2-32-32.nnue-learn.2020-07-11" binary. Create an empty folder named "evalsave" in the same directory as the binaries.
+Use the "learn" binary. Create an empty folder named "evalsave" in the same directory as the binaries.
 ```
 uci
 setoption name SkipLoadingEval value true
+setoption name Use NNUE value true
 setoption name Threads value x
 isready
 learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 mirror_percentage 50 validation_set_file_name validationdata\val.bin
@@ -42,7 +44,7 @@ Nets get saved in the "evalsave" folder.
 - lambda is the amount of weight it puts to eval of learning data vs win/draw/loss results. 1 puts all weight on eval, lambda 0 puts all weight on WDL results.
 
 ### Reinforcement Learning
-If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `uci setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries with setting `Use NNUE` to true. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `uci setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
 
 After you have generated the training data, you must move it into your training data folder and delete the older data so that the binary does not accidentally train on the same data again. Do the same for the validation data and name it to val-1.bin to make it less confusing. Make sure the evalsave folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set eval_save_interval to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value. The validation file should be set to the new validation data, not the old data.
 
diff --git a/Readme.md b/Readme.md
index 73eec1fb..7a237480 100644
--- a/Readme.md
+++ b/Readme.md
@@ -9,9 +9,10 @@ Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updat
 
 ## Training Guide
 ### Generating Training Data
-Use the "no-nnue.nnue-gen-sfen-from-original-eval" binary. The given example is generation in its simplest form. There are more commands. 
+To generate training data from the classic eval, use gensfen command with setting "Use NNUE" to "false". The given example is generation in its simplest form. There are more commands. 
 ```
 uci
+setoption name Use NNUE value false
 setoption name Threads value x
 setoption name Hash value y
 setoption name SyzygyPath value path
@@ -27,10 +28,11 @@ This will save a file named "generated_kifu.bin" in the same folder as the binar
 ### Generating Validation Data
 The process is the same as the generation of training data, except for the fact that you need to set loop to 1 million, because you don't need a lot of validation data. The depth should be the same as before or slightly higher than the depth of the training data. After generation rename the validation data file to val.bin and drop it in a folder named "validationdata" in the same directory to make it easier. 
 ### Training a Completely New Network
-Use the "avx2.halfkp_256x2-32-32.nnue-learn.2020-07-11" binary. Create an empty folder named "evalsave" in the same directory as the binaries.
+Use the "learn" binary. Create an empty folder named "evalsave" in the same directory as the binaries.
 ```
 uci
 setoption name SkipLoadingEval value true
+setoption name Use NNUE value true
 setoption name Threads value x
 isready
 learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 mirror_percentage 50 validation_set_file_name validationdata\val.bin
@@ -42,7 +44,7 @@ Nets get saved in the "evalsave" folder.
 - lambda is the amount of weight it puts to eval of learning data vs win/draw/loss results. 1 puts all weight on eval, lambda 0 puts all weight on WDL results.
 
 ### Reinforcement Learning
-If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `uci setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries with setting `Use NNUE` to true. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `uci setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
 
 After you have generated the training data, you must move it into your training data folder and delete the older data so that the binary does not accidentally train on the same data again. Do the same for the validation data and name it to val-1.bin to make it less confusing. Make sure the evalsave folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set eval_save_interval to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value. The validation file should be set to the new validation data, not the old data.
 

From 4f97d3446de7eb8e5f3d52558dc66ceb54ef8151 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 08:52:34 +0900
Subject: [PATCH 146/583] Cleaned up source code.

---
 src/Makefile                       |  2 +-
 src/learn/multi_think.h            |  1 +
 src/main.cpp                       | 20 --------------------
 src/misc.h                         |  5 -----
 src/nnue/features/feature_set.h    |  4 ++--
 src/nnue/layers/affine_transform.h |  2 +-
 src/nnue/layers/clipped_relu.h     |  2 +-
 src/position.h                     |  4 ++--
 src/search.cpp                     | 12 ++++++++----
 src/search.h                       |  4 ++++
 src/tt.cpp                         |  4 ----
 src/uci.cpp                        |  3 ++-
 12 files changed, 22 insertions(+), 41 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 2097fecc..59fb90c5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -442,7 +442,7 @@ endif
 ### 3.6 popcnt
 ifeq ($(popcnt),yes)
 	ifeq ($(arch),$(filter $(arch),ppc64 armv8-a arm64))
-	CXXFLAGS += -DUSE_POPCNT
+		CXXFLAGS += -DUSE_POPCNT
 	else ifeq ($(comp),icc)
 		CXXFLAGS += -msse3 -DUSE_POPCNT
 	else
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index 55edb049..c22b7e8d 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -4,6 +4,7 @@
 #if defined(EVAL_LEARN)
 
 #include <functional>
+#include <mutex>
 
 #include "../misc.h"
 #include "../learn/learn.h"
diff --git a/src/main.cpp b/src/main.cpp
index e8324186..fbad6622 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -18,15 +18,6 @@
 
 #include <iostream>
 
-#ifdef _WIN32
-#include <filesystem>
-
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <Windows.h>
-#endif
-
 #include "bitboard.h"
 #include "endgame.h"
 #include "position.h"
@@ -41,17 +32,6 @@ namespace PSQT {
 }
 
 int main(int argc, char* argv[]) {
-  // Change the current working directory to the binary directory.  So that a
-  // net file path can be specified with a relative path from the binary
-  // directory.
-  // TODO(someone): Implement the logic for other OS.
-#ifdef _WIN32
-  TCHAR filename[_MAX_PATH];
-  ::GetModuleFileName(NULL, filename, sizeof(filename) / sizeof(filename[0]));
-  std::filesystem::path current_path = filename;
-  current_path.remove_filename();
-  std::filesystem::current_path(current_path);
-#endif
 
   std::cout << engine_info() << std::endl;
 
diff --git a/src/misc.h b/src/misc.h
index 9ea57fa8..ecef028f 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -19,7 +19,6 @@
 #ifndef MISC_H_INCLUDED
 #define MISC_H_INCLUDED
 
-#include <algorithm>
 #include <cassert>
 #include <chrono>
 #include <functional>
@@ -27,12 +26,8 @@
 #include <ostream>
 #include <string>
 #include <vector>
-#ifndef _MSC_VER
-#include <mm_malloc.h>
-#endif
 
 #include "types.h"
-#include "thread_win32_osx.h"
 
 const std::string engine_info(bool to_uci = false);
 const std::string compiler_info();
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index c46da462..ec34a486 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -1,4 +1,4 @@
-﻿/*
+/*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
@@ -171,7 +171,7 @@ namespace Eval::NNUE::Features {
       }
     }
 
-    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
     template <typename IndexListType>
     static void CollectChangedIndices(
       const Position& pos, const TriggerEvent trigger, const Color perspective,
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index cdf26cb8..057de8e1 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -67,7 +67,7 @@ namespace Eval::NNUE::Layers {
         PreviousLayer::GetStructureString() + ")";
     }
     
-    // Read network parameters
+   // Read network parameters
     bool ReadParameters(std::istream& stream) {
       if (!previous_layer_.ReadParameters(stream)) return false;
       stream.read(reinterpret_cast<char*>(biases_),
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 8c648526..822e60b0 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -193,7 +193,7 @@ namespace Eval::NNUE::Layers {
      // Make the learning class a friend
      friend class Trainer<ClippedReLU>;
      
-     PreviousLayer previous_layer_;
+    PreviousLayer previous_layer_;
   };
 
 }  // namespace Eval::NNUE::Layers
diff --git a/src/position.h b/src/position.h
index 535bf29e..b5dbaf59 100644
--- a/src/position.h
+++ b/src/position.h
@@ -1,4 +1,4 @@
-﻿/*
+/*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
@@ -182,7 +182,7 @@ public:
   // Do not include gamePly in pack.
   void sfen_pack(PackedSfen& sfen);
 
-  // ª It is slow to go through sfen, so I made a function to set packed sfen directly.
+  // It is slow to go through sfen, so I made a function to set packed sfen directly.
   // Equivalent to pos.set(sfen_unpack(data),si,th);.
   // If there is a problem with the passed phase and there is an error, non-zero is returned.
   // PackedSfen does not include gamePly so it cannot be restored. If you want to set it, specify it with an argument.
diff --git a/src/search.cpp b/src/search.cpp
index 7c666772..fe1771a3 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -992,7 +992,11 @@ moves_loop: // When in check, search starts from here
 
       ss->moveCount = ++moveCount;
 
-      if (rootNode && thisThread == Threads.main() && Time.elapsed() > 3000 && !Limits.silent)
+      if (rootNode && thisThread == Threads.main() && Time.elapsed() > 3000
+#if defined(EVAL_LEARN)
+          && !Limits.silent
+#endif
+          )
           sync_cout << "info depth " << depth
                     << " currmove " << UCI::move(move, pos.is_chess960())
                     << " currmovenumber " << moveCount + thisThread->pvIdx << sync_endl;
@@ -2066,10 +2070,10 @@ namespace Learner
       // Increase the generation of the substitution table for this thread because it is a new search.
             //TT.new_search(th->thread_id());
 
-            // ↑ If you call new_search here, it may be a loss because you can't use the previous search result.
+            // �� If you call new_search here, it may be a loss because you can't use the previous search result.
             // Do not do this here, but caller should do TT.new_search(th->thread_id()) for each station ...
 
-            // →Because we want to avoid reaching the same final diagram, use the substitution table commonly for all threads when generating teachers.
+            // ��Because we want to avoid reaching the same final diagram, use the substitution table commonly for all threads when generating teachers.
       //#endif
     }
   }
@@ -2259,7 +2263,7 @@ namespace Learner
     }
 
     // Pass PV_is(ok) to eliminate this PV, there may be NULL_MOVE in the middle.
-    // → PV should not be NULL_MOVE because it is PV
+    // �� PV should not be NULL_MOVE because it is PV
     // MOVE_WIN has never been thrust. (For now)
     for (Move move : rootMoves[0].pv)
     {
diff --git a/src/search.h b/src/search.h
index 0d4e5399..01d8a4c1 100644
--- a/src/search.h
+++ b/src/search.h
@@ -86,7 +86,9 @@ struct LimitsType {
     time[WHITE] = time[BLACK] = inc[WHITE] = inc[BLACK] = npmsec = movetime = TimePoint(0);
     movestogo = depth = mate = perft = infinite = 0;
     nodes = 0;
+#if defined (EVAL_LEARN)
     silent = false;
+#endif
   }
 
   bool use_time_management() const {
@@ -97,9 +99,11 @@ struct LimitsType {
   TimePoint time[COLOR_NB], inc[COLOR_NB], npmsec, movetime, startTime;
   int movestogo, depth, mate, perft, infinite;
   int64_t nodes;
+#if defined (EVAL_LEARN)
   // Silent mode that does not output to the screen (for continuous self-play in process)
   // Do not output PV at this time.
   bool silent;
+#endif
 };
 
 extern LimitsType Limits;
diff --git a/src/tt.cpp b/src/tt.cpp
index 54c7f6b9..d494c27d 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -114,9 +114,6 @@ void TranspositionTable::clear() {
 /// TTEntry t2 if its replace value is greater than that of t2.
 
 TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
-#if defined(DISABLE_TT)
-  return found = false, first_entry(0);
-#else
 
   TTEntry* const tte = first_entry(key);
   const uint16_t key16 = (uint16_t)key;  // Use the low 16 bits as key inside the cluster
@@ -141,7 +138,6 @@ TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
           replace = &tte[i];
 
   return found = false, replace;
-#endif
 }
 
 
diff --git a/src/uci.cpp b/src/uci.cpp
index b0d7b6f9..8972cec9 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -81,6 +81,7 @@ void test_cmd(Position& pos, istringstream& is)
 #endif
 
 namespace {
+
   // position() is called when engine receives the "position" UCI command.
   // The function sets up the position described in the given FEN string ("fen")
   // or the starting position ("startpos") and then makes the moves given in the
@@ -462,4 +463,4 @@ Move UCI::to_move(const Position& pos, string& str) {
           return m;
 
   return MOVE_NONE;
-}
\ No newline at end of file
+}

From 31d4f46f5ed239ad1d8dce462fbe2f26e91143b8 Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Mon, 10 Aug 2020 01:43:59 +0900
Subject: [PATCH 147/583] update convert_bin

learn convert_bin in.txt output_file_name out.bin check_illegal_move 1
convert in.txt ... done 16 parsed 4 is filtered (illegal fen:1, illegal move:2, illegal ply:1)
---
 src/learn/learner.cpp | 76 ++++++++++++++++++++++++++++++-------------
 1 file changed, 53 insertions(+), 23 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index afc82447..17737c6d 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -2433,11 +2433,14 @@ void shuffle_files_on_memory(const vector<string>& filenames,const string output
 	std::cout << "..shuffle_on_memory done." << std::endl;
 }
 
-void convert_bin(const vector<string>& filenames, const string& output_file_name, const int ply_minimum, const int ply_maximum, const int interpolate_eval)
+void convert_bin(const vector<string>& filenames, const string& output_file_name, const int ply_minimum, const int ply_maximum, const int interpolate_eval, const bool check_illegal_move)
 {
 	std::fstream fs;
 	uint64_t data_size=0;
 	uint64_t filtered_size = 0;
+	uint64_t filtered_size_fen = 0;
+	uint64_t filtered_size_move = 0;
+	uint64_t filtered_size_ply = 0;
 	auto th = Threads.main();
 	auto &tpos = th->rootPos;
 	// convert plain rag to packed sfenvalue for Yaneura king
@@ -2451,34 +2454,56 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 		PackedSfenValue p;
 		data_size = 0;
 		filtered_size = 0;
+		filtered_size_fen = 0;
+		filtered_size_move = 0;
+		filtered_size_ply = 0;
 		p.gamePly = 1; // Not included in apery format. Should be initialized
-		bool ignore_flag = false;
+		bool ignore_flag_fen = false;
+		bool ignore_flag_move = false;
+		bool ignore_flag_ply = false;
 		while (std::getline(ifs, line)) {
 			std::stringstream ss(line);
 			std::string token;
 			std::string value;
 			ss >> token;
 			if (token == "fen") {
-			  states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
-			  tpos.set(line.substr(4), false, &states->back(), Threads.main());
-			  tpos.sfen_pack(p.sfen);
+				states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
+				std::string input_fen = line.substr(4);
+				tpos.set(input_fen, false, &states->back(), Threads.main());
+				if (!tpos.pos_is_ok() || tpos.fen() != input_fen) {
+					ignore_flag_fen = true;
+					filtered_size_fen++;
+				}
+				else {
+					tpos.sfen_pack(p.sfen);
+				}
 			}
 			else if (token == "move") {
 				ss >> value;
-				p.move = UCI::to_move(tpos, value);
+				Move move = UCI::to_move(tpos, value);
+				if (check_illegal_move && move == MOVE_NONE) {
+					ignore_flag_move = true;
+					filtered_size_move++;
+				}
+				else {
+					p.move = move;
+				}
 			}
 			else if (token == "score") {
-				ss >> p.score;
+				int32_t score;
+				ss >> score;
+				p.score = Math::clamp(score , -(int32_t)VALUE_MATE , (int32_t)VALUE_MATE);
 			}
 			else if (token == "ply") {
 				int temp;
 				ss >> temp;
 				if(temp < ply_minimum || temp > ply_maximum){
-				  ignore_flag = true;
+					ignore_flag_ply = true;
+					filtered_size_ply++;
 				}
 				p.gamePly = uint16_t(temp); // No cast here?
 				if (interpolate_eval != 0){
-				  p.score = min(3000, interpolate_eval * temp);
+					p.score = min(3000, interpolate_eval * temp);
 				}
 			}
 			else if (token == "result") {
@@ -2486,24 +2511,27 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 				ss >> temp;
 				p.game_result = int8_t(temp); // Do you need a cast here?
 				if (interpolate_eval){
-				  p.score = p.score * p.game_result;
+					p.score = p.score * p.game_result;
 				}
 			}
 			else if (token == "e") {
-			  if(!ignore_flag){
-				fs.write((char*)&p, sizeof(PackedSfenValue));
-				data_size+=1;
-				// debug
-				// std::cout<<tpos<<std::endl;
-				// std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
-			  }else{
-			    ignore_flag = false;
-			    filtered_size += 1;
-			  }
-				
+				if(!(ignore_flag_fen || ignore_flag_move || ignore_flag_ply)){
+					fs.write((char*)&p, sizeof(PackedSfenValue));
+					data_size+=1;
+					// debug
+					// std::cout<<tpos<<std::endl;
+					// std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
+				}
+				else {
+					filtered_size++;
+				}
+				ignore_flag_fen = false;
+				ignore_flag_move = false;
+				ignore_flag_ply = false;
 			}
 		}
-		std::cout << "done" << data_size <<" parsed " << filtered_size<<" is filtered"<< std::endl;
+		std::cout << "done " << data_size << " parsed " << filtered_size << " is filtered"
+				  << " (illegal fen:" << filtered_size_fen << ", illegal move:" << filtered_size_move << ", illegal ply:" << filtered_size_ply << ")" << std::endl;
 		ifs.close();
 	}
 	std::cout << "all done" << std::endl;
@@ -2851,6 +2879,7 @@ void learn(Position&, istringstream& is)
 	int ply_minimum = 0;
 	int ply_maximum = 114514;
 	bool interpolate_eval = 0;
+	bool check_illegal_move = false;
 	// convert teacher in pgn-extract format to Yaneura King's bin
 	bool use_convert_bin_from_pgn_extract = false;
 	bool pgn_eval_side_to_move = false;
@@ -2987,6 +3016,7 @@ void learn(Position&, istringstream& is)
 		else if (option == "convert_plain") use_convert_plain = true;
 		else if (option == "convert_bin") use_convert_bin = true;
 		else if (option == "interpolate_eval") is >> interpolate_eval;
+		else if (option == "check_illegal_move") is >> check_illegal_move;
 		else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
 		else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
 
@@ -3098,7 +3128,7 @@ void learn(Position&, istringstream& is)
 	{
 		Eval::init_NNUE();
 		cout << "convert_bin.." << endl;
-		convert_bin(filenames,output_file_name, ply_minimum, ply_maximum, interpolate_eval);
+		convert_bin(filenames,output_file_name, ply_minimum, ply_maximum, interpolate_eval, check_illegal_move);
 		return;
 		
 	}

From 643be3c6f98018032c70cf59db0c495c1235f510 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 10:45:03 +0900
Subject: [PATCH 148/583] Changed not to use std::random_device().  Because it
 always returns the same integers on MingW. #68

---
 src/learn/learner.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 17737c6d..5120200f 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -15,6 +15,7 @@
 
 #if defined(EVAL_LEARN)
 
+#include <chrono>
 #include <filesystem>
 #include <random>
 #include <regex>
@@ -959,8 +960,8 @@ void gen_sfen(Position&, istringstream& is)
 	if (random_file_name)
 	{
 		// Give a random number to output_file_name at this point.
-    std::random_device seed_gen;
-    PRNG r(seed_gen());
+		// Do not use std::random_device().  Because it always the same integers on MinGW.
+		PRNG r(std::chrono::system_clock::now().time_since_epoch().count());
 		// Just in case, reassign the random numbers.
 		for(int i=0;i<10;++i)
 			r.rand(1);
@@ -1205,7 +1206,8 @@ double calc_grad(Value shallow, const PackedSfenValue& psv) {
 // Sfen reader
 struct SfenReader
 {
-	SfenReader(int thread_num) : prng((std::random_device())())
+	// Do not use std::random_device().  Because it always the same integers on MinGW.
+	SfenReader(int thread_num) : prng(std::chrono::system_clock::now().time_since_epoch().count())
 	{
 		packed_sfens.resize(thread_num);
 		total_read = 0;
@@ -2283,7 +2285,8 @@ void shuffle_files(const vector<string>& filenames , const string& output_file_n
 	uint64_t write_file_count = 0;
 
 	// random number to shuffle
-	PRNG prng((std::random_device())());
+	// Do not use std::random_device().  Because it always the same integers on MinGW.
+	PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
 
 	// generate the name of the temporary file
 	auto make_filename = [](uint64_t i)
@@ -2361,7 +2364,8 @@ void shuffle_files_quick(const vector<string>& filenames, const string& output_f
 	uint64_t read_sfen_count = 0;
 
 	// random number to shuffle
-	PRNG prng((std::random_device())());
+	// Do not use std::random_device().  Because it always the same integers on MinGW.
+	PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
 
 	// number of files
 	size_t file_count = filenames.size();
@@ -2419,7 +2423,8 @@ void shuffle_files_on_memory(const vector<string>& filenames,const string output
 	}
 
 	// shuffle from buf[0] to buf[size-1]
-	PRNG prng((std::random_device())());
+	// Do not use std::random_device().  Because it always the same integers on MinGW.
+	PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
 	uint64_t size = (uint64_t)buf.size();
 	std::cout << "shuffle buf.size() = " << size << std::endl;
 	for (uint64_t i = 0; i < size; ++i)

From e65c515d6b1ac3c1e8e7e9c35b8f6be6002a4c46 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 12:09:21 +0900
Subject: [PATCH 149/583] Changed to specify the current tick as a random seed.
 #68

---
 src/learn/multi_think.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index c22b7e8d..6e6c695c 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -17,7 +17,7 @@
 // Derive and use this class.
 struct MultiThink
 {
-	MultiThink() : prng(21120903)
+	MultiThink() : prng(std::chrono::system_clock::now().time_since_epoch().count())
 	{
 		loop_count = 0;
 	}

From bac96aa04a8f91089f982d62452f5b9240451c03 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 12:17:26 +0900
Subject: [PATCH 150/583] Changed to use TB in the training data generator. #67

---
 src/learn/learner.cpp | 50 ++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 5120200f..4b76e444 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -23,6 +23,7 @@
 #include "learn.h"
 #include "multi_think.h"
 #include "../uci.h"
+#include "../syzygy/tbprobe.h"
 
 // evaluate header for learning
 #include "../eval/evaluate_common.h"
@@ -522,13 +523,18 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
         break;
       }
 
-			// Isn't all pieces stuck and stuck?
-			if (MoveList<LEGAL>(pos).size() == 0)
-			{
-        // (write up to the previous phase of this phase)
-        // Write the positions other than this position if checkmated.
-                if (pos.checkers()) // Mate
-                    flush_psv(-1);
+			// Initialize the Syzygy Ending Tablebase and sort the moves.
+			Search::RootMoves rootMoves;
+			for (const auto& m : MoveList<LEGAL>(pos))
+				rootMoves.emplace_back(m);
+			if (!rootMoves.empty())
+				Tablebases::rank_root_moves(pos, rootMoves);
+
+			// If there is no legal move, terminate the game if position
+			// is mate or a stalemate.
+			else {
+				if (pos.checkers()) // Mate
+					flush_psv(-1);
 				else if (use_draw_in_training_data_generation) {
 					flush_psv(0); // Stalemate
 				}
@@ -636,10 +642,10 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 					// cout << pos;
 
 					auto v = Eval::evaluate(pos);
-						// evaluate() returns the evaluation value on the turn side, so
-						// If it's a turn different from root_color, you must invert v and return it.
-						if (rootColor != pos.side_to_move())
-							v = -v;
+					// evaluate() returns the evaluation value on the turn side, so
+					// If it's a turn different from root_color, you must invert v and return it.
+					if (rootColor != pos.side_to_move())
+						v = -v;
 
 					// Rewind.
 					// Is it C++x14, and isn't there even foreach to turn in reverse?
@@ -2472,7 +2478,7 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 			std::string value;
 			ss >> token;
 			if (token == "fen") {
-				states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
+			  states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
 				std::string input_fen = line.substr(4);
 				tpos.set(input_fen, false, &states->back(), Threads.main());
 				if (!tpos.pos_is_ok() || tpos.fen() != input_fen) {
@@ -2480,8 +2486,8 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 					filtered_size_fen++;
 				}
 				else {
-					tpos.sfen_pack(p.sfen);
-				}
+			  tpos.sfen_pack(p.sfen);
+			}
 			}
 			else if (token == "move") {
 				ss >> value;
@@ -2508,7 +2514,7 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 				}
 				p.gamePly = uint16_t(temp); // No cast here?
 				if (interpolate_eval != 0){
-					p.score = min(3000, interpolate_eval * temp);
+				  p.score = min(3000, interpolate_eval * temp);
 				}
 			}
 			else if (token == "result") {
@@ -2516,17 +2522,17 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 				ss >> temp;
 				p.game_result = int8_t(temp); // Do you need a cast here?
 				if (interpolate_eval){
-					p.score = p.score * p.game_result;
+				  p.score = p.score * p.game_result;
 				}
 			}
 			else if (token == "e") {
 				if(!(ignore_flag_fen || ignore_flag_move || ignore_flag_ply)){
-					fs.write((char*)&p, sizeof(PackedSfenValue));
-					data_size+=1;
-					// debug
-					// std::cout<<tpos<<std::endl;
-					// std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
-				}
+				fs.write((char*)&p, sizeof(PackedSfenValue));
+				data_size+=1;
+				// debug
+				// std::cout<<tpos<<std::endl;
+				// std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
+			  }
 				else {
 					filtered_size++;
 				}

From 12c6c2f550dab77dcbc5381900f59a7f9694126b Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 13:07:22 +0900
Subject: [PATCH 151/583] Chagned to use the search value instead of the value
 of the PV leaf to avoid crash by assertion.

---
 src/learn/learner.cpp | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 4b76e444..998c82eb 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -639,13 +639,21 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 					}
 
 					// reach leaf
-					// cout << pos;
-
-					auto v = Eval::evaluate(pos);
-					// evaluate() returns the evaluation value on the turn side, so
-					// If it's a turn different from root_color, you must invert v and return it.
-					if (rootColor != pos.side_to_move())
-						v = -v;
+					Value v;
+					if (pos.checkers()) {
+						// Sometime a king is checked.  An example is a case that a checkmate is
+						// found in the search.  If Eval::evaluate() is called whne a king is
+						// checked, classic eval crashes by an assertion.  To avoid crashes, return
+						// value1 instead of the score of the PV leaf.
+						v = value1;
+					}
+					else {
+						v = Eval::evaluate(pos);
+						// evaluate() returns the evaluation value on the turn side, so
+						// If it's a turn different from root_color, you must invert v and return it.
+						if (rootColor != pos.side_to_move())
+							v = -v;
+					}
 
 					// Rewind.
 					// Is it C++x14, and isn't there even foreach to turn in reverse?

From 4a87d7b78712c02b38cb1132b2a705dabd087106 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 15:44:58 +0900
Subject: [PATCH 152/583] Added the use_game_draw_adjudication option.

---
 src/learn/learner.cpp | 51 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 47 insertions(+), 4 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 998c82eb..68bbc835 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -118,6 +118,7 @@ bool use_draw_in_training_data_generation = false;
 bool use_draw_in_training = false;
 bool use_draw_in_validation = false;
 bool use_hash_in_training = true;
+bool use_game_draw_adjudication = false;
 
 // -----------------------------------
 // write phase file
@@ -394,6 +395,12 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 	// end flag
 	bool quit = false;
 
+	// Variables for draw adjudication.
+	// Todo: Make this as an option.
+	int adj_draw_ply = 80; // start the adjudication when ply reaches this value
+	int adj_draw_cnt = 8;  // 4 move scores for each side have to be checked
+	int adj_draw_score = 0;  // move score in CP
+
 	// repeat until the specified number of times
 	while (!quit)
 	{
@@ -495,6 +502,9 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 		// When random_move_minply == -1, random moves are performed continuously, so use it at this time.
 		int random_move_c = 0;
 
+		// Save history of move scores for adjudication
+		vector<int> move_hist_scores;
+
 		// ply: steps from the initial stage
 		for (int ply = 0; ; ++ply)
 		{
@@ -541,6 +551,34 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 				break;
 			}
 
+			// Adjudicate game to a draw if the last 4 scores of each engine is 0.
+			if (use_game_draw_adjudication) {
+				if (ply >= adj_draw_ply) {
+					int draw_cnt = 0;
+					bool is_adj_draw = false;
+
+					for (vector<int>::reverse_iterator it = move_hist_scores.rbegin();
+						it != move_hist_scores.rend(); ++it) 
+					{
+						if (abs(*it) <= adj_draw_score)
+							draw_cnt++;
+						else
+							break;  // score should be successive
+
+						if (draw_cnt >= adj_draw_cnt) {
+							is_adj_draw = true;
+							break;
+						}
+					}
+
+					if (is_adj_draw) {
+						if (use_draw_in_training_data_generation)
+							flush_psv(0);
+						break;
+					}
+				}
+			}
+
 			//// constant track
 			//if ((m = book.probe(pos)) != MOVE_NONE)
 			//{
@@ -597,6 +635,9 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 					break;
 				}
 
+				// Save the move score for adjudication.
+				move_hist_scores.push_back(value1);
+
 				// Processing according to each thousand-day hand.
 
         if (pos.is_draw(0)) {
@@ -649,10 +690,10 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 					}
 					else {
 						v = Eval::evaluate(pos);
-						// evaluate() returns the evaluation value on the turn side, so
-						// If it's a turn different from root_color, you must invert v and return it.
-						if (rootColor != pos.side_to_move())
-							v = -v;
+					// evaluate() returns the evaluation value on the turn side, so
+					// If it's a turn different from root_color, you must invert v and return it.
+					if (rootColor != pos.side_to_move())
+						v = -v;
 					}
 
 					// Rewind.
@@ -955,6 +996,8 @@ void gen_sfen(Position&, istringstream& is)
 			is >> random_file_name;
 		else if (token == "use_draw_in_training_data_generation")
 			is >> use_draw_in_training_data_generation;
+		else if (token == "use_game_draw_adjudication")
+			is >> use_game_draw_adjudication;
 		else
 			cout << "Error! : Illegal token " << token << endl;
 	}

From 3bd3ef0aeabb59ffa0584a95369b79fdf85822be Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 15:47:11 +0900
Subject: [PATCH 153/583] Implemented the code to detect draw by insufficient
 mating material.

---
 src/learn/learner.cpp | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 68bbc835..b81009c7 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -579,6 +579,48 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 				}
 			}
 
+			// Draw by insufficient mating material
+			if (pos.count<ALL_PIECES>() <= 4) {
+				int pcnt = pos.count<ALL_PIECES>();
+				// (1) KvK
+				if (pcnt == 2) {
+					if (use_draw_in_training_data_generation)
+						flush_psv(0);
+					break;
+				}
+				// (2) KvK + 1 minor piece
+				if (pcnt == 3) {
+					int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
+						pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
+					if (minor_pc == 1) {
+						if (use_draw_in_training_data_generation)
+							flush_psv(0);
+						break;
+					}
+				}
+				// (3) KBvKB, bishops of the same color
+				else if (pcnt == 4) {
+					if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1) {
+						// Color of bishops is black.
+						if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
+							&& (pos.pieces(BLACK, BISHOP) & DarkSquares))
+						{
+							if (use_draw_in_training_data_generation)
+								flush_psv(0);
+							break;
+						}
+						// Color of bishops is white.
+						if ((pos.pieces(WHITE, BISHOP) & ~DarkSquares)
+							&& (pos.pieces(BLACK, BISHOP) & ~DarkSquares))
+						{
+							if (use_draw_in_training_data_generation)
+								flush_psv(0);
+							break;
+						}
+					}
+				}
+			}
+
 			//// constant track
 			//if ((m = book.probe(pos)) != MOVE_NONE)
 			//{

From a41cbb9ca9a13f7f851e0235ed04364954864f4e Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 15:49:24 +0900
Subject: [PATCH 154/583] Renamed use_draw_in_training_data_generation option
 to write_out_draw_game_in_training_data_generation.

---
 src/learn/learner.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index b81009c7..2c46b374 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -114,7 +114,7 @@ namespace Learner
 // Phase array: PSVector stands for packed sfen vector.
 typedef std::vector<PackedSfenValue> PSVector;
 
-bool use_draw_in_training_data_generation = false;
+bool write_out_draw_game_in_training_data_generation = false;
 bool use_draw_in_training = false;
 bool use_draw_in_validation = false;
 bool use_hash_in_training = true;
@@ -517,7 +517,7 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 			// has it reached the length
 			if (ply >= MAX_PLY2)
 			{
-				if (use_draw_in_training_data_generation) {
+				if (write_out_draw_game_in_training_data_generation) {
 				// Write out as win/loss = draw.
 				// This way it is harder to allow the opponent to enter the ball when I enter (may)
 				flush_psv(0);
@@ -526,7 +526,7 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 			}
 
       if (pos.is_draw(ply)) {
-		  if (use_draw_in_training_data_generation) {
+		  if (write_out_draw_game_in_training_data_generation) {
 			  // Write if draw.
 			  flush_psv(0);
 		  }
@@ -545,7 +545,7 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 			else {
 				if (pos.checkers()) // Mate
 					flush_psv(-1);
-				else if (use_draw_in_training_data_generation) {
+				else if (write_out_draw_game_in_training_data_generation) {
 					flush_psv(0); // Stalemate
 				}
 				break;
@@ -572,7 +572,7 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 					}
 
 					if (is_adj_draw) {
-						if (use_draw_in_training_data_generation)
+						if (write_out_draw_game_in_training_data_generation)
 							flush_psv(0);
 						break;
 					}
@@ -584,7 +584,7 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 				int pcnt = pos.count<ALL_PIECES>();
 				// (1) KvK
 				if (pcnt == 2) {
-					if (use_draw_in_training_data_generation)
+					if (write_out_draw_game_in_training_data_generation)
 						flush_psv(0);
 					break;
 				}
@@ -593,7 +593,7 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 					int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
 						pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
 					if (minor_pc == 1) {
-						if (use_draw_in_training_data_generation)
+						if (write_out_draw_game_in_training_data_generation)
 							flush_psv(0);
 						break;
 					}
@@ -605,7 +605,7 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 						if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
 							&& (pos.pieces(BLACK, BISHOP) & DarkSquares))
 						{
-							if (use_draw_in_training_data_generation)
+							if (write_out_draw_game_in_training_data_generation)
 								flush_psv(0);
 							break;
 						}
@@ -613,7 +613,7 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 						if ((pos.pieces(WHITE, BISHOP) & ~DarkSquares)
 							&& (pos.pieces(BLACK, BISHOP) & ~DarkSquares))
 						{
-							if (use_draw_in_training_data_generation)
+							if (write_out_draw_game_in_training_data_generation)
 								flush_psv(0);
 							break;
 						}
@@ -683,7 +683,7 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 				// Processing according to each thousand-day hand.
 
         if (pos.is_draw(0)) {
-			if (use_draw_in_training_data_generation) {
+			if (write_out_draw_game_in_training_data_generation) {
 				// Write if draw.
 				flush_psv(0);
 			}
@@ -1036,8 +1036,8 @@ void gen_sfen(Position&, istringstream& is)
 			is >> save_every;
 		else if (token == "random_file_name")
 			is >> random_file_name;
-		else if (token == "use_draw_in_training_data_generation")
-			is >> use_draw_in_training_data_generation;
+		else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
+			is >> write_out_draw_game_in_training_data_generation;
 		else if (token == "use_game_draw_adjudication")
 			is >> use_game_draw_adjudication;
 		else

From fa5b2aec3a45449f6da8fa466ffd527c9e7a6a4e Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 15:51:23 +0900
Subject: [PATCH 155/583] Renamed use_draw_in_training to
 use_draw_games_in_training.

---
 src/learn/learner.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 2c46b374..b8d29162 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -115,7 +115,7 @@ namespace Learner
 typedef std::vector<PackedSfenValue> PSVector;
 
 bool write_out_draw_game_in_training_data_generation = false;
-bool use_draw_in_training = false;
+bool use_draw_games_in_training = false;
 bool use_draw_in_validation = false;
 bool use_hash_in_training = true;
 bool use_game_draw_adjudication = false;
@@ -2056,7 +2056,7 @@ void LearnerThink::thread_worker(size_t thread_id)
 			goto RetryRead;
 
 
-		if (!use_draw_in_training && ps.game_result == 0)
+		if (!use_draw_games_in_training && ps.game_result == 0)
 			goto RetryRead;
 
 
@@ -3066,7 +3066,7 @@ void learn(Position&, istringstream& is)
 		else if (option == "eta3")       is >> eta3;
 		else if (option == "eta1_epoch") is >> eta1_epoch;
 		else if (option == "eta2_epoch") is >> eta2_epoch;
-		else if (option == "use_draw_in_training") is >> use_draw_in_training;
+		else if (option == "use_draw_in_training" || option == "use_draw_games_in_training") is >> use_draw_games_in_training;
 		else if (option == "use_draw_in_validation") is >> use_draw_in_validation;
 		else if (option == "use_hash_in_training") is >> use_hash_in_training;
 		// Discount rate

From 87c50c5cbc92ce1cc9ca332a298b0155bae27033 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 15:55:34 +0900
Subject: [PATCH 156/583] Renamed use_draw_in_validation to
 use_draw_games_in_validation. Added comments.

---
 src/learn/learner.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index b8d29162..7a0f01a2 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -116,7 +116,7 @@ typedef std::vector<PackedSfenValue> PSVector;
 
 bool write_out_draw_game_in_training_data_generation = false;
 bool use_draw_games_in_training = false;
-bool use_draw_in_validation = false;
+bool use_draw_games_in_validation = false;
 bool use_hash_in_training = true;
 bool use_game_draw_adjudication = false;
 
@@ -1036,6 +1036,7 @@ void gen_sfen(Position&, istringstream& is)
 			is >> save_every;
 		else if (token == "random_file_name")
 			is >> random_file_name;
+		// Accept also the old option name.
 		else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
 			is >> write_out_draw_game_in_training_data_generation;
 		else if (token == "use_game_draw_adjudication")
@@ -1370,7 +1371,7 @@ struct SfenReader
 			{
 				if (eval_limit < abs(p.score))
 					continue;
-				if (!use_draw_in_validation && p.game_result == 0)
+				if (!use_draw_games_in_validation && p.game_result == 0)
 					continue;
 				sfen_for_mse.push_back(p);
 			} else {
@@ -3066,8 +3067,10 @@ void learn(Position&, istringstream& is)
 		else if (option == "eta3")       is >> eta3;
 		else if (option == "eta1_epoch") is >> eta1_epoch;
 		else if (option == "eta2_epoch") is >> eta2_epoch;
+		// Accept also the old option name.
 		else if (option == "use_draw_in_training" || option == "use_draw_games_in_training") is >> use_draw_games_in_training;
-		else if (option == "use_draw_in_validation") is >> use_draw_in_validation;
+		// Accept also the old option name.
+		else if (option == "use_draw_in_validation" || option == "use_draw_games_in_validation") is >> use_draw_games_in_validation;
 		else if (option == "use_hash_in_training") is >> use_hash_in_training;
 		// Discount rate
 		else if (option == "discount_rate") is >> discount_rate;

From 5467ba3c2388ba50aeebf9c2839532f620f964ec Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 15:58:17 +0900
Subject: [PATCH 157/583] Renamed use_hash_in_training to
 skip_duplicated_positions_in_training.

---
 src/learn/learner.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 7a0f01a2..464c949f 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -117,7 +117,7 @@ typedef std::vector<PackedSfenValue> PSVector;
 bool write_out_draw_game_in_training_data_generation = false;
 bool use_draw_games_in_training = false;
 bool use_draw_games_in_validation = false;
-bool use_hash_in_training = true;
+bool skip_duplicated_positions_in_training = true;
 bool use_game_draw_adjudication = false;
 
 // -----------------------------------
@@ -2083,13 +2083,13 @@ void LearnerThink::thread_worker(size_t thread_id)
 		{
 			auto key = pos.key();
 			// Exclude the phase used for rmse calculation.
-			if (sr.is_for_rmse(key) && use_hash_in_training)
+			if (sr.is_for_rmse(key) && skip_duplicated_positions_in_training)
 				goto RetryRead;
 
 			// Exclude the most recently used aspect.
 			auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
 			auto key2 = sr.hash[hash_index];
-			if (key == key2 && use_hash_in_training)
+			if (key == key2 && skip_duplicated_positions_in_training)
 				goto RetryRead;
 			sr.hash[hash_index] = key; // Replace with the current key.
 		}
@@ -3071,7 +3071,8 @@ void learn(Position&, istringstream& is)
 		else if (option == "use_draw_in_training" || option == "use_draw_games_in_training") is >> use_draw_games_in_training;
 		// Accept also the old option name.
 		else if (option == "use_draw_in_validation" || option == "use_draw_games_in_validation") is >> use_draw_games_in_validation;
-		else if (option == "use_hash_in_training") is >> use_hash_in_training;
+		// Accept also the old option name.
+		else if (option == "use_hash_in_training" || option == "skip_duplicated_positions_in_training") is >> skip_duplicated_positions_in_training;
 		// Discount rate
 		else if (option == "discount_rate") is >> discount_rate;
 

From 84070c02e60c0665934bbca619d4e3f560e21995 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 16:02:18 +0900
Subject: [PATCH 158/583] Renamed use_game_draw_adjudication to
 detect_draw_by_consecutive_low_score.

---
 src/learn/learner.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 464c949f..1e873a4c 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -118,7 +118,7 @@ bool write_out_draw_game_in_training_data_generation = false;
 bool use_draw_games_in_training = false;
 bool use_draw_games_in_validation = false;
 bool skip_duplicated_positions_in_training = true;
-bool use_game_draw_adjudication = false;
+bool detect_draw_by_consecutive_low_score = false;
 
 // -----------------------------------
 // write phase file
@@ -552,7 +552,7 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 			}
 
 			// Adjudicate game to a draw if the last 4 scores of each engine is 0.
-			if (use_game_draw_adjudication) {
+			if (detect_draw_by_consecutive_low_score) {
 				if (ply >= adj_draw_ply) {
 					int draw_cnt = 0;
 					bool is_adj_draw = false;
@@ -1039,8 +1039,9 @@ void gen_sfen(Position&, istringstream& is)
 		// Accept also the old option name.
 		else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
 			is >> write_out_draw_game_in_training_data_generation;
-		else if (token == "use_game_draw_adjudication")
-			is >> use_game_draw_adjudication;
+		// Accept also the old option name.
+		else if (token == "use_game_draw_adjudication" || token == "detect_draw_by_consecutive_low_score")
+			is >> detect_draw_by_consecutive_low_score;
 		else
 			cout << "Error! : Illegal token " << token << endl;
 	}

From 8c0429d1e595bdbf4a6e1e3c51a1b89176cb0140 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 16:14:56 +0900
Subject: [PATCH 159/583] Added detect_draw_by_insufficient_mating_material
 option.

---
 src/learn/learner.cpp | 61 +++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 1e873a4c..c5e23b38 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -119,6 +119,7 @@ bool use_draw_games_in_training = false;
 bool use_draw_games_in_validation = false;
 bool skip_duplicated_positions_in_training = true;
 bool detect_draw_by_consecutive_low_score = false;
+bool detect_draw_by_insufficient_mating_material = false;
 
 // -----------------------------------
 // write phase file
@@ -580,42 +581,44 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 			}
 
 			// Draw by insufficient mating material
-			if (pos.count<ALL_PIECES>() <= 4) {
-				int pcnt = pos.count<ALL_PIECES>();
-				// (1) KvK
-				if (pcnt == 2) {
-					if (write_out_draw_game_in_training_data_generation)
-						flush_psv(0);
-					break;
-				}
-				// (2) KvK + 1 minor piece
-				if (pcnt == 3) {
-					int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
-						pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
-					if (minor_pc == 1) {
+			if (detect_draw_by_insufficient_mating_material) {
+				if (pos.count<ALL_PIECES>() <= 4) {
+					int pcnt = pos.count<ALL_PIECES>();
+					// (1) KvK
+					if (pcnt == 2) {
 						if (write_out_draw_game_in_training_data_generation)
 							flush_psv(0);
 						break;
 					}
-				}
-				// (3) KBvKB, bishops of the same color
-				else if (pcnt == 4) {
-					if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1) {
-						// Color of bishops is black.
-						if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
-							&& (pos.pieces(BLACK, BISHOP) & DarkSquares))
-						{
+					// (2) KvK + 1 minor piece
+					if (pcnt == 3) {
+						int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
+							pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
+						if (minor_pc == 1) {
 							if (write_out_draw_game_in_training_data_generation)
 								flush_psv(0);
 							break;
 						}
-						// Color of bishops is white.
-						if ((pos.pieces(WHITE, BISHOP) & ~DarkSquares)
-							&& (pos.pieces(BLACK, BISHOP) & ~DarkSquares))
-						{
-							if (write_out_draw_game_in_training_data_generation)
-								flush_psv(0);
-							break;
+					}
+					// (3) KBvKB, bishops of the same color
+					else if (pcnt == 4) {
+						if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1) {
+							// Color of bishops is black.
+							if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
+								&& (pos.pieces(BLACK, BISHOP) & DarkSquares))
+							{
+								if (write_out_draw_game_in_training_data_generation)
+									flush_psv(0);
+								break;
+							}
+							// Color of bishops is white.
+							if ((pos.pieces(WHITE, BISHOP) & ~DarkSquares)
+								&& (pos.pieces(BLACK, BISHOP) & ~DarkSquares))
+							{
+								if (write_out_draw_game_in_training_data_generation)
+									flush_psv(0);
+								break;
+							}
 						}
 					}
 				}
@@ -1042,6 +1045,8 @@ void gen_sfen(Position&, istringstream& is)
 		// Accept also the old option name.
 		else if (token == "use_game_draw_adjudication" || token == "detect_draw_by_consecutive_low_score")
 			is >> detect_draw_by_consecutive_low_score;
+		else if (token == "detect_draw_by_insufficient_mating_material")
+			is >> detect_draw_by_insufficient_mating_material;
 		else
 			cout << "Error! : Illegal token " << token << endl;
 	}

From c420b327bfeb5b191aace380ca3b2b120609bba4 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 16:23:04 +0900
Subject: [PATCH 160/583] Added output messages.

---
 src/learn/learner.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index c5e23b38..d5a8eae7 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1099,7 +1099,10 @@ void gen_sfen(Position&, istringstream& is)
 		<< "  output_file_name       = " << output_file_name << endl
 		<< "  use_eval_hash          = " << use_eval_hash << endl
 		<< "  save_every             = " << save_every << endl
-		<< "  random_file_name       = " << random_file_name << endl;
+		<< "  random_file_name       = " << random_file_name << endl
+		<< "  write_out_draw_game_in_training_data_generation = " << write_out_draw_game_in_training_data_generation << endl
+		<< "  detect_draw_by_consecutive_low_score = " << detect_draw_by_consecutive_low_score << endl
+		<< "  detect_draw_by_insufficient_mating_material = " << detect_draw_by_insufficient_mating_material << endl;
 
 	// Show if the training data generator uses NNUE.
 	Eval::verify_NNUE();
@@ -3276,6 +3279,9 @@ void learn(Position&, istringstream& is)
 #endif
 	cout << "learning rate     : " << eta1 << " , " << eta2 << " , " << eta3 << endl;
 	cout << "eta_epoch         : " << eta1_epoch << " , " << eta2_epoch << endl;
+	cout << "use_draw_games_in_training : " << use_draw_games_in_training << endl;
+	cout << "use_draw_games_in_validation : " << use_draw_games_in_validation << endl;
+	cout << "skip_duplicated_positions_in_training : " << skip_duplicated_positions_in_training << endl;
 #if defined(EVAL_NNUE)
 	if (newbob_decay != 1.0) {
 		cout << "scheduling        : newbob with decay = " << newbob_decay

From 35f04aaf24d05ec7b71c2cc1710d00642aaf35ec Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 10 Aug 2020 19:42:39 +0900
Subject: [PATCH 161/583] Removed an unnecessary call for pos.is_draw().

---
 src/learn/learner.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index d5a8eae7..1e51eeb5 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -683,16 +683,6 @@ void MultiThinkGenSfen::thread_worker(size_t thread_id)
 				// Save the move score for adjudication.
 				move_hist_scores.push_back(value1);
 
-				// Processing according to each thousand-day hand.
-
-        if (pos.is_draw(0)) {
-			if (write_out_draw_game_in_training_data_generation) {
-				// Write if draw.
-				flush_psv(0);
-			}
-          break;
-        }
-
 				// Use PV's move to the leaf node and use the value that evaluated() is called on that leaf node.
 				auto evaluate_leaf = [&](Position& pos , vector<Move>& pv)
 				{

From 75b9d6f6b100121904cb8acc24fdf87891a93aae Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 11 Aug 2020 16:37:47 +0900
Subject: [PATCH 162/583] Fixed build parameters.

---
 src/Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index e871f267..453967ad 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -759,7 +759,10 @@ icc-profile-use:
 	all
 
 learn: config-sanity
-	$(MAKE) CXXFLAGS='$(CXXFLAGS) -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp' LDFLAGS='$(LDFLAGS) -lopenblas -fopenmp' build
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
+	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
+	EXTRALDFLAGS=' -lopenblas -fopenmp -Wl,-s ' \
+	all
 
 .depend:
 	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null

From 62228e6b18091e8ab89c25ad4fb39a186d727368 Mon Sep 17 00:00:00 2001
From: xXH4CKST3RXx <52459831+xXH4CKST3RXx@users.noreply.github.com>
Date: Tue, 11 Aug 2020 21:27:15 -0400
Subject: [PATCH 163/583] Update README.md

Grammar, changed link.
---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 7a237480..9b0cac59 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updat
 
 ## Training Guide
 ### Generating Training Data
-To generate training data from the classic eval, use gensfen command with setting "Use NNUE" to "false". The given example is generation in its simplest form. There are more commands. 
+To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands. 
 ```
 uci
 setoption name Use NNUE value false
@@ -44,14 +44,14 @@ Nets get saved in the "evalsave" folder.
 - lambda is the amount of weight it puts to eval of learning data vs win/draw/loss results. 1 puts all weight on eval, lambda 0 puts all weight on WDL results.
 
 ### Reinforcement Learning
-If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries with setting `Use NNUE` to true. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `uci setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries with the setting `Use NNUE` set to true. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
 
 After you have generated the training data, you must move it into your training data folder and delete the older data so that the binary does not accidentally train on the same data again. Do the same for the validation data and name it to val-1.bin to make it less confusing. Make sure the evalsave folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set eval_save_interval to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value. The validation file should be set to the new validation data, not the old data.
 
 After training is finished, your new net should be located in the "final" folder under the "evalsave" directory. You should test this new network against the older network to see if there are any improvements.
 
 ## Using Your Trained Net
-If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into a new folder named "eval" under the directory with the binaries. You can then use the halfkp_256x2 binaries pertaining to your CPU with a standard chess GUI, such as Cutechess. Refer to the [releases page](https://github.com/nodchip/Stockfish/releases) to find out which binary is best for your CPU.
+If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into a new folder named "eval" under the directory with the binaries. You can then use the halfkp_256x2 binaries pertaining to your CPU with a standard chess GUI, such as Cutechess. Refer to the [releases page](https://abrok.eu/stockfish) to find out which binary is best for your CPU.
 
 If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to sepcify the net with the full file path with the "EvalFile" option by typing the command `setoption name EvalFile value path` where path is the full file path.
 

From c3224dd9a15b4add5e9b2deacd5a5a00350caf84 Mon Sep 17 00:00:00 2001
From: xXH4CKST3RXx <52459831+xXH4CKST3RXx@users.noreply.github.com>
Date: Tue, 11 Aug 2020 21:27:51 -0400
Subject: [PATCH 164/583] Update README.md

Typo
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9b0cac59..415ff3ca 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ After training is finished, your new net should be located in the "final" folder
 ## Using Your Trained Net
 If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into a new folder named "eval" under the directory with the binaries. You can then use the halfkp_256x2 binaries pertaining to your CPU with a standard chess GUI, such as Cutechess. Refer to the [releases page](https://abrok.eu/stockfish) to find out which binary is best for your CPU.
 
-If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to sepcify the net with the full file path with the "EvalFile" option by typing the command `setoption name EvalFile value path` where path is the full file path.
+If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to specify the net with the full file path with the "EvalFile" option by typing the command `setoption name EvalFile value path` where path is the full file path.
 
 ## Resources
 - [Stockfish NNUE Wiki](https://www.qhapaq.org/shogi/shogiwiki/stockfish-nnue/)

From e12a0cd9ebefc4328409ad19b3df64e933a9c483 Mon Sep 17 00:00:00 2001
From: xXH4CKST3RXx <52459831+xXH4CKST3RXx@users.noreply.github.com>
Date: Tue, 11 Aug 2020 21:31:12 -0400
Subject: [PATCH 165/583] Update README.md

Additional instruction.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 415ff3ca..8cacd1b3 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ After training is finished, your new net should be located in the "final" folder
 ## Using Your Trained Net
 If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into a new folder named "eval" under the directory with the binaries. You can then use the halfkp_256x2 binaries pertaining to your CPU with a standard chess GUI, such as Cutechess. Refer to the [releases page](https://abrok.eu/stockfish) to find out which binary is best for your CPU.
 
-If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to specify the net with the full file path with the "EvalFile" option by typing the command `setoption name EvalFile value path` where path is the full file path.
+If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to specify the net with the full file path with the "EvalFile" option by typing the command `setoption name EvalFile value path` where path is the full file path. The "Use NNUE" option must be set to true with the command `setoption name Use NNUE value true`.
 
 ## Resources
 - [Stockfish NNUE Wiki](https://www.qhapaq.org/shogi/shogiwiki/stockfish-nnue/)

From 44a54b63f1ef637a2d35c75b7db2a6559fb34208 Mon Sep 17 00:00:00 2001
From: Joseph Ellis <jhellis3@gmail.com>
Date: Tue, 11 Aug 2020 13:35:47 -0500
Subject: [PATCH 166/583] Don't allow LMP on PvNodes

I mentioned this a while back in discord, but nothing seems to have ever come from it.  Anyway, to the best of my knowledge most current training data gen is being done at relatively low fixed depths.  With this in mind, the change to not allow LMP in PvNodes should result in a fairly significant increase in strength and reliability of the PV.
---
 src/search.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index fe1771a3..267672ff 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1012,7 +1012,7 @@ moves_loop: // When in check, search starts from here
       newDepth = depth - 1;
 
       // Step 13. Pruning at shallow depth (~200 Elo)
-      if (  !rootNode
+      if (  !PvNode
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)
       {
@@ -2070,10 +2070,10 @@ namespace Learner
       // Increase the generation of the substitution table for this thread because it is a new search.
             //TT.new_search(th->thread_id());
 
-            // �� If you call new_search here, it may be a loss because you can't use the previous search result.
+            // ª If you call new_search here, it may be a loss because you can't use the previous search result.
             // Do not do this here, but caller should do TT.new_search(th->thread_id()) for each station ...
 
-            // ��Because we want to avoid reaching the same final diagram, use the substitution table commonly for all threads when generating teachers.
+            // ¨Because we want to avoid reaching the same final diagram, use the substitution table commonly for all threads when generating teachers.
       //#endif
     }
   }
@@ -2263,7 +2263,7 @@ namespace Learner
     }
 
     // Pass PV_is(ok) to eliminate this PV, there may be NULL_MOVE in the middle.
-    // �� PV should not be NULL_MOVE because it is PV
+    // ¨ PV should not be NULL_MOVE because it is PV
     // MOVE_WIN has never been thrust. (For now)
     for (Move move : rootMoves[0].pv)
     {

From 430467db1c341c99b47f96dfbc77c25d0171782a Mon Sep 17 00:00:00 2001
From: Joseph Ellis <jhellis3@gmail.com>
Date: Wed, 12 Aug 2020 13:44:07 -0500
Subject: [PATCH 167/583] Create a UCI Training option

Creates a UCI Training option and uses it to disable LMP on PV nodes.
---
 src/search.cpp    | 7 ++++++-
 src/ucioption.cpp | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 267672ff..b7561a96 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -68,6 +68,8 @@ namespace {
     return Value(227 * (d - improving));
   }
 
+  bool training;
+
   // Reductions lookup table, initialized at startup
   int Reductions[MAX_MOVES]; // [depth or moveNumber]
 
@@ -193,6 +195,8 @@ void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
       Reductions[i] = int((24.8 + std::log(Threads.size())) * std::log(i));
+
+  training = Options["Training"];
 }
 
 
@@ -1012,7 +1016,8 @@ moves_loop: // When in check, search starts from here
       newDepth = depth - 1;
 
       // Step 13. Pruning at shallow depth (~200 Elo)
-      if (  !PvNode
+      if (  !rootNode
+          && !(training && PvNode)
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)
       {
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 168e73a9..ef40fe82 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -69,6 +69,7 @@ void init(OptionsMap& o) {
   o["Move Overhead"]         << Option(10, 0, 5000);
   o["Slow Mover"]            << Option(100, 10, 1000);
   o["nodestime"]             << Option(0, 0, 10000);
+  o["Training"]              << Option(false);
   o["UCI_Chess960"]          << Option(false);
   o["UCI_AnalyseMode"]       << Option(false);
   o["UCI_LimitStrength"]     << Option(false);

From ee823afdad97848c6398dbbf69aa8d9629af465f Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Fri, 14 Aug 2020 23:33:28 +0900
Subject: [PATCH 168/583] Deleted a duplicated Readme.md.

---
 Readme.md | 65 -------------------------------------------------------
 1 file changed, 65 deletions(-)
 delete mode 100644 Readme.md

diff --git a/Readme.md b/Readme.md
deleted file mode 100644
index 7a237480..00000000
--- a/Readme.md
+++ /dev/null
@@ -1,65 +0,0 @@
-<p align="center">
-  <img src="https://cdn.discordapp.com/attachments/724700045525647420/729135226365804594/SFNNUE2.png">
-</p>
-
-<h1 align="center">Stockfish NNUE</h1>
-
-## Overview
-Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updateable neural network backwards) to Stockfish 11. To learn more about the Stockfish chess engine, look [here](stockfish.md) for an overview and [here](https://github.com/official-stockfish/Stockfish) for the official repository.
-
-## Training Guide
-### Generating Training Data
-To generate training data from the classic eval, use gensfen command with setting "Use NNUE" to "false". The given example is generation in its simplest form. There are more commands. 
-```
-uci
-setoption name Use NNUE value false
-setoption name Threads value x
-setoption name Hash value y
-setoption name SyzygyPath value path
-isready
-gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000
-```
-Specify how many threads and how much memory you would like to use with the x and y values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The path is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
-
-This will save a file named "generated_kifu.bin" in the same folder as the binary. Once generation is done, rename the file to something like "1billiondepth12.bin" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
-#### Generation Parameters
-- Depth is the searched depth per move, or how far the engine looks forward. This value is an integer.
-- Loop is the amount of positions generated. This value is also an integer
-### Generating Validation Data
-The process is the same as the generation of training data, except for the fact that you need to set loop to 1 million, because you don't need a lot of validation data. The depth should be the same as before or slightly higher than the depth of the training data. After generation rename the validation data file to val.bin and drop it in a folder named "validationdata" in the same directory to make it easier. 
-### Training a Completely New Network
-Use the "learn" binary. Create an empty folder named "evalsave" in the same directory as the binaries.
-```
-uci
-setoption name SkipLoadingEval value true
-setoption name Use NNUE value true
-setoption name Threads value x
-isready
-learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 mirror_percentage 50 validation_set_file_name validationdata\val.bin
-```
-Nets get saved in the "evalsave" folder. 
-
-#### Training Parameters
-- eta is the learning rate
-- lambda is the amount of weight it puts to eval of learning data vs win/draw/loss results. 1 puts all weight on eval, lambda 0 puts all weight on WDL results.
-
-### Reinforcement Learning
-If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries with setting `Use NNUE` to true. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `uci setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
-
-After you have generated the training data, you must move it into your training data folder and delete the older data so that the binary does not accidentally train on the same data again. Do the same for the validation data and name it to val-1.bin to make it less confusing. Make sure the evalsave folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set eval_save_interval to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value. The validation file should be set to the new validation data, not the old data.
-
-After training is finished, your new net should be located in the "final" folder under the "evalsave" directory. You should test this new network against the older network to see if there are any improvements.
-
-## Using Your Trained Net
-If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into a new folder named "eval" under the directory with the binaries. You can then use the halfkp_256x2 binaries pertaining to your CPU with a standard chess GUI, such as Cutechess. Refer to the [releases page](https://github.com/nodchip/Stockfish/releases) to find out which binary is best for your CPU.
-
-If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to sepcify the net with the full file path with the "EvalFile" option by typing the command `setoption name EvalFile value path` where path is the full file path.
-
-## Resources
-- [Stockfish NNUE Wiki](https://www.qhapaq.org/shogi/shogiwiki/stockfish-nnue/)
-- [Training instructions](https://twitter.com/mktakizawa/status/1273042640280252416) from the creator of the Elmo shogi engine
-- [Original Talkchess thread](http://talkchess.com/forum3/viewtopic.php?t=74059) discussing Stockfish NNUE
-- [Guide to Stockfish NNUE](http://yaneuraou.yaneu.com/2020/06/19/stockfish-nnue-the-complete-guide/) 
-- [Unofficial Stockfish Discord](https://discord.gg/nv8gDtt)
-
-A more updated list can be found in the #sf-nnue-resources channel in the Discord.

From 69a95e431b0733651a2b1a5c84d8526037ac9b04 Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Sat, 15 Aug 2020 00:04:10 +0900
Subject: [PATCH 169/583] update convert_bin

learn convert_bin in.txt output_file_name out.bin check_invalid_fen 1 check_illegal_move 1
convert in.txt ... done 16 parsed 3 is filtered (invalid fen:1, illegal move:2, invalid ply:0)
---
 src/learn/learner.cpp | 52 ++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 1e51eeb5..a9e742a0 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -2537,8 +2537,26 @@ void shuffle_files_on_memory(const vector<string>& filenames,const string output
 	std::cout << "..shuffle_on_memory done." << std::endl;
 }
 
-void convert_bin(const vector<string>& filenames, const string& output_file_name, const int ply_minimum, const int ply_maximum, const int interpolate_eval, const bool check_illegal_move)
+bool fen_is_ok(Position& pos, std::string input_fen) {
+	std::string pos_fen = pos.fen();
+	std::istringstream ss_input(input_fen);
+	std::istringstream ss_pos(pos_fen);
+
+	// example : "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3 w - h6 0 24"
+	//       --> "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3"
+	std::string str_input, str_pos;
+	ss_input >> str_input;
+	ss_pos >> str_pos;
+
+	// Only compare "Piece placement field" between input_fen and pos.fen().
+	return str_input == str_pos;
+}
+
+void convert_bin(const vector<string>& filenames, const string& output_file_name, const int ply_minimum, const int ply_maximum, const int interpolate_eval, const bool check_invalid_fen, const bool check_illegal_move)
 {
+	std::cout << "check_invalid_fen=" << check_invalid_fen << std::endl;
+	std::cout << "check_illegal_move=" << check_illegal_move << std::endl;
+
 	std::fstream fs;
 	uint64_t data_size=0;
 	uint64_t filtered_size = 0;
@@ -2571,16 +2589,16 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 			std::string value;
 			ss >> token;
 			if (token == "fen") {
-			  states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
+				states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
 				std::string input_fen = line.substr(4);
 				tpos.set(input_fen, false, &states->back(), Threads.main());
-				if (!tpos.pos_is_ok() || tpos.fen() != input_fen) {
+				if (check_invalid_fen && !fen_is_ok(tpos, input_fen)) {
 					ignore_flag_fen = true;
 					filtered_size_fen++;
 				}
 				else {
-			  tpos.sfen_pack(p.sfen);
-			}
+					tpos.sfen_pack(p.sfen);
+				}
 			}
 			else if (token == "move") {
 				ss >> value;
@@ -2607,7 +2625,7 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 				}
 				p.gamePly = uint16_t(temp); // No cast here?
 				if (interpolate_eval != 0){
-				  p.score = min(3000, interpolate_eval * temp);
+					p.score = min(3000, interpolate_eval * temp);
 				}
 			}
 			else if (token == "result") {
@@ -2615,17 +2633,17 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 				ss >> temp;
 				p.game_result = int8_t(temp); // Do you need a cast here?
 				if (interpolate_eval){
-				  p.score = p.score * p.game_result;
+					p.score = p.score * p.game_result;
 				}
 			}
 			else if (token == "e") {
-				if(!(ignore_flag_fen || ignore_flag_move || ignore_flag_ply)){
-				fs.write((char*)&p, sizeof(PackedSfenValue));
-				data_size+=1;
-				// debug
-				// std::cout<<tpos<<std::endl;
-				// std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
-			  }
+				if (!(ignore_flag_fen || ignore_flag_move || ignore_flag_ply)) {
+					fs.write((char*)&p, sizeof(PackedSfenValue));
+					data_size+=1;
+					// debug
+					// std::cout<<tpos<<std::endl;
+					// std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
+				}
 				else {
 					filtered_size++;
 				}
@@ -2635,7 +2653,7 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 			}
 		}
 		std::cout << "done " << data_size << " parsed " << filtered_size << " is filtered"
-				  << " (illegal fen:" << filtered_size_fen << ", illegal move:" << filtered_size_move << ", illegal ply:" << filtered_size_ply << ")" << std::endl;
+				  << " (invalid fen:" << filtered_size_fen << ", illegal move:" << filtered_size_move << ", invalid ply:" << filtered_size_ply << ")" << std::endl;
 		ifs.close();
 	}
 	std::cout << "all done" << std::endl;
@@ -2983,6 +3001,7 @@ void learn(Position&, istringstream& is)
 	int ply_minimum = 0;
 	int ply_maximum = 114514;
 	bool interpolate_eval = 0;
+	bool check_invalid_fen = false;
 	bool check_illegal_move = false;
 	// convert teacher in pgn-extract format to Yaneura King's bin
 	bool use_convert_bin_from_pgn_extract = false;
@@ -3123,6 +3142,7 @@ void learn(Position&, istringstream& is)
 		else if (option == "convert_plain") use_convert_plain = true;
 		else if (option == "convert_bin") use_convert_bin = true;
 		else if (option == "interpolate_eval") is >> interpolate_eval;
+		else if (option == "check_invalid_fen") is >> check_invalid_fen;
 		else if (option == "check_illegal_move") is >> check_illegal_move;
 		else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
 		else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
@@ -3235,7 +3255,7 @@ void learn(Position&, istringstream& is)
 	{
 		Eval::init_NNUE();
 		cout << "convert_bin.." << endl;
-		convert_bin(filenames,output_file_name, ply_minimum, ply_maximum, interpolate_eval, check_illegal_move);
+		convert_bin(filenames,output_file_name, ply_minimum, ply_maximum, interpolate_eval, check_invalid_fen, check_illegal_move);
 		return;
 		
 	}

From 79654ac5095af6f063a4d0f73b3621be1b8ec20f Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 15 Aug 2020 11:57:08 +0900
Subject: [PATCH 170/583] Added winning_probability_coefficient option to
 specify the coefficient to calculate the winning probability from a value.
 #71

---
 src/learn/learner.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 1e51eeb5..fc178c92 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -120,6 +120,8 @@ bool use_draw_games_in_validation = false;
 bool skip_duplicated_positions_in_training = true;
 bool detect_draw_by_consecutive_low_score = false;
 bool detect_draw_by_insufficient_mating_material = false;
+// 1.0 / PawnValueEg / 4.0 * log(10.0)
+double winning_probability_coefficient = 0.00276753015984861260098316280611;
 
 // -----------------------------------
 // write phase file
@@ -1147,7 +1149,7 @@ double winning_percentage(double value)
 	// 1/(1+10^(-Eval/4))
 	// = 1/(1+e^(-Eval/4*ln(10))
 	// = sigmoid(Eval/4*ln(10))
-	return sigmoid(value / PawnValueEg / 4.0 * log(10.0));
+	return sigmoid(value * winning_probability_coefficient);
 }
 double dsigmoid(double x)
 {
@@ -3072,6 +3074,7 @@ void learn(Position&, istringstream& is)
 		else if (option == "use_draw_in_validation" || option == "use_draw_games_in_validation") is >> use_draw_games_in_validation;
 		// Accept also the old option name.
 		else if (option == "use_hash_in_training" || option == "skip_duplicated_positions_in_training") is >> skip_duplicated_positions_in_training;
+		else if (option == "winning_probability_coefficient") is >> winning_probability_coefficient;
 		// Discount rate
 		else if (option == "discount_rate") is >> discount_rate;
 

From 2fb3f76399a9d7fd32ac7f039ef6c98bd2915504 Mon Sep 17 00:00:00 2001
From: Serianol <65077855+Serianol@users.noreply.github.com>
Date: Mon, 17 Aug 2020 12:36:13 +0200
Subject: [PATCH 171/583] Update Makefile

---
 src/Makefile | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 453967ad..a8736a15 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -34,6 +34,7 @@ BINDIR = $(PREFIX)/bin
 
 ### Built-in benchmark for pgo-builds
 PGOBENCH = ./$(EXE) bench
+PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 100000
 
 ### Source and object files
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
@@ -270,9 +271,9 @@ endif
 ### ==========================================================================
 
 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS)
+CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
 DEPENDFLAGS += -std=c++17
-LDFLAGS += $(EXTRALDFLAGS)
+LDFLAGS += $(EXTRALDFLAGS) $(LEARNLDFLAGS)
 
 ifeq ($(COMP),)
 	COMP=gcc
@@ -763,6 +764,26 @@ learn: config-sanity
 	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
 	EXTRALDFLAGS=' -lopenblas -fopenmp -Wl,-s ' \
 	all
+	
+profile-learn: config-sanity objclean profileclean
+	@echo ""
+	@echo "Step 1/4. Building instrumented executable ..."
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
+	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s '
+	@echo ""
+	@echo "Step 2/4. Running benchmark for pgo-build ..."
+	$(PGOGENSFEN) 
+	@echo ""
+	@echo "Step 3/4. Building optimized executable ..."
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
+	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s '
+	@echo ""
+	@echo "Step 4/4. Deleting profile data ..."
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
+	rm generated_kifu.bin
 
 .depend:
 	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null

From aaa73b2569df4af1b8e3a5148dcef83a3c709105 Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Wed, 19 Aug 2020 22:47:22 +0900
Subject: [PATCH 172/583] modify convert_bin_from_pgn-extract

---
 src/learn/learner.cpp | 217 ++++++++++++++++++++++++------------------
 1 file changed, 126 insertions(+), 91 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index acc028f5..6161f974 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -2732,9 +2732,25 @@ Value parse_score_from_pgn_extract(std::string eval, bool& success) {
 	}
 }
 
-void convert_bin_from_pgn_extract(const vector<string>& filenames, const string& output_file_name, const bool pgn_eval_side_to_move)
+// for Debug
+//#define DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT
+
+bool is_like_fen(std::string fen) {
+	int count_space = std::count(fen.cbegin(), fen.cend(), ' ');
+	int count_slash = std::count(fen.cbegin(), fen.cend(), '/');
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+	//std::cout << "count_space=" << count_space << std::endl;
+	//std::cout << "count_slash=" << count_slash << std::endl;
+#endif
+
+	return count_space == 5 && count_slash == 7;
+}
+
+void convert_bin_from_pgn_extract(const vector<string>& filenames, const string& output_file_name, const bool pgn_eval_side_to_move, const bool convert_no_eval_fens_as_score_zero)
 {
 	std::cout << "pgn_eval_side_to_move=" << pgn_eval_side_to_move << std::endl;
+	std::cout << "convert_no_eval_fens_as_score_zero=" << convert_no_eval_fens_as_score_zero << std::endl;
 
 	auto th = Threads.main();
 	auto &pos = th->rootPos;
@@ -2766,8 +2782,9 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 				// example: [Result "1-0"]
 				if (std::regex_search(line, match, pattern_result)) {
 					game_result = parse_game_result_from_pgn_extract(match.str(1));
-					//std::cout << "game_result=" << game_result << std::endl;
-
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+					std::cout << "game_result=" << game_result << std::endl;
+#endif
 					game_count++;
 					if (game_count % 10000 == 0) {
 						std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
@@ -2778,80 +2795,131 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 			}
 
 			else {
-				int gamePly = 0;
-				bool first = true;
-
-				PackedSfenValue psv;
-				memset((char*)&psv, 0, sizeof(PackedSfenValue));
-
+				int gamePly = 1;
 				auto itr = line.cbegin();
 
 				while (true) {
 					gamePly++;
 
-					std::regex pattern_bracket(R"(\{(.+?)\})");
+					PackedSfenValue psv;
+					memset((char*)&psv, 0, sizeof(PackedSfenValue));
 
-					std::regex pattern_eval1(R"(\[\%eval (.+?)\])");
-					std::regex pattern_eval2(R"((.+?)\/)");
+					// fen
+					{
+						bool fen_found = false;
 
-					// very slow
-					//std::regex pattern_eval1(R"(\[\%eval (#?[+-]?(?:\d+\.?\d*|\.\d+))\])");
-					//std::regex pattern_eval2(R"((#?[+-]?(?:\d+\.?\d*|\.\d+)\/))");
+						while (!fen_found) {
+							std::regex pattern_bracket(R"(\{(.+?)\})");
+							std::smatch match;
+							if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+								break;
+							}
 
-					std::regex pattern_move(R"((.+?)\{)");
-					std::smatch match;
+							itr += match.position(0) + match.length(0) - 1;
+							std::string str_fen = match.str(1);
+							trim(str_fen);
 
-					// example: { [%eval 0.25] [%clk 0:10:00] }
-					// example: { +0.71/22 1.2s }
-					// example: { book }
-					if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
-						break;
+							if (is_like_fen(str_fen)) {
+								fen_found = true;
+
+								StateInfo si;
+								pos.set(str_fen, false, &si, th);
+								pos.sfen_pack(psv.sfen);
+							}
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+							std::cout << "str_fen=" << str_fen << std::endl;
+							std::cout << "fen_found=" << fen_found << std::endl;
+#endif
+						}
+
+						if (!fen_found) {
+							break;
+						}
 					}
 
-					itr += match.position(0) + match.length(0);
-					std::string str_eval_clk = match.str(1);
-					trim(str_eval_clk);
-					//std::cout << "str_eval_clk="<< str_eval_clk << std::endl;
+					// move
+					{
+						std::regex pattern_move(R"(\}(.+?)\{)");
+						std::smatch match;
+						if (!std::regex_search(itr, line.cend(), match, pattern_move)) {
+							break;
+						}
 
-					if (str_eval_clk == "book") {
-						//std::cout << "book" << std::endl;
+						itr += match.position(0) + match.length(0) - 1;
+						std::string str_move = match.str(1);
+						trim(str_move);
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+						std::cout << "str_move=" << str_move << std::endl;
+#endif
+						psv.move = UCI::to_move(pos, str_move);
+					}
 
-						// example: { rnbqkbnr/pppppppp/8/8/8/4P3/PPPP1PPP/RNBQKBNR b KQkq - 0 1 }
+					// eval
+					bool eval_found = false;
+					{
+						std::regex pattern_bracket(R"(\{(.+?)\})");
+						std::smatch match;
 						if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
 							break;
 						}
-						itr += match.position(0) + match.length(0);
-						continue;
-					}
 
-					// example: [%eval 0.25]
-					// example: [%eval #-4]
-					// example: [%eval #3]
-					// example: +0.71/
-					if (std::regex_search(str_eval_clk, match, pattern_eval1) ||
-						std::regex_search(str_eval_clk, match, pattern_eval2)) {
-						std::string str_eval = match.str(1);
-						trim(str_eval);
-						//std::cout << "str_eval=" << str_eval << std::endl;
+						std::string str_eval_clk = match.str(1);
+						trim(str_eval_clk);
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+						std::cout << "str_eval_clk=" << str_eval_clk << std::endl;
+#endif
 
-						bool success = false;
-						psv.score = Math::clamp(parse_score_from_pgn_extract(str_eval, success), -VALUE_MATE , VALUE_MATE);
-						//std::cout << "success=" << success << ", psv.score=" << psv.score << std::endl;
+						// example: { [%eval 0.25] [%clk 0:10:00] }
+						// example: { [%eval #-4] [%clk 0:10:00] }
+						// example: { [%eval #3] [%clk 0:10:00] }
+						// example: { +0.71/22 1.2s }
+						// example: { -M4/7 0.003s }
+						// example: { M3/245 0.017s }
+						// example: { +M1/245 0.010s, White mates }
+						// example: { 0.60 }
+						// example: { book }
+						// example: { rnbqkb1r/pp3ppp/2p1pn2/3p4/2PP4/2N2N2/PP2PPPP/R1BQKB1R w KQkq - 0 5 }
 
-						if (!success) {
-							//std::cout << "str_eval=" << str_eval << std::endl;
-							//std::cout << "success=" << success << ", psv.score=" << psv.score << std::endl;
-							break;
+						// Considering the absence of eval
+						if (!is_like_fen(str_eval_clk)) {
+							itr += match.position(0) + match.length(0) - 1;
+
+							if (str_eval_clk != "book") {
+								std::regex pattern_eval1(R"(\[\%eval (.+?)\])");
+								std::regex pattern_eval2(R"((.+?)\/)");
+
+								std::string str_eval;
+								if (std::regex_search(str_eval_clk, match, pattern_eval1) ||
+									std::regex_search(str_eval_clk, match, pattern_eval2)) {
+									str_eval = match.str(1);
+									trim(str_eval);
+								}
+								else {
+									str_eval = str_eval_clk;
+								}
+
+								bool success = false;
+								Value value = parse_score_from_pgn_extract(str_eval, success);
+								if (success) {
+									eval_found = true;
+									psv.score = Math::clamp(value, -VALUE_MATE , VALUE_MATE);
+								}
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+								std::cout << "str_eval=" << str_eval << std::endl;
+								std::cout << "success=" << success << ", psv.score=" << psv.score << std::endl;
+#endif
+							}
 						}
 					}
-					else {
-						break;
-					}
 
-					if (first) {
-						first = false;
-					}
-					else {
+					// write
+					if (eval_found || convert_no_eval_fens_as_score_zero) {
+						if (!eval_found && convert_no_eval_fens_as_score_zero) {
+							psv.score = 0;
+						}
+
 						psv.gamePly = gamePly;
 						psv.game_result = game_result;
 
@@ -2862,45 +2930,10 @@ void convert_bin_from_pgn_extract(const vector<string>& filenames, const string&
 							psv.game_result *= -1;
 						}
 
-#if 0
-						std::cout << "write: "
-								  << "score=" << psv.score
-								  << ", move=" << psv.move
-								  << ", gamePly=" << psv.gamePly
-								  << ", game_result=" << (int)psv.game_result
-								  << std::endl;
-#endif
-
 						ofs.write((char*)&psv, sizeof(PackedSfenValue));
-						memset((char*)&psv, 0, sizeof(PackedSfenValue));
 
 						fen_count++;
 					}
-
-					// example: { rnbqkbnr/pppppppp/8/8/3P4/8/PPP1PPPP/RNBQKBNR b KQkq d3 0 1 }
-					if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
-						break;
-					}
-
-					itr += match.position(0) + match.length(0);
-					std::string str_fen = match.str(1);
-					trim(str_fen);
-					//std::cout << "str_fen=" << str_fen << std::endl;
-
-					StateInfo si;
-					pos.set(str_fen, false, &si, th);
-					pos.sfen_pack(psv.sfen);
-
-					// example: d7d5 {
-					if (!std::regex_search(itr, line.cend(), match, pattern_move)) {
-						break;
-					}
-
-					itr += match.position(0) + match.length(0) - 1;
-					std::string str_move = match.str(1);
-					trim(str_move);
-					//std::cout << "str_move=" << str_move << std::endl;
-					psv.move = UCI::to_move(pos, str_move);
 				}
 
 				game_result = 0;
@@ -3008,6 +3041,7 @@ void learn(Position&, istringstream& is)
 	// convert teacher in pgn-extract format to Yaneura King's bin
 	bool use_convert_bin_from_pgn_extract = false;
 	bool pgn_eval_side_to_move = false;
+	bool convert_no_eval_fens_as_score_zero = false;
 	// File name to write in those cases (default is "shuffled_sfen.bin")
 	string output_file_name = "shuffled_sfen.bin";
 
@@ -3149,6 +3183,7 @@ void learn(Position&, istringstream& is)
 		else if (option == "check_illegal_move") is >> check_illegal_move;
 		else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
 		else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+		else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
 
 		// Otherwise, it's a filename.
 		else
@@ -3266,7 +3301,7 @@ void learn(Position&, istringstream& is)
 	{
 		Eval::init_NNUE();
 		cout << "convert_bin_from_pgn-extract.." << endl;
-		convert_bin_from_pgn_extract(filenames, output_file_name, pgn_eval_side_to_move);
+		convert_bin_from_pgn_extract(filenames, output_file_name, pgn_eval_side_to_move, convert_no_eval_fens_as_score_zero);
 		return;
 	}
 

From 11752d4e63e30ae401995ff7566be109098d367c Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Fri, 21 Aug 2020 21:15:59 +0900
Subject: [PATCH 173/583] Added options to scale the scores in training data.
 #71

---
 src/learn/learner.cpp | 40 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index acc028f5..617f5af3 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -122,6 +122,13 @@ bool detect_draw_by_consecutive_low_score = false;
 bool detect_draw_by_insufficient_mating_material = false;
 // 1.0 / PawnValueEg / 4.0 * log(10.0)
 double winning_probability_coefficient = 0.00276753015984861260098316280611;
+// Score scale factors.  ex) If we set src_score_min_value = 0.0,
+// src_score_max_value = 1.0, dest_score_min_value = 0.0,
+// dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000].
+double src_score_min_value = 0.0;
+double src_score_max_value = 1.0;
+double dest_score_min_value = 0.0;
+double dest_score_max_value = 1.0;
 
 // -----------------------------------
 // write phase file
@@ -1245,8 +1252,15 @@ double calc_grad(Value deep, Value shallow , const PackedSfenValue& psv)
 	// elmo (WCSC27) method
 	// Correct with the actual game wins and losses.
 
+	// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+	double scaled_deep = deep;
+	// Normalize to [0.0, 1.0].
+	scaled_deep = (scaled_deep - src_score_min_value) / (src_score_max_value - src_score_min_value);
+	// Scale to [dest_score_min_value, dest_score_max_value].
+	scaled_deep = scaled_deep * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+
 	const double q = winning_percentage(shallow);
-	const double p = winning_percentage(deep);
+	const double p = winning_percentage(scaled_deep);
 
 	// Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
 	// game_result = 1,0,-1 so add 1 and divide by 2.
@@ -1268,7 +1282,14 @@ void calc_cross_entropy(Value deep, Value shallow, const PackedSfenValue& psv,
 	double& cross_entropy_eval, double& cross_entropy_win, double& cross_entropy,
 	double& entropy_eval, double& entropy_win, double& entropy)
 {
-	const double p /* teacher_winrate */ = winning_percentage(deep);
+	// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+	double scaled_deep = deep;
+	// Normalize to [0.0, 1.0].
+	scaled_deep = (scaled_deep - src_score_min_value) / (src_score_max_value - src_score_min_value);
+	// Scale to [dest_score_min_value, dest_score_max_value].
+	scaled_deep = scaled_deep * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+
+	const double p /* teacher_winrate */ = winning_percentage(scaled_deep);
 	const double q /* eval_winrate    */ = winning_percentage(shallow);
 	const double t = double(psv.game_result + 1) / 2;
 
@@ -2614,9 +2635,14 @@ void convert_bin(const vector<string>& filenames, const string& output_file_name
 				}
 			}
 			else if (token == "score") {
-				int32_t score;
+				double score;
 				ss >> score;
-				p.score = Math::clamp(score , -(int32_t)VALUE_MATE , (int32_t)VALUE_MATE);
+				// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+				// Normalize to [0.0, 1.0].
+				score = (score - src_score_min_value) / (src_score_max_value - src_score_min_value);
+				// Scale to [dest_score_min_value, dest_score_max_value].
+				score = score * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+				p.score = Math::clamp((int32_t)std::round(score) , -(int32_t)VALUE_MATE , (int32_t)VALUE_MATE);
 			}
 			else if (token == "ply") {
 				int temp;
@@ -3149,6 +3175,10 @@ void learn(Position&, istringstream& is)
 		else if (option == "check_illegal_move") is >> check_illegal_move;
 		else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
 		else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+		else if (option == "src_score_min_value") is >> src_score_min_value;
+		else if (option == "src_score_max_value") is >> src_score_max_value;
+		else if (option == "dest_score_min_value") is >> dest_score_min_value;
+		else if (option == "dest_score_max_value") is >> dest_score_max_value;
 
 		// Otherwise, it's a filename.
 		else
@@ -3260,7 +3290,7 @@ void learn(Position&, istringstream& is)
 		cout << "convert_bin.." << endl;
 		convert_bin(filenames,output_file_name, ply_minimum, ply_maximum, interpolate_eval, check_invalid_fen, check_illegal_move);
 		return;
-		
+
 	}
 	if (use_convert_bin_from_pgn_extract)
 	{

From 87633b876c502b46a85ca0b8f6e92eef247b7461 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 22 Aug 2020 12:19:29 +0900
Subject: [PATCH 174/583] Added an option to convert teacher signals to winning
 probabilities.

---
 src/learn/learner.cpp | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 617f5af3..182064fa 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -129,6 +129,10 @@ double src_score_min_value = 0.0;
 double src_score_max_value = 1.0;
 double dest_score_min_value = 0.0;
 double dest_score_max_value = 1.0;
+// Assume teacher signals are the scores of deep searches, and convert them into winning
+// probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
+// data directly. In those cases, we set false to this variable.
+bool convert_teacher_signal_to_winning_probability = true;
 
 // -----------------------------------
 // write phase file
@@ -1247,27 +1251,31 @@ double ELMO_LAMBDA = 0.33;
 double ELMO_LAMBDA2 = 0.33;
 double ELMO_LAMBDA_LIMIT = 32000;
 
-double calc_grad(Value deep, Value shallow , const PackedSfenValue& psv)
+double calc_grad(Value teacher_signal, Value shallow , const PackedSfenValue& psv)
 {
 	// elmo (WCSC27) method
 	// Correct with the actual game wins and losses.
 
 	// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-	double scaled_deep = deep;
+	double scaled_teacher_signal = teacher_signal;
 	// Normalize to [0.0, 1.0].
-	scaled_deep = (scaled_deep - src_score_min_value) / (src_score_max_value - src_score_min_value);
+	scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
 	// Scale to [dest_score_min_value, dest_score_max_value].
-	scaled_deep = scaled_deep * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+	scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
 
 	const double q = winning_percentage(shallow);
-	const double p = winning_percentage(scaled_deep);
+	// Teacher winning probability.
+	double p = scaled_teacher_signal;
+	if (convert_teacher_signal_to_winning_probability) {
+		p = winning_percentage(scaled_teacher_signal);
+	}
 
 	// Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
 	// game_result = 1,0,-1 so add 1 and divide by 2.
 	const double t = double(psv.game_result + 1) / 2;
 
 	// If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
-	const double lambda = (abs(deep) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
+	const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
 
 	// Use the actual win rate as a correction term.
 	// This is the idea of ​​elmo (WCSC27), modern O-parts.
@@ -1278,25 +1286,29 @@ double calc_grad(Value deep, Value shallow , const PackedSfenValue& psv)
 
 // Calculate cross entropy during learning
 // The individual cross entropy of the win/loss term and win rate term of the elmo expression is returned to the arguments cross_entropy_eval and cross_entropy_win.
-void calc_cross_entropy(Value deep, Value shallow, const PackedSfenValue& psv,
+void calc_cross_entropy(Value teacher_signal, Value shallow, const PackedSfenValue& psv,
 	double& cross_entropy_eval, double& cross_entropy_win, double& cross_entropy,
 	double& entropy_eval, double& entropy_win, double& entropy)
 {
 	// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-	double scaled_deep = deep;
+	double scaled_teacher_signal = teacher_signal;
 	// Normalize to [0.0, 1.0].
-	scaled_deep = (scaled_deep - src_score_min_value) / (src_score_max_value - src_score_min_value);
+	scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
 	// Scale to [dest_score_min_value, dest_score_max_value].
-	scaled_deep = scaled_deep * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+	scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
 
-	const double p /* teacher_winrate */ = winning_percentage(scaled_deep);
+	// Teacher winning probability.
+	double p = scaled_teacher_signal;
+	if (convert_teacher_signal_to_winning_probability) {
+		p = winning_percentage(scaled_teacher_signal);
+	}
 	const double q /* eval_winrate    */ = winning_percentage(shallow);
 	const double t = double(psv.game_result + 1) / 2;
 
 	constexpr double epsilon = 0.000001;
 
 	// If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
-	const double lambda = (abs(deep) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
+	const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
 
 	const double m = (1.0 - lambda) * t + lambda * p;
 
@@ -3179,6 +3191,7 @@ void learn(Position&, istringstream& is)
 		else if (option == "src_score_max_value") is >> src_score_max_value;
 		else if (option == "dest_score_min_value") is >> dest_score_min_value;
 		else if (option == "dest_score_max_value") is >> dest_score_max_value;
+		else if (option == "convert_teacher_signal_to_winning_probability") is >> convert_teacher_signal_to_winning_probability;
 
 		// Otherwise, it's a filename.
 		else

From 4ce30d952221a58d65be8518a917be6ef5333f92 Mon Sep 17 00:00:00 2001
From: tttak <tttak@users.noreply.github.com>
Date: Mon, 24 Aug 2020 22:56:08 +0900
Subject: [PATCH 175/583] Use winning_percentage_wdl in learn

---
 src/learn/learner.cpp | 61 +++++++++++++++++++++++++++++++++++++++----
 src/uci.cpp           | 39 +++++++++++++++------------
 src/uci.h             |  1 +
 3 files changed, 80 insertions(+), 21 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 3eb97923..abfcd87b 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -133,6 +133,8 @@ double dest_score_max_value = 1.0;
 // probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
 // data directly. In those cases, we set false to this variable.
 bool convert_teacher_signal_to_winning_probability = true;
+// Using WDL with win rate model instead of sigmoid
+bool use_wdl = false;
 
 // -----------------------------------
 // write phase file
@@ -1162,6 +1164,45 @@ double winning_percentage(double value)
 	// = sigmoid(Eval/4*ln(10))
 	return sigmoid(value * winning_probability_coefficient);
 }
+
+// A function that converts the evaluation value to the winning rate [0,1]
+double winning_percentage_wdl(double value, int ply)
+{
+	double wdl_w = UCI::win_rate_model_double( value, ply);
+	double wdl_l = UCI::win_rate_model_double(-value, ply);
+	double wdl_d = 1000.0 - wdl_w - wdl_l;
+
+	return (wdl_w + wdl_d / 2.0) / 1000.0;
+}
+
+// A function that converts the evaluation value to the winning rate [0,1]
+double winning_percentage(double value, int ply)
+{
+	if (use_wdl) {
+		return winning_percentage_wdl(value, ply);
+	}
+	else {
+		return winning_percentage(value);
+	}
+}
+
+double calc_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
+{
+	double p = deep_win_rate;
+	double q = winning_percentage(shallow_eval, ply);
+	return -p * std::log(q) - (1 - p) * std::log(1 - q);
+}
+
+double calc_d_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
+{
+	constexpr double epsilon = 0.000001;
+	double y1 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval          , ply);
+	double y2 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval + epsilon, ply);
+
+	// Divide by the winning_probability_coefficient to match scale with the sigmoidal win rate
+	return ((y2 - y1) / epsilon) / winning_probability_coefficient;
+}
+
 double dsigmoid(double x)
 {
 	// Sigmoid function
@@ -1263,11 +1304,11 @@ double calc_grad(Value teacher_signal, Value shallow , const PackedSfenValue& ps
 	// Scale to [dest_score_min_value, dest_score_max_value].
 	scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
 
-	const double q = winning_percentage(shallow);
+	const double q = winning_percentage(shallow, psv.gamePly);
 	// Teacher winning probability.
 	double p = scaled_teacher_signal;
 	if (convert_teacher_signal_to_winning_probability) {
-		p = winning_percentage(scaled_teacher_signal);
+		p = winning_percentage(scaled_teacher_signal, psv.gamePly);
 	}
 
 	// Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
@@ -1277,9 +1318,17 @@ double calc_grad(Value teacher_signal, Value shallow , const PackedSfenValue& ps
 	// If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
 	const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
 
-	// Use the actual win rate as a correction term.
-	// This is the idea of ​​elmo (WCSC27), modern O-parts.
-	const double grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
+	double grad;
+	if (use_wdl) {
+		double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
+		double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
+		grad = lambda * dce_p + (1.0 - lambda) * dce_t;
+	}
+	else {
+		// Use the actual win rate as a correction term.
+		// This is the idea of ​​elmo (WCSC27), modern O-parts.
+		grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
+	}
 
 	return grad;
 }
@@ -3168,6 +3217,8 @@ void learn(Position&, istringstream& is)
 		else if (option == "winning_probability_coefficient") is >> winning_probability_coefficient;
 		// Discount rate
 		else if (option == "discount_rate") is >> discount_rate;
+		// Using WDL with win rate model instead of sigmoid
+		else if (option == "use_wdl") is >> use_wdl;
 
 		// No learning of KK/KKP/KPP/KPPP.
 		else if (option == "freeze_kk")    is >> freeze[0];
diff --git a/src/uci.cpp b/src/uci.cpp
index 8972cec9..00941040 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -238,27 +238,34 @@ namespace {
   // The win rate model returns the probability (per mille) of winning given an eval
   // and a game-ply. The model fits rather accurately the LTC fishtest statistics.
   int win_rate_model(Value v, int ply) {
-
-     // The model captures only up to 240 plies, so limit input (and rescale)
-     double m = std::min(240, ply) / 64.0;
-
-     // Coefficients of a 3rd order polynomial fit based on fishtest data
-     // for two parameters needed to transform eval to the argument of a
-     // logistic function.
-     double as[] = {-8.24404295, 64.23892342, -95.73056462, 153.86478679};
-     double bs[] = {-3.37154371, 28.44489198, -56.67657741,  72.05858751};
-     double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
-     double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
-
-     // Transform eval to centipawns with limited range
-     double x = Utility::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
-
      // Return win rate in per mille (rounded to nearest)
-     return int(0.5 + 1000 / (1 + std::exp((a - x) / b)));
+     return int(0.5 + UCI::win_rate_model_double(v, ply));
   }
 
 } // namespace
 
+// The win rate model returns the probability (per mille) of winning given an eval
+// and a game-ply. The model fits rather accurately the LTC fishtest statistics.
+double UCI::win_rate_model_double(double v, int ply) {
+
+   // The model captures only up to 240 plies, so limit input (and rescale)
+   double m = std::min(240, ply) / 64.0;
+
+   // Coefficients of a 3rd order polynomial fit based on fishtest data
+   // for two parameters needed to transform eval to the argument of a
+   // logistic function.
+   double as[] = {-8.24404295, 64.23892342, -95.73056462, 153.86478679};
+   double bs[] = {-3.37154371, 28.44489198, -56.67657741,  72.05858751};
+   double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
+   double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
+
+   // Transform eval to centipawns with limited range
+   double x = Utility::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
+
+   // Return win rate in per mille
+   return 1000.0 / (1 + std::exp((a - x) / b));
+}
+
 // --------------------
 // Call qsearch(),search() directly for testing
 // --------------------
diff --git a/src/uci.h b/src/uci.h
index 27a50fb9..c0e8372f 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -72,6 +72,7 @@ std::string square(Square s);
 std::string move(Move m, bool chess960);
 std::string pv(const Position& pos, Depth depth, Value alpha, Value beta);
 std::string wdl(Value v, int ply);
+double win_rate_model_double(double v, int ply);
 Move to_move(const Position& pos, std::string& str);
 
 } // namespace UCI

From 9fc3ff4c309191a2dfdd06b6f4369377727e8f25 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Thu, 27 Aug 2020 23:48:28 +0900
Subject: [PATCH 176/583] Added use_raw_nnue_eval option to return raw NNUE
 eval value in evaluate().

---
 src/evaluate.cpp      | 12 ++++++++++++
 src/learn/learner.cpp |  7 +++++++
 2 files changed, 19 insertions(+)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 9b3b58c3..4ba89675 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -32,6 +32,13 @@
 #include "thread.h"
 #include "uci.h"
 
+#ifdef EVAL_LEARN
+namespace Learner
+{
+    extern bool use_raw_nnue_eval;
+}
+#endif
+
 namespace Eval {
 
   bool useNNUE;
@@ -941,6 +948,11 @@ make_v:
 /// evaluation of the position from the point of view of the side to move.
 
 Value Eval::evaluate(const Position& pos) {
+#ifdef EVAL_LEARN
+  if (Learner::use_raw_nnue_eval) {
+      return NNUE::evaluate(pos);
+  }
+#endif
 
   if (Eval::useNNUE)
   {
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 3eb97923..029285e8 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -133,6 +133,10 @@ double dest_score_max_value = 1.0;
 // probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
 // data directly. In those cases, we set false to this variable.
 bool convert_teacher_signal_to_winning_probability = true;
+// Enable hybrid eval. If hybrid eval is enabled, training data generation and
+// training don't work well.
+// https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
+bool use_raw_nnue_eval = false;
 
 // -----------------------------------
 // write phase file
@@ -1050,6 +1054,8 @@ void gen_sfen(Position&, istringstream& is)
 			is >> detect_draw_by_consecutive_low_score;
 		else if (token == "detect_draw_by_insufficient_mating_material")
 			is >> detect_draw_by_insufficient_mating_material;
+		else if (token == "use_raw_nnue_eval")
+			is >> use_raw_nnue_eval;
 		else
 			cout << "Error! : Illegal token " << token << endl;
 	}
@@ -3227,6 +3233,7 @@ void learn(Position&, istringstream& is)
 		else if (option == "dest_score_min_value") is >> dest_score_min_value;
 		else if (option == "dest_score_max_value") is >> dest_score_max_value;
 		else if (option == "convert_teacher_signal_to_winning_probability") is >> convert_teacher_signal_to_winning_probability;
+		else if (option == "use_raw_nnue_eval") is >> use_raw_nnue_eval;
 
 		// Otherwise, it's a filename.
 		else

From 763e72cc9f1c9752be32a215d1dfb28adcfd22c6 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Thu, 27 Aug 2020 23:49:58 +0900
Subject: [PATCH 177/583] Changed the default value of use_raw_nnue_eval. 
 Updated a source code comment.

---
 src/learn/learner.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 029285e8..4f8b3fee 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -133,10 +133,10 @@ double dest_score_max_value = 1.0;
 // probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
 // data directly. In those cases, we set false to this variable.
 bool convert_teacher_signal_to_winning_probability = true;
-// Enable hybrid eval. If hybrid eval is enabled, training data generation and
-// training don't work well.
+// Use raw NNUE eval value in the Eval::evaluate(). If hybrid eval is enabled, training data
+// generation and training don't work well.
 // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-bool use_raw_nnue_eval = false;
+bool use_raw_nnue_eval = true;
 
 // -----------------------------------
 // write phase file

From 5637996f79566a2d720ff3eb9236c361f7c22998 Mon Sep 17 00:00:00 2001
From: hero2017 <futurehaker@hotmail.com>
Date: Wed, 26 Aug 2020 00:20:30 -0400
Subject: [PATCH 178/583] Resolve #92

If we're defining something in a header then we should declare it.
---
 src/learn/learner.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index a139bb5f..9f02a594 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -84,6 +84,7 @@
 
 #if defined(EVAL_NNUE)
 #include "../nnue/evaluate_nnue_learner.h"
+#include <climits>
 #include <shared_mutex>
 #endif
 

From f7bc4e6e45788b61517d5a669ca6e6d75d1b0d17 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 29 Aug 2020 00:56:05 +0900
Subject: [PATCH 179/583] Fixed compilation errors.

---
 src/eval/evaluate_mir_inv_tools.cpp           | 190 ----
 src/eval/evaluate_mir_inv_tools.h             |  47 -
 src/evaluate.cpp                              |  99 ---
 src/extra/sfen_packer.cpp                     |  18 -
 src/learn/learning_tools.cpp                  | 231 -----
 src/learn/learning_tools.h                    | 834 ------------------
 src/nnue/features/half_relative_kp.cpp        |  75 +-
 src/nnue/features/half_relative_kp.h          |  13 +-
 src/nnue/features/k.cpp                       |  39 +-
 src/nnue/features/k.h                         |   4 +
 src/nnue/features/p.cpp                       |  38 +-
 src/nnue/features/p.h                         |   8 +-
 .../trainer/features/factorizer_half_kp.h     |   6 +-
 src/types.h                                   |   5 +
 14 files changed, 96 insertions(+), 1511 deletions(-)
 delete mode 100644 src/eval/evaluate_mir_inv_tools.cpp
 delete mode 100644 src/eval/evaluate_mir_inv_tools.h

diff --git a/src/eval/evaluate_mir_inv_tools.cpp b/src/eval/evaluate_mir_inv_tools.cpp
deleted file mode 100644
index 3667b9f5..00000000
--- a/src/eval/evaluate_mir_inv_tools.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-﻿#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
-
-#include "evaluate_mir_inv_tools.h"
-
-namespace Eval
-{
-
-	// --- tables
-
-	// Value when a certain PieceSquare is seen from the other side
-	// BONA_PIECE_INIT is -1, so it must be a signed type.
-	// Even if KPPT is expanded, PieceSquare will not exceed 2^15 for the time being, so int16_t is good.
-	int16_t inv_piece_[PieceSquare::PS_END];
-
-	// Returns the one at the position where a PieceSquare on the board is mirrored.
-	int16_t mir_piece_[PieceSquare::PS_END];
-
-
-	// --- methods
-
-// Returns the value when a certain PieceSquare is seen from the other side
-	PieceSquare inv_piece(PieceSquare p) { return (PieceSquare)inv_piece_[p]; }
-
-	// Returns the one at the position where a PieceSquare on the board is mirrored.
-	PieceSquare mir_piece(PieceSquare p) { return (PieceSquare)mir_piece_[p]; }
-
-	std::function<void()> mir_piece_init_function;
-
-	void init_mir_inv_tables()
-	{
-		// Initialize the mirror and inverse tables.
-
-		// Initialization is limited to once.
-		static bool first = true;
-		if (!first) return;
-		first = false;
-
-		// exchange f and e
-		int t[] = {
-			PieceSquare::PS_W_PAWN             , PieceSquare::PS_B_PAWN            ,
-			PieceSquare::PS_W_KNIGHT           , PieceSquare::PS_B_KNIGHT          ,
-			PieceSquare::PS_W_BISHOP           , PieceSquare::PS_B_BISHOP          ,
-			PieceSquare::PS_W_ROOK             , PieceSquare::PS_B_ROOK            ,
-			PieceSquare::PS_W_QUEEN            , PieceSquare::PS_B_QUEEN           ,
-		};
-
-		// Insert uninitialized value.
-		for (PieceSquare p = PieceSquare::PS_NONE; p < PieceSquare::PS_END; ++p)
-		{
-			inv_piece_[p] = PieceSquare::PS_NOT_INIT;
-
-			// mirror does not work for hand pieces. Just return the original value.
-			mir_piece_[p] = (p < PieceSquare::PS_W_PAWN) ? p : PieceSquare::PS_NOT_INIT;
-		}
-
-		for (PieceSquare p = PieceSquare::PS_NONE; p < PieceSquare::PS_END; ++p)
-		{
-			for (int i = 0; i < 32 /* t.size() */; i += 2)
-			{
-				if (t[i] <= p && p < t[i + 1])
-				{
-					Square sq = (Square)(p - t[i]);
-
-					// found!!
-					PieceSquare q = (p < PieceSquare::PS_W_PAWN) ? PieceSquare(sq + t[i + 1]) : (PieceSquare)(rotate180(sq) + t[i + 1]);
-					inv_piece_[p] = q;
-					inv_piece_[q] = p;
-
-					/*
-					It's a bit tricky, but regarding p
-										p >= PieceSquare::PS_W_PAWN
-										When.
-
-					For this p, let n be an integer (i in the above code can only be an even number),
-					a) When t[2n + 0] <= p <t[2n + 1], the first piece
-					b) When t[2n + 1] <= p <t[2n + 2], the back piece
-					Is.
-
-					Therefore, if p in the range of a) is set to q = rotate180(p-t[2n+0]) + t[2n+1], it becomes the back piece in the box rotated 180 degrees.
-					So inv_piece[] is initialized by swapping p and q.
-					*/
-
-					// There is no mirror for hand pieces.
-					if (p < PieceSquare::PS_W_PAWN)
-						continue;
-
-					PieceSquare r1 = (PieceSquare)(flip_file(sq) + t[i]);
-					mir_piece_[p] = r1;
-					mir_piece_[r1] = p;
-
-					PieceSquare p2 = (PieceSquare)(sq + t[i + 1]);
-					PieceSquare r2 = (PieceSquare)(flip_file(sq) + t[i + 1]);
-					mir_piece_[p2] = r2;
-					mir_piece_[r2] = p2;
-
-					break;
-				}
-			}
-		}
-
-		if (mir_piece_init_function)
-			mir_piece_init_function();
-
-		for (PieceSquare p = PieceSquare::PS_NONE; p < PieceSquare::PS_END; ++p)
-		{
-			// It remains uninitialized. The initialization code in the table above is incorrect.
-			assert(mir_piece_[p] != PieceSquare::PS_NOT_INIT && mir_piece_[p] < PieceSquare::PS_END);
-			assert(inv_piece_[p] != PieceSquare::PS_NOT_INIT && inv_piece_[p] < PieceSquare::PS_END);
-
-			// mir and inv return to their original coordinates after being applied twice.
-			assert(mir_piece_[mir_piece_[p]] == p);
-			assert(inv_piece_[inv_piece_[p]] == p);
-
-			// mir->inv->mir->inv must be the original location.
-			assert(p == inv_piece(mir_piece(inv_piece(mir_piece(p)))));
-
-			// inv->mir->inv->mir must be the original location.
-			assert(p == mir_piece(inv_piece(mir_piece(inv_piece(p)))));
-		}
-
-#if 0
-		// Pre-verification that it is okay to mirror the evaluation function
-		// When writing a value, there is an assertion, so if you can't mirror it,
-		// Should get caught in the assert.
-
-		// Apery's WCSC26 evaluation function, kpp p1==0 or p1==20 (0th step on the back)
-		// There is dust in it, and if you don't avoid it, it will get caught in the assert.
-
-		std::unordered_set<PieceSquare> s;
-		vector<int> a = {
-			f_hand_pawn - 1,e_hand_pawn - 1,
-			f_hand_lance - 1, e_hand_lance - 1,
-			f_hand_knight - 1, e_hand_knight - 1,
-			f_hand_silver - 1, e_hand_silver - 1,
-			f_hand_gold - 1, e_hand_gold - 1,
-			f_hand_bishop - 1, e_hand_bishop - 1,
-			f_hand_rook - 1, e_hand_rook - 1,
-		};
-		for (auto b : a)
-			s.insert((PieceSquare)b);
-
-		// Excludes walks, incense, and katsura on the board that do not appear further (Apery also contains garbage here)
-		for (Rank r = RANK_1; r <= RANK_2; ++r)
-			for (File f = FILE_1; f <= FILE_9; ++f)
-			{
-				if (r == RANK_1)
-				{
-					// first step
-					PieceSquare b1 = PieceSquare(PieceSquare::PS_W_PAWN + (f | r));
-					s.insert(b1);
-					s.insert(inv_piece[b1]);
-
-					// 1st stage incense
-					PieceSquare b2 = PieceSquare(f_lance + (f | r));
-					s.insert(b2);
-					s.insert(inv_piece[b2]);
-				}
-
-				// Katsura on the 1st and 2nd steps
-				PieceSquare b = PieceSquare(PieceSquare::PS_W_KNIGHT + (f | r));
-				s.insert(b);
-				s.insert(inv_piece[b]);
-			}
-
-		cout << "\nchecking kpp_write()..";
-		for (auto sq : SQ)
-		{
-			cout << sq << ' ';
-			for (PieceSquare p1 = PieceSquare::PS_NONE; p1 < PieceSquare::PS_END; ++p1)
-				for (PieceSquare p2 = PieceSquare::PS_NONE; p2 < PieceSquare::PS_END; ++p2)
-					if (!s.count(p1) && !s.count(p2))
-						kpp_write(sq, p1, p2, kpp[sq][p1][p2]);
-		}
-		cout << "\nchecking kkp_write()..";
-
-		for (auto sq1 : SQ)
-		{
-			cout << sq1 << ' ';
-			for (auto sq2 : SQ)
-				for (PieceSquare p1 = PieceSquare::PS_NONE; p1 < PieceSquare::PS_END; ++p1)
-					if (!s.count(p1))
-						kkp_write(sq1, sq2, p1, kkp[sq1][sq2][p1]);
-		}
-		cout << "..done!" << endl;
-#endif
-	}
-
-}
-
-#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
diff --git a/src/eval/evaluate_mir_inv_tools.h b/src/eval/evaluate_mir_inv_tools.h
deleted file mode 100644
index 1f193b17..00000000
--- a/src/eval/evaluate_mir_inv_tools.h
+++ /dev/null
@@ -1,47 +0,0 @@
-﻿#ifndef _EVALUATE_MIR_INV_TOOLS_
-#define _EVALUATE_MIR_INV_TOOLS_
-
-#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
-
-// PieceSquare's mirror (horizontal flip) and inverse (180° on the board) tools to get pieces.
-
-#include "../types.h"
-#include "../evaluate.h"
-#include <functional>
-
-namespace Eval
-{
-	// -------------------------------------------------
-	//                  tables
-	// -------------------------------------------------
-
-	// --- Provide Mirror and Inverse to PieceSquare.
-
-	// These arrays are initialized by calling init() or init_mir_inv_tables();.
-	// If you want to use only this table from the evaluation function,
-	// Call init_mir_inv_tables().
-	// These arrays are referenced from the KK/KKP/KPP classes below.
-
-	// Returns the value when a certain PieceSquare is seen from the other side
-	extern PieceSquare inv_piece(PieceSquare p);
-
-	// Returns the one at the position where a PieceSquare on the board is mirrored.
-	extern PieceSquare mir_piece(PieceSquare p);
-
-
-	// callback called when initializing mir_piece/inv_piece
-	// Used when extending fe_end on the user side.
-	// Inv_piece_ and inv_piece_ are exposed because they are necessary for this initialization.
-	// At the timing when mir_piece_init_function is called, until fe_old_end
-	// It is guaranteed that these tables have been initialized.
-	extern std::function<void()> mir_piece_init_function;
-	extern int16_t mir_piece_[PieceSquare::PS_END];
-	extern int16_t inv_piece_[PieceSquare::PS_END];
-
-	// The table above will be initialized when you call this function explicitly or call init().
-	extern void init_mir_inv_tables();
-}
-
-#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
-
-#endif
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 5cbf821d..8edc9bb8 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -1028,102 +1028,3 @@ std::string Eval::trace(const Position& pos) {
 
   return ss.str();
 }
-
-// Check whether the pieceListFw[] held internally is a correct PieceSquare.
-// Note: For debugging. slow.
-bool EvalList::is_valid(const Position& pos)
-{
-    std::set<PieceId> piece_numbers;
-    for (Square sq = SQ_A1; sq != SQUARE_NB; ++sq) {
-        auto piece_number = piece_id_list[sq];
-        if (piece_number == PieceId::PIECE_ID_NONE) {
-            continue;
-        }
-        assert(!piece_numbers.count(piece_number));
-        piece_numbers.insert(piece_number);
-    }
-
-    for (int i = 0; i < PieceId::PIECE_ID_KING; ++i)
-    {
-        PieceSquare fw = pieceListFw[i];
-        // Go to the Position class to see if this fw really exists.
-
-        if (fw == PieceSquare::PS_NONE) {
-            continue;
-        }
-
-        // Out of range
-        if (!(0 <= fw && fw < PieceSquare::PS_END))
-            return false;
-
-        // Since it is a piece on the board, I will check if this piece really exists.
-        for (Piece pc = NO_PIECE; pc < PIECE_NB; ++pc)
-        {
-            auto pt = type_of(pc);
-            if (pt == NO_PIECE_TYPE || pt == 7) // non-existing piece
-                continue;
-
-            // PieceSquare start number of piece pc
-            auto s = PieceSquare(kpp_board_index[pc].from[Color::WHITE]);
-            if (s <= fw && fw < s + SQUARE_NB)
-            {
-                // Since it was found, check if this piece is at sq.
-                Square sq = (Square)(fw - s);
-                Piece pc2 = pos.piece_on(sq);
-
-                if (pc2 != pc)
-                    return false;
-
-                goto Found;
-            }
-        }
-        // It was a piece that did not exist for some reason..
-        return false;
-    Found:;
-    }
-
-    // Validate piece_id_list
-    for (auto sq = SQUARE_ZERO; sq < SQUARE_NB; ++sq) {
-        Piece expected_piece = pos.piece_on(sq);
-        PieceId piece_number = piece_id_list[sq];
-        if (piece_number == PieceId::PIECE_ID_NONE) {
-            assert(expected_piece == NO_PIECE);
-            if (expected_piece != NO_PIECE) {
-                return false;
-            }
-            continue;
-        }
-
-        PieceSquare bona_piece_white = pieceListFw[piece_number];
-        Piece actual_piece;
-        for (actual_piece = NO_PIECE; actual_piece < PIECE_NB; ++actual_piece) {
-            if (kpp_board_index[actual_piece].from[Color::WHITE] == PieceSquare::PS_NONE) {
-                continue;
-            }
-
-            if (kpp_board_index[actual_piece].from[Color::WHITE] <= bona_piece_white
-                && bona_piece_white < kpp_board_index[actual_piece].from[Color::WHITE] + SQUARE_NB) {
-                break;
-            }
-        }
-
-        assert(actual_piece != PIECE_NB);
-        if (actual_piece == PIECE_NB) {
-            return false;
-        }
-
-        assert(actual_piece == expected_piece);
-        if (actual_piece != expected_piece) {
-            return false;
-        }
-
-        Square actual_square = static_cast<Square>(
-            bona_piece_white - kpp_board_index[actual_piece].from[Color::WHITE]);
-        assert(sq == actual_square);
-        if (sq != actual_square) {
-            return false;
-        }
-    }
-
-    return true;
-}
diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
index 68576c82..ac789ce8 100644
--- a/src/extra/sfen_packer.cpp
+++ b/src/extra/sfen_packer.cpp
@@ -276,13 +276,6 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 	// Active color
 	sideToMove = (Color)stream.read_one_bit();
 
-	// clear evalList. It is cleared when memset is cleared to zero above...
-	evalList.clear();
-
-	// In updating the PieceList, we have to set which piece is where,
-	// A counter of how much each piece has been used
-  PieceId next_piece_number = PieceId::PIECE_ID_ZERO;
-
   pieceList[W_KING][0] = SQUARE_NB;
   pieceList[B_KING][0] = SQUARE_NB;
 
@@ -327,14 +320,6 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 
       put_piece(Piece(pc), sq);
 
-      // update evalList
-      PieceId piece_no =
-        (pc == B_KING) ?PieceId::PIECE_ID_BKING :// Move ball
-        (pc == W_KING) ?PieceId::PIECE_ID_WKING :// Backing ball
-        next_piece_number++; // otherwise
-
-      evalList.put_piece(piece_no, sq, pc); // Place the pc piece in the sq box
-
       //cout << sq << ' ' << board[sq] << ' ' << stream.get_cursor() << endl;
 
       if (stream.get_cursor()> 256)
@@ -402,9 +387,6 @@ set_state(st);
   //std::cout << *this << std::endl;
 
   assert(pos_is_ok());
-#if defined(EVAL_NNUE)
-  assert(evalList.is_valid(*this));
-#endif  // defined(EVAL_NNUE)
 
 	return 0;
 }
diff --git a/src/learn/learning_tools.cpp b/src/learn/learning_tools.cpp
index c97b4910..de6da9c5 100644
--- a/src/learn/learning_tools.cpp
+++ b/src/learn/learning_tools.cpp
@@ -20,237 +20,6 @@ namespace EvalLearningTools
 	double Weight::eta3;
 	uint64_t Weight::eta1_epoch;
 	uint64_t Weight::eta2_epoch;
-
-	std::vector<bool> min_index_flag;
-
-	// --- initialization for each individual table
-
-	void init_min_index_flag()
-	{
-		// Initialization of mir_piece and inv_piece must be completed.
-		assert(Eval::mir_piece(PieceSquare::PS_W_PAWN) == PieceSquare::PS_B_PAWN);
-
-		// Initialize the flag array for dimension reduction
-		// Not involved in KPPP.
-
-		KK g_kk;
-		g_kk.set(SQUARE_NB, PieceSquare::PS_END, 0);
-		KKP g_kkp;
-		g_kkp.set(SQUARE_NB, PieceSquare::PS_END, g_kk.max_index());
-		KPP g_kpp;
-		g_kpp.set(SQUARE_NB, PieceSquare::PS_END, g_kkp.max_index());
-
-		uint64_t size = g_kpp.max_index();
-		min_index_flag.resize(size);
-
-#pragma omp parallel
-		{
-#if defined(_OPENMP)
-			// To prevent the logical 64 cores from being used when there are two CPUs under Windows
-			// explicitly assign to CPU here
-			int thread_index = omp_get_thread_num(); // get your thread number
-			WinProcGroup::bindThisThread(thread_index);
-#endif
-
-#pragma omp for schedule(dynamic,20000)
-
-			for (int64_t index_ = 0; index_ < (int64_t)size; ++index_)
-			{
-				// It seems that the loop variable must be a sign type due to OpenMP restrictions, but
-				// It's really difficult to use.
-				uint64_t index = (uint64_t)index_;
-
-				if (g_kk.is_ok(index))
-				{
-					// Make sure that the original index will be restored by conversion from index and reverse conversion.
-					// It is a process that is executed only once at startup, so write it in assert.
-					assert(g_kk.fromIndex(index).toIndex() == index);
-
-					KK a[KK_LOWER_COUNT];
-					g_kk.fromIndex(index).toLowerDimensions(a);
-
-					// Make sure that the first element of dimension reduction is the same as the original index.
-					assert(a[0].toIndex() == index);
-
-					uint64_t min_index = UINT64_MAX;
-					for (auto& e : a)
-						min_index = std::min(min_index, e.toIndex());
-					min_index_flag[index] = (min_index == index);
-				}
-				else if (g_kkp.is_ok(index))
-				{
-					assert(g_kkp.fromIndex(index).toIndex() == index);
-
-					KKP x = g_kkp.fromIndex(index);
-					KKP a[KKP_LOWER_COUNT];
-					x.toLowerDimensions(a);
-
-					assert(a[0].toIndex() == index);
-
-					uint64_t min_index = UINT64_MAX;
-					for (auto& e : a)
-						min_index = std::min(min_index, e.toIndex());
-					min_index_flag[index] = (min_index == index);
-				}
-				else if (g_kpp.is_ok(index))
-				{
-					assert(g_kpp.fromIndex(index).toIndex() == index);
-
-					KPP x = g_kpp.fromIndex(index);
-					KPP a[KPP_LOWER_COUNT];
-					x.toLowerDimensions(a);
-
-					assert(a[0].toIndex() == index);
-
-					uint64_t min_index = UINT64_MAX;
-					for (auto& e : a)
-						min_index = std::min(min_index, e.toIndex());
-					min_index_flag[index] = (min_index == index);
-				}
-				else
-				{
-					assert(false);
-				}
-			}
-		}
-	}
-
-	void learning_tools_unit_test_kpp()
-	{
-
-		// test KPP triangulation for bugs
-		// All combinations of k-p0-p1 are properly handled by KPP, and the dimension reduction at that time is
-		// Determine if it is correct.
-
-		KK g_kk;
-		g_kk.set(SQUARE_NB, PieceSquare::PS_END, 0);
-		KKP g_kkp;
-		g_kkp.set(SQUARE_NB, PieceSquare::PS_END, g_kk.max_index());
-		KPP g_kpp;
-		g_kpp.set(SQUARE_NB, PieceSquare::PS_END, g_kkp.max_index());
-
-		std::vector<bool> f;
-		f.resize(g_kpp.max_index() - g_kpp.min_index());
-
-		for(auto k = SQUARE_ZERO ; k < SQUARE_NB ; ++k)
-			for(auto p0 = PieceSquare::PS_NONE; p0 < PieceSquare::PS_END ; ++p0)
-				for (auto p1 = PieceSquare::PS_NONE; p1 < PieceSquare::PS_END; ++p1)
-				{
-					KPP kpp_org = g_kpp.fromKPP(k,p0,p1);
-					KPP kpp0;
-					KPP kpp1 = g_kpp.fromKPP(flip_file(k), mir_piece(p0), mir_piece(p1));
-					KPP kpp_array[2];
-
-					auto index = kpp_org.toIndex();
-					assert(g_kpp.is_ok(index));
-
-					kpp0 = g_kpp.fromIndex(index);
-
-					//if (kpp0 != kpp_org)
-					//	std::cout << "index = " << index << "," << kpp_org << "," << kpp0 << std::endl;
-
-					kpp0.toLowerDimensions(kpp_array);
-
-					assert(kpp_array[0] == kpp0);
-					assert(kpp0 == kpp_org);
-					assert(kpp_array[1] == kpp1);
-
-					auto index2 = kpp1.toIndex();
-					f[index - g_kpp.min_index()] = f[index2-g_kpp.min_index()] = true;
-				}
-
-		// Check if there is no missing index.
-		for(size_t index = 0 ; index < f.size(); index++)
-			if (!f[index])
-			{
-				std::cout << index << g_kpp.fromIndex(index + g_kpp.min_index()) << std::endl;
-			}
-	}
-
-	void learning_tools_unit_test_kppp()
-	{
-		// Test for missing KPPP calculations
-
-		KPPP g_kppp;
-		g_kppp.set(15, PieceSquare::PS_END,0);
-		uint64_t min_index = g_kppp.min_index();
-		uint64_t max_index = g_kppp.max_index();
-
-		// Confirm last element.
-		//KPPP x = KPPP::fromIndex(max_index-1);
-		//std::cout << x << std::endl;
-
-		for (uint64_t index = min_index; index < max_index; ++index)
-		{
-			KPPP x = g_kppp.fromIndex(index);
-			//std::cout << x << std::endl;
-
-#if 0
-			if ((index % 10000000) == 0)
-				std::cout << "index = " << index << std::endl;
-
-			// index = 9360000000
-			//	done.
-
-			if (x.toIndex() != index)
-			{
-				std::cout << "assertion failed , index = " << index << std::endl;
-			}
-#endif
-
-			assert(x.toIndex() == index);
-
-//			ASSERT((&kppp_ksq_pcpcpc(x.king(), x.piece0(), x.piece1(), x.piece2()) - &kppp[0][0]) == (index - min_index));
-		}
-
-	}
-
-	void learning_tools_unit_test_kkpp()
-	{
-		KKPP g_kkpp;
-		g_kkpp.set(SQUARE_NB, 10000, 0);
-		uint64_t n = 0;
-		for (int k = 0; k<SQUARE_NB; ++k)
-			for (int i = 0; i<10000; ++i) // As a test, assuming a large fe_end, try turning at 10000.
-				for (int j = 0; j < i; ++j)
-				{
-					auto kkpp = g_kkpp.fromKKPP(k, (PieceSquare)i, (PieceSquare)j);
-					auto r = kkpp.toRawIndex();
-					assert(n++ == r);
-					auto kkpp2 = g_kkpp.fromIndex(r + g_kkpp.min_index());
-					assert(kkpp2.king() == k && kkpp2.piece0() == i && kkpp2.piece1() == j);
-				}
-	}
-
-	// Initialize this entire EvalLearningTools
-	void init()
-	{
-		// Initialization is required only once after startup, so a flag for that.
-		static bool first = true;
-
-		if (first)
-		{
-			std::cout << "EvalLearningTools init..";
-
-			// Make mir_piece() and inv_piece() available.
-			// After this, the min_index_flag is initialized, but
-			// It depends on this, so you need to do this first.
-			init_mir_inv_tables();
-
-			//learning_tools_unit_test_kpp();
-			//learning_tools_unit_test_kppp();
-			//learning_tools_unit_test_kkpp();
-
-			// It may be the last time to execute UnitTest, but since init_min_index_flag() takes a long time,
-			// I want to do this at the time of debugging.
-
-			init_min_index_flag();
-
-			std::cout << "done." << std::endl;
-
-			first = false;
-		}
-	}
 }
 
 #endif
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index becd8db4..3c4be08a 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -7,8 +7,6 @@
 #if defined (EVAL_LEARN)
 #include <array>
 
-#include "../eval/evaluate_mir_inv_tools.h"
-
 #if defined(SGD_UPDATE) || defined(USE_KPPP_MIRROR_WRITE)
 #include "../misc.h"  // PRNG , my_insertion_sort
 #endif
@@ -17,27 +15,6 @@
 
 namespace EvalLearningTools
 {
-	// -------------------------------------------------
-	//                  Initialization
-	// -------------------------------------------------
-
-	// Initialize the tables in this EvalLearningTools namespace.
-	// Be sure to call once before learning starts.
-	// In this function, we also call init_mir_inv_tables().
-	// (It is not necessary to call init_mir_inv_tables() when calling this function.)
-	void init();
-
-	// -------------------------------------------------
-	//                     flags
-	// -------------------------------------------------
-
-	// When the dimension is lowered, it may become the smallest index among them
-	// A flag array that is true for the known index.
-	// This array is also initialized by init().
-	// KPPP is not involved.
-	// Therefore, the valid index range of this array is from KK::min_index() to KPP::max_index().
-	extern std::vector<bool> min_index_flag;
-
 	// -------------------------------------------------
 	//   Array for learning that stores gradients etc.
 	// -------------------------------------------------
@@ -217,817 +194,6 @@ namespace EvalLearningTools
 
 		std::array<LearnFloatType, 2> get_grad() const { return std::array<LearnFloatType, 2>{w[0].get_grad(), w[1].get_grad()}; }
 	};
-
-	// ------------------------------------------------ -
-	// A helper that calculates the index when the Weight array is serialized.
-	// ------------------------------------------------ -
-
-	// Base class for KK,KKP,KPP,KKPP
-	// How to use these classes
-	//
-	// 1. Initialize with set() first. Example) KK g_kk; g_kk.set(SQUARE_NB,fe_end,0);
-	// 2. Next create an instance with fromIndex(), fromKK(), etc.
-	// 3. Access using properties such as king(), piece0(), piece1().
-	//
-	// It may be difficult to understand just by this explanation, but if you look at init_grad(), add_grad(), update_weights() etc. in the learning part
-	// I think you can understand it including the necessity.
-	//
-	// Note: this derived class may indirectly reference the above inv_piece/mir_piece for dimension reduction, so
-	// Initialize by calling EvalLearningTools::init() or init_mir_inv_tables() first.
-	//
-	// Remarks) /*final*/ is written for the function name that should not be overridden on the derived class side.
-	// The function that should be overridden on the derived class side is a pure virtual function with "= 0".
-	// Only virtual functions are added to the derived class that may or may not be overridden.
-	//
-	struct SerializerBase
-	{
-
-		// Minimum value and maximum value of serial number +1 when serializing KK, KKP, KPP arrays.
-		/*final*/ uint64_t min_index() const { return min_index_; }
-		/*final*/ uint64_t max_index() const { return min_index() + max_raw_index_; }
-
-		// max_index() - min_index() the value of.
-		// Calculate the value from max_king_sq_,fe_end_ etc. on the derived class side and return it.
-		virtual uint64_t size() const = 0;
-
-		// Determine if the given index is more than min_index() and less than max_index().
-		/*final*/ bool is_ok(uint64_t index) { return min_index() <= index && index < max_index(); }
-
-		// Make sure to call this set(). Otherwise, construct an instance using fromKK()/fromIndex() etc. on the derived class side.
-		virtual void set(int max_king_sq, uint64_t fe_end, uint64_t min_index)
-		{
-			max_king_sq_ = max_king_sq;
-			fe_end_ = fe_end;
-			min_index_ = min_index;
-			max_raw_index_ = size();
-		}
-
-		// Get the index when serialized, based on the value of the current member.
-		/*final*/ uint64_t toIndex() const {
-			return min_index() + toRawIndex();
-		}
-
-		// Returns the index when serializing. (The value of min_index() is before addition)
-		virtual uint64_t toRawIndex() const = 0;
-
-	protected:
-		// The value of min_index() returned by this class
-		uint64_t min_index_;
-
-		// The value of max_index() returned by this class = min_index() + max_raw_index_
-		// This variable is calculated by size() of the derived class.
-		uint64_t max_raw_index_;
-
-		// The number of balls to support (normally SQUARE_NB)
-		int max_king_sq_;
-
-		// Maximum PieceSquare value supported
-		uint64_t fe_end_;
-
-	};
-
-	struct KK : public SerializerBase
-	{
-	protected:
-		KK(Square king0, Square king1,bool inverse) : king0_(king0), king1_(king1) , inverse_sign(inverse) {}
-	public:
-		KK() {}
-
-		virtual uint64_t size() const { return max_king_sq_ * max_king_sq_; }
-
-		// builder that creates KK object from index (serial number)
-		KK fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
-
-		// builder that creates KK object from raw_index (number starting from 0, not serial number)
-		KK fromRawIndex(uint64_t raw_index) const
-		{
-			int king1 = (int)(raw_index % SQUARE_NB);
-			raw_index /= SQUARE_NB;
-			int king0 = (int)(raw_index  /* % SQUARE_NB */);
-			assert(king0 < SQUARE_NB);
-			return fromKK((Square)king0, (Square)king1 , false);
-		}
-		KK fromKK(Square king0, Square king1 , bool inverse) const
-		{
-			// The variable name kk is used in the Eval::kk array etc., so it needs to be different. (The same applies to KKP, KPP classes, etc.)
-			KK my_kk(king0, king1, inverse);
-			my_kk.set(max_king_sq_, fe_end_, min_index());
-			return my_kk;
-		}
-		KK fromKK(Square king0, Square king1) const { return fromKK(king0, king1, false); }
-
-		// When you construct this object using fromIndex(), you can get information with the following accessors.
-		Square king0() const { return king0_; }
-		Square king1() const { return king1_; }
-
-// number of dimension reductions
-#if defined(USE_KK_INVERSE_WRITE)
-	#define KK_LOWER_COUNT 4
-#elif defined(USE_KK_MIRROR_WRITE)
-	#define KK_LOWER_COUNT 2
-#else 
-	#define KK_LOWER_COUNT 1
-#endif
-
-#if defined(USE_KK_INVERSE_WRITE) && !defined(USE_KK_MIRROR_WRITE) 
-		// USE_KK_INVERSE_WRITE If you use it, please also define USE_KK_MIRROR_WRITE.
-		static_assert(false, "define also USE_KK_MIRROR_WRITE!");
-#endif
-
-		// Get the index of the low-dimensional array.
-		// When USE_KK_INVERSE_WRITE is enabled, the inverse of them will be in [2] and [3].
-		// Note that the sign of grad must be reversed for this dimension reduction.
-		// You can use is_inverse() because it can be determined.
-		void toLowerDimensions(/*out*/KK kk_[KK_LOWER_COUNT]) const {
-			kk_[0] = fromKK(king0_, king1_,false);
-#if defined(USE_KK_MIRROR_WRITE)
-			kk_[1] = fromKK(flip_file(king0_),flip_file(king1_),false);
-#if defined(USE_KK_INVERSE_WRITE)
-			kk_[2] = fromKK(rotate180(king1_), rotate180(king0_),true);
-			kk_[3] = fromKK(rotate180(flip_file(king1_)) , rotate180(flip_file(king0_)),true);
-#endif
-#endif
-		}
-
-		// Get the index when counting the value of min_index() of this class as 0.
-		virtual uint64_t toRawIndex() const {
-			return (uint64_t)king0_ * (uint64_t)max_king_sq_ + (uint64_t)king1_;
-		}
-
-		// Returns whether or not the dimension lowered with toLowerDimensions is inverse.
-		bool is_inverse() const {
-			return inverse_sign;
-		}
-
-		// When is_inverse() == true, reverse the sign that is not grad's turn and return it.
-		template <typename T>
-		std::array<T, 2> apply_inverse_sign(const std::array<T, 2>& rhs)
-		{
-			return !is_inverse() ? rhs : std::array<T, 2>{-rhs[0], rhs[1]};
-		}
-
-		// comparison operator
-		bool operator==(const KK& rhs) { return king0() == rhs.king0() && king1() == rhs.king1(); }
-		bool operator!=(const KK& rhs) { return !(*this == rhs); }
-
-	private:
-		Square king0_, king1_ ;
-		bool inverse_sign;
-	};
-
-	// Output for debugging.
-	static std::ostream& operator<<(std::ostream& os, KK rhs)
-	{
-		os << "KK(" << rhs.king0() << "," << rhs.king1() << ")";
-		return os;
-	}
-
-		// Same as KK. For KKP.
-	struct KKP : public SerializerBase
-	{
-	protected:
-		KKP(Square king0, Square king1, PieceSquare p) : king0_(king0), king1_(king1), piece_(p), inverse_sign(false) {}
-		KKP(Square king0, Square king1, PieceSquare p, bool inverse) : king0_(king0), king1_(king1), piece_(p),inverse_sign(inverse) {}
-	public:
-		KKP() {}
-
-		virtual uint64_t size() const { return (uint64_t)max_king_sq_*(uint64_t)max_king_sq_*(uint64_t)fe_end_; }
-
-		// builder that creates KKP object from index (serial number)
-		KKP fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
-
-		// A builder that creates a KKP object from raw_index (a number that starts from 0, not a serial number)
-		KKP fromRawIndex(uint64_t raw_index) const
-		{
-			int piece = (int)(raw_index % PieceSquare::PS_END);
-			raw_index /= PieceSquare::PS_END;
-			int king1 = (int)(raw_index % SQUARE_NB);
-			raw_index /= SQUARE_NB;
-			int king0 = (int)(raw_index  /* % SQUARE_NB */);
-			assert(king0 < SQUARE_NB);
-			return fromKKP((Square)king0, (Square)king1, (PieceSquare)piece,false);
-		}
-
-		KKP fromKKP(Square king0, Square king1, PieceSquare p, bool inverse) const
-		{
-			KKP my_kkp(king0, king1, p, inverse);
-			my_kkp.set(max_king_sq_,fe_end_,min_index());
-			return my_kkp;
-		}
-		KKP fromKKP(Square king0, Square king1, PieceSquare p) const { return fromKKP(king0, king1, p, false); }
-
-		// When you construct this object using fromIndex(), you can get information with the following accessors.
-		Square king0() const { return king0_; }
-		Square king1() const { return king1_; }
-		PieceSquare piece() const { return piece_; }
-
-		// Number of KKP dimension reductions
-#if defined(USE_KKP_INVERSE_WRITE)
-		#define KKP_LOWER_COUNT 4
-#elif defined(USE_KKP_MIRROR_WRITE)
-		#define KKP_LOWER_COUNT 2
-#else
-		#define KKP_LOWER_COUNT 1
-#endif
-
-#if defined(USE_KKP_INVERSE_WRITE) && !defined(USE_KKP_MIRROR_WRITE) 
-		// USE_KKP_INVERSE_WRITE If you use it, please also define USE_KKP_MIRROR_WRITE.
-		static_assert(false, "define also USE_KKP_MIRROR_WRITE!");
-#endif
-
-		// Get the index of the low-dimensional array. The mirrored one is returned to kkp_[1].
-		// When USE_KKP_INVERSE_WRITE is enabled, the inverse of them will be in [2] and [3].
-		// Note that the sign of grad must be reversed for this dimension reduction.
-		// You can use is_inverse() because it can be determined.
-		void toLowerDimensions(/*out*/ KKP kkp_[KKP_LOWER_COUNT]) const {
-			kkp_[0] = fromKKP(king0_, king1_, piece_,false);
-#if defined(USE_KKP_MIRROR_WRITE)
-			kkp_[1] = fromKKP(flip_file(king0_), flip_file(king1_), Eval::mir_piece(piece_),false);
-#if defined(USE_KKP_INVERSE_WRITE)
-			kkp_[2] = fromKKP( rotate180(king1_), rotate180(king0_), Eval::inv_piece(piece_),true);
-			kkp_[3] = fromKKP( rotate180(flip_file(king1_)), rotate180(flip_file(king0_)) , Eval::inv_piece(Eval::mir_piece(piece_)),true);
-#endif
-#endif
-		}
-
-		// Get the index when counting the value of min_index() of this class as 0.
-		virtual uint64_t toRawIndex() const {
-			return  ((uint64_t)king0_ * (uint64_t)max_king_sq_ + (uint64_t)king1_) * (uint64_t)fe_end_ + (uint64_t)piece_;
-		}
-
-		// Returns whether or not the dimension lowered with toLowerDimensions is inverse.
-		bool is_inverse() const {
-			return inverse_sign;
-		}
-
-		// When is_inverse() == true, reverse the sign that is not grad's turn and return it.
-		template <typename T>
-		std::array<T, 2> apply_inverse_sign(const std::array<T, 2>& rhs)
-		{
-			return !is_inverse() ? rhs : std::array<T, 2>{-rhs[0], rhs[1]};
-		}
-
-		// comparison operator
-		bool operator==(const KKP& rhs) { return king0() == rhs.king0() && king1() == rhs.king1() && piece() == rhs.piece(); }
-		bool operator!=(const KKP& rhs) { return !(*this == rhs); }
-
-	private:
-		Square king0_, king1_;
-		PieceSquare piece_;
-		bool inverse_sign;
-	};
-
-	// Output for debugging.
-	static std::ostream& operator<<(std::ostream& os, KKP rhs)
-	{
-		os << "KKP(" << rhs.king0() << "," << rhs.king1() << "," << rhs.piece() << ")";
-		return os;
-	}
-
-
-	// Same as KK and KKP. For KPP
-	struct KPP : public SerializerBase
-	{
-	protected:
-		KPP(Square king, PieceSquare p0, PieceSquare p1) : king_(king), piece0_(p0), piece1_(p1) {}
-
-	public:
-		KPP() {}
-
-		// The minimum and maximum KPP values ​​of serial numbers when serializing KK, KKP, KPP arrays.
-#if !defined(USE_TRIANGLE_WEIGHT_ARRAY)
-		virtual uint64_t size() const { return (uint64_t)max_king_sq_*(uint64_t)fe_end_*(uint64_t)fe_end_; }
-#else
-		// Triangularize the square array part of [fe_end][fe_end] of kpp[SQUARE_NB][fe_end][fe_end].
-		// If kpp[SQUARE_NB][triangle_fe_end], the first row of this triangular array has one element, the second row has two elements, and so on.
-		// hence triangle_fe_end = 1 + 2 + .. + fe_end = fe_end * (fe_end + 1) / 2
-		virtual uint64_t size() const { return (uint64_t)max_king_sq_*(uint64_t)triangle_fe_end; }
-#endif
-
-		virtual void set(int max_king_sq, uint64_t fe_end, uint64_t min_index)
-		{
-		// This value is used in size(), and size() is used in SerializerBase::set(), so calculate first.
-			triangle_fe_end = (uint64_t)fe_end*((uint64_t)fe_end + 1) / 2;
-
-			SerializerBase::set(max_king_sq, fe_end, min_index);
-		}
-
-		// builder that creates KPP object from index (serial number)
-		KPP fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
-
-		// A builder that creates KPP objects from raw_index (a number that starts from 0, not a serial number)
-		KPP fromRawIndex(uint64_t raw_index) const
-		{
-			const uint64_t triangle_fe_end = (uint64_t)fe_end_*((uint64_t)fe_end_ + 1) / 2;
-
-#if !defined(USE_TRIANGLE_WEIGHT_ARRAY)
-			int piece1 = (int)(raw_index % fe_end_);
-			raw_index /= fe_end_;
-			int piece0 = (int)(raw_index % fe_end_);
-			raw_index /= fe_end_;
-#else
-			uint64_t index2 = raw_index % triangle_fe_end;
-
-			// Write the expression to find piece0, piece1 from index2 here.
-			// This is the inverse function of index2 = i * (i+1) / 2 + j.
-			// If j = 0, i^2 + i-2 * index2 == 0
-			// From the solution formula of the quadratic equation i = (sqrt(8*index2+1)-1) / 2.
-			// After i is converted into an integer, j can be calculated as j = index2-i * (i + 1) / 2.
-
-			// PieceSquare assumes 32bit (may not fit in 16bit), so this multiplication must be 64bit.
-			int piece1 = int(sqrt(8 * index2 + 1) - 1) / 2;
-			int piece0 = int(index2 - (uint64_t)piece1*((uint64_t)piece1 + 1) / 2);
-
-			assert(piece1 < (int)fe_end_);
-			assert(piece0 < (int)fe_end_);
-			assert(piece0 > piece1);
-
-			raw_index /= triangle_fe_end;
-#endif
-			int king = (int)(raw_index  /* % SQUARE_NB */);
-			assert(king < max_king_sq_);
-			return fromKPP((Square)king, (PieceSquare)piece0, (PieceSquare)piece1);
-		}
-
-		KPP fromKPP(Square king, PieceSquare p0, PieceSquare p1) const
-		{
-			KPP my_kpp(king, p0, p1);
-			my_kpp.set(max_king_sq_,fe_end_,min_index());
-			return my_kpp;
-		}
-
-		// When you construct this object using fromIndex(), you can get information with the following accessors.
-		Square king() const { return king_; }
-		PieceSquare piece0() const { return piece0_; }
-		PieceSquare piece1() const { return piece1_; }
-
-
-// number of dimension reductions
-#if defined(USE_KPP_MIRROR_WRITE)
-	#if !defined(USE_TRIANGLE_WEIGHT_ARRAY)
-		#define KPP_LOWER_COUNT 4
-	#else
-		#define KPP_LOWER_COUNT 2
-	#endif
-#else
-	#if !defined(USE_TRIANGLE_WEIGHT_ARRAY)
-		#define KPP_LOWER_COUNT 2
-	#else
-		#define KPP_LOWER_COUNT 1
-	#endif
-#endif
-
-		// Get the index of the low-dimensional array. The ones with p1 and p2 swapped, the ones mirrored, etc. are returned.
-		void toLowerDimensions(/*out*/ KPP kpp_[KPP_LOWER_COUNT]) const {
-
-#if defined(USE_TRIANGLE_WEIGHT_ARRAY)
-			// Note that if you use a triangular array, the swapped piece0 and piece1 will not be returned.
-			kpp_[0] = fromKPP(king_, piece0_, piece1_);
-#if defined(USE_KPP_MIRROR_WRITE)
-			kpp_[1] = fromKPP(flip_file(king_), Eval::mir_piece(piece0_), Eval::mir_piece(piece1_));
-#endif
-
-#else
-			// When not using triangular array
-			kpp_[0] = fromKPP(king_, piece0_, piece1_);
-			kpp_[1] = fromKPP(king_, piece1_, piece0_);
-#if defined(USE_KPP_MIRROR_WRITE)
-			kpp_[2] = fromKPP(flip_file(king_), mir_piece(piece0_), mir_piece(piece1_));
-			kpp_[3] = fromKPP(flip_file(king_), mir_piece(piece1_), mir_piece(piece0_));
-#endif
-#endif
-		}
-
-		// Get the index when counting the value of min_index() of this class as 0.
-		virtual uint64_t toRawIndex() const {
-
-#if !defined(USE_TRIANGLE_WEIGHT_ARRAY)
-
-			return ((uint64_t)king_ * (uint64_t)fe_end_ + (uint64_t)piece0_) * (uint64_t)fe_end_ + (uint64_t)piece1_;
-
-#else
-			// Macro similar to that used in Bonanza 6.0
-			auto PcPcOnSq = [&](Square k, PieceSquare i, PieceSquare j)
-			{
-
-				// (i,j) in this triangular array is the element in the i-th row and the j-th column.
-				// 1st row + 2 + ... + i = i * (i+1) / 2 because the i-th row and 0th column is the total of the elements up to that point
-				// The i-th row and the j-th column is j plus this. i*(i+1)/2+j
-
-				// PieceSquare type is assumed to be 32 bits, so if you do not pay attention to multiplication, it will overflow.
-				return (uint64_t)k * triangle_fe_end + (uint64_t)(uint64_t(i)*(uint64_t(i)+1) / 2 + uint64_t(j));
-			};
-
-			auto k = king_;
-			auto i = piece0_;
-			auto j = piece1_;
-
-			return (i >= j) ? PcPcOnSq(k, i, j) : PcPcOnSq(k, j, i);
-#endif
-		}
-
-		// Returns whether or not the dimension lowered with toLowerDimensions is inverse.
-		// Prepared to match KK, KKP and interface. This method always returns false for this KPP class.
-		bool is_inverse() const {
-			return false;
-		}
-
-		// comparison operator
-		bool operator==(const KPP& rhs) {
-			return king() == rhs.king() &&
-				((piece0() == rhs.piece0() && piece1() == rhs.piece1())
-#if defined(USE_TRIANGLE_WEIGHT_ARRAY)
-					// When using a triangular array, allow swapping of piece0 and piece1.
-				|| (piece0() == rhs.piece1() && piece1() == rhs.piece0())
-#endif
-					); }
-		bool operator!=(const KPP& rhs) { return !(*this == rhs); }
-
-
-	private:
-		Square king_;
-		PieceSquare piece0_, piece1_;
-
-		uint64_t triangle_fe_end; // = (uint64_t)fe_end_*((uint64_t)fe_end_ + 1) / 2;
-	};
-
-	// Output for debugging.
-	static std::ostream& operator<<(std::ostream& os, KPP rhs)
-	{
-		os << "KPP(" << rhs.king() << "," << rhs.piece0() << "," << rhs.piece1() << ")";
-		return os;
-	}
-
-	// 4 pieces related to KPPP. However, if there is a turn and you do not consider mirrors etc., memory of 2 TB or more is required for learning.
-	// Even if you use a triangular array, you need 50GB x 12 bytes = 600GB for learning.
-	// It takes about half as much as storing only the mirrored one.
-	// Here, the triangular array is always used and the mirrored one is stored.
-	//
-	// Also, king() of this class is not limited to Square of the actual king, but a value from 0 to (king_sq-1) is simply returned.
-	// This needs to be converted to an appropriate ball position on the user side when performing compression using a mirror.
-	//
-	// Later, regarding the pieces0,1,2 returned by this class,
-	// piece0() >piece1() >piece2()
-	// It is, and it is necessary to keep this constraint when passing piece0,1,2 in the constructor.
-	struct KPPP : public SerializerBase
-	{
-	protected:
-		KPPP(int king, PieceSquare p0, PieceSquare p1, PieceSquare p2) :
-			king_(king), piece0_(p0), piece1_(p1), piece2_(p2)
-		{
-			assert(piece0_ > piece1_ && piece1_ > piece2_);
-			/* sort_piece(); */
-		}
-
-	public:
-		KPPP() {}
-
-		virtual uint64_t size() const { return (uint64_t)max_king_sq_*triangle_fe_end; }
-
-		// Set fe_end and king_sq.
-		// fe_end: fe_end assumed by this KPPP class
-		// king_sq: Number of balls to handle in KPPP.
-		// 3 layers x 3 mirrors = 3 layers x 5 lines = 15
-		// 2 steps x 2 mirrors without mirror = 18
-		// Set this first using set() on the side that uses this KPPP class.
-		virtual void set(int max_king_sq, uint64_t fe_end,uint64_t min_index) {
-			// This value is used in size(), and size() is used in SerializerBase::set(), so calculate first.
-			triangle_fe_end = fe_end * (fe_end - 1) * (fe_end - 2) / 6;
-
-			SerializerBase::set(max_king_sq, fe_end, min_index);
-		}
-
-		// number of dimension reductions
-		// For the time being, the dimension reduction of the mirror is not supported. I wonder if I'll do it here...
-/*
-#if defined(USE_KPPP_MIRROR_WRITE)
-#define KPPP_LOWER_COUNT 2
-#else
-#define KPPP_LOWER_COUNT 1
-#endif
-*/
-#define KPPP_LOWER_COUNT 1
-
-		// Get the index of the low-dimensional array.
-		// Note that the one with p0,p1,p2 swapped will not be returned.
-		// Also, the mirrored one is returned only when USE_KPPP_MIRROR_WRITE is enabled.
-		void toLowerDimensions(/*out*/ KPPP kppp_[KPPP_LOWER_COUNT]) const
-		{
-			kppp_[0] = fromKPPP(king_, piece0_, piece1_,piece2_);
-#if KPPP_LOWER_COUNT > 1
-			// If mir_piece is done, it will be in a state not sorted. Need code to sort.
-			PieceSquare p_list[3] = { mir_piece(piece2_), mir_piece(piece1_), mir_piece(piece0_) };
-			my_insertion_sort(p_list, 0, 3);
-			kppp_[1] = fromKPPP((int)flip_file((Square)king_), p_list[2] , p_list[1], p_list[0]);
-#endif
-		}
-
-		// builder that creates KPPP object from index (serial number)
-		KPPP fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
-
-		// A builder that creates KPPP objects from raw_index (a number that starts from 0, not a serial number)
-		KPPP fromRawIndex(uint64_t raw_index) const
-		{
-			uint64_t index2 = raw_index % triangle_fe_end;
-
-			// Write the expression to find piece0, piece1, piece2 from index2 here.
-			// This is the inverse function of index2 = i(i-1)(i-2)/6-1 + j(j+1)/2 + k.
-			// For j = k = 0, the real root is i = ... from the solution formula of the cubic equation. (The following formula)
-			// However, if index2 is 0 or 1, there are multiple real solutions. You have to consider this. It is necessary to take measures against insufficient calculation accuracy.
-			// After i is calculated, i can be converted into an integer, then put in the first expression and then j can be calculated in the same way as in KPP.
-
-			// This process is a relatively difficult numerical calculation. Various ideas are needed.
-
-			int piece0;
-			if (index2 <= 1)
-			{
-				// There are multiple real solutions only when index2 == 0,1.
-				piece0 = (int)index2 + 2;
-
-			} else {
-
-				//double t = pow(sqrt((243 *index2 * index2-1) * 3) + 27 * index2, 1.0 / 3);
-				// → In this case, the content of sqrt() will overflow if index2 becomes large.
-
-				// Since the contents of sqrt() overflow, do not multiply 3.0 in sqrt, but multiply sqrt(3.0) outside sqrt.
-				// Since the contents of sqrt() will overflow, use an approximate expression when index2 is large.
-
-				double t;
-				
-				if (index2 < 100000000)
-					t = pow(sqrt((243.0 *index2 * index2 - 1)) * sqrt(3.0) + 27 * index2, 1.0 / 3);
-				else
-					// If index2 is very large, we can think of the contents of sqrt as approximately √243 * index2.
-					t = pow( index2 * sqrt(243 * 3.0) + 27 * index2, 1.0 / 3);
-
-				// Add deltas to avoid a slight calculation error when rounding.
-				// If it is too large, it may increase by 1 so adjustment is necessary.
-
-				const double delta = 0.000000001;
-
-				piece0 = int(t / pow(3.0, 2.0 / 3) + 1.0 / (pow(3.0, 1.0 / 3) * t) + delta) + 1;
-				// Uuu. Is it really like this? ('Ω`)
-			}
-
-			//Since piece2 is obtained, substitute piece2 for i of i(i-1)(i-2)/6 (=a) in the above formula. Also substitute k = 0.
-			// j(j+1)/2 = index2-a
-			// This is from the solution formula of the quadratic equation..
-
-			uint64_t a = (uint64_t)piece0*((uint64_t)piece0 - 1)*((uint64_t)piece0 - 2) / 6;
-			int piece1 = int((1 + sqrt(8.0 * (index2 - a ) + 1)) / 2);
-			uint64_t b = (uint64_t)piece1 * (piece1 - 1) / 2;
-			int piece2 = int(index2 - a - b);
-
-#if 0
-			if (!((piece0 > piece1 && piece1 > piece2)))
-			{
-				std::cout << index << " , " << index2 << "," << a << "," << sqrt(8.0 * (index2 - a) + 1);
-			}
-#endif
-
-			assert(piece0 > piece1 && piece1 > piece2);
-
-			assert(piece2 < (int)fe_end_);
-			assert(piece1 < (int)fe_end_);
-			assert(piece0 < (int)fe_end_);
-
-			raw_index /= triangle_fe_end;
-
-			int king = (int)(raw_index  /* % SQUARE_NB */);
-			assert(king < max_king_sq_);
-
-			// Propagate king_sq and fe_end.
-			return fromKPPP((Square)king, (PieceSquare)piece0, (PieceSquare)piece1 , (PieceSquare)piece2);
-		}
-
-		// Specify k,p0,p1,p2 to build KPPP instance.
-		// The king_sq and fe_end passed by set() which is internally retained are inherited.
-		KPPP fromKPPP(int king, PieceSquare p0, PieceSquare p1, PieceSquare p2) const
-		{
-			KPPP kppp(king, p0, p1, p2);
-			kppp.set(max_king_sq_, fe_end_,min_index());
-			return kppp;
-		}
-
-		// Get the index when counting the value of min_index() of this class as 0.
-		virtual uint64_t toRawIndex() const {
-
-			// Macro similar to the one used in Bonanza 6.0
-			// Precondition) i> j> k.
-			// NG in case of i==j,j==k.
-			auto PcPcPcOnSq = [this](int king, PieceSquare i, PieceSquare j , PieceSquare k)
-			{
-				// (i,j,k) in this triangular array is the element in the i-th row and the j-th column.
-				// 0th row 0th column 0th is the sum of the elements up to that point, so 0 + 0 + 1 + 3 + 6 + ... + (i)*(i-1)/2 = i*( i-1)*(i-2)/6
-				// i-th row, j-th column, 0-th is j with j added. + j*(j-1) / 2
-				// i-th row, j-th column and k-th row is k plus it. + k
-				assert(i > j && j > k);
-
-				// PieceSquare type is assumed to be 32 bits, so if you do not pay attention to multiplication, it will overflow.
-				return (uint64_t)king * triangle_fe_end + (uint64_t)(
-						  uint64_t(i)*(uint64_t(i) - 1) * (uint64_t(i) - 2) / 6
-						+ uint64_t(j)*(uint64_t(j) - 1) / 2
-						+ uint64_t(k)
-					);
-			};
-
-			return PcPcPcOnSq(king_, piece0_, piece1_, piece2_);
-		}
-
-		// When you construct this object using fromIndex(), you can get information with the following accessors.
-		int king() const { return king_; }
-		PieceSquare piece0() const { return piece0_; }
-		PieceSquare piece1() const { return piece1_; }
-		PieceSquare piece2() const { return piece2_; }
-		// Returns whether or not the dimension lowered with toLowerDimensions is inverse.
-		// Prepared to match KK, KKP and interface. This method always returns false for this KPPP class.
-		bool is_inverse() const {
-			return false;
-		}
-
-		// Returns the number of elements in a triangular array. It is assumed that the kppp array is the following two-dimensional array.
-		// kppp[king_sq][triangle_fe_end];
-		uint64_t get_triangle_fe_end() const { return triangle_fe_end; }
-
-		// comparison operator
-		bool operator==(const KPPP& rhs) {
-			// piece0> piece1> piece2 is assumed, so there is no possibility of replacement.
-			return king() == rhs.king() && piece0() == rhs.piece0() && piece1() == rhs.piece1() && piece2() == rhs.piece2();
-		}
-		bool operator!=(const KPPP& rhs) { return !(*this == rhs); }
-
-	private:
-
-		int king_;
-		PieceSquare piece0_, piece1_,piece2_;
-
-		// The part of the square array of [fe_end][fe_end][fe_end] of kppp[king_sq][fe_end][fe_end][fe_end] is made into a triangular array.
-		// If kppp[king_sq][triangle_fe_end], the number of elements from the 0th row of this triangular array is 0,0,1,3,..., The nth row is n(n-1)/2.
-		// therefore,
-		// triangle_fe_end = Σn(n-1)/2 , n=0..fe_end-1
-		//                 =  fe_end * (fe_end - 1) * (fe_end - 2) / 6
-		uint64_t triangle_fe_end; // ((uint64_t)PieceSquare::PS_END)*((uint64_t)PieceSquare::PS_END - 1)*((uint64_t)PieceSquare::PS_END - 2) / 6;
-	};
-
-	// Output for debugging.
-	static std::ostream& operator<<(std::ostream& os, KPPP rhs)
-	{
-		os << "KPPP(" << rhs.king() << "," << rhs.piece0() << "," << rhs.piece1() << "," << rhs.piece2() << ")";
-		return os;
-	}
-
-	// For learning about 4 pieces by KKPP.
-	//
-	// Same design as KPPP class. In KPPP class, treat as one with less p.
-	// The positions of the two balls are encoded as values ​​from 0 to king_sq-1.
-	//
-	// Later, regarding the pieces0 and 1 returned by this class,
-	// piece0() >piece1()
-	// It is, and it is necessary to keep this constraint even when passing piece0,1 in the constructor.
-	//
-	// Due to this constraint, PieceSquareZero cannot be assigned to piece0 and piece1 at the same time and passed.
-	// If you want to support learning of dropped frames, you need to devise with evaluate().
-	struct KKPP: SerializerBase
-	{
-	protected:
-		KKPP(int king, PieceSquare p0, PieceSquare p1) :
-			king_(king), piece0_(p0), piece1_(p1)
-		{
-			assert(piece0_ > piece1_);
-			/* sort_piece(); */
-		}
-
-	public:
-		KKPP() {}
-
-		virtual uint64_t size() const { return (uint64_t)max_king_sq_*triangle_fe_end; }
-
-		// Set fe_end and king_sq.
-		// fe_end: fe_end assumed by this KPPP class
-		// king_sq: Number of balls to handle in KPPP.
-		// 9 steps x mirrors 9 steps x 5 squared squares (balls before and after) = 45*45 = 2025.
-		// Set this first using set() on the side that uses this KKPP class.
-		void set(int max_king_sq, uint64_t fe_end , uint64_t min_index) {
-			// This value is used in size(), and size() is used in SerializerBase::set(), so calculate first.
-			triangle_fe_end = fe_end * (fe_end - 1) / 2;
-
-			SerializerBase::set(max_king_sq, fe_end, min_index);
-		}
-
-		// number of dimension reductions
-		// For the time being, the dimension reduction of the mirror is not supported. I wonder if I'll do it here... (Because the memory for learning is a waste)
-#define KKPP_LOWER_COUNT 1
-
-		// Get the index of the low-dimensional array.
-		//Note that the one with p0,p1,p2 swapped will not be returned.
-		// Also, the mirrored one is returned only when USE_KPPP_MIRROR_WRITE is enabled.
-		void toLowerDimensions(/*out*/ KKPP kkpp_[KPPP_LOWER_COUNT]) const
-		{
-			kkpp_[0] = fromKKPP(king_, piece0_, piece1_);
-
-			// When mirroring, mir_piece will not be sorted. Need code to sort.
-			// We also need to define a mirror for king_.
-		}
-
-		// builder that creates KKPP object from index (serial number)
-		KKPP fromIndex(uint64_t index) const { assert(index >= min_index()); return fromRawIndex(index - min_index()); }
-
-		// builder that creates KKPP object from raw_index (number starting from 0, not serial number)
-		KKPP fromRawIndex(uint64_t raw_index) const
-		{
-			uint64_t index2 = raw_index % triangle_fe_end;
-
-			// Write the expression to find piece0, piece1, piece2 from index2 here.
-			// This is the inverse function of index2 = i(i-1)/2 + j.
-			// Use the formula of the solution of the quadratic equation with j=0.
-			// When index2=0, it is a double root, but the smaller one does not satisfy i>j and is ignored.
-
-			int piece0 = (int(sqrt(8 * index2 + 1)) + 1)/2;
-			int piece1 = int(index2 - piece0 * (piece0 - 1) /2 );
-
-			assert(piece0 > piece1);
-
-			assert(piece1 < (int)fe_end_);
-			assert(piece0 < (int)fe_end_);
-
-			raw_index /= triangle_fe_end;
-
-			int king = (int)(raw_index  /* % SQUARE_NB */);
-			assert(king < max_king_sq_);
-
-			// Propagate king_sq and fe_end.
-			return fromKKPP(king, (PieceSquare)piece0, (PieceSquare)piece1);
-		}
-
-		// Specify k,p0,p1 to build KKPP instance.
-		// The king_sq and fe_end passed by set() which is internally retained are inherited.
-		KKPP fromKKPP(int king, PieceSquare p0, PieceSquare p1) const
-		{
-			KKPP kkpp(king, p0, p1);
-			kkpp.set(max_king_sq_, fe_end_,min_index());
-			return kkpp;
-		}
-
-		// Get the index when counting the value of min_index() of this class as 0.
-		virtual uint64_t toRawIndex() const {
-
-			// Macro similar to the one used in Bonanza 6.0
-			// Precondition) i> j.
-			// NG in case of i==j,j==k.
-			auto PcPcOnSq = [this](int king, PieceSquare i, PieceSquare j)
-			{
-				assert(i > j);
-
-				// PieceSquare type is assumed to be 32 bits, so if you do not pay attention to multiplication, it will overflow.
-				return (uint64_t)king * triangle_fe_end + (uint64_t)(
-					+ uint64_t(i)*(uint64_t(i) - 1) / 2
-					+ uint64_t(j)
-					);
-			};
-
-			return PcPcOnSq(king_, piece0_, piece1_);
-		}
-
-		// When you construct this object using fromIndex(), fromKKPP(), you can get information with the following accessors.
-		int king() const { return king_; }
-		PieceSquare piece0() const { return piece0_; }
-		PieceSquare piece1() const { return piece1_; }
-
-		// Returns whether or not the dimension lowered with toLowerDimensions is inverse.
-		// Prepared to match KK, KKP and interface. In this KKPP class, this method always returns false.
-		bool is_inverse() const {
-			return false;
-		}
-
-		//Returns the number of elements in a triangular array. It is assumed that the kkpp array is the following two-dimensional array.
-		//   kkpp[king_sq][triangle_fe_end];
-		uint64_t get_triangle_fe_end() const { return triangle_fe_end; }
-
-		// comparison operator
-		bool operator==(const KKPP& rhs) {
-			// Since piece0> piece1 is assumed, there is no possibility of replacement.
-			return king() == rhs.king() && piece0() == rhs.piece0() && piece1() == rhs.piece1();
-		}
-		bool operator!=(const KKPP& rhs) { return !(*this == rhs); }
-
-	private:
-
-		int king_;
-		PieceSquare piece0_, piece1_;
-
-		// Triangularize the square array part of [fe_end][fe_end] of kppp[king_sq][fe_end][fe_end].
-		uint64_t triangle_fe_end = 0;
-		
-	};
-
-	// Output for debugging.
-	static std::ostream& operator<<(std::ostream& os, KKPP rhs)
-	{
-		os << "KKPP(" << rhs.king() << "," << rhs.piece0() << "," << rhs.piece1() << ")";
-		return os;
-	}
-
-
 }
 
 #endif // defined (EVAL_LEARN)
diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 7f15ff39..015ecb73 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -11,49 +11,41 @@ namespace NNUE {
 
 namespace Features {
 
+// Orient a square according to perspective (rotates by 180 for black)
+inline Square orient(Color perspective, Square s) {
+  return Square(int(s) ^ (bool(perspective) * 63));
+}
+
 // Find the index of the feature quantity from the ball position and PieceSquare
 template <Side AssociatedKing>
 inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-    Square sq_k, PieceSquare p) {
+  Color perspective, Square s, Piece pc, Square sq_k) {
+  const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+  return MakeIndex(sq_k, p);
+}
+
+// Find the index of the feature quantity from the ball position and PieceSquare
+template <Side AssociatedKing>
+inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
+    Square sq_k, IndexType p) {
   constexpr IndexType W = kBoardWidth;
   constexpr IndexType H = kBoardHeight;
-  const IndexType piece_index = (p - PieceSquare::PS_W_PAWN) / SQUARE_NB;
-  const Square sq_p = static_cast<Square>((p - PieceSquare::PS_W_PAWN) % SQUARE_NB);
+  const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
+  const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
   const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
   const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
   return H * W * piece_index + H * relative_file + relative_rank;
 }
 
-// Get the piece information
-template <Side AssociatedKing>
-inline void HalfRelativeKP<AssociatedKing>::GetPieces(
-    const Position& pos, Color perspective,
-    PieceSquare** pieces, Square* sq_target_k) {
-  *pieces = (perspective == BLACK) ?
-      pos.eval_list()->piece_list_fb() :
-      pos.eval_list()->piece_list_fw();
-  const PieceId target = (AssociatedKing == Side::kFriend) ?
-      static_cast<PieceId>(PieceId::PIECE_ID_KING + perspective) :
-      static_cast<PieceId>(PieceId::PIECE_ID_KING + ~perspective);
-  *sq_target_k = static_cast<Square>(((*pieces)[target] - PieceSquare::PS_W_KING) % SQUARE_NB);
-}
-
 // Get a list of indices with a value of 1 among the features
 template <Side AssociatedKing>
 void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
     const Position& pos, Color perspective, IndexList* active) {
-  // do nothing if array size is small to avoid compiler warning
-  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
-
-  PieceSquare* pieces;
-  Square sq_target_k;
-  GetPieces(pos, perspective, &pieces, &sq_target_k);
-  for (PieceId i = PieceId::PIECE_ID_ZERO; i < PieceId::PIECE_ID_KING; ++i) {
-    if (pieces[i] >= PieceSquare::PS_W_PAWN) {
-      if (pieces[i] != PieceSquare::PS_NONE) {
-        active->push_back(MakeIndex(sq_target_k, pieces[i]));
-      }
-    }
+  Square ksq = orient(perspective, pos.square<KING>(perspective));
+  Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+  while (bb) {
+    Square s = pop_lsb(&bb);
+    active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
   }
 }
 
@@ -62,26 +54,15 @@ template <Side AssociatedKing>
 void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
     const Position& pos, Color perspective,
     IndexList* removed, IndexList* added) {
-  PieceSquare* pieces;
-  Square sq_target_k;
-  GetPieces(pos, perspective, &pieces, &sq_target_k);
+  Square ksq = orient(perspective, pos.square<KING>(perspective));
   const auto& dp = pos.state()->dirtyPiece;
   for (int i = 0; i < dp.dirty_num; ++i) {
-    if (dp.pieceId[i] >= PieceId::PIECE_ID_KING) continue;
-    const auto old_p = static_cast<PieceSquare>(
-        dp.old_piece[i].from[perspective]);
-    if (old_p >= PieceSquare::PS_W_PAWN) {
-      if (old_p != PieceSquare::PS_NONE) {
-        removed->push_back(MakeIndex(sq_target_k, old_p));
-      }
-    }
-    const auto new_p = static_cast<PieceSquare>(
-        dp.new_piece[i].from[perspective]);
-    if (new_p >= PieceSquare::PS_W_PAWN) {
-      if (new_p != PieceSquare::PS_NONE) {
-        added->push_back(MakeIndex(sq_target_k, new_p));
-      }
-    }
+    Piece pc = dp.piece[i];
+    if (type_of(pc) == KING) continue;
+    if (dp.from[i] != SQ_NONE)
+      removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
+    if (dp.to[i] != SQ_NONE)
+      added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
   }
 }
 
diff --git a/src/nnue/features/half_relative_kp.h b/src/nnue/features/half_relative_kp.h
index 9561ab91..2d4182e4 100644
--- a/src/nnue/features/half_relative_kp.h
+++ b/src/nnue/features/half_relative_kp.h
@@ -25,7 +25,7 @@ class HalfRelativeKP {
   static constexpr std::uint32_t kHashValue =
       0xF9180919u ^ (AssociatedKing == Side::kFriend);
   // Piece type excluding balls
-  static constexpr IndexType kNumPieceKinds = (PieceSquare::PS_END - PieceSquare::PS_W_PAWN) / SQUARE_NB;
+  static constexpr IndexType kNumPieceKinds = 5 * 2;
   // width of the virtual board with the ball in the center
   static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
   // height of a virtual board with balls in the center
@@ -34,7 +34,7 @@ class HalfRelativeKP {
   static constexpr IndexType kDimensions =
       kNumPieceKinds * kBoardHeight * kBoardWidth;
   // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = PieceId::PIECE_ID_KING;
+  static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
   // Timing of full calculation instead of difference calculation
   static constexpr TriggerEvent kRefreshTrigger =
       (AssociatedKing == Side::kFriend) ?
@@ -49,12 +49,9 @@ class HalfRelativeKP {
                                    IndexList* removed, IndexList* added);
 
   // Find the index of the feature quantity from the ball position and PieceSquare
-  static IndexType MakeIndex(Square sq_k, PieceSquare p);
-
- private:
-  // Get the piece information
-  static void GetPieces(const Position& pos, Color perspective,
-                        PieceSquare** pieces, Square* sq_target_k);
+  static IndexType MakeIndex(Square s, IndexType p);
+  // Find the index of the feature quantity from the ball position and PieceSquare
+  static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
 };
 
 }  // namespace Features
diff --git a/src/nnue/features/k.cpp b/src/nnue/features/k.cpp
index 001e4b98..314b1338 100644
--- a/src/nnue/features/k.cpp
+++ b/src/nnue/features/k.cpp
@@ -11,19 +11,21 @@ namespace NNUE {
 
 namespace Features {
 
+// Orient a square according to perspective (rotates by 180 for black)
+inline Square orient(Color perspective, Square s) {
+  return Square(int(s) ^ (bool(perspective) * 63));
+}
+
+// Index of a feature for a given king position.
+IndexType K::MakeIndex(Color perspective, Square s, Color king_color) {
+  return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
+}
+
 // Get a list of indices with a value of 1 among the features
 void K::AppendActiveIndices(
     const Position& pos, Color perspective, IndexList* active) {
-  // do nothing if array size is small to avoid compiler warning
-  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
-
-  const PieceSquare* pieces = (perspective == BLACK) ?
-      pos.eval_list()->piece_list_fb() :
-      pos.eval_list()->piece_list_fw();
-  assert(pieces[PieceId::PIECE_ID_BKING] != PieceSquare::PS_NONE);
-  assert(pieces[PieceId::PIECE_ID_WKING] != PieceSquare::PS_NONE);
-  for (PieceId i = PieceId::PIECE_ID_KING; i < PieceId::PIECE_ID_NONE; ++i) {
-    active->push_back(pieces[i] - PieceSquare::PS_END);
+  for (auto color : Colors) {
+    active->push_back(MakeIndex(perspective, pos.square<KING>(color), color));
   }
 }
 
@@ -32,12 +34,19 @@ void K::AppendChangedIndices(
     const Position& pos, Color perspective,
     IndexList* removed, IndexList* added) {
   const auto& dp = pos.state()->dirtyPiece;
-  if (dp.pieceId[0] >= PieceId::PIECE_ID_KING) {
-    removed->push_back(
-        dp.old_piece[0].from[perspective] - PieceSquare::PS_END);
-    added->push_back(
-        dp.new_piece[0].from[perspective] - PieceSquare::PS_END);
+  Color king_color;
+  if (dp.piece[0] == Piece::W_KING) {
+    king_color = WHITE;
   }
+  else if (dp.piece[0] == Piece::B_KING) {
+    king_color = BLACK;
+  }
+  else {
+    return;
+  }
+
+  removed->push_back(MakeIndex(perspective, dp.from[0], king_color));
+  added->push_back(MakeIndex(perspective, dp.to[0], king_color));
 }
 
 }  // namespace Features
diff --git a/src/nnue/features/k.h b/src/nnue/features/k.h
index 28431010..0c394f4e 100644
--- a/src/nnue/features/k.h
+++ b/src/nnue/features/k.h
@@ -35,6 +35,10 @@ class K {
   // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
   static void AppendChangedIndices(const Position& pos, Color perspective,
                                    IndexList* removed, IndexList* added);
+
+private:
+  // Index of a feature for a given king position.
+  static IndexType MakeIndex(Color perspective, Square s, Color king_color);
 };
 
 }  // namespace Features
diff --git a/src/nnue/features/p.cpp b/src/nnue/features/p.cpp
index 8b24f544..b4a6faf9 100644
--- a/src/nnue/features/p.cpp
+++ b/src/nnue/features/p.cpp
@@ -11,19 +11,24 @@ namespace NNUE {
 
 namespace Features {
 
+// Orient a square according to perspective (rotates by 180 for black)
+inline Square orient(Color perspective, Square s) {
+  return Square(int(s) ^ (bool(perspective) * 63));
+}
+
+// Find the index of the feature quantity from the king position and PieceSquare
+inline IndexType P::MakeIndex(
+  Color perspective, Square s, Piece pc) {
+  return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+}
+
 // Get a list of indices with a value of 1 among the features
 void P::AppendActiveIndices(
     const Position& pos, Color perspective, IndexList* active) {
-  // do nothing if array size is small to avoid compiler warning
-  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
-
-  const PieceSquare* pieces = (perspective == BLACK) ?
-      pos.eval_list()->piece_list_fb() :
-      pos.eval_list()->piece_list_fw();
-  for (PieceId i = PieceId::PIECE_ID_ZERO; i < PieceId::PIECE_ID_KING; ++i) {
-    if (pieces[i] != PieceSquare::PS_NONE) {
-      active->push_back(pieces[i]);
-    }
+  Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+  while (bb) {
+    Square s = pop_lsb(&bb);
+    active->push_back(MakeIndex(perspective, s, pos.piece_on(s)));
   }
 }
 
@@ -33,13 +38,12 @@ void P::AppendChangedIndices(
     IndexList* removed, IndexList* added) {
   const auto& dp = pos.state()->dirtyPiece;
   for (int i = 0; i < dp.dirty_num; ++i) {
-    if (dp.pieceId[i] >= PieceId::PIECE_ID_KING) continue;
-    if (dp.old_piece[i].from[perspective] != PieceSquare::PS_NONE) {
-      removed->push_back(dp.old_piece[i].from[perspective]);
-    }
-    if (dp.new_piece[i].from[perspective] != PieceSquare::PS_NONE) {
-      added->push_back(dp.new_piece[i].from[perspective]);
-    }
+    Piece pc = dp.piece[i];
+    if (type_of(pc) == KING) continue;
+    if (dp.from[i] != SQ_NONE)
+      removed->push_back(MakeIndex(perspective, dp.from[i], pc));
+    if (dp.to[i] != SQ_NONE)
+      added->push_back(MakeIndex(perspective, dp.to[i], pc));
   }
 }
 
diff --git a/src/nnue/features/p.h b/src/nnue/features/p.h
index 2a83c4ad..b3d4191e 100644
--- a/src/nnue/features/p.h
+++ b/src/nnue/features/p.h
@@ -22,9 +22,9 @@ class P {
   // Hash value embedded in the evaluation function file
   static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
   // number of feature dimensions
-  static constexpr IndexType kDimensions = PieceSquare::PS_END;
+  static constexpr IndexType kDimensions = PS_END;
   // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = PieceId::PIECE_ID_KING;
+  static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
   // Timing of full calculation instead of difference calculation
   static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
@@ -35,6 +35,10 @@ class P {
   // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
   static void AppendChangedIndices(const Position& pos, Color perspective,
                                    IndexList* removed, IndexList* added);
+
+ private:
+  // Index of a feature for a given piece on some square
+  static IndexType MakeIndex(Color perspective, Square s, Piece pc);
 };
 
 }  // namespace Features
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
index 48a99797..955894e8 100644
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/nnue/trainer/features/factorizer_half_kp.h
@@ -62,8 +62,8 @@ class Factorizer<HalfKP<AssociatedKing>> {
     IndexType index_offset = AppendBaseFeature<FeatureType>(
         kProperties[kFeaturesHalfKP], base_index, training_features);
 
-    const auto sq_k = static_cast<Square>(base_index / PieceSquare::PS_END);
-    const auto p = static_cast<PieceSquare>(base_index % PieceSquare::PS_END);
+    const auto sq_k = static_cast<Square>(base_index / PS_END);
+    const auto p = static_cast<IndexType>(base_index % PS_END);
     // kFeaturesHalfK
     {
       const auto& properties = kProperties[kFeaturesHalfK];
@@ -76,7 +76,7 @@ class Factorizer<HalfKP<AssociatedKing>> {
     index_offset += InheritFeaturesIfRequired<P>(
         index_offset, kProperties[kFeaturesP], p, training_features);
     // kFeaturesHalfRelativeKP
-    if (p >= PieceSquare::PS_W_PAWN) {
+    if (p >= PS_W_PAWN) {
       index_offset += InheritFeaturesIfRequired<HalfRelativeKP<AssociatedKing>>(
           index_offset, kProperties[kFeaturesHalfRelativeKP],
           HalfRelativeKP<AssociatedKing>::MakeIndex(sq_k, p),
diff --git a/src/types.h b/src/types.h
index d34781e5..bcc4f77f 100644
--- a/src/types.h
+++ b/src/types.h
@@ -444,6 +444,11 @@ constexpr Square to_sq(Move m) {
   return Square(m & 0x3F);
 }
 
+// Return relative square when turning the board 180 degrees
+constexpr Square rotate180(Square sq) {
+    return (Square)(sq ^ 0x3F);
+}
+
 constexpr int from_to(Move m) {
  return m & 0xFFF;
 }

From 9f2f31632cbb9fc617e7044aebbfb0f2ca87c7ab Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 29 Aug 2020 08:17:24 +0900
Subject: [PATCH 180/583] Fixed build errors.

---
 src/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 75e39557..9372b915 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -40,7 +40,6 @@ PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 100000
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
 	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
 	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
-	eval/evaluate_mir_inv_tools.cpp \
 	nnue/evaluate_nnue.cpp \
 	nnue/evaluate_nnue_learner.cpp \
 	nnue/features/half_kp.cpp \

From d2586623833411823fb9e88234b391470d3fce87 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sat, 29 Aug 2020 09:12:10 +0900
Subject: [PATCH 181/583] Update README.md

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8cacd1b3..6d28a998 100644
--- a/README.md
+++ b/README.md
@@ -17,10 +17,12 @@ setoption name Threads value x
 setoption name Hash value y
 setoption name SyzygyPath value path
 isready
-gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000
+gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000 use_raw_nnue_eval 0
 ```
 Specify how many threads and how much memory you would like to use with the x and y values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The path is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
 
+use_raw_nnue_eval controls if the training data generator or trainer uses raw NNUE eval values.  Don't forget to set use_raw_nnue_eval 0 when initial training data are generated.  Otherwise, the gensfen command will crash.
+
 This will save a file named "generated_kifu.bin" in the same folder as the binary. Once generation is done, rename the file to something like "1billiondepth12.bin" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
 #### Generation Parameters
 - Depth is the searched depth per move, or how far the engine looks forward. This value is an integer.

From 7d6668515c5b044df66ad1cdc3a1f75843cf5f56 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 30 Aug 2020 14:54:07 +0900
Subject: [PATCH 182/583] Added -static link option to the learn and
 profile-learn targets.

---
 src/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 9372b915..cc63ab15 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -889,7 +889,7 @@ icc-profile-use:
 learn: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
 	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	EXTRALDFLAGS=' -lopenblas -fopenmp -Wl,-s ' \
+	EXTRALDFLAGS=' -lopenblas -fopenmp -Wl,-s -static ' \
 	all
 	
 profile-learn: config-sanity objclean profileclean
@@ -897,7 +897,7 @@ profile-learn: config-sanity objclean profileclean
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
 	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s '
+	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s -static '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOGENSFEN) 
@@ -906,7 +906,7 @@ profile-learn: config-sanity objclean profileclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
 	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s '
+	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s -static '
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean

From c17f2b15fdfdb44fc4ef2ca73c58d1d1097f101e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 2 Sep 2020 15:17:18 +0200
Subject: [PATCH 183/583] General cleanup of learner.cpp.

---
 src/Makefile               |    6 +-
 src/eval/evaluate_common.h |    2 +
 src/learn/convert.cpp      |  515 ++++
 src/learn/gensfen.cpp      | 1181 +++++++++
 src/learn/learn.h          |   42 +
 src/learn/learner.cpp      | 4886 +++++++++++++-----------------------
 src/learn/multi_think.h    |    5 +-
 7 files changed, 3452 insertions(+), 3185 deletions(-)
 create mode 100644 src/learn/convert.cpp
 create mode 100644 src/learn/gensfen.cpp

diff --git a/src/Makefile b/src/Makefile
index cc63ab15..0c6b21e5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -52,6 +52,8 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	extra/sfen_packer.cpp \
 	learn/gensfen2019.cpp \
 	learn/learner.cpp \
+	learn/gensfen.cpp \
+	learn/convert.cpp \
 	learn/learning_tools.cpp \
 	learn/multi_think.cpp
 
@@ -891,7 +893,7 @@ learn: config-sanity
 	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
 	EXTRALDFLAGS=' -lopenblas -fopenmp -Wl,-s -static ' \
 	all
-	
+
 profile-learn: config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
@@ -900,7 +902,7 @@ profile-learn: config-sanity objclean profileclean
 	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s -static '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
-	$(PGOGENSFEN) 
+	$(PGOGENSFEN)
 	@echo ""
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index b043f2e1..dacbd2ba 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -15,6 +15,8 @@
 // KPP file name
 #define KPP_BIN "KPP_synthesized.bin"
 
+#include "../position.h"
+
 namespace Eval
 {
 
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
new file mode 100644
index 00000000..ebee8a96
--- /dev/null
+++ b/src/learn/convert.cpp
@@ -0,0 +1,515 @@
+#define EVAL_LEARN
+
+#if defined(EVAL_LEARN)
+
+// evaluate header for learning
+#include "../eval/evaluate_common.h"
+
+#include "learn.h"
+#include "multi_think.h"
+#include "../uci.h"
+#include "../syzygy/tbprobe.h"
+#include "../misc.h"
+#include "../thread.h"
+#include "../position.h"
+#include "../tt.h"
+
+#include <sstream>
+#include <fstream>
+#include <unordered_set>
+#include <iomanip>
+#include <list>
+#include <cmath>    // std::exp(),std::pow(),std::log()
+#include <cstring>  // memcpy()
+#include <memory>
+#include <limits>
+#include <optional>
+#include <chrono>
+#include <random>
+#include <regex>
+
+#if defined (_OPENMP)
+#include <omp.h>
+#endif
+
+#if defined(_MSC_VER)
+// The C++ filesystem cannot be used unless it is C++17 or later or MSVC.
+// I tried to use windows.h, but with g++ of msys2 I can not get the files in the folder well.
+// Use dirent.h because there is no help for it.
+#include <filesystem>
+#elif defined(__GNUC__)
+#include <dirent.h>
+#endif
+
+using namespace std;
+
+namespace Learner
+{
+    bool fen_is_ok(Position& pos, std::string input_fen) {
+        std::string pos_fen = pos.fen();
+        std::istringstream ss_input(input_fen);
+        std::istringstream ss_pos(pos_fen);
+
+        // example : "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3 w - h6 0 24"
+        //       --> "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3"
+        std::string str_input, str_pos;
+        ss_input >> str_input;
+        ss_pos >> str_pos;
+
+        // Only compare "Piece placement field" between input_fen and pos.fen().
+        return str_input == str_pos;
+    }
+
+    void convert_bin(
+        const vector<string>& filenames, 
+        const string& output_file_name, 
+        const int ply_minimum, 
+        const int ply_maximum, 
+        const int interpolate_eval, 
+        const int src_score_min_value,
+        const int src_score_max_value,
+        const int dest_score_min_value,
+        const int dest_score_max_value,
+        const bool check_invalid_fen, 
+        const bool check_illegal_move)
+    {
+        std::cout << "check_invalid_fen=" << check_invalid_fen << std::endl;
+        std::cout << "check_illegal_move=" << check_illegal_move << std::endl;
+
+        std::fstream fs;
+        uint64_t data_size = 0;
+        uint64_t filtered_size = 0;
+        uint64_t filtered_size_fen = 0;
+        uint64_t filtered_size_move = 0;
+        uint64_t filtered_size_ply = 0;
+        auto th = Threads.main();
+        auto& tpos = th->rootPos;
+        // convert plain rag to packed sfenvalue for Yaneura king
+        fs.open(output_file_name, ios::app | ios::binary);
+        StateListPtr states;
+        for (auto filename : filenames) {
+            std::cout << "convert " << filename << " ... ";
+            std::string line;
+            ifstream ifs;
+            ifs.open(filename);
+            PackedSfenValue p;
+            data_size = 0;
+            filtered_size = 0;
+            filtered_size_fen = 0;
+            filtered_size_move = 0;
+            filtered_size_ply = 0;
+            p.gamePly = 1; // Not included in apery format. Should be initialized
+            bool ignore_flag_fen = false;
+            bool ignore_flag_move = false;
+            bool ignore_flag_ply = false;
+            while (std::getline(ifs, line)) {
+                std::stringstream ss(line);
+                std::string token;
+                std::string value;
+                ss >> token;
+                if (token == "fen") {
+                    states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
+                    std::string input_fen = line.substr(4);
+                    tpos.set(input_fen, false, &states->back(), Threads.main());
+                    if (check_invalid_fen && !fen_is_ok(tpos, input_fen)) {
+                        ignore_flag_fen = true;
+                        filtered_size_fen++;
+                    }
+                    else {
+                        tpos.sfen_pack(p.sfen);
+                    }
+                }
+                else if (token == "move") {
+                    ss >> value;
+                    Move move = UCI::to_move(tpos, value);
+                    if (check_illegal_move && move == MOVE_NONE) {
+                        ignore_flag_move = true;
+                        filtered_size_move++;
+                    }
+                    else {
+                        p.move = move;
+                    }
+                }
+                else if (token == "score") {
+                    double score;
+                    ss >> score;
+                    // Training Formula � Issue #71 � nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+                    // Normalize to [0.0, 1.0].
+                    score = (score - src_score_min_value) / (src_score_max_value - src_score_min_value);
+                    // Scale to [dest_score_min_value, dest_score_max_value].
+                    score = score * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+                    p.score = Math::clamp((int32_t)std::round(score), -(int32_t)VALUE_MATE, (int32_t)VALUE_MATE);
+                }
+                else if (token == "ply") {
+                    int temp;
+                    ss >> temp;
+                    if (temp < ply_minimum || temp > ply_maximum) {
+                        ignore_flag_ply = true;
+                        filtered_size_ply++;
+                    }
+                    p.gamePly = uint16_t(temp); // No cast here?
+                    if (interpolate_eval != 0) {
+                        p.score = min(3000, interpolate_eval * temp);
+                    }
+                }
+                else if (token == "result") {
+                    int temp;
+                    ss >> temp;
+                    p.game_result = int8_t(temp); // Do you need a cast here?
+                    if (interpolate_eval) {
+                        p.score = p.score * p.game_result;
+                    }
+                }
+                else if (token == "e") {
+                    if (!(ignore_flag_fen || ignore_flag_move || ignore_flag_ply)) {
+                        fs.write((char*)&p, sizeof(PackedSfenValue));
+                        data_size += 1;
+                        // debug
+                        // std::cout<<tpos<<std::endl;
+                        // std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
+                    }
+                    else {
+                        filtered_size++;
+                    }
+                    ignore_flag_fen = false;
+                    ignore_flag_move = false;
+                    ignore_flag_ply = false;
+                }
+            }
+            std::cout << "done " << data_size << " parsed " << filtered_size << " is filtered"
+                << " (invalid fen:" << filtered_size_fen << ", illegal move:" << filtered_size_move << ", invalid ply:" << filtered_size_ply << ")" << std::endl;
+            ifs.close();
+        }
+        std::cout << "all done" << std::endl;
+        fs.close();
+    }
+
+    static inline void ltrim(std::string& s) {
+        s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
+            return !std::isspace(ch);
+            }));
+    }
+
+    static inline void rtrim(std::string& s) {
+        s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
+            return !std::isspace(ch);
+            }).base(), s.end());
+    }
+
+    static inline void trim(std::string& s) {
+        ltrim(s);
+        rtrim(s);
+    }
+
+    int parse_game_result_from_pgn_extract(std::string result) {
+        // White Win
+        if (result == "\"1-0\"") {
+            return 1;
+        }
+        // Black Win
+        else if (result == "\"0-1\"") {
+            return -1;
+        }
+        // Draw
+        else {
+            return 0;
+        }
+    }
+
+    // 0.25 -->  0.25 * PawnValueEg
+    // #-4  --> -mate_in(4)
+    // #3   -->  mate_in(3)
+    // -M4  --> -mate_in(4)
+    // +M3  -->  mate_in(3)
+    Value parse_score_from_pgn_extract(std::string eval, bool& success) {
+        success = true;
+
+        if (eval.substr(0, 1) == "#") {
+            if (eval.substr(1, 1) == "-") {
+                return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
+            }
+            else {
+                return mate_in(stoi(eval.substr(1, eval.length() - 1)));
+            }
+        }
+        else if (eval.substr(0, 2) == "-M") {
+            //std::cout << "eval=" << eval << std::endl;
+            return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
+        }
+        else if (eval.substr(0, 2) == "+M") {
+            //std::cout << "eval=" << eval << std::endl;
+            return mate_in(stoi(eval.substr(2, eval.length() - 2)));
+        }
+        else {
+            char* endptr;
+            double value = strtod(eval.c_str(), &endptr);
+
+            if (*endptr != '\0') {
+                success = false;
+                return VALUE_ZERO;
+            }
+            else {
+                return Value(value * static_cast<double>(PawnValueEg));
+            }
+        }
+    }
+
+    // for Debug
+    //#define DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT
+
+    bool is_like_fen(std::string fen) {
+        int count_space = std::count(fen.cbegin(), fen.cend(), ' ');
+        int count_slash = std::count(fen.cbegin(), fen.cend(), '/');
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+        //std::cout << "count_space=" << count_space << std::endl;
+        //std::cout << "count_slash=" << count_slash << std::endl;
+#endif
+
+        return count_space == 5 && count_slash == 7;
+    }
+
+    void convert_bin_from_pgn_extract(
+        const vector<string>& filenames, 
+        const string& output_file_name, 
+        const bool pgn_eval_side_to_move, 
+        const bool convert_no_eval_fens_as_score_zero)
+    {
+        std::cout << "pgn_eval_side_to_move=" << pgn_eval_side_to_move << std::endl;
+        std::cout << "convert_no_eval_fens_as_score_zero=" << convert_no_eval_fens_as_score_zero << std::endl;
+
+        auto th = Threads.main();
+        auto& pos = th->rootPos;
+
+        std::fstream ofs;
+        ofs.open(output_file_name, ios::out | ios::binary);
+
+        int game_count = 0;
+        int fen_count = 0;
+
+        for (auto filename : filenames) {
+            std::cout << now_string() << " convert " << filename << std::endl;
+            ifstream ifs;
+            ifs.open(filename);
+
+            int game_result = 0;
+
+            std::string line;
+            while (std::getline(ifs, line)) {
+
+                if (line.empty()) {
+                    continue;
+                }
+
+                else if (line.substr(0, 1) == "[") {
+                    std::regex pattern_result(R"(\[Result (.+?)\])");
+                    std::smatch match;
+
+                    // example: [Result "1-0"]
+                    if (std::regex_search(line, match, pattern_result)) {
+                        game_result = parse_game_result_from_pgn_extract(match.str(1));
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                        std::cout << "game_result=" << game_result << std::endl;
+#endif
+                        game_count++;
+                        if (game_count % 10000 == 0) {
+                            std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
+                        }
+                    }
+
+                    continue;
+                }
+
+                else {
+                    int gamePly = 1;
+                    auto itr = line.cbegin();
+
+                    while (true) {
+                        gamePly++;
+
+                        PackedSfenValue psv;
+                        memset((char*)&psv, 0, sizeof(PackedSfenValue));
+
+                        // fen
+                        {
+                            bool fen_found = false;
+
+                            while (!fen_found) {
+                                std::regex pattern_bracket(R"(\{(.+?)\})");
+                                std::smatch match;
+                                if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+                                    break;
+                                }
+
+                                itr += match.position(0) + match.length(0) - 1;
+                                std::string str_fen = match.str(1);
+                                trim(str_fen);
+
+                                if (is_like_fen(str_fen)) {
+                                    fen_found = true;
+
+                                    StateInfo si;
+                                    pos.set(str_fen, false, &si, th);
+                                    pos.sfen_pack(psv.sfen);
+                                }
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                                std::cout << "str_fen=" << str_fen << std::endl;
+                                std::cout << "fen_found=" << fen_found << std::endl;
+#endif
+                            }
+
+                            if (!fen_found) {
+                                break;
+                            }
+                        }
+
+                        // move
+                        {
+                            std::regex pattern_move(R"(\}(.+?)\{)");
+                            std::smatch match;
+                            if (!std::regex_search(itr, line.cend(), match, pattern_move)) {
+                                break;
+                            }
+
+                            itr += match.position(0) + match.length(0) - 1;
+                            std::string str_move = match.str(1);
+                            trim(str_move);
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                            std::cout << "str_move=" << str_move << std::endl;
+#endif
+                            psv.move = UCI::to_move(pos, str_move);
+                        }
+
+                        // eval
+                        bool eval_found = false;
+                        {
+                            std::regex pattern_bracket(R"(\{(.+?)\})");
+                            std::smatch match;
+                            if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
+                                break;
+                            }
+
+                            std::string str_eval_clk = match.str(1);
+                            trim(str_eval_clk);
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                            std::cout << "str_eval_clk=" << str_eval_clk << std::endl;
+#endif
+
+                            // example: { [%eval 0.25] [%clk 0:10:00] }
+                            // example: { [%eval #-4] [%clk 0:10:00] }
+                            // example: { [%eval #3] [%clk 0:10:00] }
+                            // example: { +0.71/22 1.2s }
+                            // example: { -M4/7 0.003s }
+                            // example: { M3/245 0.017s }
+                            // example: { +M1/245 0.010s, White mates }
+                            // example: { 0.60 }
+                            // example: { book }
+                            // example: { rnbqkb1r/pp3ppp/2p1pn2/3p4/2PP4/2N2N2/PP2PPPP/R1BQKB1R w KQkq - 0 5 }
+
+                            // Considering the absence of eval
+                            if (!is_like_fen(str_eval_clk)) {
+                                itr += match.position(0) + match.length(0) - 1;
+
+                                if (str_eval_clk != "book") {
+                                    std::regex pattern_eval1(R"(\[\%eval (.+?)\])");
+                                    std::regex pattern_eval2(R"((.+?)\/)");
+
+                                    std::string str_eval;
+                                    if (std::regex_search(str_eval_clk, match, pattern_eval1) ||
+                                        std::regex_search(str_eval_clk, match, pattern_eval2)) {
+                                        str_eval = match.str(1);
+                                        trim(str_eval);
+                                    }
+                                    else {
+                                        str_eval = str_eval_clk;
+                                    }
+
+                                    bool success = false;
+                                    Value value = parse_score_from_pgn_extract(str_eval, success);
+                                    if (success) {
+                                        eval_found = true;
+                                        psv.score = Math::clamp(value, -VALUE_MATE, VALUE_MATE);
+                                    }
+
+#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
+                                    std::cout << "str_eval=" << str_eval << std::endl;
+                                    std::cout << "success=" << success << ", psv.score=" << psv.score << std::endl;
+#endif
+                                }
+                            }
+                        }
+
+                        // write
+                        if (eval_found || convert_no_eval_fens_as_score_zero) {
+                            if (!eval_found && convert_no_eval_fens_as_score_zero) {
+                                psv.score = 0;
+                            }
+
+                            psv.gamePly = gamePly;
+                            psv.game_result = game_result;
+
+                            if (pos.side_to_move() == BLACK) {
+                                if (!pgn_eval_side_to_move) {
+                                    psv.score *= -1;
+                                }
+                                psv.game_result *= -1;
+                            }
+
+                            ofs.write((char*)&psv, sizeof(PackedSfenValue));
+
+                            fen_count++;
+                        }
+                    }
+
+                    game_result = 0;
+                }
+            }
+        }
+
+        std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
+        std::cout << now_string() << " all done" << std::endl;
+        ofs.close();
+    }
+
+    void convert_plain(
+        const vector<string>& filenames, 
+        const string& output_file_name)
+    {
+        Position tpos;
+        std::ofstream ofs;
+        ofs.open(output_file_name, ios::app);
+        auto th = Threads.main();
+        for (auto filename : filenames) {
+            std::cout << "convert " << filename << " ... ";
+
+            // Just convert packedsfenvalue to text
+            std::fstream fs;
+            fs.open(filename, ios::in | ios::binary);
+            PackedSfenValue p;
+            while (true)
+            {
+                if (fs.read((char*)&p, sizeof(PackedSfenValue))) {
+                    StateInfo si;
+                    tpos.set_from_packed_sfen(p.sfen, &si, th, false);
+
+                    // write as plain text
+                    ofs << "fen " << tpos.fen() << std::endl;
+                    ofs << "move " << UCI::move(Move(p.move), false) << std::endl;
+                    ofs << "score " << p.score << std::endl;
+                    ofs << "ply " << int(p.gamePly) << std::endl;
+                    ofs << "result " << int(p.game_result) << std::endl;
+                    ofs << "e" << std::endl;
+                }
+                else {
+                    break;
+                }
+            }
+            fs.close();
+            std::cout << "done" << std::endl;
+        }
+        ofs.close();
+        std::cout << "all done" << std::endl;
+    }
+}
+#endif
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
new file mode 100644
index 00000000..38bed2d5
--- /dev/null
+++ b/src/learn/gensfen.cpp
@@ -0,0 +1,1181 @@
+﻿#define EVAL_LEARN
+
+#if defined(EVAL_LEARN)
+
+#include "../eval/evaluate_common.h"
+
+#include "learn.h"
+#include "multi_think.h"
+#include "../misc.h"
+#include "../thread.h"
+#include "../position.h"
+#include "../tt.h"
+#include "../uci.h"
+#include "../syzygy/tbprobe.h"
+
+#if defined(USE_BOOK)
+#include "../extra/book/book.h"
+#endif
+
+#include <chrono>
+#include <random>
+#include <regex>
+#include <sstream>
+#include <fstream>
+#include <unordered_set>
+#include <iomanip>
+#include <list>
+#include <cmath>
+#include <cstring>
+#include <memory>
+#include <limits>
+#include <optional>
+
+#if defined (_OPENMP)
+#include <omp.h>
+#endif
+
+#if defined(_MSC_VER)
+// std::filesystem doesn't work on GCC even though it claims to support C++17.
+#include <filesystem>
+#elif defined(__GNUC__)
+#include <dirent.h>
+#endif
+
+#if defined(EVAL_NNUE)
+#include "../nnue/evaluate_nnue_learner.h"
+#include <climits>
+#include <shared_mutex>
+#endif
+
+using namespace std; 
+
+namespace Learner
+{
+    static bool write_out_draw_game_in_training_data_generation = false;
+    static bool detect_draw_by_consecutive_low_score = false;
+    static bool detect_draw_by_insufficient_mating_material = false;
+
+    // Use raw NNUE eval value in the Eval::evaluate(). 
+    // If hybrid eval is enabled, training data
+    // generation and training don't work well.
+    // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
+    static bool use_raw_nnue_eval = true;
+
+    // Helper class for exporting Sfen
+    struct SfenWriter
+    {
+        // Amount of sfens required to flush the buffer.
+        static constexpr size_t SFEN_WRITE_SIZE = 5000;
+
+        // Current status is output after 
+        // each (SFEN_WRITE_SIZE * STATUS_OUTPUT_PERIOD) sfens
+        static constexpr uint64_t STATUS_OUTPUT_PERIOD = 40;
+
+        // File name to write and number of threads to create
+        SfenWriter(string filename_, int thread_num)
+        {
+            sfen_buffers_pool.reserve((size_t)thread_num * 10);
+            sfen_buffers.resize(thread_num);
+
+            output_file_stream.open(filename_, ios::out | ios::binary | ios::app);
+            filename = filename_;
+
+            finished = false;
+        }
+
+        ~SfenWriter()
+        {
+            finished = true;
+            file_worker_thread.join();
+            output_file_stream.close();
+
+#if !defined(DNDEBUG)
+            {
+                // All buffers should be empty since file_worker_thread
+                // should have written everything before exiting.
+                for (const auto& p : sfen_buffers) { assert(p == nullptr); }
+                assert(sfen_buffers_pool.empty());
+            }
+#endif
+        }
+
+        void write(size_t thread_id, const PackedSfenValue& psv)
+        {
+            // We have a buffer for each thread and add it there.
+            // If the buffer overflows, write it to a file.
+
+            // This buffer is prepared for each thread.
+            auto& buf = sfen_buffers[thread_id];
+
+            // Secure since there is no buf at the first time 
+            // and immediately after writing the thread buffer.
+            if (!buf)
+            {
+                buf = std::make_unique<PSVector>();
+                buf->reserve(SFEN_WRITE_SIZE);
+            }
+
+            // Buffer is exclusive to this thread.
+            // There is no need for a critical section.
+            buf->push_back(psv);
+
+            if (buf->size() >= SFEN_WRITE_SIZE)
+            {
+                // If you load it in sfen_buffers_pool, the worker will do the rest.
+
+                // Critical section since sfen_buffers_pool is shared among threads.
+                std::unique_lock<std::mutex> lk(mutex);
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        // Move what remains in the buffer for your thread to a buffer for writing to a file.
+        void finalize(size_t thread_id)
+        {
+            std::unique_lock<std::mutex> lk(mutex);
+
+            auto& buf = sfen_buffers[thread_id];
+
+            // There is a case that buf==nullptr, so that check is necessary.
+            if (buf && buf->size() != 0)
+            {
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        // Start the write_worker thread.
+        void start_file_write_worker()
+        {
+            file_worker_thread = std::thread([&] { this->file_write_worker(); });
+        }
+
+        // Dedicated thread to write to file
+        void file_write_worker()
+        {
+            auto output_status = [&]()
+            {
+                // Also output the current time to console.
+                sync_cout << endl << sfen_write_count << " sfens , at " << now_string() << sync_endl;
+
+                // This is enough for flush().
+                output_file_stream.flush();
+            };
+
+            while (!finished || sfen_buffers_pool.size())
+            {
+                vector<std::unique_ptr<PSVector>> buffers;
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // Atomically swap take the filled buffers and
+                    // create a new buffer pool for threads to fill.
+                    buffers = std::move(sfen_buffers_pool);
+                    sfen_buffers_pool = std::vector<std::unique_ptr<PSVector>>();
+                }
+
+                if (!buffers.size())
+                {
+                    // Poor man's condition variable.
+                    sleep(100);
+                }
+                else
+                {
+                    for (auto& buf : buffers)
+                    {
+                        output_file_stream.write(reinterpret_cast<const char*>(buf->data()), sizeof(PackedSfenValue) * buf->size());
+
+                        sfen_write_count += buf->size();
+#if 1
+                        // Add the processed number here, and if it exceeds save_every, 
+                        // change the file name and reset this counter.
+                        sfen_write_count_current_file += buf->size();
+                        if (sfen_write_count_current_file >= save_every)
+                        {
+                            sfen_write_count_current_file = 0;
+
+                            output_file_stream.close();
+
+                            // Sequential number attached to the file
+                            int n = (int)(sfen_write_count / save_every);
+
+                            // Rename the file and open it again. 
+                            // Add ios::app in consideration of overwriting. 
+                            // (Depending on the operation, it may not be necessary.)
+                            string new_filename = filename + "_" + std::to_string(n);
+                            output_file_stream.open(new_filename, ios::out | ios::binary | ios::app);
+                            cout << endl << "output sfen file = " << new_filename << endl;
+                        }
+#endif
+                        // Output '.' every time when writing a game record.
+                        std::cout << ".";
+
+                        // Output the number of phases processed 
+                        // every STATUS_OUTPUT_PERIOD times
+                        // Finally, the remainder of the teacher phase 
+                        // of each thread is written out, 
+                        // so halfway numbers are displayed, but is it okay?
+                        // If you overuse the threads to the maximum number 
+                        // of logical cores, the console will be clogged, 
+                        // so it may be beneficial to increase that value.
+                        if ((++batch_counter % STATUS_OUTPUT_PERIOD) == 0)
+                        {
+                            output_status();
+                        }
+                    }
+                }
+            }
+
+            // Output the status again after whole processing is done.
+            output_status();
+        }
+
+        void set_save_interval(uint64_t v)
+        {
+            save_every = v;
+        }
+
+    private:
+
+        fstream output_file_stream;
+
+        // A new net is saved after every save_every sfens are processed.
+        uint64_t save_every = std::numeric_limits<uint64_t>::max();
+
+        // File name passed in the constructor
+        std::string filename;
+
+        // Thread to write to the file
+        std::thread file_worker_thread;
+
+        // Flag that all threads have finished
+        atomic<bool> finished;
+
+        // Counter for time stamp output
+        uint64_t batch_counter = 0;
+
+        // buffer before writing to file
+        // sfen_buffers is the buffer for each thread
+        // sfen_buffers_pool is a buffer for writing.
+        // After loading the phase in the former buffer by SFEN_WRITE_SIZE, 
+        // transfer it to the latter.
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers;
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers_pool;
+
+        // Mutex required to access sfen_buffers_pool
+        std::mutex mutex;
+
+        // Number of sfens written in total, and the 
+        // number of sfens written in the current file.
+        uint64_t sfen_write_count = 0;
+        uint64_t sfen_write_count_current_file = 0;
+    };
+
+    // -----------------------------------
+    // worker that creates the game record (for each thread)
+    // -----------------------------------
+
+    // Class to generate sfen with multiple threads
+    struct MultiThinkGenSfen : public MultiThink
+    {
+        // Hash to limit the export of identical sfens
+        static constexpr uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
+        // It must be 2**N because it will be used as the mask to calculate hash_index.
+        static_assert((GENSFEN_HASH_SIZE& (GENSFEN_HASH_SIZE - 1)) == 0);
+
+        MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_) : 
+            search_depth_min(search_depth_min_), 
+            search_depth_max(search_depth_max_), 
+            sfen_writer(sw_)
+        {
+            hash.resize(GENSFEN_HASH_SIZE);
+
+            // Output seed to veryfy by the user if it's not identical by chance.
+            std::cout << prng << std::endl;
+        }
+
+        void start_file_write_worker()
+        {
+            sfen_writer.start_file_write_worker();
+        }
+
+        void thread_worker(size_t thread_id) override;
+
+        optional<int8_t> get_current_game_result(
+            Position& pos,
+            const vector<int>& move_hist_scores) const;
+
+        vector<uint8_t> generate_random_move_flags();
+
+        bool commit_psv(PSVector& a_psv, size_t thread_id, int8_t lastTurnIsWin);
+
+        optional<Move> choose_random_move(
+            Position& pos,
+            std::vector<uint8_t>& random_move_flag,
+            int ply,
+            int& random_move_c);
+
+        Value evaluate_leaf(
+            Position& pos,
+            std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
+            int ply,
+            vector<Move>& pv);
+
+        // Min and max depths for search during gensfen
+        int search_depth_min;
+        int search_depth_max;
+
+        // Number of the nodes to be searched.
+        // 0 represents no limits.
+        uint64_t nodes;
+
+        // Upper limit of evaluation value of generated situation
+        int eval_limit;
+
+        // minimum ply with random move
+        // maximum ply with random move
+        // Number of random moves in one station
+        int random_move_minply;
+        int random_move_maxply;
+        int random_move_count;
+
+        // Move kings with a probability of 1/N when randomly moving like Apery software.
+        // When you move the king again, there is a 1/N chance that it will randomly moved
+        // once in the opponent's turn.
+        // Apery has N=2. Specifying 0 here disables this function.
+        int random_move_like_apery;
+
+        // For when using multi pv instead of random move.
+        // random_multi_pv is the number of candidates for MultiPV.
+        // When adopting the move of the candidate move, the difference 
+        // between the evaluation value of the move of the 1st place 
+        // and the evaluation value of the move of the Nth place is.
+        // Must be in the range random_multi_pv_diff.
+        // random_multi_pv_depth is the search depth for MultiPV.
+        int random_multi_pv;
+        int random_multi_pv_diff;
+        int random_multi_pv_depth;
+
+        // The minimum and maximum ply (number of steps from 
+        // the initial phase) of the sfens to write out.
+        int write_minply;
+        int write_maxply;
+
+        // sfen exporter
+        SfenWriter& sfen_writer;
+
+        vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
+    };
+
+    optional<int8_t> MultiThinkGenSfen::get_current_game_result(
+        Position& pos,
+        const vector<int>& move_hist_scores) const
+    {
+        // Variables for draw adjudication.
+        // Todo: Make this as an option.
+
+        // start the adjudication when ply reaches this value
+        constexpr int adj_draw_ply = 80;
+
+        // 4 move scores for each side have to be checked
+        constexpr int adj_draw_cnt = 8;
+
+        // move score in CP
+        constexpr int adj_draw_score = 0;
+
+        // For the time being, it will be treated as a 
+        // draw at the maximum number of steps to write.
+        const int ply = move_hist_scores.size();
+
+        // has it reached the max length or is a draw
+        if (ply >= write_maxply || pos.is_draw(ply))
+        {
+            return 0;
+        }
+
+        // Initialize the Syzygy Ending Tablebase and sort the moves.
+        Search::RootMoves rootMoves;
+        for (const auto& m : MoveList<LEGAL>(pos))
+        {
+            rootMoves.emplace_back(m);
+        }
+
+        if (!rootMoves.empty())
+        {
+            Tablebases::rank_root_moves(pos, rootMoves);
+        }
+        else 
+        {
+            // If there is no legal move
+            return pos.checkers() 
+                ? -1 /* mate */ 
+                : 0 /* stalemate */;
+        }
+
+        // Adjudicate game to a draw if the last 4 scores of each engine is 0.
+        if (detect_draw_by_consecutive_low_score) 
+        {
+            if (ply >= adj_draw_ply) 
+            {
+                int num_cons_plies_within_draw_score = 0;
+                bool is_adj_draw = false;
+
+                for (auto it = move_hist_scores.rbegin();
+                    it != move_hist_scores.rend(); ++it)
+                {
+                    if (abs(*it) <= adj_draw_score)
+                    {
+                        num_cons_plies_within_draw_score++;
+                    }
+                    else
+                    {
+                        // Draw scores must happen on consecutive plies
+                        break;
+                    }
+
+                    if (num_cons_plies_within_draw_score >= adj_draw_cnt) 
+                    {
+                        is_adj_draw = true;
+                        break;
+                    }
+                }
+
+                if (is_adj_draw) 
+                {
+                    return 0;
+                }
+            }
+        }
+
+        // Draw by insufficient mating material
+        if (detect_draw_by_insufficient_mating_material) 
+        {
+            if (pos.count<ALL_PIECES>() <= 4) 
+            {
+                int num_pieces = pos.count<ALL_PIECES>();
+
+                // (1) KvK
+                if (num_pieces == 2) 
+                {
+                    return 0;
+                }
+
+                // (2) KvK + 1 minor piece
+                if (num_pieces == 3) 
+                {
+                    int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
+                        pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
+                    if (minor_pc == 1) 
+                    {
+                        return 0;
+                    }
+                }
+
+                // (3) KBvKB, bishops of the same color
+                else if (num_pieces == 4) 
+                {
+                    if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1) 
+                    {
+                        // Color of bishops is black.
+                        if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
+                            && (pos.pieces(BLACK, BISHOP) & DarkSquares))
+                        {
+                            return 0;
+                        }
+                        // Color of bishops is white.
+                        if ((pos.pieces(WHITE, BISHOP) & ~DarkSquares)
+                            && (pos.pieces(BLACK, BISHOP) & ~DarkSquares))
+                        {
+                            return 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        return nullopt;
+    }
+
+    // Write out the phases loaded in sfens to a file.
+    // lastTurnIsWin: win/loss in the next phase after the final phase in sfens
+    // 1 when winning. -1 when losing. Pass 0 for a draw.
+    // Return value: true if the specified number of 
+    // sfens has already been reached and the process ends.
+    bool MultiThinkGenSfen::commit_psv(PSVector& sfens, size_t thread_id, int8_t lastTurnIsWin)
+    {
+        int8_t is_win = lastTurnIsWin;
+
+        // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
+        // The phases stored in sfens are assumed to be continuous (in order).
+        for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
+        {
+            // If is_win == 0 (draw), multiply by -1 and it will remain 0 (draw)
+            is_win = -is_win;
+            it->game_result = is_win;
+
+            // See how many sfens were already written and get the next id.
+            // Exit if requested number of sfens reached.
+            auto now_loop_count = get_next_loop_count();
+            if (now_loop_count == LOOP_COUNT_FINISHED)
+            {
+                return true;
+            }
+
+            // Write out one sfen.
+            sfen_writer.write(thread_id, *it);
+
+#if 0
+            pos.set_from_packed_sfen(it->sfen);
+            cout << pos << "Win : " << it->is_win << " , " << it->score << endl;
+#endif
+        }
+
+        return false;
+    }
+
+    optional<Move> MultiThinkGenSfen::choose_random_move(
+        Position& pos,
+        std::vector<uint8_t>& random_move_flag,
+        int ply,
+        int& random_move_c)
+    {
+        optional<Move> random_move;
+
+        // Randomly choose one from legal move
+        if (
+            // 1. Random move of random_move_count times from random_move_minply to random_move_maxply
+            (random_move_minply != -1 && ply < (int)random_move_flag.size() && random_move_flag[ply]) ||
+            // 2. A mode to perform random move of random_move_count times after leaving the startpos
+            (random_move_minply == -1 && random_move_c < random_move_count))
+        {
+            ++random_move_c;
+
+            // It's not a mate, so there should be one legal move...
+            if (random_multi_pv == 0)
+            {
+                // Normal random move
+                MoveList<LEGAL> list(pos);
+
+                // I don't really know the goodness and badness of making this the Apery method.
+                if (random_move_like_apery == 0
+                    || prng.rand(random_move_like_apery) != 0)
+                {
+                    // Normally one move from legal move
+                    random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
+                }
+                else 
+                {
+                    // if you can move the king, move the king
+                    Move moves[8]; // Near 8
+                    Move* p = &moves[0];
+                    for (auto& m : list)
+                    {
+                        if (type_of(pos.moved_piece(m)) == KING)
+                        {
+                            *(p++) = m;
+                        }
+                    }
+
+                    size_t n = p - &moves[0];
+                    if (n != 0)
+                    {
+                        // move to move the king
+                        random_move = moves[prng.rand(n)];
+
+                        // In Apery method, at this time there is a 1/2 chance 
+                        // that the opponent will also move randomly
+                        if (prng.rand(2) == 0)
+                        {
+                            // Is it a simple hack to add a "1" next to random_move_flag[ply]?
+                            random_move_flag.insert(random_move_flag.begin() + ply + 1, 1, true);
+                        }
+                    }
+                    else
+                    {
+                        // Normally one move from legal move
+                        random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
+                    }
+                }
+            }
+            else 
+            {
+                Learner::search(pos, random_multi_pv_depth, random_multi_pv);
+
+                // Select one from the top N hands of root Moves
+                auto& rm = pos.this_thread()->rootMoves;
+
+                uint64_t s = min((uint64_t)rm.size(), (uint64_t)random_multi_pv);
+                for (uint64_t i = 1; i < s; ++i)
+                {
+                    // The difference from the evaluation value of rm[0] must 
+                    // be within the range of random_multi_pv_diff.
+                    // It can be assumed that rm[x].score is arranged in descending order.
+                    if (rm[0].score > rm[i].score + random_multi_pv_diff)
+                    {
+                        s = i;
+                        break;
+                    }
+                }
+
+                random_move = rm[prng.rand(s)].pv[0];
+            }
+        }
+
+        return random_move;
+    }
+
+    vector<uint8_t> MultiThinkGenSfen::generate_random_move_flags()
+    {
+        vector<uint8_t> random_move_flag;
+
+        // Depending on random move selection parameters setup
+        // the array of flags that indicates whether a random move
+        // be taken at a given ply.
+
+        // Make an array like a[0] = 0 ,a[1] = 1, ...
+        // Fisher-Yates shuffle and take out the first N items.
+        // Actually, I only want N pieces, so I only need 
+        // to shuffle the first N pieces with Fisher-Yates.
+
+        vector<int> a;
+        a.reserve((size_t)random_move_maxply);
+
+        // random_move_minply ,random_move_maxply is specified by 1 origin,
+        // Note that we are handling 0 origin here.
+        for (int i = std::max(random_move_minply - 1, 0); i < random_move_maxply; ++i)
+        {
+            a.push_back(i);
+        }
+
+        // In case of Apery random move, insert() may be called random_move_count times.
+        // Reserve only the size considering it.
+        random_move_flag.resize((size_t)random_move_maxply + random_move_count);
+
+        // A random move that exceeds the size() of a[] cannot be applied, so limit it.
+        for (int i = 0; i < std::min(random_move_count, (int)a.size()); ++i)
+        {
+            swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
+            random_move_flag[a[i]] = true;
+        }
+
+        return random_move_flag;
+    }
+
+    Value MultiThinkGenSfen::evaluate_leaf(
+        Position& pos, 
+        std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
+        int ply,
+        vector<Move>& pv)
+    {
+        auto rootColor = pos.side_to_move();
+
+        for (auto m : pv)
+        {
+#if 1
+            // There should be no illegal move. This is as a debugging precaution.
+            if (!pos.pseudo_legal(m) || !pos.legal(m))
+            {
+                cout << "Error! : " << pos.fen() << m << endl;
+            }
+#endif
+            pos.do_move(m, states[ply++]);
+
+            // Because the difference calculation of evaluate() cannot be 
+            // performed unless each node evaluate() is called!
+            // If the depth is 8 or more, it seems 
+            // faster not to calculate this difference.
+#if defined(EVAL_NNUE)
+            if (depth < 8)
+            {
+                Eval::NNUE::update_eval(pos);
+            }
+#endif  // defined(EVAL_NNUE)
+        }
+
+        // Reach leaf
+        Value v;
+        if (pos.checkers()) {
+            // Sometime a king is checked.  An example is a case that a checkmate is
+            // found in the search.  If Eval::evaluate() is called whne a king is
+            // checked, classic eval crashes by an assertion. To avoid crashes, return
+            // VALUE_NONE and let the caller assign a value to the position.
+            return VALUE_NONE;
+        }
+        else 
+        {
+            v = Eval::evaluate(pos);
+
+            // evaluate() returns the evaluation value on the turn side, so
+            // If it's a turn different from root_color, you must invert v and return it.
+            if (rootColor != pos.side_to_move())
+            {
+                v = -v;
+            }
+        }
+
+        // Rewind the pv moves.
+        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+        {
+            pos.undo_move(*it);
+        }
+
+        return v;
+    }
+
+    // thread_id = 0..Threads.size()-1
+    void MultiThinkGenSfen::thread_worker(size_t thread_id)
+    {
+        // For the time being, it will be treated as a draw 
+        // at the maximum number of steps to write.
+        // Maximum StateInfo + Search PV to advance to leaf buffer
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
+            write_maxply + MAX_PLY /* == search_depth_min + α */);
+
+        StateInfo si;
+
+        // end flag
+        bool quit = false;
+
+        // repeat until the specified number of times
+        while (!quit)
+        {
+            // It is necessary to set a dependent thread for Position.
+            // When parallelizing, Threads (since this is a vector<Thread*>,
+            // Do the same for up to Threads[0]...Threads[thread_num-1].
+            auto th = Threads[thread_id];
+
+            auto& pos = th->rootPos;
+            pos.set(StartFEN, false, &si, th);
+
+#if defined(USE_BOOK)
+            // Refer to the members of BookMoveSelector defined in the search section.
+            auto& book = ::book;
+#endif
+
+            // Vector for holding the sfens in the current simulated game.
+            PSVector a_psv;
+            a_psv.reserve(write_maxply + MAX_PLY);
+
+            // Precomputed flags. Used internally by choose_random_move.
+            vector<uint8_t> random_move_flag = generate_random_move_flags();
+
+            // A counter that keeps track of the number of random moves
+            // When random_move_minply == -1, random moves are 
+            // performed continuously, so use it at this time.
+            // Used internally by choose_random_move.
+            int actual_random_move_count = 0;
+
+            // Save history of move scores for adjudication
+            vector<int> move_hist_scores;
+
+            auto flush_psv = [&](int8_t result) {
+                quit = commit_psv(a_psv, thread_id, result);
+            };
+
+            for (int ply = 0; ; ++ply)
+            {
+                Move next_move = MOVE_NONE;
+
+                // Current search depth
+                const int depth = search_depth_min + (int)prng.rand(search_depth_max - search_depth_min + 1);
+
+                const auto result = get_current_game_result(pos, move_hist_scores);
+                if (result.has_value())
+                {
+                    flush_psv(result.value());
+                    break;
+                }
+#if defined(USE_BOOK)
+                if ((next_move = book.probe(pos)) != MOVE_NONE)
+                {
+                    // Hit the constant track.
+                    // The move was stored in next_move.
+
+                    // Do not use the fixed phase for learning.
+                    sfens.clear();
+
+                    if (random_move_minply != -1)
+                    {
+                        // Random move is performed with a certain 
+                        // probability even in the constant phase.
+                        goto RANDOM_MOVE;
+                    }
+                    else
+                    {
+                        // When -1 is specified as random_move_minply, 
+                        // it points according to the standard until 
+                        // it goes out of the standard.
+                        // Prepare an innumerable number of situations 
+                        // that have left the constant as 
+                        // ConsiderationBookMoveCount true using a huge constant
+                        // Used for purposes such as performing 
+                        // a random move 5 times from there.
+                        goto DO_MOVE;
+                    }
+                }
+#endif
+                {
+                    auto [search_value, search_pv] = search(pos, depth, 1, nodes);
+
+                    // Always adjudivate by eval limit.
+                    // Also because of this we don't have to check for TB/MATE scores
+                    if (abs(search_value) >= eval_limit)
+                    {
+                        const auto wdl = (search_value >= eval_limit) ? 1 : -1;
+                        flush_psv(wdl);
+                        break;
+                    }
+
+                    // Verification of a strange move
+                    if (search_pv.size() > 0
+                        && (search_pv[0] == MOVE_NONE || search_pv[0] == MOVE_NULL))
+                    {
+                        // (???)
+                        // MOVE_WIN is checking if it is the declaration victory stage before this
+                        // The declarative winning move should never come back here.
+                        // Also, when MOVE_RESIGN, search_value is a one-stop score, which should be the minimum value of eval_limit (-31998)...
+                        cout << "Error! : " << pos.fen() << next_move << search_value << endl;
+                        break;
+                    }
+
+                    // Save the move score for adjudication.
+                    move_hist_scores.push_back(search_value);
+
+#if 0
+                    dbg_hit_on(search_value == leaf_value);
+                    // gensfen depth 3 eval_limit 32000
+                    // Total 217749 Hits 203579 hit rate (%) 93.490
+                    // gensfen depth 6 eval_limit 32000
+                    // Total 78407 Hits 69190 hit rate (%) 88.245
+                    // gensfen depth 6 eval_limit 3000
+                    // Total 53879 Hits 43713 hit rate (%) 81.132
+
+                    // Problems such as pruning with moves in the substitution table.
+                    // This is a little uncomfortable as a teacher...
+#endif
+
+                    // If depth 0, pv is not obtained, so search again at depth 2.
+                    if (search_depth_min <= 0)
+                    {
+                        auto [research_value, research_pv] = search(pos, 2);
+                        search_pv = research_pv;
+                    }
+
+                    // Discard stuff before write_minply is reached
+                    // because it can harm training due to overfitting.
+                    // Initial positions would be too common.
+                    if (ply < write_minply - 1)
+                    {
+                        a_psv.clear();
+                        goto SKIP_SAVE;
+                    }
+
+                    // Look into the position hashtable to see if the same
+                    // position was seen before.
+                    // This is a good heuristic to exlude already seen
+                    // positions without many false positives.
+                    {
+                        auto key = pos.key();
+                        auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
+                        auto old_key = hash[hash_index];
+                        if (key == old_key)
+                        {
+                            a_psv.clear();
+                            goto SKIP_SAVE;
+                        }
+                        else
+                        {
+                            // Replace with the current key.
+                            hash[hash_index] = key;
+                        }
+                    }
+
+                    // Pack the current position into a packed sfen and save it into the buffer.
+                    {
+                        a_psv.emplace_back(PackedSfenValue());
+                        auto& psv = a_psv.back();
+
+                        // Here we only write the position data.
+                        // Result is added after the whole game is done.
+                        pos.sfen_pack(psv.sfen);
+
+                        // Get the value of evaluate() as seen from the 
+                        // root color on the leaf node of the PV line.
+                        // I don't know the goodness and badness of using the 
+                        // return value of search() as it is.
+                        // TODO: Consider using search value instead of evaluate_leaf.
+                        //       Maybe give it as an option.
+                        
+                        // Use PV moves to reach the leaf node and use the value 
+                        // that evaluated() is called on that leaf node.
+                        const auto leaf_value = evaluate_leaf(pos, states, ply, search_pv);
+
+                        // If for some reason the leaf node couldn't yield an eval
+                        // we fallback to search value.
+                        psv.score = leaf_value == VALUE_NONE ? search_value : leaf_value;
+
+                        psv.gamePly = ply;
+
+                        // Take out the first PV move. This should be present unless depth 0.
+                        assert(search_pv.size() >= 1);
+                        psv.move = search_pv[0];
+                    }
+
+                SKIP_SAVE:;
+
+                    // For some reason, We could not get PV (hit the substitution table etc. and got stuck?) 
+                    // so go to the next game. It's a rare case, so you can ignore it.
+                    if (search_pv.size() == 0)
+                    {
+                        break;
+                    }
+
+                    // Update the next move according to best search result.
+                    next_move = search_pv[0];
+                }
+
+            RANDOM_MOVE:;
+
+                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
+                if (random_move.has_value())
+                {
+                    next_move = random_move.value();
+
+                    // We don't have the whole game yet, but it ended, 
+                    // so the writing process ends and the next game starts.
+                    if (!is_ok(next_move))
+                    {
+                        break;
+                    }
+
+                    // Clear the sfens that were written before the random move.
+                    // (???) why?
+                    a_psv.clear();
+                }
+
+            DO_MOVE:;
+                pos.do_move(next_move, states[ply]);
+
+                // Call node evaluate() for each difference calculation.
+                Eval::NNUE::update_eval(pos);
+
+            } // for (int ply = 0; ; ++ply)
+
+        } // while(!quit)
+
+        sfen_writer.finalize(thread_id);
+    }
+
+    // -----------------------------------
+    // Command to generate a game record (master thread)
+    // -----------------------------------
+
+    // Command to generate a game record
+    void gen_sfen(Position&, istringstream& is)
+    {
+        // number of threads (given by USI setoption)
+        uint32_t thread_num = (uint32_t)Options["Threads"];
+
+        // Number of generated game records default = 8 billion phases (Ponanza specification)
+        uint64_t loop_max = 8000000000UL;
+
+        // Stop the generation when the evaluation value reaches this value.
+        int eval_limit = 3000;
+
+        // search depth
+        int search_depth_min = 3;
+        int search_depth_max = INT_MIN;
+
+        // Number of nodes to be searched.
+        uint64_t nodes = 0;
+
+        // minimum ply, maximum ply and number of random moves
+        int random_move_minply = 1;
+        int random_move_maxply = 24;
+        int random_move_count = 5;
+
+        // A function to move the random move mainly like Apery
+        // If this is set to 3, the ball will move with a probability of 1/3.
+        int random_move_like_apery = 0;
+
+        // If you search with multipv instead of random move and choose from among them randomly, set random_multi_pv = 1 or more.
+        int random_multi_pv = 0;
+        int random_multi_pv_diff = 32000;
+        int random_multi_pv_depth = INT_MIN;
+
+        // The minimum and maximum ply (number of steps from the initial phase) of the phase to write out.
+        int write_minply = 16;
+        int write_maxply = 400;
+
+        // File name to write
+        string output_file_name = "generated_kifu.bin";
+
+        string token;
+
+        // When hit to eval hash, as a evaluation value near the initial stage, if a hash collision occurs and a large value is written
+        // When eval_limit is set small, eval_limit will be exceeded every time in the initial phase, and phase generation will not proceed.
+        // Therefore, eval hash needs to be disabled.
+        // After that, when the hash of the eval hash collides, the evaluation value of a strange value is used, and it may be unpleasant to use it for the teacher.
+        bool use_eval_hash = false;
+
+        // Save to file in this unit.
+        // File names are serialized like file_1.bin, file_2.bin.
+        uint64_t save_every = UINT64_MAX;
+
+        // Add a random number to the end of the file name.
+        bool random_file_name = false;
+
+        while (true)
+        {
+            token = "";
+            is >> token;
+            if (token == "")
+                break;
+
+            if (token == "depth")
+                is >> search_depth_min;
+            else if (token == "depth2")
+                is >> search_depth_max;
+            else if (token == "nodes")
+                is >> nodes;
+            else if (token == "loop")
+                is >> loop_max;
+            else if (token == "output_file_name")
+                is >> output_file_name;
+            else if (token == "eval_limit")
+            {
+                is >> eval_limit;
+                // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
+                eval_limit = std::min(eval_limit, (int)mate_in(2));
+            }
+            else if (token == "random_move_minply")
+                is >> random_move_minply;
+            else if (token == "random_move_maxply")
+                is >> random_move_maxply;
+            else if (token == "random_move_count")
+                is >> random_move_count;
+            else if (token == "random_move_like_apery")
+                is >> random_move_like_apery;
+            else if (token == "random_multi_pv")
+                is >> random_multi_pv;
+            else if (token == "random_multi_pv_diff")
+                is >> random_multi_pv_diff;
+            else if (token == "random_multi_pv_depth")
+                is >> random_multi_pv_depth;
+            else if (token == "write_minply")
+                is >> write_minply;
+            else if (token == "write_maxply")
+                is >> write_maxply;
+            else if (token == "use_eval_hash")
+                is >> use_eval_hash;
+            else if (token == "save_every")
+                is >> save_every;
+            else if (token == "random_file_name")
+                is >> random_file_name;
+            // Accept also the old option name.
+            else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
+                is >> write_out_draw_game_in_training_data_generation;
+            // Accept also the old option name.
+            else if (token == "use_game_draw_adjudication" || token == "detect_draw_by_consecutive_low_score")
+                is >> detect_draw_by_consecutive_low_score;
+            else if (token == "detect_draw_by_insufficient_mating_material")
+                is >> detect_draw_by_insufficient_mating_material;
+            else if (token == "use_raw_nnue_eval")
+                is >> use_raw_nnue_eval;
+            else
+                cout << "Error! : Illegal token " << token << endl;
+        }
+
+#if defined(USE_GLOBAL_OPTIONS)
+        // Save it for later restore.
+        auto oldGlobalOptions = GlobalOptions;
+        GlobalOptions.use_eval_hash = use_eval_hash;
+#endif
+
+        // If search depth2 is not set, leave it the same as search depth.
+        if (search_depth_max == INT_MIN)
+            search_depth_max = search_depth_min;
+        if (random_multi_pv_depth == INT_MIN)
+            random_multi_pv_depth = search_depth_min;
+
+        if (random_file_name)
+        {
+            // Give a random number to output_file_name at this point.
+            // Do not use std::random_device().  Because it always the same integers on MinGW.
+            PRNG r(std::chrono::system_clock::now().time_since_epoch().count());
+            // Just in case, reassign the random numbers.
+            for (int i = 0; i < 10; ++i)
+                r.rand(1);
+            auto to_hex = [](uint64_t u) {
+                std::stringstream ss;
+                ss << std::hex << u;
+                return ss.str();
+            };
+            // I don't want to wear 64bit numbers by accident, so I'next_move going to make a 64bit number 2 just in case.
+            output_file_name = output_file_name + "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
+        }
+
+        std::cout << "gensfen : " << endl
+            << "  search_depth_min = " << search_depth_min << " to " << search_depth_max << endl
+            << "  nodes = " << nodes << endl
+            << "  loop_max = " << loop_max << endl
+            << "  eval_limit = " << eval_limit << endl
+            << "  thread_num (set by USI setoption) = " << thread_num << endl
+#if defined(USE_BOOK)
+            << "  book_moves (set by USI setoption) = " << Options["BookMoves"] << endl
+#endif
+            << "  random_move_minply     = " << random_move_minply << endl
+            << "  random_move_maxply     = " << random_move_maxply << endl
+            << "  random_move_count      = " << random_move_count << endl
+            << "  random_move_like_apery = " << random_move_like_apery << endl
+            << "  random_multi_pv        = " << random_multi_pv << endl
+            << "  random_multi_pv_diff   = " << random_multi_pv_diff << endl
+            << "  random_multi_pv_depth  = " << random_multi_pv_depth << endl
+            << "  write_minply           = " << write_minply << endl
+            << "  write_maxply           = " << write_maxply << endl
+            << "  output_file_name       = " << output_file_name << endl
+            << "  use_eval_hash          = " << use_eval_hash << endl
+            << "  save_every             = " << save_every << endl
+            << "  random_file_name       = " << random_file_name << endl
+            << "  write_out_draw_game_in_training_data_generation = " << write_out_draw_game_in_training_data_generation << endl
+            << "  detect_draw_by_consecutive_low_score = " << detect_draw_by_consecutive_low_score << endl
+            << "  detect_draw_by_insufficient_mating_material = " << detect_draw_by_insufficient_mating_material << endl;
+
+        // Show if the training data generator uses NNUE.
+        Eval::verify_NNUE();
+
+        // Create and execute threads as many as Options["Threads"].
+        {
+            SfenWriter sfen_writer(output_file_name, thread_num);
+            sfen_writer.set_save_interval(save_every);
+
+            MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, sfen_writer);
+            multi_think.nodes = nodes;
+            multi_think.set_loop_max(loop_max);
+            multi_think.eval_limit = eval_limit;
+            multi_think.random_move_minply = random_move_minply;
+            multi_think.random_move_maxply = random_move_maxply;
+            multi_think.random_move_count = random_move_count;
+            multi_think.random_move_like_apery = random_move_like_apery;
+            multi_think.random_multi_pv = random_multi_pv;
+            multi_think.random_multi_pv_diff = random_multi_pv_diff;
+            multi_think.random_multi_pv_depth = random_multi_pv_depth;
+            multi_think.write_minply = write_minply;
+            multi_think.write_maxply = write_maxply;
+            multi_think.start_file_write_worker();
+            multi_think.go_think();
+
+            // Since we are joining with the destructor of SfenWriter, please give a message that it has finished after the join
+            // Enclose this in a block because it should be displayed.
+        }
+
+        std::cout << "gensfen finished." << endl;
+
+#if defined(USE_GLOBAL_OPTIONS)
+        // Restore Global Options.
+        GlobalOptions = oldGlobalOptions;
+#endif
+
+    }
+}
+#endif
diff --git a/src/learn/learn.h b/src/learn/learn.h
index eda2bb32..e29ed74a 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -180,6 +180,23 @@ typedef float LearnFloatType;
 #define ADA_GRAD_UPDATE
 #endif
 
+// Character string according to update formula. (Output for debugging.)
+// Implemented various update expressions, but concluded that AdaGrad is the best in terms of speed and memory.
+#if defined(ADA_GRAD_UPDATE)
+#define LEARN_UPDATE "AdaGrad"
+#elif defined(SGD_UPDATE)
+#define LEARN_UPDATE "SGD"
+#endif
+
+#if defined(LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
+#define LOSS_FUNCTION "WINNING_PERCENTAGE"
+#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
+#define LOSS_FUNCTION "CROSS_ENTOROPY"
+#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
+#define LOSS_FUNCTION "CROSS_ENTOROPY_FOR_VALUE"
+#elif defined(LOSS_FUNCTION_IS_ELMO_METHOD)
+#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
+#endif
 
 // ----------------------
 // Definition of struct used in Learner
@@ -223,13 +240,38 @@ namespace Learner
 	// Used in Learner::search(), Learner::qsearch().
 	typedef std::pair<Value, std::vector<Move> > ValueAndPV;
 
+	// Phase array: PSVector stands for packed sfen vector.
+	typedef std::vector<PackedSfenValue> PSVector;
+
 	// So far, only Yaneura King 2018 Otafuku has this stub
 	// This stub is required if EVAL_LEARN is defined.
 	extern Learner::ValueAndPV  search(Position& pos, int depth , size_t multiPV = 1 , uint64_t NodesLimit = 0);
 	extern Learner::ValueAndPV qsearch(Position& pos);
 
 	double calc_grad(Value shallow, const PackedSfenValue& psv);
+	
+	void convert_bin_from_pgn_extract(
+		const std::vector<std::string>& filenames,
+		const std::string& output_file_name,
+		const bool pgn_eval_side_to_move,
+		const bool convert_no_eval_fens_as_score_zero);
+	
+	void convert_bin(
+		const std::vector<std::string>& filenames,
+		const std::string& output_file_name,
+		const int ply_minimum,
+		const int ply_maximum,
+		const int interpolate_eval,
+		const int src_score_min_value,
+		const int src_score_max_value,
+		const int dest_score_min_value,
+		const int dest_score_max_value,
+		const bool check_invalid_fen,
+		const bool check_illegal_move);
 
+	void convert_plain(
+		const std::vector<std::string>& filenames,
+		const std::string& output_file_name);
 }
 
 #endif
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 9f02a594..c897dd93 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -13,54 +13,34 @@
 // → I will not be involved in the engine because it is a problem that the GUI should assist.
 // etc..
 
+#define EVAL_LEARN
+
 #if defined(EVAL_LEARN)
 
-#include <chrono>
-#include <filesystem>
-#include <random>
-#include <regex>
+#include "../eval/evaluate_common.h"
 
 #include "learn.h"
 #include "multi_think.h"
 #include "../uci.h"
 #include "../syzygy/tbprobe.h"
+#include "../misc.h"
+#include "../thread.h"
+#include "../position.h"
+#include "../tt.h"
 
-// evaluate header for learning
-#include "../eval/evaluate_common.h"
-
-// ----------------------
-// constant string based on the settings
-// ----------------------
-
-// Character string according to update formula. (Output for debugging.)
-// Implemented various update expressions, but concluded that AdaGrad is the best in terms of speed and memory.
-#if defined(ADA_GRAD_UPDATE)
-#define LEARN_UPDATE "AdaGrad"
-#elif defined(SGD_UPDATE)
-#define LEARN_UPDATE "SGD"
-#endif
-
-#if defined(LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
-#define LOSS_FUNCTION "WINNING_PERCENTAGE"
-#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
-#define LOSS_FUNCTION "CROSS_ENTOROPY"
-#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
-#define LOSS_FUNCTION "CROSS_ENTOROPY_FOR_VALUE"
-#elif defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
-#endif
-
-// -----------------------------------
-// Below, the implementation section.
-// -----------------------------------
-
+#include <chrono>
+#include <random>
+#include <regex>
 #include <sstream>
 #include <fstream>
 #include <unordered_set>
 #include <iomanip>
 #include <list>
-#include <cmath>	// std::exp(),std::pow(),std::log()
-#include <cstring>	// memcpy()
+#include <cmath>    // std::exp(),std::pow(),std::log()
+#include <cstring>  // memcpy()
+#include <memory>
+#include <limits>
+#include <optional>
 
 #if defined (_OPENMP)
 #include <omp.h>
@@ -75,13 +55,6 @@
 #include <dirent.h>
 #endif
 
-#include "../misc.h"
-#include "../thread.h"
-#include "../position.h"
-//#include "../extra/book/book.h"
-#include "../tt.h"
-#include "multi_think.h"
-
 #if defined(EVAL_NNUE)
 #include "../nnue/evaluate_nnue_learner.h"
 #include <climits>
@@ -93,3470 +66,2019 @@ using namespace std;
 //// This is defined in the search section.
 //extern Book::BookMoveSelector book;
 
-// Addition and subtraction definition for atomic<T>
-// Aligned with atomicAdd() in Apery/learner.hpp.
 template <typename T>
 T operator += (std::atomic<T>& x, const T rhs)
 {
-	T old = x.load(std::memory_order_consume);
-	// It is allowed that the value is rewritten from other thread at this timing.
-	// The idea that the value is not destroyed is good.
-	T desired = old + rhs;
-	while (!x.compare_exchange_weak(old, desired, std::memory_order_release, std::memory_order_consume))
-		desired = old + rhs;
-	return desired;
+    T old = x.load(std::memory_order_consume);
+    // It is allowed that the value is rewritten from other thread at this timing.
+    // The idea that the value is not destroyed is good.
+    T desired = old + rhs;
+    while (!x.compare_exchange_weak(old, desired, std::memory_order_release, std::memory_order_consume))
+        desired = old + rhs;
+    return desired;
 }
 template <typename T>
 T operator -= (std::atomic<T>& x, const T rhs) { return x += -rhs; }
 
 namespace Learner
 {
-
-// Phase array: PSVector stands for packed sfen vector.
-typedef std::vector<PackedSfenValue> PSVector;
-
-bool write_out_draw_game_in_training_data_generation = false;
-bool use_draw_games_in_training = false;
-bool use_draw_games_in_validation = false;
-bool skip_duplicated_positions_in_training = true;
-bool detect_draw_by_consecutive_low_score = false;
-bool detect_draw_by_insufficient_mating_material = false;
-// 1.0 / PawnValueEg / 4.0 * log(10.0)
-double winning_probability_coefficient = 0.00276753015984861260098316280611;
-// Score scale factors.  ex) If we set src_score_min_value = 0.0,
-// src_score_max_value = 1.0, dest_score_min_value = 0.0,
-// dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000].
-double src_score_min_value = 0.0;
-double src_score_max_value = 1.0;
-double dest_score_min_value = 0.0;
-double dest_score_max_value = 1.0;
-// Assume teacher signals are the scores of deep searches, and convert them into winning
-// probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
-// data directly. In those cases, we set false to this variable.
-bool convert_teacher_signal_to_winning_probability = true;
-// Use raw NNUE eval value in the Eval::evaluate(). If hybrid eval is enabled, training data
-// generation and training don't work well.
-// https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-bool use_raw_nnue_eval = true;
-// Using WDL with win rate model instead of sigmoid
-bool use_wdl = false;
-
-// -----------------------------------
-// write phase file
-// -----------------------------------
-
-// Helper class for exporting Sfen
-struct SfenWriter
-{
-		// File name to write and number of threads to create
-	SfenWriter(string filename, int thread_num)
-	{
-		sfen_buffers_pool.reserve((size_t)thread_num * 10);
-		sfen_buffers.resize(thread_num);
-
-		// When performing additional learning, the quality of the teacher generated after learning the evaluation function does not change much and I want to earn more teacher positions.
-		// Since it is preferable that old teachers also use it, it has such a specification.
-		fs.open(filename, ios::out | ios::binary | ios::app);
-		filename_ = filename;
-
-		finished = false;
-	}
-
-	~SfenWriter()
-	{
-		finished = true;
-		file_worker_thread.join();
-		fs.close();
-
-		// all buffers should be empty since file_worker_thread has written all..
-		for (auto p : sfen_buffers) { assert(p == nullptr); }
-		assert(sfen_buffers_pool.empty());
-	}
-
-	// For each thread, flush the file by this number of phases.
-	const size_t SFEN_WRITE_SIZE = 5000;
-
-	// write one by pairing the phase and evaluation value (in packed sfen format)
-	void write(size_t thread_id, const PackedSfenValue& psv)
-	{
-		// We have a buffer for each thread and add it there.
-		// If the buffer overflows, write it to a file.
-
-		// This buffer is prepared for each thread.
-		auto& buf = sfen_buffers[thread_id];
-
-		// Secure since there is no buf at the first time and immediately after writing the thread buffer.
-		if (!buf)
-		{
-			buf = new PSVector();
-			buf->reserve(SFEN_WRITE_SIZE);
-		}
-
-		// It is prepared for each thread, so one thread does not call this write() function at the same time.
-		// There is no need to exclude at this point.
-		buf->push_back(psv);
-
-		if (buf->size() >= SFEN_WRITE_SIZE)
-		{
-			// If you load it in sfen_buffers_pool, the worker will do the rest.
-
-			// Mutex lock is required when changing the contents of sfen_buffers_pool.
-			std::unique_lock<std::mutex> lk(mutex);
-			sfen_buffers_pool.push_back(buf);
-
-			buf = nullptr;
-			// If you set buf == nullptr, the buffer will be allocated the next time this function is called.
-		}
-	}
-
-	// Move what remains in the buffer for your thread to a buffer for writing to a file.
-	void finalize(size_t thread_id)
-	{
-		std::unique_lock<std::mutex> lk(mutex);
-
-		auto& buf = sfen_buffers[thread_id];
-
-		// There is a case that buf==nullptr, so that check is necessary.
-		if (buf && buf->size() != 0)
-			sfen_buffers_pool.push_back(buf);
-
-		buf = nullptr;
-	}
-
-	// Start the write_worker thread.
-	void start_file_write_worker()
-	{
-		file_worker_thread = std::thread([&] { this->file_write_worker(); });
-	}
-
-	// Dedicated thread to write to file
-	void file_write_worker()
-	{
-		auto output_status = [&]()
-		{
-			// also output the current time
-			sync_cout << endl << sfen_write_count << " sfens , at " << now_string() << sync_endl;
-
-			// This is enough for flush().
-			fs.flush();
-		};
-
-		while (!finished || sfen_buffers_pool.size())
-		{
-			vector<PSVector*> buffers;
-			{
-				std::unique_lock<std::mutex> lk(mutex);
-
-				// copy the whole
-				buffers = sfen_buffers_pool;
-				sfen_buffers_pool.clear();
-			}
-
-			// sleep() if you didn't get anything
-			if (!buffers.size())
-				sleep(100);
-			else
-			{
-				for (auto ptr : buffers)
-				{
-					fs.write((const char*)&((*ptr)[0]), sizeof(PackedSfenValue) * ptr->size());
-
-					sfen_write_count += ptr->size();
-
-#if 1
-					// Add the processed number here, and if it exceeds save_every, change the file name and reset this counter.
-					save_every_counter += ptr->size();
-					if (save_every_counter >= save_every)
-					{
-						save_every_counter = 0;
-						// Change the file name.
-
-						fs.close();
-
-						// Sequential number attached to the file
-						int n = (int)(sfen_write_count / save_every);
-						// Rename the file and open it again. Add ios::app in consideration of overwriting. (Depending on the operation, it may not be necessary.)
-						string filename = filename_ + "_" + std::to_string(n);
-						fs.open(filename, ios::out | ios::binary | ios::app);
-						cout << endl << "output sfen file = " << filename << endl;
-					}
-#endif
-
-					// Output'.' every time when writing a game record.
-					std::cout << ".";
-
-					// Output the number of phases processed every 40 times
-					// Finally, the remainder of the teacher phase of each thread is written out, so halfway numbers are displayed, but is it okay?
-					// If you overuse the threads to the maximum number of logical cores, the console will be clogged, so it may be a little more loose.
-					if ((++time_stamp_count % 40) == 0)
-						output_status();
-
-					// Since this memory is unnecessary, release it at this timing.
-					delete ptr;
-				}
-			}
-		}
-
-		// Output the time stamp again before the end.
-		output_status();
-	}
-
-	// Change the file name in this unit.
-	uint64_t save_every = UINT64_MAX;
-
-private:
-
-	fstream fs;
-
-	// File name passed in the constructor
-	std::string filename_;
-
-	// Add the processed number here, and if it exceeds save_every, change the file name and reset this counter.
-	uint64_t save_every_counter = 0;
-
-	// thread to write to the file
-	std::thread file_worker_thread;
-	// Flag that all threads have finished
-	atomic<bool> finished;
-
-	// Counter for time stamp output
-	uint64_t time_stamp_count = 0;
-
-	// buffer before writing to file
-	// sfen_buffers is the buffer for each thread
-	// sfen_buffers_pool is a buffer for writing.
-	// After loading the phase in the former buffer by SFEN_WRITE_SIZE, transfer it to the latter.
-	std::vector<PSVector*> sfen_buffers;
-	std::vector<PSVector*> sfen_buffers_pool;
-
-	// Mutex required to access sfen_buffers_pool
-	std::mutex mutex;
-
-	// number of written phases
-	uint64_t sfen_write_count = 0;
-};
-
-// -----------------------------------
-// worker that creates the game record (for each thread)
-// -----------------------------------
-
-// Class to generate sfen with multiple threads
-struct MultiThinkGenSfen : public MultiThink
-{
-	MultiThinkGenSfen(int search_depth_, int search_depth2_, SfenWriter& sw_)
-		: search_depth(search_depth_), search_depth2(search_depth2_), sw(sw_)
-	{
-		hash.resize(GENSFEN_HASH_SIZE);
-
-		// Output for confirmation if the same random seed is not drawn when parallelizing and gensfening the PC.
-		std::cout << prng << std::endl;
-	}
-
-	virtual void thread_worker(size_t thread_id);
-	void start_file_write_worker() { sw.start_file_write_worker(); }
-
-	// search_depth = search depth for normal search
-	int search_depth;
-	int search_depth2;
-
-	// Number of the nodes to be searched.
-	// 0 represents no limits.
-	uint64_t nodes;
-
-	// Upper limit of evaluation value of generated situation
-	int eval_limit;
-
-	// minimum ply with random move
-	int random_move_minply;
-	// maximum ply with random move
-	int random_move_maxply;
-	// Number of random moves in one station
-	int random_move_count;
-	// Move balls with a probability of 1/N when randomly moving like Apery.
-	// When you move the ball again, there is a 1/N chance that it will randomly move once in the opponent's number.
-	// Apery has N=2. Specifying 0 here disables this function.
-	int random_move_like_apery;
-
-	// For when using multi pv instead of random move.
-	// random_multi_pv is the number of candidates for MultiPV.
-	// When adopting the move of the candidate move, the difference between the evaluation value of the move of the 1st place and the evaluation value of the move of the Nth place is
-	// Must be in the range random_multi_pv_diff.
-	// random_multi_pv_depth is the search depth for MultiPV.
-	int random_multi_pv;
-	int random_multi_pv_diff;
-	int random_multi_pv_depth;
-
-	// The minimum and maximum ply (number of steps from the initial phase) of the phase to write out.
-	int write_minply;
-	int write_maxply;
-
-	// sfen exporter
-	SfenWriter& sw;
-
-	// hash to limit the export of the same phase
-	// It must be 2**N because it will be used as the mask to calculate hash_index.
-	static const uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
-
-	vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
-};
-
-//  thread_id    = 0..Threads.size()-1
-void MultiThinkGenSfen::thread_worker(size_t thread_id)
-{
-	// For the time being, it will be treated as a draw at the maximum number of steps to write.
-	const int MAX_PLY2 = write_maxply;
-
-	//Maximum StateInfo + Search PV to advance to leaf buffer
-	std::vector<StateInfo,AlignedAllocator<StateInfo>> states(MAX_PLY2 + MAX_PLY /* == search_depth + α */);
-	StateInfo si;
-
-	// This move. Use this move to advance the stage.
-	Move m = MOVE_NONE;
-
-	// end flag
-	bool quit = false;
-
-	// Variables for draw adjudication.
-	// Todo: Make this as an option.
-	int adj_draw_ply = 80; // start the adjudication when ply reaches this value
-	int adj_draw_cnt = 8;  // 4 move scores for each side have to be checked
-	int adj_draw_score = 0;  // move score in CP
-
-	// repeat until the specified number of times
-	while (!quit)
-	{
-		// It is necessary to set a dependent thread for Position.
-		// When parallelizing, Threads (since this is a vector<Thread*>,
-		// Do the same for up to Threads[0]...Threads[thread_num-1].
-		auto th = Threads[thread_id];
-
-		auto& pos = th->rootPos;
-    pos.set(StartFEN, false, &si, th);
-
-    // Test cod for Packed SFEN.
-    //{
-    //  PackedSfen packed_sfen;
-    //  pos.sfen_pack(packed_sfen);
-    //  std::cout << pos << std::endl;
-    //  pos.set_from_packed_sfen(packed_sfen, &si, th);
-    //  std::string actual = pos.fen();
-    //  assert(actual == StartFEN);
-    //}
-
-		// Refer to the members of BookMoveSelector defined in the search section.
-		//auto& book = ::book;
-
-		// Save the situation for one station, and write it out including the winning and losing at the end.
-		// The function to write is flush_psv() below this.
-		PSVector a_psv;
-		a_psv.reserve(MAX_PLY2 + MAX_PLY);
-
-		// Write out the phases loaded in a_psv to a file.
-		// lastTurnIsWin: win/loss in the next phase after the final phase in a_psv
-		// 1 when winning. -1 when losing. Pass 0 for a draw.
-		// Return value: true if the specified number of phases has already been reached and the process ends.
-		auto flush_psv = [&](int8_t lastTurnIsWin)
-		{
-			int8_t isWin = lastTurnIsWin;
-
-			// From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
-			// The phases stored in a_psv are assumed to be continuous (in order).
-			for (auto it = a_psv.rbegin(); it != a_psv.rend(); ++it)
-			{
-				// If isWin == 0 (draw), multiply by -1 and it will remain 0 (draw)
-				isWin = - isWin;
-				it->game_result = isWin;
-
-				// When I tried to write out the phase, it reached the specified number of times.
-				// Because the counter is added in get_next_loop_count()
-				// If you don't call this when the phase is output, the counter goes crazy.
-				auto loop_count = get_next_loop_count();
-				if (loop_count == UINT64_MAX)
-				{
-					// Set the end flag.
-					quit = true;
-					return;
-				}
-
-				// Write out one aspect.
-				sw.write(thread_id, *it);
-
-#if 0
-				pos.set_from_packed_sfen(it->sfen);
-				cout << pos << "Win : " << it->isWin << " , " << it->score << endl;
-#endif
-			}
-		};
-
-		// ply flag for whether or not to randomly move by eyes
-		vector<bool> random_move_flag;
-		{
-			// If you want to add a random move, random_move_maxply be sure to enter random_move_count times before the first move.
-			// I want you to disperse so much.
-			// I'm not sure how best it is. Experimenting under various conditions.
-
-			// Make an array like a[0] = 0 ,a[1] = 1, ...
-			// Fisher-Yates shuffle and take out the first N items.
-			// Actually, I only want N pieces, so I only need to shuffle the first N pieces with Fisher-Yates.
-
-			vector<int> a;
-			a.reserve((size_t)random_move_maxply);
-
-			// random_move_minply ,random_move_maxply is specified by 1 origin,
-			// Note that we are handling 0 origin here.
-			for (int i = std::max(random_move_minply - 1 , 0) ; i < random_move_maxply; ++i)
-				a.push_back(i);
-
-			// In case of Apery random move, insert() may be called random_move_count times.
-			// Reserve only the size considering it.
-			random_move_flag.resize((size_t)random_move_maxply + random_move_count);
-
-			// A random move that exceeds the size() of a[] cannot be applied, so limit it.
-			for (int i = 0 ; i < std::min(random_move_count, (int)a.size()) ; ++i)
-			{
-				swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
-				random_move_flag[a[i]] = true;
-			}
-		}
-
-		// A counter that keeps track of the number of random moves
-		// When random_move_minply == -1, random moves are performed continuously, so use it at this time.
-		int random_move_c = 0;
-
-		// Save history of move scores for adjudication
-		vector<int> move_hist_scores;
-
-		// ply: steps from the initial stage
-		for (int ply = 0; ; ++ply)
-		{
-			//cout << pos << endl;
-
-			// Current search depth
-			// Goto will fly, so declare it first.
-			int depth = search_depth + (int)prng.rand(search_depth2 - search_depth + 1);
-
-			// has it reached the length
-			if (ply >= MAX_PLY2)
-			{
-				if (write_out_draw_game_in_training_data_generation) {
-				// Write out as win/loss = draw.
-				// This way it is harder to allow the opponent to enter the ball when I enter (may)
-				flush_psv(0);
-				}
-				break;
-			}
-
-      if (pos.is_draw(ply)) {
-		  if (write_out_draw_game_in_training_data_generation) {
-			  // Write if draw.
-			  flush_psv(0);
-		  }
-        break;
-      }
-
-			// Initialize the Syzygy Ending Tablebase and sort the moves.
-			Search::RootMoves rootMoves;
-			for (const auto& m : MoveList<LEGAL>(pos))
-				rootMoves.emplace_back(m);
-			if (!rootMoves.empty())
-				Tablebases::rank_root_moves(pos, rootMoves);
-
-			// If there is no legal move, terminate the game if position
-			// is mate or a stalemate.
-			else {
-				if (pos.checkers()) // Mate
-					flush_psv(-1);
-				else if (write_out_draw_game_in_training_data_generation) {
-					flush_psv(0); // Stalemate
-				}
-				break;
-			}
-
-			// Adjudicate game to a draw if the last 4 scores of each engine is 0.
-			if (detect_draw_by_consecutive_low_score) {
-				if (ply >= adj_draw_ply) {
-					int draw_cnt = 0;
-					bool is_adj_draw = false;
-
-					for (vector<int>::reverse_iterator it = move_hist_scores.rbegin();
-						it != move_hist_scores.rend(); ++it) 
-					{
-						if (abs(*it) <= adj_draw_score)
-							draw_cnt++;
-						else
-							break;  // score should be successive
-
-						if (draw_cnt >= adj_draw_cnt) {
-							is_adj_draw = true;
-							break;
-						}
-					}
-
-					if (is_adj_draw) {
-						if (write_out_draw_game_in_training_data_generation)
-							flush_psv(0);
-						break;
-					}
-				}
-			}
-
-			// Draw by insufficient mating material
-			if (detect_draw_by_insufficient_mating_material) {
-				if (pos.count<ALL_PIECES>() <= 4) {
-					int pcnt = pos.count<ALL_PIECES>();
-					// (1) KvK
-					if (pcnt == 2) {
-						if (write_out_draw_game_in_training_data_generation)
-							flush_psv(0);
-						break;
-					}
-					// (2) KvK + 1 minor piece
-					if (pcnt == 3) {
-						int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
-							pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
-						if (minor_pc == 1) {
-							if (write_out_draw_game_in_training_data_generation)
-								flush_psv(0);
-							break;
-						}
-					}
-					// (3) KBvKB, bishops of the same color
-					else if (pcnt == 4) {
-						if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1) {
-							// Color of bishops is black.
-							if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
-								&& (pos.pieces(BLACK, BISHOP) & DarkSquares))
-							{
-								if (write_out_draw_game_in_training_data_generation)
-									flush_psv(0);
-								break;
-							}
-							// Color of bishops is white.
-							if ((pos.pieces(WHITE, BISHOP) & ~DarkSquares)
-								&& (pos.pieces(BLACK, BISHOP) & ~DarkSquares))
-							{
-								if (write_out_draw_game_in_training_data_generation)
-									flush_psv(0);
-								break;
-							}
-						}
-					}
-				}
-			}
-
-			//// constant track
-			//if ((m = book.probe(pos)) != MOVE_NONE)
-			//{
-			//  // Hit the constant track.
-			//  // The move was stored in m.
-
-			//  // Do not use the fixed phase for learning.
-			//  a_psv.clear();
-
-			//  if (random_move_minply != -1)
-			// 		// Random move is performed with a certain probability even in the constant phase.
-			// 		goto RANDOM_MOVE;
-			//  else
-			// 		// When -1 is specified as random_move_minply, it points according to the standard until it goes out of the standard.
-			// 		// Prepare an innumerable number of situations that have left the constant as ConsiderationBookMoveCount true using a huge constant
-			// 		// Used for purposes such as performing a random move 5 times from there.
-			// 		goto DO_MOVE;
-			//}
-
-			{
-				// search_depth～search_depth2 Evaluation value of hand reading and PV (best responder row)
-				// There should be no problem if you narrow the search window.
-
-				auto pv_value1 = search(pos, depth, 1, nodes);
-
-				auto value1 = pv_value1.first;
-				auto& pv1 = pv_value1.second;
-
-				// For situations where the absolute evaluation value is greater than or equal to this value
-				// It doesn't make much sense to use that aspect for learning, so this game ends.
-				// Treat this as having won or lost.
-
-				// If you win one move, declarative win, mate_in(2) will be returned here, so it will be the same value as the upper limit of eval_limit,
-				// This if expression is always true. The same applies to resign.
-
-				if (abs(value1) >= eval_limit)
-				{
-					// sync_cout << pos << "eval limit = "<< eval_limit << "over ,move = "<< pv1[0] << sync_endl;
-
-					// If value1 >= eval_limit in this aspect, you win (the turn side of this aspect).
-					flush_psv((value1 >= eval_limit) ? 1 : -1);
-					break;
-				}
-
-				// Verification of a strange move
-				if (pv1.size() > 0
-					&& (pv1[0] == MOVE_NONE || pv1[0] == MOVE_NULL)
-					)
-				{
-					// MOVE_WIN is checking if it is the declaration victory stage before this
-					// The declarative winning move should never come back here.
-					// Also, when MOVE_RESIGN, value1 is a one-stop score, which should be the minimum value of eval_limit (-31998)...
-					cout << "Error! : " << pos.fen() << m << value1 << endl;
-					break;
-				}
-
-				// Save the move score for adjudication.
-				move_hist_scores.push_back(value1);
-
-				// Use PV's move to the leaf node and use the value that evaluated() is called on that leaf node.
-				auto evaluate_leaf = [&](Position& pos , vector<Move>& pv)
-				{
-					auto rootColor = pos.side_to_move();
-
-					int ply2 = ply;
-					for (auto m : pv)
-					{
-						// As a verification for debugging, make sure there are no illegal players in the middle.
-						// NULL_MOVE does not come.
-
-						// I tested it out enough so I can comment it out.
-#if 1
-						// I shouldn't be an illegal player.
-						// declarative win and not mated() are tested above so
-						// It is guaranteed that MOVE_WIN and MOVE_RESIGN do not come as a reader. (Should...)
-						if (!pos.pseudo_legal(m) || !pos.legal(m))
-						{
-							cout << "Error! : " << pos.fen() << m << endl;
-						}
-#endif
-						pos.do_move(m, states[ply2++]);
-						
-						//Because the difference calculation of evaluate() cannot be performed unless each node evaluate() is called!
-						// If the depth is 8 or more, it seems faster not to calculate this difference.
-#if defined(EVAL_NNUE)
-            if (depth < 8)
-              Eval::NNUE::update_eval(pos);
-#endif  // defined(EVAL_NNUE)
-					}
-
-					// reach leaf
-					Value v;
-					if (pos.checkers()) {
-						// Sometime a king is checked.  An example is a case that a checkmate is
-						// found in the search.  If Eval::evaluate() is called whne a king is
-						// checked, classic eval crashes by an assertion.  To avoid crashes, return
-						// value1 instead of the score of the PV leaf.
-						v = value1;
-					}
-					else {
-						v = Eval::evaluate(pos);
-					// evaluate() returns the evaluation value on the turn side, so
-					// If it's a turn different from root_color, you must invert v and return it.
-					if (rootColor != pos.side_to_move())
-						v = -v;
-					}
-
-					// Rewind.
-					// Is it C++x14, and isn't there even foreach to turn in reverse?
-					//  for (auto it : boost::adaptors::reverse(pv))
-
-					for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-						pos.undo_move(*it);
-
-					return v;
-				};
-
-#if 0
-				dbg_hit_on(pv_value1.first == leaf_value);
-				// gensfen depth 3 eval_limit 32000
-				// Total 217749 Hits 203579 hit rate (%) 93.490
-				// gensfen depth 6 eval_limit 32000
-				// Total 78407 Hits 69190 hit rate (%) 88.245
-				// gensfen depth 6 eval_limit 3000
-				// Total 53879 Hits 43713 hit rate (%) 81.132
-
-				// Problems such as pruning with moves in the substitution table.
-				// This is a little uncomfortable as a teacher...
-#endif
-
-				//If depth 0, pv is not obtained, so search again at depth 2.
-				if (search_depth <= 0)
-				{
-					pv_value1 = search(pos, 2);
-					pv1 = pv_value1.second;
-				}
-
-				// The surroundings of the initial stage are all similar
-				// Do not write it out because it can lead to overlearning when used for learning.
-				// → comparative experiment should be done
-				if (ply < write_minply - 1)
-				{
-					a_psv.clear();
-					goto SKIP_SAVE;
-				}
-
-				// Did you just write the same phase?
-				// This may include the same aspect as it is generated in parallel on multiple PCs, so
-				// It is better to do the same process when reading.
-				{
-					auto key = pos.key();
-					auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
-					auto key2 = hash[hash_index];
-					if (key == key2)
-					{
-						// when skipping regarding earlier
-						// Clear the saved situation because the win/loss information will be incorrect.
-						// anyway, when the hash matches, it's likely that the previous phases also match
-						// Not worth writing out.
-						a_psv.clear();
-						goto SKIP_SAVE;
-					}
-					hash[hash_index] = key; // Replace with the current key.
-				}
-
-				// Temporary saving of the situation.
-				{
-					a_psv.emplace_back(PackedSfenValue());
-					auto &psv = a_psv.back();
-
-					// If pack is requested, write the packed sfen and the evaluation value at that time.
-					// The final writing is after winning or losing.
-					pos.sfen_pack(psv.sfen);
-
-          //{
-          //  std::string before_fen = pos.fen();
-          //  pos.set_from_packed_sfen(psv.sfen, &si, th);
-          //  std::string after_fen = pos.fen();
-          //  assert(before_fen == after_fen);
-          //}
-
-					// Get the value of evaluate() as seen from the root color on the leaf node of the PV line.
-					//I don't know the goodness and badness of using the return value of search() as it is.
-					psv.score = evaluate_leaf(pos, pv1);
-					psv.gamePly = ply;
-
-					// Take out the first PV hand. This should be present unless depth 0.
-					assert(pv_value1.second.size() >= 1);
-					Move pv_move1 = pv_value1.second[0];
-					psv.move = pv_move1;
-				}
-
-			SKIP_SAVE:;
-
-				// For some reason, I could not get PV (hit the substitution table etc. and got stuck?) so go to the next game.
-				// It's a rare case, so you can ignore it.
-				if (pv1.size() == 0)
-					break;
-
-				// search_depth Advance the phase by hand reading.
-				m = pv1[0];
-			}
-
-		RANDOM_MOVE:;
-
-			// Phase to randomly choose one from legal hands
-			if (
-				// 1. Random move of random_move_count times from random_move_minply to random_move_maxply
-				(random_move_minply != -1 && ply <(int)random_move_flag.size() && random_move_flag[ply]) ||
-				// 2. A mode to perform random move of random_move_count times after leaving the track
-				(random_move_minply == -1 && random_move_c <random_move_count))
-			{
-				++random_move_c;
-
-				// It's not a mate, so there should be one legal hand...
-				if (random_multi_pv == 0)
-				{
-					// normal random move
-
-					MoveList<LEGAL> list(pos);
-
-					// I don't really know the goodness and badness of making this the Apery method.
-					if (random_move_like_apery == 0
-						|| prng.rand(random_move_like_apery) != 0
-					)
-					{
-						// Normally one move from legal move
-						m = list.at((size_t)prng.rand((uint64_t)list.size()));
-					}
-					else {
-						// if you can move the ball, move the ball
-						Move moves[8]; // Near 8
-						Move* p = &moves[0];
-						for (auto& m : list)
-							if (type_of(pos.moved_piece(m)) == KING)
-								*(p++) = m;
-						size_t n = p - &moves[0];
-						if (n != 0)
-						{
-							// move to move the ball
-							m = moves[prng.rand(n)];
-
-							// In Apery method, at this time there is a 1/2 chance that the opponent will also move randomly
-							if (prng.rand(2) == 0)
-							{
-								// Is it a simple hack to add a "1" next to random_move_flag[ply]?
-								random_move_flag.insert(random_move_flag.begin() + ply + 1, 1, true);
-							}
-						}
-						else
-							// Normally one move from legal move
-							m = list.at((size_t)prng.rand((uint64_t)list.size()));
-					}
-
-					// I put in the code of two handed balls, but if you choose one from legal hands, it should be equivalent to that
-					// I decided it's unnecessary because it just makes the code more complicated.
-				}
-				else {
-					// Since the logic becomes complicated, I'm sorry, I will search again with MultiPV here.
-					Learner::search(pos, random_multi_pv_depth, random_multi_pv);
-					// Select one from the top N hands of root Moves
-
-					auto& rm = pos.this_thread()->rootMoves;
-
-					uint64_t s = min((uint64_t)rm.size(), (uint64_t)random_multi_pv);
-					for (uint64_t i = 1; i < s; ++i)
-					{
-						// The difference from the evaluation value of rm[0] must be within the range of random_multi_pv_diff.
-						// It can be assumed that rm[x].score is arranged in descending order.
-						if (rm[0].score > rm[i].score + random_multi_pv_diff)
-						{
-							s = i;
-							break;
-						}
-					}
-
-					m = rm[prng.rand(s)].pv[0];
-
-					// I haven't written one phase yet, but it ended, so the writing process ends and the next game starts.
-					if (!is_ok(m))
-						break;
-				}
-
-				// When trying to evaluate the move from the outcome of the game,
-				// There is a random move this time, so try not to fall below this.
-				a_psv.clear(); // clear saved aspect
-			}
-
-		DO_MOVE:;
-			pos.do_move(m, states[ply]);
-
-			// Call node evaluate() for each difference calculation.
-			Eval::NNUE::update_eval(pos);
-
-		} // for (int ply = 0; ; ++ply)
-
-	} // while(!quit)
-
-	sw.finalize(thread_id);
-}
-
-// -----------------------------------
-// Command to generate a game record (master thread)
-// -----------------------------------
-
-// Command to generate a game record
-void gen_sfen(Position&, istringstream& is)
-{
-	// number of threads (given by USI setoption)
-	uint32_t thread_num = (uint32_t)Options["Threads"];
-
-	// Number of generated game records default = 8 billion phases (Ponanza specification)
-	uint64_t loop_max = 8000000000UL;
-
-	// Stop the generation when the evaluation value reaches this value.
-	int eval_limit = 3000;
-
-	// search depth
-	int search_depth = 3;
-	int search_depth2 = INT_MIN;
-
-	// Number of nodes to be searched.
-	uint64_t nodes = 0;
-
-	// minimum ply, maximum ply and number of random moves
-	int random_move_minply = 1;
-	int random_move_maxply = 24;
-	int random_move_count = 5;
-	// A function to move the random move mainly like Apery
-	// If this is set to 3, the ball will move with a probability of 1/3.
-	int random_move_like_apery = 0;
-	// If you search with multipv instead of random move and choose from among them randomly, set random_multi_pv = 1 or more.
-	int random_multi_pv = 0;
-	int random_multi_pv_diff = 32000;
-	int random_multi_pv_depth = INT_MIN;
-
-	// The minimum and maximum ply (number of steps from the initial phase) of the phase to write out.
-	int write_minply = 16;
-	int write_maxply = 400;
-
-	// File name to write
-	string output_file_name = "generated_kifu.bin";
-
-	string token;
-
-	// When hit to eval hash, as a evaluation value near the initial stage, if a hash collision occurs and a large value is written
-	// When eval_limit is set small, eval_limit will be exceeded every time in the initial phase, and phase generation will not proceed.
-	// Therefore, eval hash needs to be disabled.
-	// After that, when the hash of the eval hash collides, the evaluation value of a strange value is used, and it may be unpleasant to use it for the teacher.
-	bool use_eval_hash = false;
-
-	// Save to file in this unit.
-	// File names are serialized like file_1.bin, file_2.bin.
-	uint64_t save_every = UINT64_MAX;
-
-	// Add a random number to the end of the file name.
-	bool random_file_name = false;
-
-	while (true)
-	{
-		token = "";
-		is >> token;
-		if (token == "")
-			break;
-
-		if (token == "depth")
-			is >> search_depth;
-		else if (token == "depth2")
-			is >> search_depth2;
-		else if (token == "nodes")
-			is >> nodes;
-		else if (token == "loop")
-			is >> loop_max;
-		else if (token == "output_file_name")
-			is >> output_file_name;
-		else if (token == "eval_limit")
-		{
-			is >> eval_limit;
-			// Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
-			eval_limit = std::min(eval_limit, (int)mate_in(2));
-		}
-		else if (token == "random_move_minply")
-			is >> random_move_minply;
-		else if (token == "random_move_maxply")
-			is >> random_move_maxply;
-		else if (token == "random_move_count")
-			is >> random_move_count;
-		else if (token == "random_move_like_apery")
-			is >> random_move_like_apery;
-		else if (token == "random_multi_pv")
-			is >> random_multi_pv;
-		else if (token == "random_multi_pv_diff")
-			is >> random_multi_pv_diff;
-		else if (token == "random_multi_pv_depth")
-			is >> random_multi_pv_depth;
-		else if (token == "write_minply")
-			is >> write_minply;
-		else if (token == "write_maxply")
-			is >> write_maxply;
-		else if (token == "use_eval_hash")
-			is >> use_eval_hash;
-		else if (token == "save_every")
-			is >> save_every;
-		else if (token == "random_file_name")
-			is >> random_file_name;
-		// Accept also the old option name.
-		else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
-			is >> write_out_draw_game_in_training_data_generation;
-		// Accept also the old option name.
-		else if (token == "use_game_draw_adjudication" || token == "detect_draw_by_consecutive_low_score")
-			is >> detect_draw_by_consecutive_low_score;
-		else if (token == "detect_draw_by_insufficient_mating_material")
-			is >> detect_draw_by_insufficient_mating_material;
-		else if (token == "use_raw_nnue_eval")
-			is >> use_raw_nnue_eval;
-		else
-			cout << "Error! : Illegal token " << token << endl;
-	}
-
-#if defined(USE_GLOBAL_OPTIONS)
-	// Save it for later restore.
-	auto oldGlobalOptions = GlobalOptions;
-	GlobalOptions.use_eval_hash = use_eval_hash;
-#endif
-
-	// If search depth2 is not set, leave it the same as search depth.
-	if (search_depth2 == INT_MIN)
-		search_depth2 = search_depth;
-	if (random_multi_pv_depth == INT_MIN)
-		random_multi_pv_depth = search_depth;
-
-	if (random_file_name)
-	{
-		// Give a random number to output_file_name at this point.
-		// Do not use std::random_device().  Because it always the same integers on MinGW.
-		PRNG r(std::chrono::system_clock::now().time_since_epoch().count());
-		// Just in case, reassign the random numbers.
-		for(int i=0;i<10;++i)
-			r.rand(1);
-		auto to_hex = [](uint64_t u){
-			std::stringstream ss;
-			ss << std::hex << u;
-			return ss.str();
-		};
-		// I don't want to wear 64bit numbers by accident, so I'm going to make a 64bit number 2 just in case.
-		output_file_name = output_file_name + "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
-	}
-
-	std::cout << "gensfen : " << endl
-		<< "  search_depth = " << search_depth << " to " << search_depth2 << endl
-		<< "  nodes = " << nodes << endl
-		<< "  loop_max = " << loop_max << endl
-		<< "  eval_limit = " << eval_limit << endl
-		<< "  thread_num (set by USI setoption) = " << thread_num << endl
-		//<< "  book_moves (set by USI setoption) = " << Options["BookMoves"] << endl
-		<< "  random_move_minply     = " << random_move_minply << endl
-		<< "  random_move_maxply     = " << random_move_maxply << endl
-		<< "  random_move_count      = " << random_move_count << endl
-		<< "  random_move_like_apery = " << random_move_like_apery << endl
-		<< "  random_multi_pv        = " << random_multi_pv << endl
-		<< "  random_multi_pv_diff   = " << random_multi_pv_diff << endl
-		<< "  random_multi_pv_depth  = " << random_multi_pv_depth << endl
-		<< "  write_minply           = " << write_minply << endl
-		<< "  write_maxply           = " << write_maxply << endl
-		<< "  output_file_name       = " << output_file_name << endl
-		<< "  use_eval_hash          = " << use_eval_hash << endl
-		<< "  save_every             = " << save_every << endl
-		<< "  random_file_name       = " << random_file_name << endl
-		<< "  write_out_draw_game_in_training_data_generation = " << write_out_draw_game_in_training_data_generation << endl
-		<< "  detect_draw_by_consecutive_low_score = " << detect_draw_by_consecutive_low_score << endl
-		<< "  detect_draw_by_insufficient_mating_material = " << detect_draw_by_insufficient_mating_material << endl;
-
-	// Show if the training data generator uses NNUE.
-	Eval::verify_NNUE();
-
-	// Create and execute threads as many as Options["Threads"].
-	{
-		SfenWriter sw(output_file_name, thread_num);
-		sw.save_every = save_every;
-
-		MultiThinkGenSfen multi_think(search_depth, search_depth2, sw);
-		multi_think.nodes = nodes;
-		multi_think.set_loop_max(loop_max);
-		multi_think.eval_limit = eval_limit;
-		multi_think.random_move_minply = random_move_minply;
-		multi_think.random_move_maxply = random_move_maxply;
-		multi_think.random_move_count = random_move_count;
-		multi_think.random_move_like_apery = random_move_like_apery;
-		multi_think.random_multi_pv = random_multi_pv;
-		multi_think.random_multi_pv_diff = random_multi_pv_diff;
-		multi_think.random_multi_pv_depth = random_multi_pv_depth;
-		multi_think.write_minply = write_minply;
-		multi_think.write_maxply = write_maxply;
-		multi_think.start_file_write_worker();
-		multi_think.go_think();
-
-		// Since we are joining with the destructor of SfenWriter, please give a message that it has finished after the join
-		// Enclose this in a block because it should be displayed.
-	}
-
-	std::cout << "gensfen finished." << endl;
-
-#if defined(USE_GLOBAL_OPTIONS)
-	// Restore Global Options.
-	GlobalOptions = oldGlobalOptions;
-#endif
-
-}
-
-// -----------------------------------
-// command to learn from the generated game (learn)
-// -----------------------------------
-
-// ordinary sigmoid function
-double sigmoid(double x)
-{
-	return 1.0 / (1.0 + std::exp(-x));
-}
-
-// A function that converts the evaluation value to the winning rate [0,1]
-double winning_percentage(double value)
-{
-	// 1/(1+10^(-Eval/4))
-	// = 1/(1+e^(-Eval/4*ln(10))
-	// = sigmoid(Eval/4*ln(10))
-	return sigmoid(value * winning_probability_coefficient);
-}
-
-// A function that converts the evaluation value to the winning rate [0,1]
-double winning_percentage_wdl(double value, int ply)
-{
-	double wdl_w = UCI::win_rate_model_double( value, ply);
-	double wdl_l = UCI::win_rate_model_double(-value, ply);
-	double wdl_d = 1000.0 - wdl_w - wdl_l;
-
-	return (wdl_w + wdl_d / 2.0) / 1000.0;
-}
-
-// A function that converts the evaluation value to the winning rate [0,1]
-double winning_percentage(double value, int ply)
-{
-	if (use_wdl) {
-		return winning_percentage_wdl(value, ply);
-	}
-	else {
-		return winning_percentage(value);
-	}
-}
-
-double calc_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
-{
-	double p = deep_win_rate;
-	double q = winning_percentage(shallow_eval, ply);
-	return -p * std::log(q) - (1 - p) * std::log(1 - q);
-}
-
-double calc_d_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
-{
-	constexpr double epsilon = 0.000001;
-	double y1 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval          , ply);
-	double y2 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval + epsilon, ply);
-
-	// Divide by the winning_probability_coefficient to match scale with the sigmoidal win rate
-	return ((y2 - y1) / epsilon) / winning_probability_coefficient;
-}
-
-double dsigmoid(double x)
-{
-	// Sigmoid function
-	// f(x) = 1/(1+exp(-x))
-	// the first derivative is
-	// f'(x) = df/dx = f(x)・{ 1-f(x)}
-	// becomes
-
-	return sigmoid(x) * (1.0 - sigmoid(x));
-}
-
-// When the objective function is the sum of squares of the difference in winning percentage
+    static bool use_draw_games_in_training = false;
+    static bool use_draw_games_in_validation = false;
+    static bool skip_duplicated_positions_in_training = true;
+    // 1.0 / PawnValueEg / 4.0 * log(10.0)
+    static double winning_probability_coefficient = 0.00276753015984861260098316280611;
+    // Score scale factors.  ex) If we set src_score_min_value = 0.0,
+    // src_score_max_value = 1.0, dest_score_min_value = 0.0,
+    // dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000].
+    static double src_score_min_value = 0.0;
+    static double src_score_max_value = 1.0;
+    static double dest_score_min_value = 0.0;
+    static double dest_score_max_value = 1.0;
+    // Assume teacher signals are the scores of deep searches, and convert them into winning
+    // probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
+    // data directly. In those cases, we set false to this variable.
+    static bool convert_teacher_signal_to_winning_probability = true;
+    // Use raw NNUE eval value in the Eval::evaluate(). If hybrid eval is enabled, training data
+    // generation and training don't work well.
+    // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
+    static bool use_raw_nnue_eval = true;
+    // Using WDL with win rate model instead of sigmoid
+    static bool use_wdl = false;
+
+    // -----------------------------------
+    // command to learn from the generated game (learn)
+    // -----------------------------------
+
+    // ordinary sigmoid function
+    double sigmoid(double x)
+    {
+        return 1.0 / (1.0 + std::exp(-x));
+    }
+
+    // A function that converts the evaluation value to the winning rate [0,1]
+    double winning_percentage(double value)
+    {
+        // 1/(1+10^(-Eval/4))
+        // = 1/(1+e^(-Eval/4*ln(10))
+        // = sigmoid(Eval/4*ln(10))
+        return sigmoid(value * winning_probability_coefficient);
+    }
+
+    // A function that converts the evaluation value to the winning rate [0,1]
+    double winning_percentage_wdl(double value, int ply)
+    {
+        double wdl_w = UCI::win_rate_model_double(value, ply);
+        double wdl_l = UCI::win_rate_model_double(-value, ply);
+        double wdl_d = 1000.0 - wdl_w - wdl_l;
+
+        return (wdl_w + wdl_d / 2.0) / 1000.0;
+    }
+
+    // A function that converts the evaluation value to the winning rate [0,1]
+    double winning_percentage(double value, int ply)
+    {
+        if (use_wdl) {
+            return winning_percentage_wdl(value, ply);
+        }
+        else {
+            return winning_percentage(value);
+        }
+    }
+
+    double calc_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
+    {
+        double p = deep_win_rate;
+        double q = winning_percentage(shallow_eval, ply);
+        return -p * std::log(q) - (1 - p) * std::log(1 - q);
+    }
+
+    double calc_d_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
+    {
+        constexpr double epsilon = 0.000001;
+        double y1 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval, ply);
+        double y2 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval + epsilon, ply);
+
+        // Divide by the winning_probability_coefficient to match scale with the sigmoidal win rate
+        return ((y2 - y1) / epsilon) / winning_probability_coefficient;
+    }
+
+    double dsigmoid(double x)
+    {
+        // Sigmoid function
+        // f(x) = 1/(1+exp(-x))
+        // the first derivative is
+        // f'(x) = df/dx = f(x)・{ 1-f(x)}
+        // becomes
+
+        return sigmoid(x) * (1.0 - sigmoid(x));
+    }
+
+    // When the objective function is the sum of squares of the difference in winning percentage
 #if defined (LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
 // function to calculate the gradient
-double calc_grad(Value deep, Value shallow, PackedSfenValue& psv)
-{
-	// The square of the win rate difference minimizes it in the objective function.
-	// Objective function J = 1/2m Σ (win_rate(shallow)-win_rate(deep) )^2
-	// However, σ is a sigmoid function that converts the evaluation value into the difference in the winning percentage.
-	// m is the number of samples. shallow is the evaluation value for a shallow search (qsearch()). deep is the evaluation value for deep search.
-	// If W is the feature vector (parameter of the evaluation function) and Xi and Yi are teachers
-	// shallow = W*Xi // * is the Hadamard product, transposing W and meaning X
-	// f(Xi) = win_rate(W*Xi)
-	// If σ(i th deep) = Yi,
-	// J = m/2 Σ (f(Xi)-Yi )^2
-	// becomes a common expression.
-	// W is a vector, and if we write the jth element as Wj, from the chain rule
-	// ∂J/∂Wj = ∂J/∂f ・∂f/∂W ・∂W/∂Wj
-	// = 1/m Σ (f(Xi)-y) ・f'(Xi) ・ 1
+    double calc_grad(Value deep, Value shallow, PackedSfenValue& psv)
+    {
+        // The square of the win rate difference minimizes it in the objective function.
+        // Objective function J = 1/2m Σ (win_rate(shallow)-win_rate(deep) )^2
+        // However, σ is a sigmoid function that converts the evaluation value into the difference in the winning percentage.
+        // m is the number of samples. shallow is the evaluation value for a shallow search (qsearch()). deep is the evaluation value for deep search.
+        // If W is the feature vector (parameter of the evaluation function) and Xi and Yi are teachers
+        // shallow = W*Xi // * is the Hadamard product, transposing W and meaning X
+        // f(Xi) = win_rate(W*Xi)
+        // If σ(i th deep) = Yi,
+        // J = m/2 Σ (f(Xi)-Yi )^2
+        // becomes a common expression.
+        // W is a vector, and if we write the jth element as Wj, from the chain rule
+        // ∂J/∂Wj = ∂J/∂f ・∂f/∂W ・∂W/∂Wj
+        // = 1/m Σ (f(Xi)-y) ・f'(Xi) ・ 1
 
-	// 1/m will be multiplied later, but the contents of Σ can be retained in the array as the value of the gradient.
-	// f'(Xi) = win_rate'(shallow) = sigmoid'(shallow/600) = dsigmoid(shallow / 600) / 600
-	// This /600 at the end is adjusted by the learning rate, so do not write it..
-	// Also, the coefficient of 1/m is unnecessary if you use the update formula that has the automatic gradient adjustment function like Adam and AdaGrad.
-	// Therefore, it is not necessary to save it in memory.
+        // 1/m will be multiplied later, but the contents of Σ can be retained in the array as the value of the gradient.
+        // f'(Xi) = win_rate'(shallow) = sigmoid'(shallow/600) = dsigmoid(shallow / 600) / 600
+        // This /600 at the end is adjusted by the learning rate, so do not write it..
+        // Also, the coefficient of 1/m is unnecessary if you use the update formula that has the automatic gradient adjustment function like Adam and AdaGrad.
+        // Therefore, it is not necessary to save it in memory.
 
-	double p = winning_percentage(deep);
-	double q = winning_percentage(shallow);
-	return (q - p) * dsigmoid(double(shallow) / 600.0);
-}
+        double p = winning_percentage(deep);
+        double q = winning_percentage(shallow);
+        return (q - p) * dsigmoid(double(shallow) / 600.0);
+    }
 #endif
 
 #if defined (LOSS_FUNCTION_IS_CROSS_ENTOROPY)
-double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
-{
-	// Objective function with cross entropy
+    double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
+    {
+        // Objective function with cross entropy
 
-	// For the concept and nature of cross entropy,
-	// http://nnadl-ja.github.io/nnadl_site_ja/chap3.html#the_cross-entropy_cost_function
-	// http://postd.cc/visual-information-theory-3/
-	// Refer to etc.
+        // For the concept and nature of cross entropy,
+        // http://nnadl-ja.github.io/nnadl_site_ja/chap3.html#the_cross-entropy_cost_function
+        // http://postd.cc/visual-information-theory-3/
+        // Refer to etc.
 
-	// Objective function design)
-	// We want to make the distribution of p closer to the distribution of q → Think of it as the problem of minimizing the cross entropy between the probability distributions of p and q.
-	// J = H(p,q) =-Σ p(x) log(q(x)) = -p log q-(1-p) log(1-q)
-	// x
+        // Objective function design)
+        // We want to make the distribution of p closer to the distribution of q → Think of it as the problem of minimizing the cross entropy between the probability distributions of p and q.
+        // J = H(p,q) =-Σ p(x) log(q(x)) = -p log q-(1-p) log(1-q)
+        // x
 
-	// p is a constant and q is a Wi function (q = σ(W・Xi) ).
-	// ∂J/∂Wi = -p・q'/q-(1-p)(1-q)'/(1-q)
-	// = ...
-	// = q-p.
+        // p is a constant and q is a Wi function (q = σ(W・Xi) ).
+        // ∂J/∂Wi = -p・q'/q-(1-p)(1-q)'/(1-q)
+        // = ...
+        // = q-p.
 
-	double p = winning_percentage(deep);
-	double q = winning_percentage(shallow);
+        double p = winning_percentage(deep);
+        double q = winning_percentage(shallow);
 
-	return q - p;
-}
+        return q - p;
+    }
 #endif
 
 #if defined ( LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE )
-double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
-{
-	// Version that does not pass the winning percentage function
-	// This, unless EVAL_LIMIT is set low, trying to match the evaluation value with the shape of the end stage
-	// eval may exceed the range of eval.
-	return shallow - deep;
-}
+    double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
+    {
+        // Version that does not pass the winning percentage function
+        // This, unless EVAL_LIMIT is set low, trying to match the evaluation value with the shape of the end stage
+        // eval may exceed the range of eval.
+        return shallow - deep;
+    }
 #endif
 
 #if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
 
-// A constant used in elmo (WCSC27). Adjustment required.
-// Since elmo does not internally divide the expression, the value is different.
-// You can set this value with the learn command.
-// 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
-double ELMO_LAMBDA = 0.33;
-double ELMO_LAMBDA2 = 0.33;
-double ELMO_LAMBDA_LIMIT = 32000;
+    // A constant used in elmo (WCSC27). Adjustment required.
+    // Since elmo does not internally divide the expression, the value is different.
+    // You can set this value with the learn command.
+    // 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
+    double ELMO_LAMBDA = 0.33;
+    double ELMO_LAMBDA2 = 0.33;
+    double ELMO_LAMBDA_LIMIT = 32000;
 
-double calc_grad(Value teacher_signal, Value shallow , const PackedSfenValue& psv)
-{
-	// elmo (WCSC27) method
-	// Correct with the actual game wins and losses.
+    double calc_grad(Value teacher_signal, Value shallow, const PackedSfenValue& psv)
+    {
+        // elmo (WCSC27) method
+        // Correct with the actual game wins and losses.
 
-	// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-	double scaled_teacher_signal = teacher_signal;
-	// Normalize to [0.0, 1.0].
-	scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
-	// Scale to [dest_score_min_value, dest_score_max_value].
-	scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+        // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+        double scaled_teacher_signal = teacher_signal;
+        // Normalize to [0.0, 1.0].
+        scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
+        // Scale to [dest_score_min_value, dest_score_max_value].
+        scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
 
-	const double q = winning_percentage(shallow, psv.gamePly);
-	// Teacher winning probability.
-	double p = scaled_teacher_signal;
-	if (convert_teacher_signal_to_winning_probability) {
-		p = winning_percentage(scaled_teacher_signal, psv.gamePly);
-	}
+        const double q = winning_percentage(shallow, psv.gamePly);
+        // Teacher winning probability.
+        double p = scaled_teacher_signal;
+        if (convert_teacher_signal_to_winning_probability) {
+            p = winning_percentage(scaled_teacher_signal, psv.gamePly);
+        }
 
-	// Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
-	// game_result = 1,0,-1 so add 1 and divide by 2.
-	const double t = double(psv.game_result + 1) / 2;
+        // Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
+        // game_result = 1,0,-1 so add 1 and divide by 2.
+        const double t = double(psv.game_result + 1) / 2;
 
-	// If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
-	const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
+        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
+        const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
 
-	double grad;
-	if (use_wdl) {
-		double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
-		double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
-		grad = lambda * dce_p + (1.0 - lambda) * dce_t;
-	}
-	else {
-		// Use the actual win rate as a correction term.
-		// This is the idea of ​​elmo (WCSC27), modern O-parts.
-		grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
-	}
+        double grad;
+        if (use_wdl) {
+            double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
+            double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
+            grad = lambda * dce_p + (1.0 - lambda) * dce_t;
+        }
+        else {
+            // Use the actual win rate as a correction term.
+            // This is the idea of ​​elmo (WCSC27), modern O-parts.
+            grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
+        }
 
-	return grad;
-}
+        return grad;
+    }
 
-// Calculate cross entropy during learning
-// The individual cross entropy of the win/loss term and win rate term of the elmo expression is returned to the arguments cross_entropy_eval and cross_entropy_win.
-void calc_cross_entropy(Value teacher_signal, Value shallow, const PackedSfenValue& psv,
-	double& cross_entropy_eval, double& cross_entropy_win, double& cross_entropy,
-	double& entropy_eval, double& entropy_win, double& entropy)
-{
-	// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-	double scaled_teacher_signal = teacher_signal;
-	// Normalize to [0.0, 1.0].
-	scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
-	// Scale to [dest_score_min_value, dest_score_max_value].
-	scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
+    // Calculate cross entropy during learning
+    // The individual cross entropy of the win/loss term and win rate term of the elmo expression is returned to the arguments cross_entropy_eval and cross_entropy_win.
+    void calc_cross_entropy(Value teacher_signal, Value shallow, const PackedSfenValue& psv,
+        double& cross_entropy_eval, double& cross_entropy_win, double& cross_entropy,
+        double& entropy_eval, double& entropy_win, double& entropy)
+    {
+        // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+        double scaled_teacher_signal = teacher_signal;
+        // Normalize to [0.0, 1.0].
+        scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
+        // Scale to [dest_score_min_value, dest_score_max_value].
+        scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
 
-	// Teacher winning probability.
-	double p = scaled_teacher_signal;
-	if (convert_teacher_signal_to_winning_probability) {
-		p = winning_percentage(scaled_teacher_signal);
-	}
-	const double q /* eval_winrate    */ = winning_percentage(shallow);
-	const double t = double(psv.game_result + 1) / 2;
+        // Teacher winning probability.
+        double p = scaled_teacher_signal;
+        if (convert_teacher_signal_to_winning_probability) {
+            p = winning_percentage(scaled_teacher_signal);
+        }
+        const double q /* eval_winrate    */ = winning_percentage(shallow);
+        const double t = double(psv.game_result + 1) / 2;
 
-	constexpr double epsilon = 0.000001;
+        constexpr double epsilon = 0.000001;
 
-	// If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
-	const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
+        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
+        const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
 
-	const double m = (1.0 - lambda) * t + lambda * p;
+        const double m = (1.0 - lambda) * t + lambda * p;
 
-	cross_entropy_eval =
-		(-p * std::log(q + epsilon) - (1.0 - p) * std::log(1.0 - q + epsilon));
-	cross_entropy_win =
-		(-t * std::log(q + epsilon) - (1.0 - t) * std::log(1.0 - q + epsilon));
-	entropy_eval =
-		(-p * std::log(p + epsilon) - (1.0 - p) * std::log(1.0 - p + epsilon));
-	entropy_win =
-		(-t * std::log(t + epsilon) - (1.0 - t) * std::log(1.0 - t + epsilon));
+        cross_entropy_eval =
+            (-p * std::log(q + epsilon) - (1.0 - p) * std::log(1.0 - q + epsilon));
+        cross_entropy_win =
+            (-t * std::log(q + epsilon) - (1.0 - t) * std::log(1.0 - q + epsilon));
+        entropy_eval =
+            (-p * std::log(p + epsilon) - (1.0 - p) * std::log(1.0 - p + epsilon));
+        entropy_win =
+            (-t * std::log(t + epsilon) - (1.0 - t) * std::log(1.0 - t + epsilon));
 
-	cross_entropy =
-		(-m * std::log(q + epsilon) - (1.0 - m) * std::log(1.0 - q + epsilon));
-	entropy =
-		(-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
-}
+        cross_entropy =
+            (-m * std::log(q + epsilon) - (1.0 - m) * std::log(1.0 - q + epsilon));
+        entropy =
+            (-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
+    }
 
 #endif
 
 
-// Other variations may be prepared as the objective function..
-
-
-double calc_grad(Value shallow, const PackedSfenValue& psv) {
-	return calc_grad((Value)psv.score, shallow, psv);
-}
-
-// Sfen reader
-struct SfenReader
-{
-	// Do not use std::random_device().  Because it always the same integers on MinGW.
-	SfenReader(int thread_num) : prng(std::chrono::system_clock::now().time_since_epoch().count())
-	{
-		packed_sfens.resize(thread_num);
-		total_read = 0;
-		total_done = 0;
-		last_done = 0;
-		next_update_weights = 0;
-		save_count = 0;
-		end_of_files = false;
-		no_shuffle = false;
-		stop_flag = false;
-
-		hash.resize(READ_SFEN_HASH_SIZE);
-	}
-
-	~SfenReader()
-	{
-		if (file_worker_thread.joinable())
-			file_worker_thread.join();
-
-		for (auto p : packed_sfens)
-			delete p;
-		for (auto p : packed_sfens_pool)
-			delete p;
-	}
-
-	// number of phases used for calculation such as mse
-	// mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
-	//Since search() is performed with depth = 1 in calculation of move match rate, simple comparison is not possible...
-	const uint64_t sfen_for_mse_size = 2000;
-
-	// Load the phase for calculation such as mse.
-	void read_for_mse()
-	{
-		auto th = Threads.main();
-		Position& pos = th->rootPos;
-		for (uint64_t i = 0; i < sfen_for_mse_size; ++i)
-		{
-			PackedSfenValue ps;
-			if (!read_to_thread_buffer(0, ps))
-			{
-				cout << "Error! read packed sfen , failed." << endl;
-				break;
-			}
-			sfen_for_mse.push_back(ps);
-
-			// Get the hash key.
-			StateInfo si;
-			pos.set_from_packed_sfen(ps.sfen,&si,th);
-			sfen_for_mse_hash.insert(pos.key());
-		}
-	}
-
-	void read_validation_set(const string file_name, int eval_limit)
-	{
-		ifstream fs(file_name, ios::binary);
-
-		while (fs)
-		{
-			PackedSfenValue p;
-			if (fs.read((char*)&p, sizeof(PackedSfenValue)))
-			{
-				if (eval_limit < abs(p.score))
-					continue;
-				if (!use_draw_games_in_validation && p.game_result == 0)
-					continue;
-				sfen_for_mse.push_back(p);
-			} else {
-				break;
-			}
-		}
-	}
-
-	// Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
-	const size_t THREAD_BUFFER_SIZE = 10 * 1000;
-
-	// Buffer for reading files (If this is made larger, the shuffle becomes larger and the phases may vary.
-	// If it is too large, the memory consumption will increase.
-	// SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
-	const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
-
-	// [ASYNC] Thread returns one aspect. Otherwise returns false.
-	bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
-	{
-		// If there are any positions left in the thread buffer, retrieve one and return it.
-		auto& thread_ps = packed_sfens[thread_id];
-
-		// Fill the read buffer if there is no remaining buffer, but if it doesn't even exist, finish.
-		if ((thread_ps == nullptr || thread_ps->size() == 0) // If the buffer is empty, fill it.
-			&& !read_to_thread_buffer_impl(thread_id))
-			return false;
-
-		// read_to_thread_buffer_impl() returned true,
-		// Since the filling of the thread buffer with the phase has been completed successfully
-		// thread_ps->rbegin() is alive.
-
-		ps = *(thread_ps->rbegin());
-		thread_ps->pop_back();
-
-		// If you've run out of buffers, call delete yourself to free this buffer.
-		if (thread_ps->size() == 0)
-		{
-
-			delete thread_ps;
-			thread_ps = nullptr;
-		}
-
-		return true;
-	}
-
-	// [ASYNC] Read some aspects into thread buffer.
-	bool read_to_thread_buffer_impl(size_t thread_id)
-	{
-		while (true)
-		{
-			{
-				std::unique_lock<std::mutex> lk(mutex);
-				// If you can fill from the file buffer, that's fine.
-				if (packed_sfens_pool.size() != 0)
-				{
-					// It seems that filling is possible, so fill and finish.
-
-					packed_sfens[thread_id] = packed_sfens_pool.front();
-					packed_sfens_pool.pop_front();
-
-					total_read += THREAD_BUFFER_SIZE;
-
-					return true;
-				}
-			}
-
-			// The file to read is already gone. No more use.
-			if (end_of_files)
-				return false;
-
-			// Waiting for file worker to fill packed_sfens_pool.
-			// The mutex isn't locked, so it should fill up soon.
-			sleep(1);
-		}
-
-	}
-
-	// Start a thread that loads the phase file in the background.
-	void start_file_read_worker()
-	{
-		file_worker_thread = std::thread([&] { this->file_read_worker(); });
-	}
-
-	// for file read-only threads
-	void file_read_worker()
-	{
-		auto open_next_file = [&]()
-		{
-			if (fs.is_open())
-				fs.close();
-
-			// no more
-			if (filenames.size() == 0)
-				return false;
-
-			// Get the next file name.
-			string filename = *filenames.rbegin();
-			filenames.pop_back();
-
-			fs.open(filename, ios::in | ios::binary);
-			cout << "open filename = " << filename << endl;
-			assert(fs);
-
-			return true;
-		};
-
-		while (true)
-		{
-			// Wait for the buffer to run out.
-			// This size() is read only, so you don't need to lock it.
-			while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
-				sleep(100);
-			if (stop_flag)
-				return;
-
-			PSVector sfens;
-			sfens.reserve(SFEN_READ_SIZE);
-
-			// Read from the file into the file buffer.
-			while (sfens.size() < SFEN_READ_SIZE)
-			{
-				PackedSfenValue p;
-				if (fs.read((char*)&p, sizeof(PackedSfenValue)))
-				{
-					sfens.push_back(p);
-				} else
-				{
-					// read failure
-					if (!open_next_file())
-					{
-						// There was no next file. Abon.
-						cout << "..end of files." << endl;
-						end_of_files = true;
-						return;
-					}
-				}
-			}
-
-			// Shuffle the read phase data.
-			// random shuffle by Fisher-Yates algorithm
-
-			if (!no_shuffle)
-			{
-				auto size = sfens.size();
-				for (size_t i = 0; i < size; ++i)
-					swap(sfens[i], sfens[(size_t)(prng.rand((uint64_t)size - i) + i)]);
-			}
-
-			// Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
-			// SFEN_READ_SIZE shall be a multiple of THREAD_BUFFER_SIZE.
-			assert((SFEN_READ_SIZE % THREAD_BUFFER_SIZE)==0);
-
-			auto size = size_t(SFEN_READ_SIZE / THREAD_BUFFER_SIZE);
-			std::vector<PSVector*> ptrs;
-			ptrs.reserve(size);
-
-			for (size_t i = 0; i < size; ++i)
-			{
-				// Delete this pointer on the receiving side.
-				PSVector* ptr = new PSVector();
-				ptr->resize(THREAD_BUFFER_SIZE);
-				memcpy(&((*ptr)[0]), &sfens[i * THREAD_BUFFER_SIZE], sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
-
-				ptrs.push_back(ptr);
-			}
-
-			// Since sfens is ready, look at the occasion and copy
-			{
-				std::unique_lock<std::mutex> lk(mutex);
-
-				// You can ignore this time because you just copy the pointer...
-				// The mutex lock is required because the contents of packed_sfens_pool are changed.
-
-				for (size_t i = 0; i < size; ++i)
-					packed_sfens_pool.push_back(ptrs[i]);
-			}
-		}
-	}
-
-	// sfen files
-	vector<string> filenames;
-
-	// number of phases read (file to memory buffer)
-	atomic<uint64_t> total_read;
-
-	// number of processed phases
-	atomic<uint64_t> total_done;
-
-	// number of cases processed so far
-	uint64_t last_done;
-
-	// If total_read exceeds this value, update_weights() and calculate mse.
-	uint64_t next_update_weights;
-
-	uint64_t save_count;
-
-	// Do not shuffle when reading the phase.
-	bool no_shuffle;
-
-	bool stop_flag;
-
-	// Determine if it is a phase for calculating rmse.
-	// (The computational aspects of rmse should not be used for learning.)
-	bool is_for_rmse(Key key) const
-	{
-			return sfen_for_mse_hash.count(key) != 0;
-	}
-
-	// hash to limit the reading of the same situation
-	// Is there too many 64 million phases? Or Not really..
-	// It must be 2**N because it will be used as the mask to calculate hash_index.
-	static const uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
-	vector<Key> hash; // 64MB*8 = 512MB
-
-	// test phase for mse calculation
-	PSVector sfen_for_mse;
-
-protected:
-
-	// worker thread reading file in background
-	std::thread file_worker_thread;
-
-	// Random number to shuffle when reading the phase
-	PRNG prng;
-
-	// Did you read the files and reached the end?
-	atomic<bool> end_of_files;
+    // Other variations may be prepared as the objective function..
+
+
+    double calc_grad(Value shallow, const PackedSfenValue& psv) {
+        return calc_grad((Value)psv.score, shallow, psv);
+    }
+
+    // Sfen reader
+    struct SfenReader
+    {
+        // number of phases used for calculation such as mse
+        // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
+        //Since search() is performed with depth = 1 in calculation of move match rate, simple comparison is not possible...
+        static constexpr uint64_t sfen_for_mse_size = 2000;
+
+        // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
+        static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
+
+        // Buffer for reading files (If this is made larger, the shuffle becomes larger and the phases may vary.
+        // If it is too large, the memory consumption will increase.
+        // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
+        static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
+
+        // Do not use std::random_device().  Because it always the same integers on MinGW.
+        SfenReader(int thread_num) : prng(std::chrono::system_clock::now().time_since_epoch().count())
+        {
+            packed_sfens.resize(thread_num);
+            total_read = 0;
+            total_done = 0;
+            last_done = 0;
+            next_update_weights = 0;
+            save_count = 0;
+            end_of_files = false;
+            no_shuffle = false;
+            stop_flag = false;
+
+            hash.resize(READ_SFEN_HASH_SIZE);
+        }
+
+        ~SfenReader()
+        {
+            if (file_worker_thread.joinable())
+                file_worker_thread.join();
+        }
+
+        // Load the phase for calculation such as mse.
+        void read_for_mse()
+        {
+            auto th = Threads.main();
+            Position& pos = th->rootPos;
+            for (uint64_t i = 0; i < sfen_for_mse_size; ++i)
+            {
+                PackedSfenValue ps;
+                if (!read_to_thread_buffer(0, ps))
+                {
+                    cout << "Error! read packed sfen , failed." << endl;
+                    break;
+                }
+                sfen_for_mse.push_back(ps);
+
+                // Get the hash key.
+                StateInfo si;
+                pos.set_from_packed_sfen(ps.sfen, &si, th);
+                sfen_for_mse_hash.insert(pos.key());
+            }
+        }
+
+        void read_validation_set(const string& file_name, int eval_limit)
+        {
+            ifstream input(file_name, ios::binary);
+
+            while (input)
+            {
+                PackedSfenValue p;
+                if (input.read(reinterpret_cast<char*>(&p), sizeof(PackedSfenValue)))
+                {
+                    if (eval_limit < abs(p.score))
+                        continue;
+                    if (!use_draw_games_in_validation && p.game_result == 0)
+                        continue;
+                    sfen_for_mse.push_back(p);
+                }
+                else 
+                {
+                    break;
+                }
+            }
+        }
+
+        // [ASYNC] Thread returns one aspect. Otherwise returns false.
+        bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
+        {
+            // If there are any positions left in the thread buffer, retrieve one and return it.
+            auto& thread_ps = packed_sfens[thread_id];
+
+            // Fill the read buffer if there is no remaining buffer, but if it doesn't even exist, finish.
+            if ((thread_ps == nullptr || thread_ps->size() == 0) // If the buffer is empty, fill it.
+                && !read_to_thread_buffer_impl(thread_id))
+                return false;
+
+            // read_to_thread_buffer_impl() returned true,
+            // Since the filling of the thread buffer with the phase has been completed successfully
+            // thread_ps->rbegin() is alive.
+
+            ps = *(thread_ps->rbegin());
+            thread_ps->pop_back();
+
+            // If you've run out of buffers, call delete yourself to free this buffer.
+            if (thread_ps->size() == 0)
+            {
+                thread_ps.reset();
+            }
+
+            return true;
+        }
+
+        // [ASYNC] Read some aspects into thread buffer.
+        bool read_to_thread_buffer_impl(size_t thread_id)
+        {
+            while (true)
+            {
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+                    // If you can fill from the file buffer, that's fine.
+                    if (packed_sfens_pool.size() != 0)
+                    {
+                        // It seems that filling is possible, so fill and finish.
+
+                        packed_sfens[thread_id] = std::move(packed_sfens_pool.front());
+                        packed_sfens_pool.pop_front();
+
+                        total_read += THREAD_BUFFER_SIZE;
+
+                        return true;
+                    }
+                }
+
+                // The file to read is already gone. No more use.
+                if (end_of_files)
+                    return false;
+
+                // Waiting for file worker to fill packed_sfens_pool.
+                // The mutex isn't locked, so it should fill up soon.
+                sleep(1);
+            }
+
+        }
+
+        // Start a thread that loads the phase file in the background.
+        void start_file_read_worker()
+        {
+            file_worker_thread = std::thread([&] { this->file_read_worker(); });
+        }
+
+        // for file read-only threads
+        void file_read_worker()
+        {
+            auto open_next_file = [&]()
+            {
+                if (fs.is_open())
+                    fs.close();
+
+                // no more
+                if (filenames.size() == 0)
+                    return false;
+
+                // Get the next file name.
+                string filename = *filenames.rbegin();
+                filenames.pop_back();
+
+                fs.open(filename, ios::in | ios::binary);
+                cout << "open filename = " << filename << endl;
+                assert(fs);
+
+                return true;
+            };
+
+            while (true)
+            {
+                // Wait for the buffer to run out.
+                // This size() is read only, so you don't need to lock it.
+                while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
+                    sleep(100);
+                if (stop_flag)
+                    return;
+
+                PSVector sfens;
+                sfens.reserve(SFEN_READ_SIZE);
+
+                // Read from the file into the file buffer.
+                while (sfens.size() < SFEN_READ_SIZE)
+                {
+                    PackedSfenValue p;
+                    if (fs.read(reinterpret_cast<char*>(&p), sizeof(PackedSfenValue)))
+                    {
+                        sfens.push_back(p);
+                    }
+                    else
+                    {
+                        // read failure
+                        if (!open_next_file())
+                        {
+                            // There was no next file. Abon.
+                            cout << "..end of files." << endl;
+                            end_of_files = true;
+                            return;
+                        }
+                    }
+                }
+
+                // Shuffle the read phase data.
+                // random shuffle by Fisher-Yates algorithm
+
+                if (!no_shuffle)
+                {
+                    auto size = sfens.size();
+                    for (size_t i = 0; i < size; ++i)
+                        swap(sfens[i], sfens[(size_t)(prng.rand((uint64_t)size - i) + i)]);
+                }
+
+                // Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
+                // SFEN_READ_SIZE shall be a multiple of THREAD_BUFFER_SIZE.
+                assert((SFEN_READ_SIZE % THREAD_BUFFER_SIZE) == 0);
+
+                auto size = size_t(SFEN_READ_SIZE / THREAD_BUFFER_SIZE);
+                std::vector<std::unique_ptr<PSVector>> buffers;
+                buffers.reserve(size);
+
+                for (size_t i = 0; i < size; ++i)
+                {
+                    // Delete this pointer on the receiving side.
+                    auto buf = std::make_unique<PSVector>();
+                    buf->resize(THREAD_BUFFER_SIZE);
+                    memcpy(buf->data(), &sfens[i * THREAD_BUFFER_SIZE], sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
+
+                    buffers.emplace_back(std::move(buf));
+                }
+
+                // Since sfens is ready, look at the occasion and copy
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // You can ignore this time because you just copy the pointer...
+                    // The mutex lock is required because the contents of packed_sfens_pool are changed.
+
+                    for (auto& buf : buffers)
+                        packed_sfens_pool.emplace_back(std::move(buf));
+                }
+            }
+        }
+
+        // sfen files
+        vector<string> filenames;
+
+        // number of phases read (file to memory buffer)
+        atomic<uint64_t> total_read;
+
+        // number of processed phases
+        atomic<uint64_t> total_done;
+
+        // number of cases processed so far
+        uint64_t last_done;
+
+        // If total_read exceeds this value, update_weights() and calculate mse.
+        uint64_t next_update_weights;
+
+        uint64_t save_count;
+
+        // Do not shuffle when reading the phase.
+        bool no_shuffle;
+
+        bool stop_flag;
+
+        // Determine if it is a phase for calculating rmse.
+        // (The computational aspects of rmse should not be used for learning.)
+        bool is_for_rmse(Key key) const
+        {
+            return sfen_for_mse_hash.count(key) != 0;
+        }
+
+        // hash to limit the reading of the same situation
+        // Is there too many 64 million phases? Or Not really..
+        // It must be 2**N because it will be used as the mask to calculate hash_index.
+        static const uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
+        vector<Key> hash; // 64MB*8 = 512MB
+
+        // test phase for mse calculation
+        PSVector sfen_for_mse;
+
+    protected:
+
+        // worker thread reading file in background
+        std::thread file_worker_thread;
+
+        // Random number to shuffle when reading the phase
+        PRNG prng;
+
+        // Did you read the files and reached the end?
+        atomic<bool> end_of_files;
 
 
-	// handle of sfen file
-	std::fstream fs;
+        // handle of sfen file
+        std::fstream fs;
 
-	// sfen for each thread
-	// (When the thread is used up, the thread should call delete to release it.)
-	std::vector<PSVector*> packed_sfens;
+        // sfen for each thread
+        // (When the thread is used up, the thread should call delete to release it.)
+        std::vector<std::unique_ptr<PSVector>> packed_sfens;
 
-	// Mutex when accessing packed_sfens_pool
-	std::mutex mutex;
+        // Mutex when accessing packed_sfens_pool
+        std::mutex mutex;
 
-	// pool of sfen. The worker thread read from the file is added here.
-	// Each worker thread fills its own packed_sfens[thread_id] from here.
-	// * Lock and access the mutex.
-	std::list<PSVector*> packed_sfens_pool;
+        // pool of sfen. The worker thread read from the file is added here.
+        // Each worker thread fills its own packed_sfens[thread_id] from here.
+        // * Lock and access the mutex.
+        std::list<std::unique_ptr<PSVector>> packed_sfens_pool;
 
-	// Hold the hash key so that the mse calculation phase is not used for learning.
-	std::unordered_set<Key> sfen_for_mse_hash;
-};
+        // Hold the hash key so that the mse calculation phase is not used for learning.
+        std::unordered_set<Key> sfen_for_mse_hash;
+    };
 
-// Class to generate sfen with multiple threads
-struct LearnerThink: public MultiThink
-{
-	LearnerThink(SfenReader& sr_):sr(sr_),stop_flag(false), save_only_once(false)
-	{
+    // Class to generate sfen with multiple threads
+    struct LearnerThink : public MultiThink
+    {
+        LearnerThink(SfenReader& sr_) :sr(sr_), stop_flag(false), save_only_once(false)
+        {
 #if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
-		learn_sum_cross_entropy_eval = 0.0;
-		learn_sum_cross_entropy_win = 0.0;
-		learn_sum_cross_entropy = 0.0;
-		learn_sum_entropy_eval = 0.0;
-		learn_sum_entropy_win = 0.0;
-		learn_sum_entropy = 0.0;
+            learn_sum_cross_entropy_eval = 0.0;
+            learn_sum_cross_entropy_win = 0.0;
+            learn_sum_cross_entropy = 0.0;
+            learn_sum_entropy_eval = 0.0;
+            learn_sum_entropy_win = 0.0;
+            learn_sum_entropy = 0.0;
 #endif
 #if defined(EVAL_NNUE)
-		newbob_scale = 1.0;
-		newbob_decay = 1.0;
-		newbob_num_trials = 2;
-		best_loss = std::numeric_limits<double>::infinity();
-		latest_loss_sum = 0.0;
-		latest_loss_count = 0;
+            newbob_scale = 1.0;
+            newbob_decay = 1.0;
+            newbob_num_trials = 2;
+            best_loss = std::numeric_limits<double>::infinity();
+            latest_loss_sum = 0.0;
+            latest_loss_count = 0;
 #endif
-	}
+        }
 
-	virtual void thread_worker(size_t thread_id);
+        virtual void thread_worker(size_t thread_id);
 
-	// Start a thread that loads the phase file in the background.
-	void start_file_read_worker() { sr.start_file_read_worker(); }
+        // Start a thread that loads the phase file in the background.
+        void start_file_read_worker() { sr.start_file_read_worker(); }
 
-	// save merit function parameters to a file
-	bool save(bool is_final=false);
+        // save merit function parameters to a file
+        bool save(bool is_final = false);
 
-	// sfen reader
-	SfenReader& sr;
+        // sfen reader
+        SfenReader& sr;
 
-	// Learning iteration counter
-	uint64_t epoch = 0;
+        // Learning iteration counter
+        uint64_t epoch = 0;
 
-	// Mini batch size size. Be sure to set it on the side that uses this class.
-	uint64_t mini_batch_size = 1000*1000;
+        // Mini batch size size. Be sure to set it on the side that uses this class.
+        uint64_t mini_batch_size = 1000 * 1000;
 
-	bool stop_flag;
+        bool stop_flag;
 
-	// Discount rate
-	double discount_rate;
+        // Discount rate
+        double discount_rate;
 
-	// Option to exclude early stage from learning
-	int reduction_gameply;
+        // Option to exclude early stage from learning
+        int reduction_gameply;
 
-	// Option not to learn kk/kkp/kpp/kppp
-	std::array<bool,4> freeze;
+        // Option not to learn kk/kkp/kpp/kppp
+        std::array<bool, 4> freeze;
 
-	// If the absolute value of the evaluation value of the deep search of the teacher phase exceeds this value, discard the teacher phase.
-	int eval_limit;
+        // If the absolute value of the evaluation value of the deep search of the teacher phase exceeds this value, discard the teacher phase.
+        int eval_limit;
 
-	// Flag whether to dig a folder each time the evaluation function is saved.
-	// If true, do not dig the folder.
-	bool save_only_once;
+        // Flag whether to dig a folder each time the evaluation function is saved.
+        // If true, do not dig the folder.
+        bool save_only_once;
 
-	// --- loss calculation
+        // --- loss calculation
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-	// For calculation of learning data loss
-	atomic<double> learn_sum_cross_entropy_eval;
-	atomic<double> learn_sum_cross_entropy_win;
-	atomic<double> learn_sum_cross_entropy;
-	atomic<double> learn_sum_entropy_eval;
-	atomic<double> learn_sum_entropy_win;
-	atomic<double> learn_sum_entropy;
+    // For calculation of learning data loss
+        atomic<double> learn_sum_cross_entropy_eval;
+        atomic<double> learn_sum_cross_entropy_win;
+        atomic<double> learn_sum_cross_entropy;
+        atomic<double> learn_sum_entropy_eval;
+        atomic<double> learn_sum_entropy_win;
+        atomic<double> learn_sum_entropy;
 #endif
 
 #if defined(EVAL_NNUE)
-	shared_timed_mutex nn_mutex;
-	double newbob_scale;
-	double newbob_decay;
-	int newbob_num_trials;
-	double best_loss;
-	double latest_loss_sum;
-	uint64_t latest_loss_count;
-	std::string best_nn_directory;
+        shared_timed_mutex nn_mutex;
+        double newbob_scale;
+        double newbob_decay;
+        int newbob_num_trials;
+        double best_loss;
+        double latest_loss_sum;
+        uint64_t latest_loss_count;
+        std::string best_nn_directory;
 #endif
 
-	uint64_t eval_save_interval;
-	uint64_t loss_output_interval;
-	uint64_t mirror_percentage;
+        uint64_t eval_save_interval;
+        uint64_t loss_output_interval;
+        uint64_t mirror_percentage;
 
-	// Loss calculation.
-	// done: Number of phases targeted this time
-	void calc_loss(size_t thread_id , uint64_t done);
+        // Loss calculation.
+        // done: Number of phases targeted this time
+        void calc_loss(size_t thread_id, uint64_t done);
 
-	// Define the loss calculation in ↑ as a task and execute it
-	TaskDispatcher task_dispatcher;
-};
+        // Define the loss calculation in ↑ as a task and execute it
+        TaskDispatcher task_dispatcher;
+    };
 
-void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
-{
-	// There is no point in hitting the replacement table, so at this timing the generation of the replacement table is updated.
-	// It doesn't matter if you have disabled the substitution table.
-	TT.new_search();
+    void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
+    {
+        // There is no point in hitting the replacement table, so at this timing the generation of the replacement table is updated.
+        // It doesn't matter if you have disabled the substitution table.
+        TT.new_search();
 
 
 #if defined(EVAL_NNUE)
-	std::cout << "PROGRESS: " << now_string() << ", ";
-	std::cout << sr.total_done << " sfens";
-	std::cout << ", iteration " << epoch;
-	std::cout << ", eta = " << Eval::get_eta() << ", ";
+        std::cout << "PROGRESS: " << now_string() << ", ";
+        std::cout << sr.total_done << " sfens";
+        std::cout << ", iteration " << epoch;
+        std::cout << ", eta = " << Eval::get_eta() << ", ";
 #endif
 
 #if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-	double sum_error = 0;
-	double sum_error2 = 0;
-	double sum_error3 = 0;
+        double sum_error = 0;
+        double sum_error2 = 0;
+        double sum_error3 = 0;
 #endif
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-	// For calculation of verification data loss
-	atomic<double> test_sum_cross_entropy_eval,test_sum_cross_entropy_win,test_sum_cross_entropy;
-	atomic<double> test_sum_entropy_eval,test_sum_entropy_win,test_sum_entropy;
-	test_sum_cross_entropy_eval = 0;
-	test_sum_cross_entropy_win = 0;
-	test_sum_cross_entropy = 0;
-	test_sum_entropy_eval = 0;
-	test_sum_entropy_win = 0;
-	test_sum_entropy = 0;
+        // For calculation of verification data loss
+        atomic<double> test_sum_cross_entropy_eval, test_sum_cross_entropy_win, test_sum_cross_entropy;
+        atomic<double> test_sum_entropy_eval, test_sum_entropy_win, test_sum_entropy;
+        test_sum_cross_entropy_eval = 0;
+        test_sum_cross_entropy_win = 0;
+        test_sum_cross_entropy = 0;
+        test_sum_entropy_eval = 0;
+        test_sum_entropy_win = 0;
+        test_sum_entropy = 0;
 
-	// norm for learning
-	atomic<double> sum_norm;
-	sum_norm = 0;
+        // norm for learning
+        atomic<double> sum_norm;
+        sum_norm = 0;
 #endif
 
-	// The number of times the pv first move of deep search matches the pv first move of search(1).
-	atomic<int> move_accord_count;
-	move_accord_count = 0;
+        // The number of times the pv first move of deep search matches the pv first move of search(1).
+        atomic<int> move_accord_count;
+        move_accord_count = 0;
 
-	// Display the value of eval() in the initial stage of Hirate and see the shaking.
-	auto th = Threads[thread_id];
-	auto& pos = th->rootPos;
-	StateInfo si;
-  pos.set(StartFEN, false, &si, th);
-  std::cout << "hirate eval = " << Eval::evaluate(pos);
+        // Display the value of eval() in the initial stage of Hirate and see the shaking.
+        auto th = Threads[thread_id];
+        auto& pos = th->rootPos;
+        StateInfo si;
+        pos.set(StartFEN, false, &si, th);
+        std::cout << "hirate eval = " << Eval::evaluate(pos);
 
-	//Eval::print_eval_stat(pos);
+        //Eval::print_eval_stat(pos);
 
-	// It's better to parallelize here, but it's a bit troublesome because the search before slave has not finished.
-	// I created a mechanism to call task, so I will use it.
+        // It's better to parallelize here, but it's a bit troublesome because the search before slave has not finished.
+        // I created a mechanism to call task, so I will use it.
 
-	// The number of tasks to do.
-	atomic<int> task_count;
-	task_count = (int)sr.sfen_for_mse.size();
-	task_dispatcher.task_reserve(task_count);
+        // The number of tasks to do.
+        atomic<int> task_count;
+        task_count = (int)sr.sfen_for_mse.size();
+        task_dispatcher.task_reserve(task_count);
 
-	// Create a task to search for the situation and give it to each thread.
-	for (const auto& ps : sr.sfen_for_mse)
-	{
-		// Assign work to each thread using TaskDispatcher.
-		// A task definition for that.
-		// It is not possible to capture pos used in ↑, so specify the variables you want to capture one by one.
-		auto task = [&ps,&test_sum_cross_entropy_eval,&test_sum_cross_entropy_win,&test_sum_cross_entropy,&test_sum_entropy_eval,&test_sum_entropy_win,&test_sum_entropy, &sum_norm,&task_count ,&move_accord_count](size_t thread_id)
-		{
-			// Does C++ properly capture a new ps instance for each loop?.
-			auto th = Threads[thread_id];
-			auto& pos = th->rootPos;
-			StateInfo si;
-			if (pos.set_from_packed_sfen(ps.sfen ,&si, th) != 0)
-			{
-				// Unfortunately, as an sfen for rmse calculation, an invalid sfen was drawn.
-				cout << "Error! : illegal packed sfen " << pos.fen() << endl;
-			}
+        // Create a task to search for the situation and give it to each thread.
+        for (const auto& ps : sr.sfen_for_mse)
+        {
+            // Assign work to each thread using TaskDispatcher.
+            // A task definition for that.
+            // It is not possible to capture pos used in ↑, so specify the variables you want to capture one by one.
+            auto task = 
+                [
+                    &ps, 
+                    &test_sum_cross_entropy_eval, 
+                    &test_sum_cross_entropy_win, 
+                    &test_sum_cross_entropy, 
+                    &test_sum_entropy_eval, 
+                    &test_sum_entropy_win, 
+                    &test_sum_entropy, 
+                    &sum_norm, 
+                    &task_count, 
+                    &move_accord_count
+                ](size_t task_thread_id)
+            {
+                // Does C++ properly capture a new ps instance for each loop?.
+                auto task_th = Threads[task_thread_id];
+                auto& task_pos = task_th->rootPos;
+                StateInfo task_si;
+                if (task_pos.set_from_packed_sfen(ps.sfen, &task_si, task_th) != 0)
+                {
+                    // Unfortunately, as an sfen for rmse calculation, an invalid sfen was drawn.
+                    cout << "Error! : illegal packed sfen " << task_pos.fen() << endl;
+                }
 
-			// Evaluation value for shallow search
-			// The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
-			// Use qsearch() because it is difficult to compare the values.
-			// EvalHash has been disabled in advance. (If not, the same value will be returned every time)
-			auto r = qsearch(pos);
+                // Evaluation value for shallow search
+                // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
+                // Use qsearch() because it is difficult to compare the values.
+                // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
+                auto task_search_result = qsearch(task_pos);
 
-			auto shallow_value = r.first;
-			{
-				const auto rootColor = pos.side_to_move();
-				const auto pv = r.second;
-				std::vector<StateInfo,AlignedAllocator<StateInfo>> states(pv.size());
-				for (size_t i = 0; i < pv.size(); ++i)
-				{
-					pos.do_move(pv[i], states[i]);
-					Eval::NNUE::update_eval(pos);
-				}
-				shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
-				for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-					pos.undo_move(*it);
-			}
+                auto shallow_value = task_search_result.first;
+                {
+                    const auto rootColor = task_pos.side_to_move();
+                    const auto pv = task_search_result.second;
+                    std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
+                    for (size_t i = 0; i < pv.size(); ++i)
+                    {
+                        task_pos.do_move(pv[i], states[i]);
+                        Eval::NNUE::update_eval(task_pos);
+                    }
+                    shallow_value = (rootColor == task_pos.side_to_move()) ? Eval::evaluate(task_pos) : -Eval::evaluate(task_pos);
+                    for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+                        task_pos.undo_move(*it);
+                }
 
-			// Evaluation value of deep search
-			auto deep_value = (Value)ps.score;
+                // Evaluation value of deep search
+                auto deep_value = (Value)ps.score;
 
-			// Note) This code does not consider when eval_limit is specified in the learn command.
+                // Note) This code does not consider when eval_limit is specified in the learn command.
 
-			// --- error calculation
+                // --- error calculation
 
 #if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-			auto grad = calc_grad(deep_value, shallow_value, ps);
+                auto grad = calc_grad(deep_value, shallow_value, ps);
 
-			// something like rmse
-			sum_error += grad*grad;
-			// Add the absolute value of the gradient
-			sum_error2 += abs(grad);
-			// Add the absolute value of the difference between the evaluation values
-			sum_error3 += abs(shallow_value - deep_value);
+                // something like rmse
+                sum_error += grad * grad;
+                // Add the absolute value of the gradient
+                sum_error2 += abs(grad);
+                // Add the absolute value of the difference between the evaluation values
+                sum_error3 += abs(shallow_value - deep_value);
 #endif
 
-			// --- calculation of cross entropy
+                // --- calculation of cross entropy
 
-			// For the time being, regarding the win rate and loss terms only in the elmo method
-			// Calculate and display the cross entropy.
+                // For the time being, regarding the win rate and loss terms only in the elmo method
+                // Calculate and display the cross entropy.
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-			double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
-			double test_entropy_eval, test_entropy_win, test_entropy;
-			calc_cross_entropy(deep_value, shallow_value, ps, test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy, test_entropy_eval, test_entropy_win, test_entropy);
-			// The total cross entropy need not be abs() by definition.
-			test_sum_cross_entropy_eval += test_cross_entropy_eval;
-			test_sum_cross_entropy_win += test_cross_entropy_win;
-			test_sum_cross_entropy += test_cross_entropy;
-			test_sum_entropy_eval += test_entropy_eval;
-			test_sum_entropy_win += test_entropy_win;
-			test_sum_entropy += test_entropy;
-			sum_norm += (double)abs(shallow_value);
+                double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
+                double test_entropy_eval, test_entropy_win, test_entropy;
+                calc_cross_entropy(deep_value, shallow_value, ps, test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy, test_entropy_eval, test_entropy_win, test_entropy);
+                // The total cross entropy need not be abs() by definition.
+                test_sum_cross_entropy_eval += test_cross_entropy_eval;
+                test_sum_cross_entropy_win += test_cross_entropy_win;
+                test_sum_cross_entropy += test_cross_entropy;
+                test_sum_entropy_eval += test_entropy_eval;
+                test_sum_entropy_win += test_entropy_win;
+                test_sum_entropy += test_entropy;
+                sum_norm += (double)abs(shallow_value);
 #endif
 
-			// Determine if the teacher's move and the score of the shallow search match
-			{
-				auto r = search(pos,1);
-				if ((uint16_t)r.second[0] == ps.move)
-					move_accord_count.fetch_add(1, std::memory_order_relaxed);
-			}
+                // Determine if the teacher's move and the score of the shallow search match
+                {
+                    auto r = search(task_pos, 1);
+                    if ((uint16_t)r.second[0] == ps.move)
+                        move_accord_count.fetch_add(1, std::memory_order_relaxed);
+                }
 
-			// Reduced one task because I did it
-			--task_count;
-		};
+                // Reduced one task because I did it
+                --task_count;
+            };
 
-		// Throw the defined task to slave.
-		task_dispatcher.push_task_async(task);
-	}
+            // Throw the defined task to slave.
+            task_dispatcher.push_task_async(task);
+        }
 
-	// join yourself as a slave
-	task_dispatcher.on_idle(thread_id);
+        // join yourself as a slave
+        task_dispatcher.on_idle(thread_id);
 
-	// wait for all tasks to complete
-	while (task_count)
-		sleep(1);
+        // wait for all tasks to complete
+        while (task_count)
+            sleep(1);
 
 #if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-	// rmse = root mean square error: mean square error
-	// mae = mean absolute error: mean absolute error
-	auto dsig_rmse = std::sqrt(sum_error / (sfen_for_mse.size() + epsilon));
-	auto dsig_mae = sum_error2 / (sfen_for_mse.size() + epsilon);
-	auto eval_mae = sum_error3 / (sfen_for_mse.size() + epsilon);
-	cout << " , dsig rmse = " << dsig_rmse << " , dsig mae = " << dsig_mae
-		<< " , eval mae = " << eval_mae;
+        // rmse = root mean square error: mean square error
+        // mae = mean absolute error: mean absolute error
+        auto dsig_rmse = std::sqrt(sum_error / (sfen_for_mse.size() + epsilon));
+        auto dsig_mae = sum_error2 / (sfen_for_mse.size() + epsilon);
+        auto eval_mae = sum_error3 / (sfen_for_mse.size() + epsilon);
+        cout << " , dsig rmse = " << dsig_rmse << " , dsig mae = " << dsig_mae
+            << " , eval mae = " << eval_mae;
 #endif
 
 #if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
 #if defined(EVAL_NNUE)
-	latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
-	latest_loss_count += sr.sfen_for_mse.size();
+        latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
+        latest_loss_count += sr.sfen_for_mse.size();
 #endif
 
-// learn_cross_entropy may be called train cross entropy in the world of machine learning,
-// When omitting the acronym, it is nice to be able to distinguish it from test cross entropy(tce) by writing it as lce.
+        // learn_cross_entropy may be called train cross entropy in the world of machine learning,
+        // When omitting the acronym, it is nice to be able to distinguish it from test cross entropy(tce) by writing it as lce.
 
-	if (sr.sfen_for_mse.size() && done)
-	{
-		cout
-			<< " , test_cross_entropy_eval = "  << test_sum_cross_entropy_eval / sr.sfen_for_mse.size()
-			<< " , test_cross_entropy_win = "   << test_sum_cross_entropy_win / sr.sfen_for_mse.size()
-			<< " , test_entropy_eval = "        << test_sum_entropy_eval / sr.sfen_for_mse.size()
-			<< " , test_entropy_win = "         << test_sum_entropy_win / sr.sfen_for_mse.size()
-			<< " , test_cross_entropy = "       << test_sum_cross_entropy / sr.sfen_for_mse.size()
-			<< " , test_entropy = "             << test_sum_entropy / sr.sfen_for_mse.size()
-			<< " , norm = "						<< sum_norm
-			<< " , move accuracy = "			<< (move_accord_count * 100.0 / sr.sfen_for_mse.size()) << "%";
-		if (done != static_cast<uint64_t>(-1))
-		{
-			cout
-				<< " , learn_cross_entropy_eval = " << learn_sum_cross_entropy_eval / done
-				<< " , learn_cross_entropy_win = "  << learn_sum_cross_entropy_win / done
-				<< " , learn_entropy_eval = "       << learn_sum_entropy_eval / done
-				<< " , learn_entropy_win = "        << learn_sum_entropy_win / done
-				<< " , learn_cross_entropy = "      << learn_sum_cross_entropy / done
-				<< " , learn_entropy = "            << learn_sum_entropy / done;
-		}
-		cout << endl;
-	}
-	else {
-		cout << "Error! : sr.sfen_for_mse.size() = " << sr.sfen_for_mse.size() << " ,  done = " << done << endl;
-	}
+        if (sr.sfen_for_mse.size() && done)
+        {
+            cout
+                << " , test_cross_entropy_eval = " << test_sum_cross_entropy_eval / sr.sfen_for_mse.size()
+                << " , test_cross_entropy_win = " << test_sum_cross_entropy_win / sr.sfen_for_mse.size()
+                << " , test_entropy_eval = " << test_sum_entropy_eval / sr.sfen_for_mse.size()
+                << " , test_entropy_win = " << test_sum_entropy_win / sr.sfen_for_mse.size()
+                << " , test_cross_entropy = " << test_sum_cross_entropy / sr.sfen_for_mse.size()
+                << " , test_entropy = " << test_sum_entropy / sr.sfen_for_mse.size()
+                << " , norm = " << sum_norm
+                << " , move accuracy = " << (move_accord_count * 100.0 / sr.sfen_for_mse.size()) << "%";
+            if (done != static_cast<uint64_t>(-1))
+            {
+                cout
+                    << " , learn_cross_entropy_eval = " << learn_sum_cross_entropy_eval / done
+                    << " , learn_cross_entropy_win = " << learn_sum_cross_entropy_win / done
+                    << " , learn_entropy_eval = " << learn_sum_entropy_eval / done
+                    << " , learn_entropy_win = " << learn_sum_entropy_win / done
+                    << " , learn_cross_entropy = " << learn_sum_cross_entropy / done
+                    << " , learn_entropy = " << learn_sum_entropy / done;
+            }
+            cout << endl;
+        }
+        else {
+            cout << "Error! : sr.sfen_for_mse.size() = " << sr.sfen_for_mse.size() << " ,  done = " << done << endl;
+        }
 
-	// Clear 0 for next time.
-	learn_sum_cross_entropy_eval = 0.0;
-	learn_sum_cross_entropy_win = 0.0;
-	learn_sum_cross_entropy = 0.0;
-	learn_sum_entropy_eval = 0.0;
-	learn_sum_entropy_win = 0.0;
-	learn_sum_entropy = 0.0;
+        // Clear 0 for next time.
+        learn_sum_cross_entropy_eval = 0.0;
+        learn_sum_cross_entropy_win = 0.0;
+        learn_sum_cross_entropy = 0.0;
+        learn_sum_entropy_eval = 0.0;
+        learn_sum_entropy_win = 0.0;
+        learn_sum_entropy = 0.0;
 #else
-	<< endl;
+        << endl;
 #endif
-}
+    }
 
 
-void LearnerThink::thread_worker(size_t thread_id)
-{
+    void LearnerThink::thread_worker(size_t thread_id)
+    {
 #if defined(_OPENMP)
-	omp_set_num_threads((int)Options["Threads"]);
+        omp_set_num_threads((int)Options["Threads"]);
 #endif
 
-	auto th = Threads[thread_id];
-	auto& pos = th->rootPos;
+        auto th = Threads[thread_id];
+        auto& pos = th->rootPos;
 
-	while (true)
-	{
-	// display mse (this is sometimes done only for thread 0)
-	// Immediately after being read from the file...
+        while (true)
+        {
+            // display mse (this is sometimes done only for thread 0)
+            // Immediately after being read from the file...
 
 #if defined(EVAL_NNUE)
-		// Lock the evaluation function so that it is not used during updating.
-		shared_lock<shared_timed_mutex> read_lock(nn_mutex, defer_lock);
-		if (sr.next_update_weights <= sr.total_done ||
-		    (thread_id != 0 && !read_lock.try_lock()))
+        // Lock the evaluation function so that it is not used during updating.
+            shared_lock<shared_timed_mutex> read_lock(nn_mutex, defer_lock);
+            if (sr.next_update_weights <= sr.total_done ||
+                (thread_id != 0 && !read_lock.try_lock()))
 #else
-		if (sr.next_update_weights <= sr.total_done)
+            if (sr.next_update_weights <= sr.total_done)
 #endif
-		{
-			if (thread_id != 0)
-			{
-				// Wait except thread_id == 0.
+            {
+                if (thread_id != 0)
+                {
+                    // Wait except thread_id == 0.
 
-				if (stop_flag)
-					break;
+                    if (stop_flag)
+                        break;
 
-				// I want to parallelize rmse calculation etc., so if task() is loaded, process it.
-				task_dispatcher.on_idle(thread_id);
-				continue;
-			}
-			else
-			{
-				// Only thread_id == 0 performs the following update process.
+                    // I want to parallelize rmse calculation etc., so if task() is loaded, process it.
+                    task_dispatcher.on_idle(thread_id);
+                    continue;
+                }
+                else
+                {
+                    // Only thread_id == 0 performs the following update process.
 
-				// The weight array is not updated for the first time.
-				if (sr.next_update_weights == 0)
-				{
-					sr.next_update_weights += mini_batch_size;
-					continue;
-				}
+                    // The weight array is not updated for the first time.
+                    if (sr.next_update_weights == 0)
+                    {
+                        sr.next_update_weights += mini_batch_size;
+                        continue;
+                    }
 
 #if !defined(EVAL_NNUE)
-				// Output the current time. Output every time.
-				std::cout << sr.total_done << " sfens , at " << now_string() << std::endl;
+                    // Output the current time. Output every time.
+                    std::cout << sr.total_done << " sfens , at " << now_string() << std::endl;
 
-				// Reflect the gradient in the weight array at this timing. The calculation of the gradient is just right for each 1M phase in terms of mini-batch.
-				Eval::update_weights(epoch , freeze);
+                    // Reflect the gradient in the weight array at this timing. The calculation of the gradient is just right for each 1M phase in terms of mini-batch.
+                    Eval::update_weights(epoch, freeze);
 
-				// Display epoch and current eta for debugging.
-				std::cout << "epoch = " << epoch << " , eta = " << Eval::get_eta() << std::endl;
+                    // Display epoch and current eta for debugging.
+                    std::cout << "epoch = " << epoch << " , eta = " << Eval::get_eta() << std::endl;
 #else
-				{
-					// update parameters
+                    {
+                        // update parameters
 
-					// Lock the evaluation function so that it is not used during updating.
-					lock_guard<shared_timed_mutex> write_lock(nn_mutex);
-					Eval::NNUE::UpdateParameters(epoch);
-				}
+                        // Lock the evaluation function so that it is not used during updating.
+                        lock_guard<shared_timed_mutex> write_lock(nn_mutex);
+                        Eval::NNUE::UpdateParameters(epoch);
+                    }
 #endif
-				++epoch;
+                    ++epoch;
 
-				// Save once every 1 billion phases.
+                    // Save once every 1 billion phases.
 
-				// However, the elapsed time during update_weights() and calc_rmse() is ignored.
-				if (++sr.save_count * mini_batch_size >= eval_save_interval)
-				{
-					sr.save_count = 0;
+                    // However, the elapsed time during update_weights() and calc_rmse() is ignored.
+                    if (++sr.save_count * mini_batch_size >= eval_save_interval)
+                    {
+                        sr.save_count = 0;
 
-					// During this time, as the gradient calculation proceeds, the value becomes too large and I feel annoyed, so stop other threads.
-					const bool converged = save();
-					if (converged)
-					{
-						stop_flag = true;
-						sr.stop_flag = true;
-						break;
-					}
-				}
+                        // During this time, as the gradient calculation proceeds, the value becomes too large and I feel annoyed, so stop other threads.
+                        const bool converged = save();
+                        if (converged)
+                        {
+                            stop_flag = true;
+                            sr.stop_flag = true;
+                            break;
+                        }
+                    }
 
-				// Calculate rmse. This is done for samples of 10,000 phases.
-				// If you do with 40 cores, update_weights every 1 million phases
-				// I don't think it's so good to be tiring.
-				static uint64_t loss_output_count = 0;
-				if (++loss_output_count * mini_batch_size >= loss_output_interval)
-				{
-					loss_output_count = 0;
+                    // Calculate rmse. This is done for samples of 10,000 phases.
+                    // If you do with 40 cores, update_weights every 1 million phases
+                    // I don't think it's so good to be tiring.
+                    static uint64_t loss_output_count = 0;
+                    if (++loss_output_count * mini_batch_size >= loss_output_interval)
+                    {
+                        loss_output_count = 0;
 
-					// Number of cases processed this time
-					uint64_t done = sr.total_done - sr.last_done;
+                        // Number of cases processed this time
+                        uint64_t done = sr.total_done - sr.last_done;
 
-					// loss calculation
-					calc_loss(thread_id , done);
+                        // loss calculation
+                        calc_loss(thread_id, done);
 
 #if defined(EVAL_NNUE)
-					Eval::NNUE::CheckHealth();
+                        Eval::NNUE::CheckHealth();
 #endif
 
-					// Make a note of how far you have totaled.
-					sr.last_done = sr.total_done;
-				}
+                        // Make a note of how far you have totaled.
+                        sr.last_done = sr.total_done;
+                    }
 
-				// Next time, I want you to do this series of processing again when you process only mini_batch_size.
-				sr.next_update_weights += mini_batch_size;
+                    // Next time, I want you to do this series of processing again when you process only mini_batch_size.
+                    sr.next_update_weights += mini_batch_size;
 
-				// Since I was waiting for the update of this sr.next_update_weights except the main thread,
-				// Once this value is updated, it will start moving again.
-			}
-		}
+                    // Since I was waiting for the update of this sr.next_update_weights except the main thread,
+                    // Once this value is updated, it will start moving again.
+                }
+            }
 
-		PackedSfenValue ps;
-	RetryRead:;
-		if (!sr.read_to_thread_buffer(thread_id, ps))
-		{
-			// ran out of thread pool for my thread.
-			// Because there are almost no phases left,
-			// Terminate all other threads.
+            PackedSfenValue ps;
+        RetryRead:;
+            if (!sr.read_to_thread_buffer(thread_id, ps))
+            {
+                // ran out of thread pool for my thread.
+                // Because there are almost no phases left,
+                // Terminate all other threads.
 
-			stop_flag = true;
-			break;
-		}
+                stop_flag = true;
+                break;
+            }
 
-		// The evaluation value exceeds the learning target value.
-		// Ignore this aspect information.
-		if (eval_limit <abs(ps.score))
-			goto RetryRead;
+            // The evaluation value exceeds the learning target value.
+            // Ignore this aspect information.
+            if (eval_limit < abs(ps.score))
+                goto RetryRead;
 
 
-		if (!use_draw_games_in_training && ps.game_result == 0)
-			goto RetryRead;
+            if (!use_draw_games_in_training && ps.game_result == 0)
+                goto RetryRead;
 
 
-		// Skip over the opening phase
-		if (ps.gamePly < prng.rand(reduction_gameply))
-			goto RetryRead;
+            // Skip over the opening phase
+            if (ps.gamePly < prng.rand(reduction_gameply))
+                goto RetryRead;
 
 #if 0
-		auto sfen = pos.sfen_unpack(ps.data);
-		pos.set(sfen);
+            auto sfen = pos.sfen_unpack(ps.data);
+            pos.set(sfen);
 #endif
-		// ↑ Since it is slow when passing through sfen, I made a dedicated function.
-		StateInfo si;
-		const bool mirror = prng.rand(100) < mirror_percentage;
-		if (pos.set_from_packed_sfen(ps.sfen,&si,th,mirror) != 0)
-		{
-			// I got a strange sfen. Should be debugged!
-			// Since it is an illegal sfen, it may not be displayed with pos.sfen(), but it is better than not.
-			cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
-			goto RetryRead;
-		}
+            // ↑ Since it is slow when passing through sfen, I made a dedicated function.
+            StateInfo si;
+            const bool mirror = prng.rand(100) < mirror_percentage;
+            if (pos.set_from_packed_sfen(ps.sfen, &si, th, mirror) != 0)
+            {
+                // I got a strange sfen. Should be debugged!
+                // Since it is an illegal sfen, it may not be displayed with pos.sfen(), but it is better than not.
+                cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
+                goto RetryRead;
+            }
 #if !defined(EVAL_NNUE)
-		{
-			auto key = pos.key();
-			// Exclude the phase used for rmse calculation.
-			if (sr.is_for_rmse(key) && skip_duplicated_positions_in_training)
-				goto RetryRead;
+            {
+                auto key = pos.key();
+                // Exclude the phase used for rmse calculation.
+                if (sr.is_for_rmse(key) && skip_duplicated_positions_in_training)
+                    goto RetryRead;
 
-			// Exclude the most recently used aspect.
-			auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
-			auto key2 = sr.hash[hash_index];
-			if (key == key2 && skip_duplicated_positions_in_training)
-				goto RetryRead;
-			sr.hash[hash_index] = key; // Replace with the current key.
-		}
+                // Exclude the most recently used aspect.
+                auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
+                auto key2 = sr.hash[hash_index];
+                if (key == key2 && skip_duplicated_positions_in_training)
+                    goto RetryRead;
+                sr.hash[hash_index] = key; // Replace with the current key.
+            }
 #endif
 
-		// There is a possibility that all the pieces are blocked and stuck.
-		// Also, the declaration win phase is excluded from learning because you cannot go to leaf with PV moves.
-		// (shouldn't write out such teacher aspect itself, but may have written it out with an old generation routine)
-	// Skip the position if there are no legal moves (=checkmated or stalemate).
-		if (MoveList<LEGAL>(pos).size() == 0)
-			goto RetryRead;
+            // There is a possibility that all the pieces are blocked and stuck.
+            // Also, the declaration win phase is excluded from learning because you cannot go to leaf with PV moves.
+            // (shouldn't write out such teacher aspect itself, but may have written it out with an old generation routine)
+        // Skip the position if there are no legal moves (=checkmated or stalemate).
+            if (MoveList<LEGAL>(pos).size() == 0)
+                goto RetryRead;
 
-		// I can read it, so try displaying it.
-		//		cout << pos << value << endl;
+            // I can read it, so try displaying it.
+            //      cout << pos << value << endl;
 
-		// Evaluation value of shallow search (qsearch)
-		auto r = qsearch(pos);
-		auto pv = r.second;
+            // Evaluation value of shallow search (qsearch)
+            auto r = qsearch(pos);
+            auto pv = r.second;
 
-		// Evaluation value of deep search
-		auto deep_value = (Value)ps.score;
+            // Evaluation value of deep search
+            auto deep_value = (Value)ps.score;
 
-		// I feel that the mini batch has a better gradient.
-		// Go to the leaf node as it is, add only to the gradient array, and later try AdaGrad at the time of rmse aggregation.
+            // I feel that the mini batch has a better gradient.
+            // Go to the leaf node as it is, add only to the gradient array, and later try AdaGrad at the time of rmse aggregation.
 
-		auto rootColor = pos.side_to_move();
+            auto rootColor = pos.side_to_move();
 
-		// If the initial PV is different, it is better not to use it for learning.
-		// If it is the result of searching a completely different place, it may become noise.
-		// It may be better not to study where the difference in evaluation values ​​is too large.
+            // If the initial PV is different, it is better not to use it for learning.
+            // If it is the result of searching a completely different place, it may become noise.
+            // It may be better not to study where the difference in evaluation values ​​is too large.
 
 #if 0
-		// If you do this, about 13% of the phases will be excluded from the learning target. Good and bad are subtle.
-		if (pv.size() >= 1 && (uint16_t)pv[0] != ps.move)
-		{
-			// dbg_hit_on(false);
-			continue;
-		}
+        // If you do this, about 13% of the phases will be excluded from the learning target. Good and bad are subtle.
+            if (pv.size() >= 1 && (uint16_t)pv[0] != ps.move)
+            {
+                // dbg_hit_on(false);
+                continue;
+            }
 #endif
 
 #if 0
-		// It may be better not to study where the difference in evaluation values ​​is too large.
-		// → It's okay because it passes the win rate function... About 30% of the phases are out of the scope of learning...
-		if (abs((int16_t)r.first - ps.score) >= Eval::PawnValue * 4)
-		{
-//			dbg_hit_on(false);
-			continue;
-		}
-		//		dbg_hit_on(true);
+            // It may be better not to study where the difference in evaluation values ​​is too large.
+            // → It's okay because it passes the win rate function... About 30% of the phases are out of the scope of learning...
+            if (abs((int16_t)r.first - ps.score) >= Eval::PawnValue * 4)
+            {
+                //          dbg_hit_on(false);
+                continue;
+            }
+            //      dbg_hit_on(true);
 #endif
 
-		int ply = 0;
+            int ply = 0;
 
-		// A helper function that adds the gradient to the current phase.
-		auto pos_add_grad = [&]() {
-			// Use the value of evaluate in leaf as shallow_value.
-			// Using the return value of qsearch() as shallow_value,
-			// If PV is interrupted in the middle, the phase where evaluate() is called to calculate the gradient, and
-			// I don't think this is a very desirable property, as the aspect that gives that gradient will be different.
-			// I have turned off the substitution table, but since the pv array has not been updated due to one stumbling block etc...
+            // A helper function that adds the gradient to the current phase.
+            auto pos_add_grad = [&]() {
+                // Use the value of evaluate in leaf as shallow_value.
+                // Using the return value of qsearch() as shallow_value,
+                // If PV is interrupted in the middle, the phase where evaluate() is called to calculate the gradient, and
+                // I don't think this is a very desirable property, as the aspect that gives that gradient will be different.
+                // I have turned off the substitution table, but since the pv array has not been updated due to one stumbling block etc...
 
-			Value shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
+                Value shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-			// Calculate loss for training data
-			double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
-			double learn_entropy_eval, learn_entropy_win, learn_entropy;
-			calc_cross_entropy(deep_value, shallow_value, ps, learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy, learn_entropy_eval, learn_entropy_win, learn_entropy);
-			learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
-			learn_sum_cross_entropy_win += learn_cross_entropy_win;
-			learn_sum_cross_entropy += learn_cross_entropy;
-			learn_sum_entropy_eval += learn_entropy_eval;
-			learn_sum_entropy_win += learn_entropy_win;
-			learn_sum_entropy += learn_entropy;
+                // Calculate loss for training data
+                double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
+                double learn_entropy_eval, learn_entropy_win, learn_entropy;
+                calc_cross_entropy(deep_value, shallow_value, ps, learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy, learn_entropy_eval, learn_entropy_win, learn_entropy);
+                learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
+                learn_sum_cross_entropy_win += learn_cross_entropy_win;
+                learn_sum_cross_entropy += learn_cross_entropy;
+                learn_sum_entropy_eval += learn_entropy_eval;
+                learn_sum_entropy_win += learn_entropy_win;
+                learn_sum_entropy += learn_entropy;
 #endif
 
 #if !defined(EVAL_NNUE)
-			// Slope
-			double dj_dw = calc_grad(deep_value, shallow_value, ps);
+                // Slope
+                double dj_dw = calc_grad(deep_value, shallow_value, ps);
 
-			// Add jd_dw as the gradient (∂J/∂Wj) for the feature vector currently appearing in the leaf node.
+                // Add jd_dw as the gradient (∂J/∂Wj) for the feature vector currently appearing in the leaf node.
 
-			// If it is not PV termination, apply a discount rate.
-			if (discount_rate != 0 && ply != (int)pv.size())
-				dj_dw *= discount_rate;
+                // If it is not PV termination, apply a discount rate.
+                if (discount_rate != 0 && ply != (int)pv.size())
+                    dj_dw *= discount_rate;
 
-			// Since we have reached leaf, add the gradient to the features that appear in this phase.
-			// Update based on gradient later.
-			Eval::add_grad(pos, rootColor, dj_dw, freeze);
+                // Since we have reached leaf, add the gradient to the features that appear in this phase.
+                // Update based on gradient later.
+                Eval::add_grad(pos, rootColor, dj_dw, freeze);
 #else
-			const double example_weight =
-			    (discount_rate != 0 && ply != (int)pv.size()) ? discount_rate : 1.0;
-			Eval::NNUE::AddExample(pos, rootColor, ps, example_weight);
+                const double example_weight =
+                    (discount_rate != 0 && ply != (int)pv.size()) ? discount_rate : 1.0;
+                Eval::NNUE::AddExample(pos, rootColor, ps, example_weight);
 #endif
 
-			// Since the processing is completed, the counter of the processed number is incremented
-			sr.total_done++;
-		};
+                // Since the processing is completed, the counter of the processed number is incremented
+                sr.total_done++;
+            };
 
-		StateInfo state[MAX_PLY]; // PV of qsearch cannot be so long.
-		bool illegal_move = false;
-		for (auto m : pv)
-		{
-			// I shouldn't be an illegal player.
-			// An illegal move sometimes comes here...
-			if (!pos.pseudo_legal(m) || !pos.legal(m))
-			{
-				//cout << pos << m << endl;
-				//assert(false);
-				illegal_move = true;
-				break;
-			}
+            StateInfo state[MAX_PLY]; // PV of qsearch cannot be so long.
+            bool illegal_move = false;
+            for (auto m : pv)
+            {
+                // I shouldn't be an illegal player.
+                // An illegal move sometimes comes here...
+                if (!pos.pseudo_legal(m) || !pos.legal(m))
+                {
+                    //cout << pos << m << endl;
+                    //assert(false);
+                    illegal_move = true;
+                    break;
+                }
 
-			// Processing when adding the gradient to the node on each PV.
-			//If discount_rate is 0, this process is not performed.
-			if (discount_rate != 0)
-				pos_add_grad();
+                // Processing when adding the gradient to the node on each PV.
+                //If discount_rate is 0, this process is not performed.
+                if (discount_rate != 0)
+                    pos_add_grad();
 
-			pos.do_move(m, state[ply++]);
+                pos.do_move(m, state[ply++]);
 
-			// Since the value of evaluate in leaf is used, the difference is updated.
-			Eval::NNUE::update_eval(pos);
-		}
+                // Since the value of evaluate in leaf is used, the difference is updated.
+                Eval::NNUE::update_eval(pos);
+            }
 
-		if (illegal_move) {
-			sync_cout << "An illical move was detected... Excluded the position from the learning data..." << sync_endl;
-			continue;
-		}
+            if (illegal_move) {
+                sync_cout << "An illical move was detected... Excluded the position from the learning data..." << sync_endl;
+                continue;
+            }
 
-		// Since we have reached the end phase of PV, add the slope here.
-		pos_add_grad();
+            // Since we have reached the end phase of PV, add the slope here.
+            pos_add_grad();
 
-		// rewind the phase
-		for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-			pos.undo_move(*it);
+            // rewind the phase
+            for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+                pos.undo_move(*it);
 
 #if 0
-		// When adding the gradient to the root phase
-		shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
-		dj_dw = calc_grad(deep_value, shallow_value, ps);
-		Eval::add_grad(pos, rootColor, dj_dw , without_kpp);
+            // When adding the gradient to the root phase
+            shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
+            dj_dw = calc_grad(deep_value, shallow_value, ps);
+            Eval::add_grad(pos, rootColor, dj_dw, without_kpp);
 #endif
 
-	}
+        }
 
-}
+    }
 
-// Write evaluation function file.
-bool LearnerThink::save(bool is_final)
-{
-	// Each time you save, change the extension part of the file name like "0","1","2",..
-	// (Because I want to compare the winning rate for each evaluation function parameter later)
+    // Write evaluation function file.
+    bool LearnerThink::save(bool is_final)
+    {
+        // Each time you save, change the extension part of the file name like "0","1","2",..
+        // (Because I want to compare the winning rate for each evaluation function parameter later)
 
-	if (save_only_once)
-	{
-		// When EVAL_SAVE_ONLY_ONCE is defined,
-		// Do not dig a subfolder because I want to save it only once.
-		Eval::save_eval("");
-	}
-	else if (is_final) {
-		Eval::save_eval("final");
-		return true;
-	}
-	else {
-		static int dir_number = 0;
-		const std::string dir_name = std::to_string(dir_number++);
-		Eval::save_eval(dir_name);
+        if (save_only_once)
+        {
+            // When EVAL_SAVE_ONLY_ONCE is defined,
+            // Do not dig a subfolder because I want to save it only once.
+            Eval::save_eval("");
+        }
+        else if (is_final) {
+            Eval::save_eval("final");
+            return true;
+        }
+        else {
+            static int dir_number = 0;
+            const std::string dir_name = std::to_string(dir_number++);
+            Eval::save_eval(dir_name);
 #if defined(EVAL_NNUE)
-		if (newbob_decay != 1.0 && latest_loss_count > 0) {
-			static int trials = newbob_num_trials;
-			const double latest_loss = latest_loss_sum / latest_loss_count;
-			latest_loss_sum = 0.0;
-			latest_loss_count = 0;
-			cout << "loss: " << latest_loss;
-			if (latest_loss < best_loss) {
-				cout << " < best (" << best_loss << "), accepted" << endl;
-				best_loss = latest_loss;
-				best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
-				trials = newbob_num_trials;
-			} else {
-				cout << " >= best (" << best_loss << "), rejected" << endl;
-				if (best_nn_directory.empty()) {
-					cout << "WARNING: no improvement from initial model" << endl;
-				} else {
-					cout << "restoring parameters from " << best_nn_directory << endl;
-					Eval::NNUE::RestoreParameters(best_nn_directory);
-				}
-				if (--trials > 0 && !is_final) {
-					cout << "reducing learning rate scale from " << newbob_scale
-					     << " to " << (newbob_scale * newbob_decay)
-					     << " (" << trials << " more trials)" << endl;
-					newbob_scale *= newbob_decay;
-					Eval::NNUE::SetGlobalLearningRateScale(newbob_scale);
-				}
-			}
-			if (trials == 0) {
-				cout << "converged" << endl;
-				return true;
-			}
-		}
+            if (newbob_decay != 1.0 && latest_loss_count > 0) {
+                static int trials = newbob_num_trials;
+                const double latest_loss = latest_loss_sum / latest_loss_count;
+                latest_loss_sum = 0.0;
+                latest_loss_count = 0;
+                cout << "loss: " << latest_loss;
+                if (latest_loss < best_loss) {
+                    cout << " < best (" << best_loss << "), accepted" << endl;
+                    best_loss = latest_loss;
+                    best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
+                    trials = newbob_num_trials;
+                }
+                else {
+                    cout << " >= best (" << best_loss << "), rejected" << endl;
+                    if (best_nn_directory.empty()) {
+                        cout << "WARNING: no improvement from initial model" << endl;
+                    }
+                    else {
+                        cout << "restoring parameters from " << best_nn_directory << endl;
+                        Eval::NNUE::RestoreParameters(best_nn_directory);
+                    }
+                    if (--trials > 0 && !is_final) {
+                        cout << "reducing learning rate scale from " << newbob_scale
+                            << " to " << (newbob_scale * newbob_decay)
+                            << " (" << trials << " more trials)" << endl;
+                        newbob_scale *= newbob_decay;
+                        Eval::NNUE::SetGlobalLearningRateScale(newbob_scale);
+                    }
+                }
+                if (trials == 0) {
+                    cout << "converged" << endl;
+                    return true;
+                }
+            }
 #endif
-	}
-	return false;
-}
-
-// Shuffle_files(), shuffle_files_quick() subcontracting, writing part.
-// output_file_name: Name of the file to write
-// prng: random number
-// afs: fstream of each teacher phase file
-// a_count: The number of teacher positions inherent in each file.
-void shuffle_write(const string& output_file_name , PRNG& prng , vector<fstream>& afs , vector<uint64_t>& a_count)
-{
-	uint64_t total_sfen_count = 0;
-	for (auto c : a_count)
-		total_sfen_count += c;
-
-	// number of exported phases
-	uint64_t write_sfen_count = 0;
-
-	// Output the progress on the screen for each phase.
-	const uint64_t buffer_size = 10000000;
-
-	auto print_status = [&]()
-	{
-		// Output progress every 10M phase or when all writing is completed
-		if (((write_sfen_count % buffer_size) == 0) ||
-			(write_sfen_count == total_sfen_count))
-			cout << write_sfen_count << " / " << total_sfen_count << endl;
-	};
-
-
-	cout << endl <<  "write : " << output_file_name << endl;
-
-	fstream fs(output_file_name, ios::out | ios::binary);
-
-	// total teacher positions
-	uint64_t sum = 0;
-	for (auto c : a_count)
-		sum += c;
-
-	while (sum != 0)
-	{
-		auto r = prng.rand(sum);
-
-		// Aspects stored in fs[0] file ... Aspects stored in fs[1] file ...
-		//Think of it as a series like, and determine in which file r is pointing.
-		// The contents of the file are shuffled, so you can take the next element from that file.
-		// Each file has a_count[x] phases, so this process can be written as follows.
-
-		uint64_t n = 0;
-		while (a_count[n] <= r)
-			r -= a_count[n++];
-
-		// This confirms n. Before you forget it, reduce the remaining number.
-
-		--a_count[n];
-		--sum;
-
-		PackedSfenValue psv;
-		// It's better to read and write all at once until the performance is not so good...
-		if (afs[n].read((char*)&psv, sizeof(PackedSfenValue)))
-		{
-			fs.write((char*)&psv, sizeof(PackedSfenValue));
-			++write_sfen_count;
-			print_status();
-		}
-	}
-	print_status();
-	fs.close();
-	cout << "done!" << endl;
-}
-
-// Subcontracting the teacher shuffle "learn shuffle" command.
-// output_file_name: name of the output file where the shuffled teacher positions will be written
-void shuffle_files(const vector<string>& filenames , const string& output_file_name , uint64_t buffer_size )
-{
-	// The destination folder is
-	// tmp/ for temporary writing
-
-	// Temporary file is written to tmp/ folder for each buffer_size phase.
-	// For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
-	// In a PC with a small memory, it would be better to reduce this.
-	// However, if the number of files increases too much, it will not be possible to open at the same time due to OS restrictions.
-	// There should have been a limit of 512 per process on Windows, so you can open here as 500,
-	// The current setting is 500 files x 20M = 10G = 10 billion phases.
-
-	PSVector buf;
-	buf.resize(buffer_size);
-	// ↑ buffer, a marker that indicates how much you have used
-	uint64_t buf_write_marker = 0;
-
-	// File name to write (incremental counter because it is a serial number)
-	uint64_t write_file_count = 0;
-
-	// random number to shuffle
-	// Do not use std::random_device().  Because it always the same integers on MinGW.
-	PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
-
-	// generate the name of the temporary file
-	auto make_filename = [](uint64_t i)
-	{
-		return "tmp/" + to_string(i) + ".bin";
-	};
-
-	// Exported files in tmp/ folder, number of teacher positions stored in each
-	vector<uint64_t> a_count;
-
-	auto write_buffer = [&](uint64_t size)
-	{
-		// shuffle from buf[0] to buf[size-1]
-		for (uint64_t i = 0; i < size; ++i)
-			swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
-
-		// write to a file
-		fstream fs;
-		fs.open(make_filename(write_file_count++), ios::out | ios::binary);
-		fs.write((char*)&buf[0], size * sizeof(PackedSfenValue));
-		fs.close();
-		a_count.push_back(size);
-
-		buf_write_marker = 0;
-		cout << ".";
-	};
-
-	Dependency::mkdir("tmp");
-
-	// Shuffle and export as a 10M phase shredded file.
-	for (auto filename : filenames)
-	{
-		fstream fs(filename, ios::in | ios::binary);
-		cout << endl << "open file = " << filename;
-		while (fs.read((char*)&buf[buf_write_marker], sizeof(PackedSfenValue)))
-			if (++buf_write_marker == buffer_size)
-				write_buffer(buffer_size);
-
-		// Read in units of sizeof(PackedSfenValue),
-		// Ignore the last remaining fraction. (Fails in fs.read, so exit while)
-		// (The remaining fraction seems to be half-finished data that was created because it was stopped halfway during teacher generation.)
-
-	}
-
-	if (buf_write_marker != 0)
-		write_buffer(buf_write_marker);
-
-	// Only shuffled files have been written write_file_count.
-	// As a second pass, if you open all of them at the same time, select one at random and load one phase at a time
-	// Now you have shuffled.
-
-	// Original file for shirt full + tmp file + file to write requires 3 times the storage capacity of the original file.
-	// 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
-	// If you want to delete (or delete by hand) the original file at this point after writing to tmp,
-	// The storage capacity is about twice that of the original file.
-	// So, maybe we should have an option to delete the original file.
-
-	// Files are opened at the same time. It is highly possible that this will exceed FOPEN_MAX.
-	// In that case, rather than adjusting buffer_size to reduce the number of files.
-
-	vector<fstream> afs;
-	for (uint64_t i = 0; i < write_file_count; ++i)
-		afs.emplace_back(fstream(make_filename(i),ios::in | ios::binary));
-
-	// Throw to the subcontract function and end.
-	shuffle_write(output_file_name, prng, afs, a_count);
-}
-
-// Subcontracting the teacher shuffle "learn shuffleq" command.
-// This is written in 1 pass.
-// output_file_name: name of the output file where the shuffled teacher positions will be written
-void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name)
-{
-	// number of phases read
-	uint64_t read_sfen_count = 0;
-
-	// random number to shuffle
-	// Do not use std::random_device().  Because it always the same integers on MinGW.
-	PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
-
-	// number of files
-	size_t file_count = filenames.size();
-
-	// Number of teacher positions stored in each file in filenames
-	vector<uint64_t> a_count(file_count);
-
-	// Count the number of teacher aspects in each file.
-	vector<fstream> afs(file_count);
-
-	for (size_t i = 0; i <file_count ;++i)
-	{
-		auto filename = filenames[i];
-		auto& fs = afs[i];
-
-		fs.open(filename, ios::in | ios::binary);
-		fs.seekg(0, fstream::end);
-		uint64_t eofPos = (uint64_t)fs.tellg();
-		fs.clear(); // Otherwise, the next seek may fail.
-		fs.seekg(0, fstream::beg);
-		uint64_t begPos = (uint64_t)fs.tellg();
-		uint64_t file_size = eofPos - begPos;
-		uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
-		a_count[i] = sfen_count;
-
-		// Output the number of sfen stored in each file.
-		cout << filename << " = " << sfen_count << " sfens." << endl;
-	}
-
-	// Since we know the file size of each file,
-	// open them all at once (already open),
-	// Select one at a time and load one phase at a time
-	// Now you have shuffled.
-
-	// Throw to the subcontract function and end.
-	shuffle_write(output_file_name, prng, afs, a_count);
-}
-
-// Subcontracting the teacher shuffle "learn shufflem" command.
-// Read the whole memory and write it out with the specified file name.
-void shuffle_files_on_memory(const vector<string>& filenames,const string output_file_name)
-{
-	PSVector buf;
-
-	for (auto filename : filenames)
-	{
-		std::cout << "read : " << filename << std::endl;
-		read_file_to_memory(filename, [&buf](uint64_t size) {
-			assert((size % sizeof(PackedSfenValue)) == 0);
-			// Expand the buffer and read after the last end.
-			uint64_t last = buf.size();
-			buf.resize(last + size / sizeof(PackedSfenValue));
-			return (void*)&buf[last];
-		});
-	}
-
-	// shuffle from buf[0] to buf[size-1]
-	// Do not use std::random_device().  Because it always the same integers on MinGW.
-	PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
-	uint64_t size = (uint64_t)buf.size();
-	std::cout << "shuffle buf.size() = " << size << std::endl;
-	for (uint64_t i = 0; i < size; ++i)
-		swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
-
-	std::cout << "write : " << output_file_name << endl;
-
-	// If the file to be written exceeds 2GB, it cannot be written in one shot with fstream::write, so use wrapper.
-	write_memory_to_file(output_file_name, (void*)&buf[0], (uint64_t)sizeof(PackedSfenValue)*(uint64_t)buf.size());
-
-	std::cout << "..shuffle_on_memory done." << std::endl;
-}
-
-bool fen_is_ok(Position& pos, std::string input_fen) {
-	std::string pos_fen = pos.fen();
-	std::istringstream ss_input(input_fen);
-	std::istringstream ss_pos(pos_fen);
-
-	// example : "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3 w - h6 0 24"
-	//       --> "2r4r/4kpp1/nb1np3/p2p3p/B2P1BP1/PP6/4NPKP/2R1R3"
-	std::string str_input, str_pos;
-	ss_input >> str_input;
-	ss_pos >> str_pos;
-
-	// Only compare "Piece placement field" between input_fen and pos.fen().
-	return str_input == str_pos;
-}
-
-void convert_bin(const vector<string>& filenames, const string& output_file_name, const int ply_minimum, const int ply_maximum, const int interpolate_eval, const bool check_invalid_fen, const bool check_illegal_move)
-{
-	std::cout << "check_invalid_fen=" << check_invalid_fen << std::endl;
-	std::cout << "check_illegal_move=" << check_illegal_move << std::endl;
-
-	std::fstream fs;
-	uint64_t data_size=0;
-	uint64_t filtered_size = 0;
-	uint64_t filtered_size_fen = 0;
-	uint64_t filtered_size_move = 0;
-	uint64_t filtered_size_ply = 0;
-	auto th = Threads.main();
-	auto &tpos = th->rootPos;
-	// convert plain rag to packed sfenvalue for Yaneura king
-	fs.open(output_file_name, ios::app | ios::binary);
-	StateListPtr states;
-	for (auto filename : filenames) {
-		std::cout << "convert " << filename << " ... ";
-		std::string line;
-		ifstream ifs;
-		ifs.open(filename);
-		PackedSfenValue p;
-		data_size = 0;
-		filtered_size = 0;
-		filtered_size_fen = 0;
-		filtered_size_move = 0;
-		filtered_size_ply = 0;
-		p.gamePly = 1; // Not included in apery format. Should be initialized
-		bool ignore_flag_fen = false;
-		bool ignore_flag_move = false;
-		bool ignore_flag_ply = false;
-		while (std::getline(ifs, line)) {
-			std::stringstream ss(line);
-			std::string token;
-			std::string value;
-			ss >> token;
-			if (token == "fen") {
-				states = StateListPtr(new std::deque<StateInfo>(1)); // Drop old and create a new one
-				std::string input_fen = line.substr(4);
-				tpos.set(input_fen, false, &states->back(), Threads.main());
-				if (check_invalid_fen && !fen_is_ok(tpos, input_fen)) {
-					ignore_flag_fen = true;
-					filtered_size_fen++;
-				}
-				else {
-					tpos.sfen_pack(p.sfen);
-				}
-			}
-			else if (token == "move") {
-				ss >> value;
-				Move move = UCI::to_move(tpos, value);
-				if (check_illegal_move && move == MOVE_NONE) {
-					ignore_flag_move = true;
-					filtered_size_move++;
-				}
-				else {
-					p.move = move;
-				}
-			}
-			else if (token == "score") {
-				double score;
-				ss >> score;
-				// Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-				// Normalize to [0.0, 1.0].
-				score = (score - src_score_min_value) / (src_score_max_value - src_score_min_value);
-				// Scale to [dest_score_min_value, dest_score_max_value].
-				score = score * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
-				p.score = Math::clamp((int32_t)std::round(score) , -(int32_t)VALUE_MATE , (int32_t)VALUE_MATE);
-			}
-			else if (token == "ply") {
-				int temp;
-				ss >> temp;
-				if(temp < ply_minimum || temp > ply_maximum){
-					ignore_flag_ply = true;
-					filtered_size_ply++;
-				}
-				p.gamePly = uint16_t(temp); // No cast here?
-				if (interpolate_eval != 0){
-					p.score = min(3000, interpolate_eval * temp);
-				}
-			}
-			else if (token == "result") {
-				int temp;
-				ss >> temp;
-				p.game_result = int8_t(temp); // Do you need a cast here?
-				if (interpolate_eval){
-					p.score = p.score * p.game_result;
-				}
-			}
-			else if (token == "e") {
-				if (!(ignore_flag_fen || ignore_flag_move || ignore_flag_ply)) {
-					fs.write((char*)&p, sizeof(PackedSfenValue));
-					data_size+=1;
-					// debug
-					// std::cout<<tpos<<std::endl;
-					// std::cout<<p.score<<","<<int(p.gamePly)<<","<<int(p.game_result)<<std::endl;
-				}
-				else {
-					filtered_size++;
-				}
-				ignore_flag_fen = false;
-				ignore_flag_move = false;
-				ignore_flag_ply = false;
-			}
-		}
-		std::cout << "done " << data_size << " parsed " << filtered_size << " is filtered"
-				  << " (invalid fen:" << filtered_size_fen << ", illegal move:" << filtered_size_move << ", invalid ply:" << filtered_size_ply << ")" << std::endl;
-		ifs.close();
-	}
-	std::cout << "all done" << std::endl;
-	fs.close();
-}
-
-static inline void ltrim(std::string &s) {
-	s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
-		return !std::isspace(ch);
-	}));
-}
-
-static inline void rtrim(std::string &s) {
-	s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
-		return !std::isspace(ch);
-	}).base(), s.end());
-}
-
-static inline void trim(std::string &s) {
-	ltrim(s);
-	rtrim(s);
-}
-
-int parse_game_result_from_pgn_extract(std::string result) {
-	// White Win
-	if (result == "\"1-0\"") {
-		return 1;
-	}
-	// Black Win
-	else if (result == "\"0-1\"") {
-		return -1;
-	}
-	// Draw
-	else {
-		return 0;
-	}
-}
-
-// 0.25 -->  0.25 * PawnValueEg
-// #-4  --> -mate_in(4)
-// #3   -->  mate_in(3)
-// -M4  --> -mate_in(4)
-// +M3  -->  mate_in(3)
-Value parse_score_from_pgn_extract(std::string eval, bool& success) {
-	success = true;
-
-	if (eval.substr(0, 1) == "#") {
-		if (eval.substr(1, 1) == "-") {
-			return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
-		}
-		else {
-			return mate_in(stoi(eval.substr(1, eval.length() - 1)));
-		}
-	}
-	else if (eval.substr(0, 2) == "-M") {
-		//std::cout << "eval=" << eval << std::endl;
-		return -mate_in(stoi(eval.substr(2, eval.length() - 2)));
-	}
-	else if (eval.substr(0, 2) == "+M") {
-		//std::cout << "eval=" << eval << std::endl;
-		return mate_in(stoi(eval.substr(2, eval.length() - 2)));
-	}
-	else {
-		char *endptr;
-		double value = strtod(eval.c_str(), &endptr);
-
-		if (*endptr != '\0') {
-			success = false;
-			return VALUE_ZERO;
-		}
-		else {
-			return Value(value * static_cast<double>(PawnValueEg));
-		}
-	}
-}
-
-// for Debug
-//#define DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT
-
-bool is_like_fen(std::string fen) {
-	int count_space = std::count(fen.cbegin(), fen.cend(), ' ');
-	int count_slash = std::count(fen.cbegin(), fen.cend(), '/');
-
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-	//std::cout << "count_space=" << count_space << std::endl;
-	//std::cout << "count_slash=" << count_slash << std::endl;
-#endif
-
-	return count_space == 5 && count_slash == 7;
-}
-
-void convert_bin_from_pgn_extract(const vector<string>& filenames, const string& output_file_name, const bool pgn_eval_side_to_move, const bool convert_no_eval_fens_as_score_zero)
-{
-	std::cout << "pgn_eval_side_to_move=" << pgn_eval_side_to_move << std::endl;
-	std::cout << "convert_no_eval_fens_as_score_zero=" << convert_no_eval_fens_as_score_zero << std::endl;
-
-	auto th = Threads.main();
-	auto &pos = th->rootPos;
-
-	std::fstream ofs;
-	ofs.open(output_file_name, ios::out | ios::binary);
-
-	int game_count = 0;
-	int fen_count = 0;
-
-	for (auto filename : filenames) {
-		std::cout << now_string() << " convert " << filename << std::endl;
-		ifstream ifs;
-		ifs.open(filename);
-
-		int game_result = 0;
-
-		std::string line;
-		while (std::getline(ifs, line)) {
-
-			if (line.empty()) {
-				continue;
-			}
-
-			else if (line.substr(0, 1) == "[") {
-				std::regex pattern_result(R"(\[Result (.+?)\])");
-				std::smatch match;
-
-				// example: [Result "1-0"]
-				if (std::regex_search(line, match, pattern_result)) {
-					game_result = parse_game_result_from_pgn_extract(match.str(1));
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-					std::cout << "game_result=" << game_result << std::endl;
-#endif
-					game_count++;
-					if (game_count % 10000 == 0) {
-						std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
-					}
-				}
-
-				continue;
-			}
-
-			else {
-				int gamePly = 1;
-				auto itr = line.cbegin();
-
-				while (true) {
-					gamePly++;
-
-					PackedSfenValue psv;
-					memset((char*)&psv, 0, sizeof(PackedSfenValue));
-
-					// fen
-					{
-						bool fen_found = false;
-
-						while (!fen_found) {
-							std::regex pattern_bracket(R"(\{(.+?)\})");
-							std::smatch match;
-							if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
-								break;
-							}
-
-							itr += match.position(0) + match.length(0) - 1;
-							std::string str_fen = match.str(1);
-							trim(str_fen);
-
-							if (is_like_fen(str_fen)) {
-								fen_found = true;
-
-								StateInfo si;
-								pos.set(str_fen, false, &si, th);
-								pos.sfen_pack(psv.sfen);
-							}
-
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-							std::cout << "str_fen=" << str_fen << std::endl;
-							std::cout << "fen_found=" << fen_found << std::endl;
-#endif
-						}
-
-						if (!fen_found) {
-							break;
-						}
-					}
-
-					// move
-					{
-						std::regex pattern_move(R"(\}(.+?)\{)");
-						std::smatch match;
-						if (!std::regex_search(itr, line.cend(), match, pattern_move)) {
-							break;
-						}
-
-						itr += match.position(0) + match.length(0) - 1;
-						std::string str_move = match.str(1);
-						trim(str_move);
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-						std::cout << "str_move=" << str_move << std::endl;
-#endif
-						psv.move = UCI::to_move(pos, str_move);
-					}
-
-					// eval
-					bool eval_found = false;
-					{
-						std::regex pattern_bracket(R"(\{(.+?)\})");
-						std::smatch match;
-						if (!std::regex_search(itr, line.cend(), match, pattern_bracket)) {
-							break;
-						}
-
-						std::string str_eval_clk = match.str(1);
-						trim(str_eval_clk);
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-						std::cout << "str_eval_clk=" << str_eval_clk << std::endl;
-#endif
-
-						// example: { [%eval 0.25] [%clk 0:10:00] }
-						// example: { [%eval #-4] [%clk 0:10:00] }
-						// example: { [%eval #3] [%clk 0:10:00] }
-						// example: { +0.71/22 1.2s }
-						// example: { -M4/7 0.003s }
-						// example: { M3/245 0.017s }
-						// example: { +M1/245 0.010s, White mates }
-						// example: { 0.60 }
-						// example: { book }
-						// example: { rnbqkb1r/pp3ppp/2p1pn2/3p4/2PP4/2N2N2/PP2PPPP/R1BQKB1R w KQkq - 0 5 }
-
-						// Considering the absence of eval
-						if (!is_like_fen(str_eval_clk)) {
-							itr += match.position(0) + match.length(0) - 1;
-
-							if (str_eval_clk != "book") {
-								std::regex pattern_eval1(R"(\[\%eval (.+?)\])");
-								std::regex pattern_eval2(R"((.+?)\/)");
-
-								std::string str_eval;
-								if (std::regex_search(str_eval_clk, match, pattern_eval1) ||
-									std::regex_search(str_eval_clk, match, pattern_eval2)) {
-									str_eval = match.str(1);
-									trim(str_eval);
-								}
-								else {
-									str_eval = str_eval_clk;
-								}
-
-								bool success = false;
-								Value value = parse_score_from_pgn_extract(str_eval, success);
-								if (success) {
-									eval_found = true;
-									psv.score = Math::clamp(value, -VALUE_MATE , VALUE_MATE);
-								}
-
-#if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
-								std::cout << "str_eval=" << str_eval << std::endl;
-								std::cout << "success=" << success << ", psv.score=" << psv.score << std::endl;
-#endif
-							}
-						}
-					}
-
-					// write
-					if (eval_found || convert_no_eval_fens_as_score_zero) {
-						if (!eval_found && convert_no_eval_fens_as_score_zero) {
-							psv.score = 0;
-						}
-
-						psv.gamePly = gamePly;
-						psv.game_result = game_result;
-
-						if (pos.side_to_move() == BLACK) {
-							if (!pgn_eval_side_to_move) {
-								psv.score *= -1;
-							}
-							psv.game_result *= -1;
-						}
-
-						ofs.write((char*)&psv, sizeof(PackedSfenValue));
-
-						fen_count++;
-					}
-				}
-
-				game_result = 0;
-			}
-		}
-	}
-
-	std::cout << now_string() << " game_count=" << game_count << ", fen_count=" << fen_count << std::endl;
-	std::cout << now_string() << " all done" << std::endl;
-	ofs.close();
-}
-
-void convert_plain(const vector<string>& filenames, const string& output_file_name)
-{
-	Position tpos;
-	std::ofstream ofs;
-	ofs.open(output_file_name, ios::app);
-	auto th = Threads.main();
-	for (auto filename : filenames) {
-		std::cout << "convert " << filename << " ... ";
-
-		// Just convert packedsfenvalue to text
-		std::fstream fs;
-		fs.open(filename, ios::in | ios::binary);
-		PackedSfenValue p;
-		while (true)
-		{
-			if (fs.read((char*)&p, sizeof(PackedSfenValue))) {
-				StateInfo si;
-				tpos.set_from_packed_sfen(p.sfen, &si, th, false);
-
-				// write as plain text
-				ofs << "fen " << tpos.fen() << std::endl;
-				ofs << "move " << UCI::move(Move(p.move), false) << std::endl;
-				ofs << "score " << p.score << std::endl;
-				ofs << "ply " << int(p.gamePly) << std::endl;
-				ofs << "result " << int(p.game_result) << std::endl;
-				ofs << "e" << std::endl;
-			}
-			else {
-				break;
-			}
-		}
-		fs.close();
-		std::cout << "done" << std::endl;
-	}
-	ofs.close();
-	std::cout << "all done" << std::endl;
-}
-
-// Learning from the generated game record
-void learn(Position&, istringstream& is)
-{
-	auto thread_num = (int)Options["Threads"];
-	SfenReader sr(thread_num);
-
-	LearnerThink learn_think(sr);
-	vector<string> filenames;
-
-	// mini_batch_size 1M aspect by default. This can be increased.
-	auto mini_batch_size = LEARN_MINI_BATCH_SIZE;
-
-	// Number of loops (read the game record file this number of times)
-	int loop = 1;
-
-	// Game file storage folder (get game file with relative path from here)
-	string base_dir;
-
-	string target_dir;
-
-	// If 0, it will be the default value.
-	double eta1 = 0.0;
-	double eta2 = 0.0;
-	double eta3 = 0.0;
-	uint64_t eta1_epoch = 0; // eta2 is not applied by default
-	uint64_t eta2_epoch = 0; // eta3 is not applied by default
+        }
+        return false;
+    }
+
+    // Shuffle_files(), shuffle_files_quick() subcontracting, writing part.
+    // output_file_name: Name of the file to write
+    // prng: random number
+    // afs: fstream of each teacher phase file
+    // a_count: The number of teacher positions inherent in each file.
+    void shuffle_write(const string& output_file_name, PRNG& prng, vector<fstream>& afs, vector<uint64_t>& a_count)
+    {
+        uint64_t total_sfen_count = 0;
+        for (auto c : a_count)
+            total_sfen_count += c;
+
+        // number of exported phases
+        uint64_t write_sfen_count = 0;
+
+        // Output the progress on the screen for each phase.
+        const uint64_t buffer_size = 10000000;
+
+        auto print_status = [&]()
+        {
+            // Output progress every 10M phase or when all writing is completed
+            if (((write_sfen_count % buffer_size) == 0) ||
+                (write_sfen_count == total_sfen_count))
+                cout << write_sfen_count << " / " << total_sfen_count << endl;
+        };
+
+
+        cout << endl << "write : " << output_file_name << endl;
+
+        fstream fs(output_file_name, ios::out | ios::binary);
+
+        // total teacher positions
+        uint64_t sum = 0;
+        for (auto c : a_count)
+            sum += c;
+
+        while (sum != 0)
+        {
+            auto r = prng.rand(sum);
+
+            // Aspects stored in fs[0] file ... Aspects stored in fs[1] file ...
+            //Think of it as a series like, and determine in which file r is pointing.
+            // The contents of the file are shuffled, so you can take the next element from that file.
+            // Each file has a_count[x] phases, so this process can be written as follows.
+
+            uint64_t n = 0;
+            while (a_count[n] <= r)
+                r -= a_count[n++];
+
+            // This confirms n. Before you forget it, reduce the remaining number.
+
+            --a_count[n];
+            --sum;
+
+            PackedSfenValue psv;
+            // It's better to read and write all at once until the performance is not so good...
+            if (afs[n].read((char*)&psv, sizeof(PackedSfenValue)))
+            {
+                fs.write((char*)&psv, sizeof(PackedSfenValue));
+                ++write_sfen_count;
+                print_status();
+            }
+        }
+        print_status();
+        fs.close();
+        cout << "done!" << endl;
+    }
+
+    // Subcontracting the teacher shuffle "learn shuffle" command.
+    // output_file_name: name of the output file where the shuffled teacher positions will be written
+    void shuffle_files(const vector<string>& filenames, const string& output_file_name, uint64_t buffer_size)
+    {
+        // The destination folder is
+        // tmp/ for temporary writing
+
+        // Temporary file is written to tmp/ folder for each buffer_size phase.
+        // For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
+        // In a PC with a small memory, it would be better to reduce this.
+        // However, if the number of files increases too much, it will not be possible to open at the same time due to OS restrictions.
+        // There should have been a limit of 512 per process on Windows, so you can open here as 500,
+        // The current setting is 500 files x 20M = 10G = 10 billion phases.
+
+        PSVector buf;
+        buf.resize(buffer_size);
+        // ↑ buffer, a marker that indicates how much you have used
+        uint64_t buf_write_marker = 0;
+
+        // File name to write (incremental counter because it is a serial number)
+        uint64_t write_file_count = 0;
+
+        // random number to shuffle
+        // Do not use std::random_device().  Because it always the same integers on MinGW.
+        PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
+
+        // generate the name of the temporary file
+        auto make_filename = [](uint64_t i)
+        {
+            return "tmp/" + to_string(i) + ".bin";
+        };
+
+        // Exported files in tmp/ folder, number of teacher positions stored in each
+        vector<uint64_t> a_count;
+
+        auto write_buffer = [&](uint64_t size)
+        {
+            // shuffle from buf[0] to buf[size-1]
+            for (uint64_t i = 0; i < size; ++i)
+                swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
+
+            // write to a file
+            fstream fs;
+            fs.open(make_filename(write_file_count++), ios::out | ios::binary);
+            fs.write((char*)&buf[0], size * sizeof(PackedSfenValue));
+            fs.close();
+            a_count.push_back(size);
+
+            buf_write_marker = 0;
+            cout << ".";
+        };
+
+        Dependency::mkdir("tmp");
+
+        // Shuffle and export as a 10M phase shredded file.
+        for (auto filename : filenames)
+        {
+            fstream fs(filename, ios::in | ios::binary);
+            cout << endl << "open file = " << filename;
+            while (fs.read((char*)&buf[buf_write_marker], sizeof(PackedSfenValue)))
+                if (++buf_write_marker == buffer_size)
+                    write_buffer(buffer_size);
+
+            // Read in units of sizeof(PackedSfenValue),
+            // Ignore the last remaining fraction. (Fails in fs.read, so exit while)
+            // (The remaining fraction seems to be half-finished data that was created because it was stopped halfway during teacher generation.)
+
+        }
+
+        if (buf_write_marker != 0)
+            write_buffer(buf_write_marker);
+
+        // Only shuffled files have been written write_file_count.
+        // As a second pass, if you open all of them at the same time, select one at random and load one phase at a time
+        // Now you have shuffled.
+
+        // Original file for shirt full + tmp file + file to write requires 3 times the storage capacity of the original file.
+        // 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
+        // If you want to delete (or delete by hand) the original file at this point after writing to tmp,
+        // The storage capacity is about twice that of the original file.
+        // So, maybe we should have an option to delete the original file.
+
+        // Files are opened at the same time. It is highly possible that this will exceed FOPEN_MAX.
+        // In that case, rather than adjusting buffer_size to reduce the number of files.
+
+        vector<fstream> afs;
+        for (uint64_t i = 0; i < write_file_count; ++i)
+            afs.emplace_back(fstream(make_filename(i), ios::in | ios::binary));
+
+        // Throw to the subcontract function and end.
+        shuffle_write(output_file_name, prng, afs, a_count);
+    }
+
+    // Subcontracting the teacher shuffle "learn shuffleq" command.
+    // This is written in 1 pass.
+    // output_file_name: name of the output file where the shuffled teacher positions will be written
+    void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name)
+    {
+        // random number to shuffle
+        // Do not use std::random_device().  Because it always the same integers on MinGW.
+        PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
+
+        // number of files
+        size_t file_count = filenames.size();
+
+        // Number of teacher positions stored in each file in filenames
+        vector<uint64_t> a_count(file_count);
+
+        // Count the number of teacher aspects in each file.
+        vector<fstream> afs(file_count);
+
+        for (size_t i = 0; i < file_count; ++i)
+        {
+            auto filename = filenames[i];
+            auto& fs = afs[i];
+
+            fs.open(filename, ios::in | ios::binary);
+            fs.seekg(0, fstream::end);
+            uint64_t eofPos = (uint64_t)fs.tellg();
+            fs.clear(); // Otherwise, the next seek may fail.
+            fs.seekg(0, fstream::beg);
+            uint64_t begPos = (uint64_t)fs.tellg();
+            uint64_t file_size = eofPos - begPos;
+            uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
+            a_count[i] = sfen_count;
+
+            // Output the number of sfen stored in each file.
+            cout << filename << " = " << sfen_count << " sfens." << endl;
+        }
+
+        // Since we know the file size of each file,
+        // open them all at once (already open),
+        // Select one at a time and load one phase at a time
+        // Now you have shuffled.
+
+        // Throw to the subcontract function and end.
+        shuffle_write(output_file_name, prng, afs, a_count);
+    }
+
+    // Subcontracting the teacher shuffle "learn shufflem" command.
+    // Read the whole memory and write it out with the specified file name.
+    void shuffle_files_on_memory(const vector<string>& filenames, const string output_file_name)
+    {
+        PSVector buf;
+
+        for (auto filename : filenames)
+        {
+            std::cout << "read : " << filename << std::endl;
+            read_file_to_memory(filename, [&buf](uint64_t size) {
+                assert((size % sizeof(PackedSfenValue)) == 0);
+                // Expand the buffer and read after the last end.
+                uint64_t last = buf.size();
+                buf.resize(last + size / sizeof(PackedSfenValue));
+                return (void*)&buf[last];
+                });
+        }
+
+        // shuffle from buf[0] to buf[size-1]
+        // Do not use std::random_device().  Because it always the same integers on MinGW.
+        PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
+        uint64_t size = (uint64_t)buf.size();
+        std::cout << "shuffle buf.size() = " << size << std::endl;
+        for (uint64_t i = 0; i < size; ++i)
+            swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
+
+        std::cout << "write : " << output_file_name << endl;
+
+        // If the file to be written exceeds 2GB, it cannot be written in one shot with fstream::write, so use wrapper.
+        write_memory_to_file(output_file_name, (void*)&buf[0], (uint64_t)sizeof(PackedSfenValue) * (uint64_t)buf.size());
+
+        std::cout << "..shuffle_on_memory done." << std::endl;
+    }
+
+    // Learning from the generated game record
+    void learn(Position&, istringstream& is)
+    {
+        auto thread_num = (int)Options["Threads"];
+        SfenReader sr(thread_num);
+
+        LearnerThink learn_think(sr);
+        vector<string> filenames;
+
+        // mini_batch_size 1M aspect by default. This can be increased.
+        auto mini_batch_size = LEARN_MINI_BATCH_SIZE;
+
+        // Number of loops (read the game record file this number of times)
+        int loop = 1;
+
+        // Game file storage folder (get game file with relative path from here)
+        string base_dir;
+
+        string target_dir;
+
+        // If 0, it will be the default value.
+        double eta1 = 0.0;
+        double eta2 = 0.0;
+        double eta3 = 0.0;
+        uint64_t eta1_epoch = 0; // eta2 is not applied by default
+        uint64_t eta2_epoch = 0; // eta3 is not applied by default
 
 #if defined(USE_GLOBAL_OPTIONS)
-	// Save it for later restore.
-	auto oldGlobalOptions = GlobalOptions;
-	// If you hit the eval hash, you can not calculate rmse etc. so turn it off.
-	GlobalOptions.use_eval_hash = false;
-	// If you hit the replacement table, pruning may occur at the previous evaluation value, so turn it off.
-	GlobalOptions.use_hash_probe = false;
+    // Save it for later restore.
+        auto oldGlobalOptions = GlobalOptions;
+        // If you hit the eval hash, you can not calculate rmse etc. so turn it off.
+        GlobalOptions.use_eval_hash = false;
+        // If you hit the replacement table, pruning may occur at the previous evaluation value, so turn it off.
+        GlobalOptions.use_hash_probe = false;
 #endif
 
-	// --- Function that only shuffles the teacher aspect
+        // --- Function that only shuffles the teacher aspect
 
-	// normal shuffle
-	bool shuffle_normal = false;
-	uint64_t buffer_size = 20000000;
-	// fast shuffling assuming each file is shuffled
-	bool shuffle_quick = false;
-	// A function to read the entire file in memory and shuffle it. (Requires file size memory)
-	bool shuffle_on_memory = false;
-	// Conversion of packed sfen. In plain, it consists of sfen(string), evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
-	bool use_convert_plain = false;
-	// convert plain format teacher to Yaneura King's bin
-	bool use_convert_bin = false;
-	int ply_minimum = 0;
-	int ply_maximum = 114514;
-	bool interpolate_eval = 0;
-	bool check_invalid_fen = false;
-	bool check_illegal_move = false;
-	// convert teacher in pgn-extract format to Yaneura King's bin
-	bool use_convert_bin_from_pgn_extract = false;
-	bool pgn_eval_side_to_move = false;
-	bool convert_no_eval_fens_as_score_zero = false;
-	// File name to write in those cases (default is "shuffled_sfen.bin")
-	string output_file_name = "shuffled_sfen.bin";
+        // normal shuffle
+        bool shuffle_normal = false;
+        uint64_t buffer_size = 20000000;
+        // fast shuffling assuming each file is shuffled
+        bool shuffle_quick = false;
+        // A function to read the entire file in memory and shuffle it. (Requires file size memory)
+        bool shuffle_on_memory = false;
+        // Conversion of packed sfen. In plain, it consists of sfen(string), evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
+        bool use_convert_plain = false;
+        // convert plain format teacher to Yaneura King's bin
+        bool use_convert_bin = false;
+        int ply_minimum = 0;
+        int ply_maximum = 114514;
+        bool interpolate_eval = 0;
+        bool check_invalid_fen = false;
+        bool check_illegal_move = false;
+        // convert teacher in pgn-extract format to Yaneura King's bin
+        bool use_convert_bin_from_pgn_extract = false;
+        bool pgn_eval_side_to_move = false;
+        bool convert_no_eval_fens_as_score_zero = false;
+        // File name to write in those cases (default is "shuffled_sfen.bin")
+        string output_file_name = "shuffled_sfen.bin";
 
-	// If the absolute value of the evaluation value in the deep search of the teacher phase exceeds this value, that phase is discarded.
-	int eval_limit = 32000;
+        // If the absolute value of the evaluation value in the deep search of the teacher phase exceeds this value, that phase is discarded.
+        int eval_limit = 32000;
 
-	// Flag to save the evaluation function file only once near the end.
-	bool save_only_once = false;
+        // Flag to save the evaluation function file only once near the end.
+        bool save_only_once = false;
 
-	// Shuffle about what you are pre-reading on the teacher aspect. (Shuffle of about 10 million phases)
-	// Turn on if you want to pass a pre-shuffled file.
-	bool no_shuffle = false;
+        // Shuffle about what you are pre-reading on the teacher aspect. (Shuffle of about 10 million phases)
+        // Turn on if you want to pass a pre-shuffled file.
+        bool no_shuffle = false;
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-	// elmo lambda
-	ELMO_LAMBDA = 0.33;
-	ELMO_LAMBDA2 = 0.33;
-	ELMO_LAMBDA_LIMIT = 32000;
+        // elmo lambda
+        ELMO_LAMBDA = 0.33;
+        ELMO_LAMBDA2 = 0.33;
+        ELMO_LAMBDA_LIMIT = 32000;
 #endif
 
-	// Discount rate. If this is set to a value other than 0, the slope will be added even at other than the PV termination. (At that time, apply this discount rate)
-	double discount_rate = 0;
+        // Discount rate. If this is set to a value other than 0, the slope will be added even at other than the PV termination. (At that time, apply this discount rate)
+        double discount_rate = 0;
 
-	// if (gamePly <rand(reduction_gameply)) continue;
-	// An option to exclude the early stage from the learning target moderately like
-	// If set to 1, rand(1)==0, so nothing is excluded.
-	int reduction_gameply = 1;
+        // if (gamePly <rand(reduction_gameply)) continue;
+        // An option to exclude the early stage from the learning target moderately like
+        // If set to 1, rand(1)==0, so nothing is excluded.
+        int reduction_gameply = 1;
 
-	// Optional item that does not let you learn KK/KKP/KPP/KPPP
-	array<bool,4> freeze = {};
+        // Optional item that does not let you learn KK/KKP/KPP/KPPP
+        array<bool, 4> freeze = {};
 
 #if defined(EVAL_NNUE)
-	uint64_t nn_batch_size = 1000;
-	double newbob_decay = 1.0;
-	int newbob_num_trials = 2;
-	string nn_options;
+        uint64_t nn_batch_size = 1000;
+        double newbob_decay = 1.0;
+        int newbob_num_trials = 2;
+        string nn_options;
 #endif
 
-	uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
-	uint64_t loss_output_interval = 0;
-	uint64_t mirror_percentage = 0;
+        uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
+        uint64_t loss_output_interval = 0;
+        uint64_t mirror_percentage = 0;
 
-	string validation_set_file_name;
+        string validation_set_file_name;
 
-	// Assume the filenames are staggered.
-	while (true)
-	{
-		string option;
-		is >> option;
+        // Assume the filenames are staggered.
+        while (true)
+        {
+            string option;
+            is >> option;
 
-		if (option == "")
-			break;
+            if (option == "")
+                break;
 
-		// specify the number of phases of mini-batch
-		if (option == "bat")
-		{
-			is >> mini_batch_size;
-			mini_batch_size *= 10000; // Unit is ten thousand
-		}
+            // specify the number of phases of mini-batch
+            if (option == "bat")
+            {
+                is >> mini_batch_size;
+                mini_batch_size *= 10000; // Unit is ten thousand
+            }
 
-		// Specify the folder in which the game record is stored and make it the rooting target.
-		else if (option == "targetdir") is >> target_dir;
+            // Specify the folder in which the game record is stored and make it the rooting target.
+            else if (option == "targetdir") is >> target_dir;
 
-		// Specify the number of loops
-		else if (option == "loop")      is >> loop;
+            // Specify the number of loops
+            else if (option == "loop")      is >> loop;
 
-		// Game file storage folder (get game file with relative path from here)
-		else if (option == "basedir")   is >> base_dir;
+            // Game file storage folder (get game file with relative path from here)
+            else if (option == "basedir")   is >> base_dir;
 
-		// Mini batch size
-		else if (option == "batchsize") is >> mini_batch_size;
+            // Mini batch size
+            else if (option == "batchsize") is >> mini_batch_size;
 
-		// learning rate
-		else if (option == "eta")        is >> eta1;
-		else if (option == "eta1")       is >> eta1; // alias
-		else if (option == "eta2")       is >> eta2;
-		else if (option == "eta3")       is >> eta3;
-		else if (option == "eta1_epoch") is >> eta1_epoch;
-		else if (option == "eta2_epoch") is >> eta2_epoch;
-		// Accept also the old option name.
-		else if (option == "use_draw_in_training" || option == "use_draw_games_in_training") is >> use_draw_games_in_training;
-		// Accept also the old option name.
-		else if (option == "use_draw_in_validation" || option == "use_draw_games_in_validation") is >> use_draw_games_in_validation;
-		// Accept also the old option name.
-		else if (option == "use_hash_in_training" || option == "skip_duplicated_positions_in_training") is >> skip_duplicated_positions_in_training;
-		else if (option == "winning_probability_coefficient") is >> winning_probability_coefficient;
-		// Discount rate
-		else if (option == "discount_rate") is >> discount_rate;
-		// Using WDL with win rate model instead of sigmoid
-		else if (option == "use_wdl") is >> use_wdl;
+            // learning rate
+            else if (option == "eta")        is >> eta1;
+            else if (option == "eta1")       is >> eta1; // alias
+            else if (option == "eta2")       is >> eta2;
+            else if (option == "eta3")       is >> eta3;
+            else if (option == "eta1_epoch") is >> eta1_epoch;
+            else if (option == "eta2_epoch") is >> eta2_epoch;
+            // Accept also the old option name.
+            else if (option == "use_draw_in_training" || option == "use_draw_games_in_training") is >> use_draw_games_in_training;
+            // Accept also the old option name.
+            else if (option == "use_draw_in_validation" || option == "use_draw_games_in_validation") is >> use_draw_games_in_validation;
+            // Accept also the old option name.
+            else if (option == "use_hash_in_training" || option == "skip_duplicated_positions_in_training") is >> skip_duplicated_positions_in_training;
+            else if (option == "winning_probability_coefficient") is >> winning_probability_coefficient;
+            // Discount rate
+            else if (option == "discount_rate") is >> discount_rate;
+            // Using WDL with win rate model instead of sigmoid
+            else if (option == "use_wdl") is >> use_wdl;
 
-		// No learning of KK/KKP/KPP/KPPP.
-		else if (option == "freeze_kk")    is >> freeze[0];
-		else if (option == "freeze_kkp")   is >> freeze[1];
-		else if (option == "freeze_kpp")   is >> freeze[2];
+            // No learning of KK/KKP/KPP/KPPP.
+            else if (option == "freeze_kk")    is >> freeze[0];
+            else if (option == "freeze_kkp")   is >> freeze[1];
+            else if (option == "freeze_kpp")   is >> freeze[2];
 
 #if defined(EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_KPP_KKPT_FV_VAR) || defined(EVAL_NABLA)
 
 #elif defined(EVAL_KPPPT) || defined(EVAL_KPPP_KKPT) || defined(EVAL_HELICES)
-		else if (option == "freeze_kppp")  is >> freeze[3];
+            else if (option == "freeze_kppp")  is >> freeze[3];
 #elif defined(EVAL_KKPP_KKPT) || defined(EVAL_KKPPT)
-		else if (option == "freeze_kkpp")  is >> freeze[3];
+            else if (option == "freeze_kkpp")  is >> freeze[3];
 #endif
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-		// LAMBDA
-		else if (option == "lambda")       is >> ELMO_LAMBDA;
-		else if (option == "lambda2")      is >> ELMO_LAMBDA2;
-		else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
+            // LAMBDA
+            else if (option == "lambda")       is >> ELMO_LAMBDA;
+            else if (option == "lambda2")      is >> ELMO_LAMBDA2;
+            else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
 
 #endif
-		else if (option == "reduction_gameply") is >> reduction_gameply;
+            else if (option == "reduction_gameply") is >> reduction_gameply;
 
-		// shuffle related
-		else if (option == "shuffle")	shuffle_normal = true;
-		else if (option == "buffer_size") is >> buffer_size;
-		else if (option == "shuffleq")	shuffle_quick = true;
-		else if (option == "shufflem")	shuffle_on_memory = true;
-		else if (option == "output_file_name") is >> output_file_name;
+            // shuffle related
+            else if (option == "shuffle")   shuffle_normal = true;
+            else if (option == "buffer_size") is >> buffer_size;
+            else if (option == "shuffleq")  shuffle_quick = true;
+            else if (option == "shufflem")  shuffle_on_memory = true;
+            else if (option == "output_file_name") is >> output_file_name;
 
-		else if (option == "eval_limit") is >> eval_limit;
-		else if (option == "save_only_once") save_only_once = true;
-		else if (option == "no_shuffle") no_shuffle = true;
+            else if (option == "eval_limit") is >> eval_limit;
+            else if (option == "save_only_once") save_only_once = true;
+            else if (option == "no_shuffle") no_shuffle = true;
 
 #if defined(EVAL_NNUE)
-		else if (option == "nn_batch_size") is >> nn_batch_size;
-		else if (option == "newbob_decay") is >> newbob_decay;
-		else if (option == "newbob_num_trials") is >> newbob_num_trials;
-		else if (option == "nn_options") is >> nn_options;
+            else if (option == "nn_batch_size") is >> nn_batch_size;
+            else if (option == "newbob_decay") is >> newbob_decay;
+            else if (option == "newbob_num_trials") is >> newbob_num_trials;
+            else if (option == "nn_options") is >> nn_options;
 #endif
-		else if (option == "eval_save_interval") is >> eval_save_interval;
-		else if (option == "loss_output_interval") is >> loss_output_interval;
-		else if (option == "mirror_percentage") is >> mirror_percentage;
-		else if (option == "validation_set_file_name") is >> validation_set_file_name;
+            else if (option == "eval_save_interval") is >> eval_save_interval;
+            else if (option == "loss_output_interval") is >> loss_output_interval;
+            else if (option == "mirror_percentage") is >> mirror_percentage;
+            else if (option == "validation_set_file_name") is >> validation_set_file_name;
 
-		// Rabbit convert related
-		else if (option == "convert_plain") use_convert_plain = true;
-		else if (option == "convert_bin") use_convert_bin = true;
-		else if (option == "interpolate_eval") is >> interpolate_eval;
-		else if (option == "check_invalid_fen") is >> check_invalid_fen;
-		else if (option == "check_illegal_move") is >> check_illegal_move;
-		else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
-		else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
-		else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
-		else if (option == "src_score_min_value") is >> src_score_min_value;
-		else if (option == "src_score_max_value") is >> src_score_max_value;
-		else if (option == "dest_score_min_value") is >> dest_score_min_value;
-		else if (option == "dest_score_max_value") is >> dest_score_max_value;
-		else if (option == "convert_teacher_signal_to_winning_probability") is >> convert_teacher_signal_to_winning_probability;
-		else if (option == "use_raw_nnue_eval") is >> use_raw_nnue_eval;
+            // Rabbit convert related
+            else if (option == "convert_plain") use_convert_plain = true;
+            else if (option == "convert_bin") use_convert_bin = true;
+            else if (option == "interpolate_eval") is >> interpolate_eval;
+            else if (option == "check_invalid_fen") is >> check_invalid_fen;
+            else if (option == "check_illegal_move") is >> check_illegal_move;
+            else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
+            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
+            else if (option == "src_score_min_value") is >> src_score_min_value;
+            else if (option == "src_score_max_value") is >> src_score_max_value;
+            else if (option == "dest_score_min_value") is >> dest_score_min_value;
+            else if (option == "dest_score_max_value") is >> dest_score_max_value;
+            else if (option == "convert_teacher_signal_to_winning_probability") is >> convert_teacher_signal_to_winning_probability;
+            else if (option == "use_raw_nnue_eval") is >> use_raw_nnue_eval;
 
-		// Otherwise, it's a filename.
-		else
-			filenames.push_back(option);
-	}
-	if (loss_output_interval == 0)
-		loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
+            // Otherwise, it's a filename.
+            else
+                filenames.push_back(option);
+        }
+        if (loss_output_interval == 0)
+            loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
 
-	cout << "learn command , ";
+        cout << "learn command , ";
 
-	// Issue a warning if OpenMP is disabled.
+        // Issue a warning if OpenMP is disabled.
 #if !defined(_OPENMP)
-	cout << "Warning! OpenMP disabled." << endl;
+        cout << "Warning! OpenMP disabled." << endl;
 #endif
 
-	// Display learning game file
-	if (target_dir != "")
-	{
-		string kif_base_dir = Path::Combine(base_dir, target_dir);
+        // Display learning game file
+        if (target_dir != "")
+        {
+            string kif_base_dir = Path::Combine(base_dir, target_dir);
 
-		// Remove this folder. Keep it relative to base_dir.
+            // Remove this folder. Keep it relative to base_dir.
 #if defined(_MSC_VER)
-		// If you use std::tr2, warning C4996 will appear, so suppress it.
-		// * std::tr2 issued a deprecation warning by default under std:c++14, and was deleted by default in /std:c++17.
-		#pragma warning(push)
-		#pragma warning(disable:4996)
+        // If you use std::tr2, warning C4996 will appear, so suppress it.
+        // * std::tr2 issued a deprecation warning by default under std:c++14, and was deleted by default in /std:c++17.
+#pragma warning(push)
+#pragma warning(disable:4996)
 
-		namespace sys = std::filesystem;
-		sys::path p(kif_base_dir); // Origin of enumeration
-		std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
-			[&](const sys::path& p) {
-			if (sys::is_regular_file(p))
-				filenames.push_back(Path::Combine(target_dir, p.filename().generic_string()));
-		});
-		#pragma warning(pop)
+            namespace sys = std::filesystem;
+            sys::path p(kif_base_dir); // Origin of enumeration
+            std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
+                [&](const sys::path& p) {
+                    if (sys::is_regular_file(p))
+                        filenames.push_back(Path::Combine(target_dir, p.filename().generic_string()));
+                });
+#pragma warning(pop)
 
 #elif defined(__GNUC__)
 
-		auto ends_with = [](std::string const & value, std::string const & ending)
-		{
-			if (ending.size() > value.size()) return false;
-			return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
-		};
+            auto ends_with = [](std::string const& value, std::string const& ending)
+            {
+                if (ending.size() > value.size()) return false;
+                return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
+            };
 
-		// It can't be helped, so read it using dirent.h.
-		DIR *dp; // pointer to directory
-		dirent* entry; // entry point returned by readdir()
+            // It can't be helped, so read it using dirent.h.
+            DIR* dp; // pointer to directory
+            dirent* entry; // entry point returned by readdir()
 
-		dp = opendir(kif_base_dir.c_str());
-		if (dp != NULL)
-		{
-			do {
-				entry = readdir(dp);
-				// Only list files ending with ".bin"
-				// →I hate this restriction when generating files with serial numbers...
-				if (entry != NULL  && ends_with(entry->d_name, ".bin")  )
-				{
-					//cout << entry->d_name << endl;
-					filenames.push_back(Path::Combine(target_dir, entry->d_name));
-				}
-			} while (entry != NULL);
-			closedir(dp);
-		}
+            dp = opendir(kif_base_dir.c_str());
+            if (dp != NULL)
+            {
+                do {
+                    entry = readdir(dp);
+                    // Only list files ending with ".bin"
+                    // →I hate this restriction when generating files with serial numbers...
+                    if (entry != NULL && ends_with(entry->d_name, ".bin"))
+                    {
+                        //cout << entry->d_name << endl;
+                        filenames.push_back(Path::Combine(target_dir, entry->d_name));
+                    }
+                } while (entry != NULL);
+                closedir(dp);
+            }
 #endif
-	}
+        }
 
-	cout << "learn from ";
-	for (auto s : filenames)
-		cout << s << " , ";
-	cout << endl;
-	if (!validation_set_file_name.empty())
-	{
-		cout << "validation set  : " << validation_set_file_name << endl;
-	}
+        cout << "learn from ";
+        for (auto s : filenames)
+            cout << s << " , ";
+        cout << endl;
+        if (!validation_set_file_name.empty())
+        {
+            cout << "validation set  : " << validation_set_file_name << endl;
+        }
 
-	cout << "base dir        : " << base_dir   << endl;
-	cout << "target dir      : " << target_dir << endl;
+        cout << "base dir        : " << base_dir << endl;
+        cout << "target dir      : " << target_dir << endl;
 
-	// shuffle mode
-	if (shuffle_normal)
-	{
-		cout << "buffer_size     : " << buffer_size << endl;
-		cout << "shuffle mode.." << endl;
-		shuffle_files(filenames,output_file_name , buffer_size);
-		return;
-	}
-	if (shuffle_quick)
-	{
-		cout << "quick shuffle mode.." << endl;
-		shuffle_files_quick(filenames, output_file_name);
-		return;
-	}
-	if (shuffle_on_memory)
-	{
-		cout << "shuffle on memory.." << endl;
-		shuffle_files_on_memory(filenames,output_file_name);
-		return;
-	}
-	if (use_convert_plain)
-	{
-		Eval::init_NNUE();
-		cout << "convert_plain.." << endl;
-		convert_plain(filenames, output_file_name);
-		return;
-	}
-	if (use_convert_bin)
-	{
-		Eval::init_NNUE();
-		cout << "convert_bin.." << endl;
-		convert_bin(filenames,output_file_name, ply_minimum, ply_maximum, interpolate_eval, check_invalid_fen, check_illegal_move);
-		return;
+        // shuffle mode
+        if (shuffle_normal)
+        {
+            cout << "buffer_size     : " << buffer_size << endl;
+            cout << "shuffle mode.." << endl;
+            shuffle_files(filenames, output_file_name, buffer_size);
+            return;
+        }
+        if (shuffle_quick)
+        {
+            cout << "quick shuffle mode.." << endl;
+            shuffle_files_quick(filenames, output_file_name);
+            return;
+        }
+        if (shuffle_on_memory)
+        {
+            cout << "shuffle on memory.." << endl;
+            shuffle_files_on_memory(filenames, output_file_name);
+            return;
+        }
+        if (use_convert_plain)
+        {
+            Eval::init_NNUE();
+            cout << "convert_plain.." << endl;
+            convert_plain(filenames, output_file_name);
+            return;
+        }
+        if (use_convert_bin)
+        {
+            Eval::init_NNUE();
+            cout << "convert_bin.." << endl;
+            convert_bin(
+                filenames, 
+                output_file_name, 
+                ply_minimum, 
+                ply_maximum, 
+                interpolate_eval, 
+                src_score_min_value,
+                src_score_max_value,
+                dest_score_min_value,
+                dest_score_max_value,
+                check_invalid_fen, 
+                check_illegal_move);
+            return;
 
-	}
-	if (use_convert_bin_from_pgn_extract)
-	{
-		Eval::init_NNUE();
-		cout << "convert_bin_from_pgn-extract.." << endl;
-		convert_bin_from_pgn_extract(filenames, output_file_name, pgn_eval_side_to_move, convert_no_eval_fens_as_score_zero);
-		return;
-	}
+        }
+        if (use_convert_bin_from_pgn_extract)
+        {
+            Eval::init_NNUE();
+            cout << "convert_bin_from_pgn-extract.." << endl;
+            convert_bin_from_pgn_extract(filenames, output_file_name, pgn_eval_side_to_move, convert_no_eval_fens_as_score_zero);
+            return;
+        }
 
-	cout << "loop              : " << loop << endl;
-	cout << "eval_limit        : " << eval_limit << endl;
-	cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
-	cout << "no_shuffle        : " << (no_shuffle ? "true" : "false") << endl;
+        cout << "loop              : " << loop << endl;
+        cout << "eval_limit        : " << eval_limit << endl;
+        cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
+        cout << "no_shuffle        : " << (no_shuffle ? "true" : "false") << endl;
 
-	// Insert the file name for the number of loops.
-	for (int i = 0; i < loop; ++i)
-		// sfen reader, I'll read it in reverse order so I'll reverse it here. I'm sorry.
-		for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
-			sr.filenames.push_back(Path::Combine(base_dir, *it));
+        // Insert the file name for the number of loops.
+        for (int i = 0; i < loop; ++i)
+            // sfen reader, I'll read it in reverse order so I'll reverse it here. I'm sorry.
+            for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
+                sr.filenames.push_back(Path::Combine(base_dir, *it));
 
 #if !defined(EVAL_NNUE)
-	cout << "Gradient Method   : " << LEARN_UPDATE      << endl;
+        cout << "Gradient Method   : " << LEARN_UPDATE << endl;
 #endif
-	cout << "Loss Function     : " << LOSS_FUNCTION     << endl;
-	cout << "mini-batch size   : " << mini_batch_size   << endl;
+        cout << "Loss Function     : " << LOSS_FUNCTION << endl;
+        cout << "mini-batch size   : " << mini_batch_size << endl;
 #if defined(EVAL_NNUE)
-	cout << "nn_batch_size     : " << nn_batch_size     << endl;
-	cout << "nn_options        : " << nn_options        << endl;
+        cout << "nn_batch_size     : " << nn_batch_size << endl;
+        cout << "nn_options        : " << nn_options << endl;
 #endif
-	cout << "learning rate     : " << eta1 << " , " << eta2 << " , " << eta3 << endl;
-	cout << "eta_epoch         : " << eta1_epoch << " , " << eta2_epoch << endl;
-	cout << "use_draw_games_in_training : " << use_draw_games_in_training << endl;
-	cout << "use_draw_games_in_validation : " << use_draw_games_in_validation << endl;
-	cout << "skip_duplicated_positions_in_training : " << skip_duplicated_positions_in_training << endl;
+        cout << "learning rate     : " << eta1 << " , " << eta2 << " , " << eta3 << endl;
+        cout << "eta_epoch         : " << eta1_epoch << " , " << eta2_epoch << endl;
+        cout << "use_draw_games_in_training : " << use_draw_games_in_training << endl;
+        cout << "use_draw_games_in_validation : " << use_draw_games_in_validation << endl;
+        cout << "skip_duplicated_positions_in_training : " << skip_duplicated_positions_in_training << endl;
 #if defined(EVAL_NNUE)
-	if (newbob_decay != 1.0) {
-		cout << "scheduling        : newbob with decay = " << newbob_decay
-		     << ", " << newbob_num_trials << " trials" << endl;
-	} else {
-		cout << "scheduling        : default" << endl;
-	}
+        if (newbob_decay != 1.0) {
+            cout << "scheduling        : newbob with decay = " << newbob_decay
+                << ", " << newbob_num_trials << " trials" << endl;
+        }
+        else {
+            cout << "scheduling        : default" << endl;
+        }
 #endif
-	cout << "discount rate     : " << discount_rate     << endl;
+        cout << "discount rate     : " << discount_rate << endl;
 
-	// If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
-	reduction_gameply = max(reduction_gameply, 1);
-	cout << "reduction_gameply : " << reduction_gameply << endl;
+        // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
+        reduction_gameply = max(reduction_gameply, 1);
+        cout << "reduction_gameply : " << reduction_gameply << endl;
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-	cout << "LAMBDA            : " << ELMO_LAMBDA       << endl;
-	cout << "LAMBDA2           : " << ELMO_LAMBDA2      << endl;
-	cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
+        cout << "LAMBDA            : " << ELMO_LAMBDA << endl;
+        cout << "LAMBDA2           : " << ELMO_LAMBDA2 << endl;
+        cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
 #endif
-	cout << "mirror_percentage : " << mirror_percentage << endl;
-	cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
-	cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
+        cout << "mirror_percentage : " << mirror_percentage << endl;
+        cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
+        cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
 
 #if defined(EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_KPP_KKPT_FV_VAR) || defined(EVAL_NABLA)
-	cout << "freeze_kk/kkp/kpp      : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << endl;
+        cout << "freeze_kk/kkp/kpp      : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << endl;
 #elif defined(EVAL_KPPPT) || defined(EVAL_KPPP_KKPT) || defined(EVAL_HELICES)
-	cout << "freeze_kk/kkp/kpp/kppp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
+        cout << "freeze_kk/kkp/kpp/kppp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
 #elif defined(EVAL_KKPP_KKPT) || defined(EVAL_KKPPT)
-	cout << "freeze_kk/kkp/kpp/kkpp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
+        cout << "freeze_kk/kkp/kpp/kkpp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
 #endif
 
-	// -----------------------------------
-	// various initialization
-	// -----------------------------------
+        // -----------------------------------
+        // various initialization
+        // -----------------------------------
 
-	cout << "init.." << endl;
+        cout << "init.." << endl;
 
-	// Read evaluation function parameters
-	Eval::init_NNUE();
+        // Read evaluation function parameters
+        Eval::init_NNUE();
 
 #if !defined(EVAL_NNUE)
-	cout << "init_grad.." << endl;
+        cout << "init_grad.." << endl;
 
-	// Initialize gradient array of merit function parameters
-	Eval::init_grad(eta1,eta1_epoch,eta2,eta2_epoch,eta3);
+        // Initialize gradient array of merit function parameters
+        Eval::init_grad(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
 #else
-	cout << "init_training.." << endl;
-	Eval::NNUE::InitializeTraining(eta1,eta1_epoch,eta2,eta2_epoch,eta3);
-	Eval::NNUE::SetBatchSize(nn_batch_size);
-	Eval::NNUE::SetOptions(nn_options);
-	if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
-		learn_think.best_nn_directory = std::string(Options["EvalDir"]);
-	}
+        cout << "init_training.." << endl;
+        Eval::NNUE::InitializeTraining(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
+        Eval::NNUE::SetBatchSize(nn_batch_size);
+        Eval::NNUE::SetOptions(nn_options);
+        if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
+            learn_think.best_nn_directory = std::string(Options["EvalDir"]);
+        }
 #endif
 
 #if 0
-	// A test to give a gradient of 1.0 to the initial stage of Hirate.
-	pos.set_hirate();
-	cout << Eval::evaluate(pos) << endl;
-	//Eval::print_eval_stat(pos);
-	Eval::add_grad(pos, BLACK, 32.0 , false);
-	Eval::update_weights(1);
-	pos.state()->sum.p[2][0] = VALUE_NOT_EVALUATED;
-	cout << Eval::evaluate(pos) << endl;
-	//Eval::print_eval_stat(pos);
+        // A test to give a gradient of 1.0 to the initial stage of Hirate.
+        pos.set_hirate();
+        cout << Eval::evaluate(pos) << endl;
+        //Eval::print_eval_stat(pos);
+        Eval::add_grad(pos, BLACK, 32.0, false);
+        Eval::update_weights(1);
+        pos.state()->sum.p[2][0] = VALUE_NOT_EVALUATED;
+        cout << Eval::evaluate(pos) << endl;
+        //Eval::print_eval_stat(pos);
 #endif
 
-	cout << "init done." << endl;
+        cout << "init done." << endl;
 
-	// Reflect other option settings.
-	learn_think.discount_rate = discount_rate;
-	learn_think.eval_limit = eval_limit;
-	learn_think.save_only_once = save_only_once;
-	learn_think.sr.no_shuffle = no_shuffle;
-	learn_think.freeze = freeze;
-	learn_think.reduction_gameply = reduction_gameply;
+        // Reflect other option settings.
+        learn_think.discount_rate = discount_rate;
+        learn_think.eval_limit = eval_limit;
+        learn_think.save_only_once = save_only_once;
+        learn_think.sr.no_shuffle = no_shuffle;
+        learn_think.freeze = freeze;
+        learn_think.reduction_gameply = reduction_gameply;
 #if defined(EVAL_NNUE)
-	learn_think.newbob_scale = 1.0;
-	learn_think.newbob_decay = newbob_decay;
-	learn_think.newbob_num_trials = newbob_num_trials;
+        learn_think.newbob_scale = 1.0;
+        learn_think.newbob_decay = newbob_decay;
+        learn_think.newbob_num_trials = newbob_num_trials;
 #endif
-	learn_think.eval_save_interval = eval_save_interval;
-	learn_think.loss_output_interval = loss_output_interval;
-	learn_think.mirror_percentage = mirror_percentage;
+        learn_think.eval_save_interval = eval_save_interval;
+        learn_think.loss_output_interval = loss_output_interval;
+        learn_think.mirror_percentage = mirror_percentage;
 
-	// Start a thread that loads the phase file in the background
-	// (If this is not started, mse cannot be calculated.)
-	learn_think.start_file_read_worker();
+        // Start a thread that loads the phase file in the background
+        // (If this is not started, mse cannot be calculated.)
+        learn_think.start_file_read_worker();
 
-	learn_think.mini_batch_size = mini_batch_size;
+        learn_think.mini_batch_size = mini_batch_size;
 
-	if (validation_set_file_name.empty()) {
-	// Get about 10,000 data for mse calculation.
-		sr.read_for_mse();
-	} else {
-		sr.read_validation_set(validation_set_file_name, eval_limit);
-	}
+        if (validation_set_file_name.empty()) {
+            // Get about 10,000 data for mse calculation.
+            sr.read_for_mse();
+        }
+        else {
+            sr.read_validation_set(validation_set_file_name, eval_limit);
+        }
 
-	// Calculate rmse once at this point (timing of 0 sfen)
-	// sr.calc_rmse();
+        // Calculate rmse once at this point (timing of 0 sfen)
+        // sr.calc_rmse();
 #if defined(EVAL_NNUE)
-	if (newbob_decay != 1.0) {
-		learn_think.calc_loss(0, -1);
-		learn_think.best_loss = learn_think.latest_loss_sum / learn_think.latest_loss_count;
-		learn_think.latest_loss_sum = 0.0;
-		learn_think.latest_loss_count = 0;
-		cout << "initial loss: " << learn_think.best_loss << endl;
-	}
+        if (newbob_decay != 1.0) {
+            learn_think.calc_loss(0, -1);
+            learn_think.best_loss = learn_think.latest_loss_sum / learn_think.latest_loss_count;
+            learn_think.latest_loss_sum = 0.0;
+            learn_think.latest_loss_count = 0;
+            cout << "initial loss: " << learn_think.best_loss << endl;
+        }
 #endif
 
-	// -----------------------------------
-	// start learning evaluation function parameters
-	// -----------------------------------
+        // -----------------------------------
+        // start learning evaluation function parameters
+        // -----------------------------------
 
-	// Start learning.
-	learn_think.go_think();
+        // Start learning.
+        learn_think.go_think();
 
-	// Save once at the end.
-	learn_think.save(true);
+        // Save once at the end.
+        learn_think.save(true);
 
 #if defined(USE_GLOBAL_OPTIONS)
-	// Restore Global Options.
-	GlobalOptions = oldGlobalOptions;
+        // Restore Global Options.
+        GlobalOptions = oldGlobalOptions;
 #endif
-}
+    }
 
 
 } // namespace Learner
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index 6e6c695c..6225144c 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -11,12 +11,15 @@
 #include "../thread_win32_osx.h"
 
 #include <atomic>
+#include <limits>
 
 // Learning from a game record, when making yourself think and generating a fixed track, etc.
 // Helper class used when multiple threads want to call Search::think() individually.
 // Derive and use this class.
 struct MultiThink
 {
+	static constexpr std::uint64_t LOOP_COUNT_FINISHED = std::numeric_limits<std::uint64_t>::max();
+
 	MultiThink() : prng(std::chrono::system_clock::now().time_since_epoch().count())
 	{
 		loop_count = 0;
@@ -62,7 +65,7 @@ struct MultiThink
 	uint64_t get_next_loop_count() {
 		std::unique_lock<std::mutex> lk(loop_mutex);
 		if (loop_count >= loop_max)
-			return UINT64_MAX;
+			return LOOP_COUNT_FINISHED;
 		return loop_count++;
 	}
 

From 9d5dc3d33f5774284a2854d5bf223fc55f91af51 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 3 Sep 2020 11:45:14 +0200
Subject: [PATCH 184/583] Fix compilation issues.

---
 src/learn/convert.cpp |  2 --
 src/learn/gensfen.cpp | 18 +++++++++---------
 src/learn/learner.cpp |  7 ++++---
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index ebee8a96..387ac39b 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -1,5 +1,3 @@
-#define EVAL_LEARN
-
 #if defined(EVAL_LEARN)
 
 // evaluate header for learning
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 38bed2d5..e69528ac 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,6 +1,4 @@
-﻿#define EVAL_LEARN
-
-#if defined(EVAL_LEARN)
+﻿#if defined(EVAL_LEARN)
 
 #include "../eval/evaluate_common.h"
 
@@ -319,6 +317,7 @@ namespace Learner
             Position& pos,
             std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
             int ply,
+            int depth,
             vector<Move>& pv);
 
         // Min and max depths for search during gensfen
@@ -662,9 +661,10 @@ namespace Learner
     }
 
     Value MultiThinkGenSfen::evaluate_leaf(
-        Position& pos, 
+        Position& pos,
         std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
         int ply,
+        int depth,
         vector<Move>& pv)
     {
         auto rootColor = pos.side_to_move();
@@ -899,16 +899,16 @@ namespace Learner
                         // Result is added after the whole game is done.
                         pos.sfen_pack(psv.sfen);
 
-                        // Get the value of evaluate() as seen from the 
+                        // Get the value of evaluate() as seen from the
                         // root color on the leaf node of the PV line.
-                        // I don't know the goodness and badness of using the 
+                        // I don't know the goodness and badness of using the
                         // return value of search() as it is.
                         // TODO: Consider using search value instead of evaluate_leaf.
                         //       Maybe give it as an option.
-                        
-                        // Use PV moves to reach the leaf node and use the value 
+
+                        // Use PV moves to reach the leaf node and use the value
                         // that evaluated() is called on that leaf node.
-                        const auto leaf_value = evaluate_leaf(pos, states, ply, search_pv);
+                        const auto leaf_value = evaluate_leaf(pos, states, ply, depth, search_pv);
 
                         // If for some reason the leaf node couldn't yield an eval
                         // we fallback to search value.
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index c897dd93..2cf9d9f5 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -13,8 +13,6 @@
 // → I will not be involved in the engine because it is a problem that the GUI should assist.
 // etc..
 
-#define EVAL_LEARN
-
 #if defined(EVAL_LEARN)
 
 #include "../eval/evaluate_common.h"
@@ -98,10 +96,13 @@ namespace Learner
     // probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
     // data directly. In those cases, we set false to this variable.
     static bool convert_teacher_signal_to_winning_probability = true;
+
     // Use raw NNUE eval value in the Eval::evaluate(). If hybrid eval is enabled, training data
     // generation and training don't work well.
     // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-    static bool use_raw_nnue_eval = true;
+    // This CANNOT be static since it's used elsewhere.
+    bool use_raw_nnue_eval = true;
+
     // Using WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 

From 2688194d44b9971ca3755d3dc7eba984b8c13350 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 3 Sep 2020 11:47:36 +0200
Subject: [PATCH 185/583] Fix #91

---
 src/learn/gensfen.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index e69528ac..8b6bf951 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -506,6 +506,8 @@ namespace Learner
 
         // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
         // The phases stored in sfens are assumed to be continuous (in order).
+        bool quit = false;
+        int num_sfens_to_commit = 0;
         for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
         {
             // If is_win == 0 (draw), multiply by -1 and it will remain 0 (draw)
@@ -517,19 +519,25 @@ namespace Learner
             auto now_loop_count = get_next_loop_count();
             if (now_loop_count == LOOP_COUNT_FINISHED)
             {
-                return true;
+                quit = true;
+                break;
             }
 
+            ++num_sfens_to_commit;
+        }
+
+        // Write sfens in move order to make potential compression easier
+        for (auto it = sfens.end() - num_sfens_to_commit; it != sfens.end(); ++it)
+        {
             // Write out one sfen.
             sfen_writer.write(thread_id, *it);
-
 #if 0
             pos.set_from_packed_sfen(it->sfen);
             cout << pos << "Win : " << it->is_win << " , " << it->score << endl;
 #endif
         }
 
-        return false;
+        return quit;
     }
 
     optional<Move> MultiThinkGenSfen::choose_random_move(

From 327e92aefe3eae316af9400b3e5d106c7e1de09c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 3 Sep 2020 11:47:58 +0200
Subject: [PATCH 186/583] Remove trailing whitespaces.

---
 src/learn/convert.cpp |  20 ++++----
 src/learn/gensfen.cpp | 108 +++++++++++++++++++++---------------------
 src/learn/learner.cpp |  34 ++++++-------
 3 files changed, 81 insertions(+), 81 deletions(-)

diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index 387ac39b..b84dc2f8 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -59,16 +59,16 @@ namespace Learner
     }
 
     void convert_bin(
-        const vector<string>& filenames, 
-        const string& output_file_name, 
-        const int ply_minimum, 
-        const int ply_maximum, 
-        const int interpolate_eval, 
+        const vector<string>& filenames,
+        const string& output_file_name,
+        const int ply_minimum,
+        const int ply_maximum,
+        const int interpolate_eval,
         const int src_score_min_value,
         const int src_score_max_value,
         const int dest_score_min_value,
         const int dest_score_max_value,
-        const bool check_invalid_fen, 
+        const bool check_invalid_fen,
         const bool check_illegal_move)
     {
         std::cout << "check_invalid_fen=" << check_invalid_fen << std::endl;
@@ -268,9 +268,9 @@ namespace Learner
     }
 
     void convert_bin_from_pgn_extract(
-        const vector<string>& filenames, 
-        const string& output_file_name, 
-        const bool pgn_eval_side_to_move, 
+        const vector<string>& filenames,
+        const string& output_file_name,
+        const bool pgn_eval_side_to_move,
         const bool convert_no_eval_fens_as_score_zero)
     {
         std::cout << "pgn_eval_side_to_move=" << pgn_eval_side_to_move << std::endl;
@@ -471,7 +471,7 @@ namespace Learner
     }
 
     void convert_plain(
-        const vector<string>& filenames, 
+        const vector<string>& filenames,
         const string& output_file_name)
     {
         Position tpos;
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 8b6bf951..89fa49e0 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -46,7 +46,7 @@
 #include <shared_mutex>
 #endif
 
-using namespace std; 
+using namespace std;
 
 namespace Learner
 {
@@ -54,7 +54,7 @@ namespace Learner
     static bool detect_draw_by_consecutive_low_score = false;
     static bool detect_draw_by_insufficient_mating_material = false;
 
-    // Use raw NNUE eval value in the Eval::evaluate(). 
+    // Use raw NNUE eval value in the Eval::evaluate().
     // If hybrid eval is enabled, training data
     // generation and training don't work well.
     // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
@@ -66,7 +66,7 @@ namespace Learner
         // Amount of sfens required to flush the buffer.
         static constexpr size_t SFEN_WRITE_SIZE = 5000;
 
-        // Current status is output after 
+        // Current status is output after
         // each (SFEN_WRITE_SIZE * STATUS_OUTPUT_PERIOD) sfens
         static constexpr uint64_t STATUS_OUTPUT_PERIOD = 40;
 
@@ -106,7 +106,7 @@ namespace Learner
             // This buffer is prepared for each thread.
             auto& buf = sfen_buffers[thread_id];
 
-            // Secure since there is no buf at the first time 
+            // Secure since there is no buf at the first time
             // and immediately after writing the thread buffer.
             if (!buf)
             {
@@ -185,7 +185,7 @@ namespace Learner
 
                         sfen_write_count += buf->size();
 #if 1
-                        // Add the processed number here, and if it exceeds save_every, 
+                        // Add the processed number here, and if it exceeds save_every,
                         // change the file name and reset this counter.
                         sfen_write_count_current_file += buf->size();
                         if (sfen_write_count_current_file >= save_every)
@@ -197,8 +197,8 @@ namespace Learner
                             // Sequential number attached to the file
                             int n = (int)(sfen_write_count / save_every);
 
-                            // Rename the file and open it again. 
-                            // Add ios::app in consideration of overwriting. 
+                            // Rename the file and open it again.
+                            // Add ios::app in consideration of overwriting.
                             // (Depending on the operation, it may not be necessary.)
                             string new_filename = filename + "_" + std::to_string(n);
                             output_file_stream.open(new_filename, ios::out | ios::binary | ios::app);
@@ -208,13 +208,13 @@ namespace Learner
                         // Output '.' every time when writing a game record.
                         std::cout << ".";
 
-                        // Output the number of phases processed 
+                        // Output the number of phases processed
                         // every STATUS_OUTPUT_PERIOD times
-                        // Finally, the remainder of the teacher phase 
-                        // of each thread is written out, 
+                        // Finally, the remainder of the teacher phase
+                        // of each thread is written out,
                         // so halfway numbers are displayed, but is it okay?
-                        // If you overuse the threads to the maximum number 
-                        // of logical cores, the console will be clogged, 
+                        // If you overuse the threads to the maximum number
+                        // of logical cores, the console will be clogged,
                         // so it may be beneficial to increase that value.
                         if ((++batch_counter % STATUS_OUTPUT_PERIOD) == 0)
                         {
@@ -255,7 +255,7 @@ namespace Learner
         // buffer before writing to file
         // sfen_buffers is the buffer for each thread
         // sfen_buffers_pool is a buffer for writing.
-        // After loading the phase in the former buffer by SFEN_WRITE_SIZE, 
+        // After loading the phase in the former buffer by SFEN_WRITE_SIZE,
         // transfer it to the latter.
         std::vector<std::unique_ptr<PSVector>> sfen_buffers;
         std::vector<std::unique_ptr<PSVector>> sfen_buffers_pool;
@@ -263,7 +263,7 @@ namespace Learner
         // Mutex required to access sfen_buffers_pool
         std::mutex mutex;
 
-        // Number of sfens written in total, and the 
+        // Number of sfens written in total, and the
         // number of sfens written in the current file.
         uint64_t sfen_write_count = 0;
         uint64_t sfen_write_count_current_file = 0;
@@ -281,9 +281,9 @@ namespace Learner
         // It must be 2**N because it will be used as the mask to calculate hash_index.
         static_assert((GENSFEN_HASH_SIZE& (GENSFEN_HASH_SIZE - 1)) == 0);
 
-        MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_) : 
-            search_depth_min(search_depth_min_), 
-            search_depth_max(search_depth_max_), 
+        MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_) :
+            search_depth_min(search_depth_min_),
+            search_depth_max(search_depth_max_),
             sfen_writer(sw_)
         {
             hash.resize(GENSFEN_HASH_SIZE);
@@ -346,8 +346,8 @@ namespace Learner
 
         // For when using multi pv instead of random move.
         // random_multi_pv is the number of candidates for MultiPV.
-        // When adopting the move of the candidate move, the difference 
-        // between the evaluation value of the move of the 1st place 
+        // When adopting the move of the candidate move, the difference
+        // between the evaluation value of the move of the 1st place
         // and the evaluation value of the move of the Nth place is.
         // Must be in the range random_multi_pv_diff.
         // random_multi_pv_depth is the search depth for MultiPV.
@@ -355,7 +355,7 @@ namespace Learner
         int random_multi_pv_diff;
         int random_multi_pv_depth;
 
-        // The minimum and maximum ply (number of steps from 
+        // The minimum and maximum ply (number of steps from
         // the initial phase) of the sfens to write out.
         int write_minply;
         int write_maxply;
@@ -382,7 +382,7 @@ namespace Learner
         // move score in CP
         constexpr int adj_draw_score = 0;
 
-        // For the time being, it will be treated as a 
+        // For the time being, it will be treated as a
         // draw at the maximum number of steps to write.
         const int ply = move_hist_scores.size();
 
@@ -403,18 +403,18 @@ namespace Learner
         {
             Tablebases::rank_root_moves(pos, rootMoves);
         }
-        else 
+        else
         {
             // If there is no legal move
-            return pos.checkers() 
-                ? -1 /* mate */ 
+            return pos.checkers()
+                ? -1 /* mate */
                 : 0 /* stalemate */;
         }
 
         // Adjudicate game to a draw if the last 4 scores of each engine is 0.
-        if (detect_draw_by_consecutive_low_score) 
+        if (detect_draw_by_consecutive_low_score)
         {
-            if (ply >= adj_draw_ply) 
+            if (ply >= adj_draw_ply)
             {
                 int num_cons_plies_within_draw_score = 0;
                 bool is_adj_draw = false;
@@ -432,14 +432,14 @@ namespace Learner
                         break;
                     }
 
-                    if (num_cons_plies_within_draw_score >= adj_draw_cnt) 
+                    if (num_cons_plies_within_draw_score >= adj_draw_cnt)
                     {
                         is_adj_draw = true;
                         break;
                     }
                 }
 
-                if (is_adj_draw) 
+                if (is_adj_draw)
                 {
                     return 0;
                 }
@@ -447,33 +447,33 @@ namespace Learner
         }
 
         // Draw by insufficient mating material
-        if (detect_draw_by_insufficient_mating_material) 
+        if (detect_draw_by_insufficient_mating_material)
         {
-            if (pos.count<ALL_PIECES>() <= 4) 
+            if (pos.count<ALL_PIECES>() <= 4)
             {
                 int num_pieces = pos.count<ALL_PIECES>();
 
                 // (1) KvK
-                if (num_pieces == 2) 
+                if (num_pieces == 2)
                 {
                     return 0;
                 }
 
                 // (2) KvK + 1 minor piece
-                if (num_pieces == 3) 
+                if (num_pieces == 3)
                 {
                     int minor_pc = pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE) +
                         pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK);
-                    if (minor_pc == 1) 
+                    if (minor_pc == 1)
                     {
                         return 0;
                     }
                 }
 
                 // (3) KBvKB, bishops of the same color
-                else if (num_pieces == 4) 
+                else if (num_pieces == 4)
                 {
-                    if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1) 
+                    if (pos.count<BISHOP>(WHITE) == 1 && pos.count<BISHOP>(BLACK) == 1)
                     {
                         // Color of bishops is black.
                         if ((pos.pieces(WHITE, BISHOP) & DarkSquares)
@@ -498,7 +498,7 @@ namespace Learner
     // Write out the phases loaded in sfens to a file.
     // lastTurnIsWin: win/loss in the next phase after the final phase in sfens
     // 1 when winning. -1 when losing. Pass 0 for a draw.
-    // Return value: true if the specified number of 
+    // Return value: true if the specified number of
     // sfens has already been reached and the process ends.
     bool MultiThinkGenSfen::commit_psv(PSVector& sfens, size_t thread_id, int8_t lastTurnIsWin)
     {
@@ -570,7 +570,7 @@ namespace Learner
                     // Normally one move from legal move
                     random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
                 }
-                else 
+                else
                 {
                     // if you can move the king, move the king
                     Move moves[8]; // Near 8
@@ -589,7 +589,7 @@ namespace Learner
                         // move to move the king
                         random_move = moves[prng.rand(n)];
 
-                        // In Apery method, at this time there is a 1/2 chance 
+                        // In Apery method, at this time there is a 1/2 chance
                         // that the opponent will also move randomly
                         if (prng.rand(2) == 0)
                         {
@@ -604,7 +604,7 @@ namespace Learner
                     }
                 }
             }
-            else 
+            else
             {
                 Learner::search(pos, random_multi_pv_depth, random_multi_pv);
 
@@ -614,7 +614,7 @@ namespace Learner
                 uint64_t s = min((uint64_t)rm.size(), (uint64_t)random_multi_pv);
                 for (uint64_t i = 1; i < s; ++i)
                 {
-                    // The difference from the evaluation value of rm[0] must 
+                    // The difference from the evaluation value of rm[0] must
                     // be within the range of random_multi_pv_diff.
                     // It can be assumed that rm[x].score is arranged in descending order.
                     if (rm[0].score > rm[i].score + random_multi_pv_diff)
@@ -641,7 +641,7 @@ namespace Learner
 
         // Make an array like a[0] = 0 ,a[1] = 1, ...
         // Fisher-Yates shuffle and take out the first N items.
-        // Actually, I only want N pieces, so I only need 
+        // Actually, I only want N pieces, so I only need
         // to shuffle the first N pieces with Fisher-Yates.
 
         vector<int> a;
@@ -688,9 +688,9 @@ namespace Learner
 #endif
             pos.do_move(m, states[ply++]);
 
-            // Because the difference calculation of evaluate() cannot be 
+            // Because the difference calculation of evaluate() cannot be
             // performed unless each node evaluate() is called!
-            // If the depth is 8 or more, it seems 
+            // If the depth is 8 or more, it seems
             // faster not to calculate this difference.
 #if defined(EVAL_NNUE)
             if (depth < 8)
@@ -709,7 +709,7 @@ namespace Learner
             // VALUE_NONE and let the caller assign a value to the position.
             return VALUE_NONE;
         }
-        else 
+        else
         {
             v = Eval::evaluate(pos);
 
@@ -733,7 +733,7 @@ namespace Learner
     // thread_id = 0..Threads.size()-1
     void MultiThinkGenSfen::thread_worker(size_t thread_id)
     {
-        // For the time being, it will be treated as a draw 
+        // For the time being, it will be treated as a draw
         // at the maximum number of steps to write.
         // Maximum StateInfo + Search PV to advance to leaf buffer
         std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
@@ -768,7 +768,7 @@ namespace Learner
             vector<uint8_t> random_move_flag = generate_random_move_flags();
 
             // A counter that keeps track of the number of random moves
-            // When random_move_minply == -1, random moves are 
+            // When random_move_minply == -1, random moves are
             // performed continuously, so use it at this time.
             // Used internally by choose_random_move.
             int actual_random_move_count = 0;
@@ -804,19 +804,19 @@ namespace Learner
 
                     if (random_move_minply != -1)
                     {
-                        // Random move is performed with a certain 
+                        // Random move is performed with a certain
                         // probability even in the constant phase.
                         goto RANDOM_MOVE;
                     }
                     else
                     {
-                        // When -1 is specified as random_move_minply, 
-                        // it points according to the standard until 
+                        // When -1 is specified as random_move_minply,
+                        // it points according to the standard until
                         // it goes out of the standard.
-                        // Prepare an innumerable number of situations 
-                        // that have left the constant as 
+                        // Prepare an innumerable number of situations
+                        // that have left the constant as
                         // ConsiderationBookMoveCount true using a huge constant
-                        // Used for purposes such as performing 
+                        // Used for purposes such as performing
                         // a random move 5 times from there.
                         goto DO_MOVE;
                     }
@@ -931,7 +931,7 @@ namespace Learner
 
                 SKIP_SAVE:;
 
-                    // For some reason, We could not get PV (hit the substitution table etc. and got stuck?) 
+                    // For some reason, We could not get PV (hit the substitution table etc. and got stuck?)
                     // so go to the next game. It's a rare case, so you can ignore it.
                     if (search_pv.size() == 0)
                     {
@@ -949,7 +949,7 @@ namespace Learner
                 {
                     next_move = random_move.value();
 
-                    // We don't have the whole game yet, but it ended, 
+                    // We don't have the whole game yet, but it ended,
                     // so the writing process ends and the next game starts.
                     if (!is_ok(next_move))
                     {
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 2cf9d9f5..88f2a0c3 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -421,7 +421,7 @@ namespace Learner
                         continue;
                     sfen_for_mse.push_back(p);
                 }
-                else 
+                else
                 {
                     break;
                 }
@@ -815,17 +815,17 @@ namespace Learner
             // Assign work to each thread using TaskDispatcher.
             // A task definition for that.
             // It is not possible to capture pos used in ↑, so specify the variables you want to capture one by one.
-            auto task = 
+            auto task =
                 [
-                    &ps, 
-                    &test_sum_cross_entropy_eval, 
-                    &test_sum_cross_entropy_win, 
-                    &test_sum_cross_entropy, 
-                    &test_sum_entropy_eval, 
-                    &test_sum_entropy_win, 
-                    &test_sum_entropy, 
-                    &sum_norm, 
-                    &task_count, 
+                    &ps,
+                    &test_sum_cross_entropy_eval,
+                    &test_sum_cross_entropy_win,
+                    &test_sum_cross_entropy,
+                    &test_sum_entropy_eval,
+                    &test_sum_entropy_win,
+                    &test_sum_entropy,
+                    &sum_norm,
+                    &task_count,
                     &move_accord_count
                 ](size_t task_thread_id)
             {
@@ -1906,16 +1906,16 @@ namespace Learner
             Eval::init_NNUE();
             cout << "convert_bin.." << endl;
             convert_bin(
-                filenames, 
-                output_file_name, 
-                ply_minimum, 
-                ply_maximum, 
-                interpolate_eval, 
+                filenames,
+                output_file_name,
+                ply_minimum,
+                ply_maximum,
+                interpolate_eval,
                 src_score_min_value,
                 src_score_max_value,
                 dest_score_min_value,
                 dest_score_max_value,
-                check_invalid_fen, 
+                check_invalid_fen,
                 check_illegal_move);
             return;
 

From 0612adec41f26ff618da76f57f7049d0cb2a38f8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 4 Sep 2020 20:53:40 +0200
Subject: [PATCH 187/583] Fix incorrect early exit in evaluate_leaf.

---
 src/learn/gensfen.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 89fa49e0..23d7e2c6 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -702,12 +702,13 @@ namespace Learner
 
         // Reach leaf
         Value v;
-        if (pos.checkers()) {
+        if (pos.checkers())
+        {
             // Sometime a king is checked.  An example is a case that a checkmate is
             // found in the search.  If Eval::evaluate() is called whne a king is
             // checked, classic eval crashes by an assertion. To avoid crashes, return
             // VALUE_NONE and let the caller assign a value to the position.
-            return VALUE_NONE;
+            v = VALUE_NONE;
         }
         else
         {

From e9e6e47a93c5512204a55aa7416bf133b8ef6671 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 6 Sep 2020 12:47:37 +0200
Subject: [PATCH 188/583] Fix write_out_draw_game_in_training_data_generation
 flag not being respected.

---
 src/learn/gensfen.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 23d7e2c6..39edc699 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -502,6 +502,12 @@ namespace Learner
     // sfens has already been reached and the process ends.
     bool MultiThinkGenSfen::commit_psv(PSVector& sfens, size_t thread_id, int8_t lastTurnIsWin)
     {
+        if (!write_out_draw_game_in_training_data_generation && lastTurnIsWin == 0)
+        {
+            // We didn't write anything so why quit.
+            return false;
+        }
+
         int8_t is_win = lastTurnIsWin;
 
         // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.

From 3bf418e63f93036d7ec5049e73ac945e75a901a0 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 6 Sep 2020 20:38:29 +0200
Subject: [PATCH 189/583] Fix some uninitialized variables with gensfen

fixes valgrind errors as seen with:

```
setoption name Use NNUE value true
isready
gensfen depth 6 loop 10 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin use_raw_nnue_eval 0
quit
```

the latter script now runs without valgrind errors on linux
---
 src/learn/gensfen.cpp | 2 ++
 src/search.cpp        | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 39edc699..eeeb7b2e 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1159,6 +1159,8 @@ namespace Learner
         // Show if the training data generator uses NNUE.
         Eval::verify_NNUE();
 
+        Threads.main()->ponder = false;
+
         // Create and execute threads as many as Options["Threads"].
         {
             SfenWriter sfen_writer(output_file_name, thread_num);
diff --git a/src/search.cpp b/src/search.cpp
index 2d848bcd..8f258ae4 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -2035,6 +2035,8 @@ namespace Learner
       th->completedDepth = 0;
       th->selDepth = 0;
       th->rootDepth = 0;
+      th->nmpMinPly = th->bestMoveChanges = 0;
+      th->ttHitAverage = TtHitAverageWindow * TtHitAverageResolution / 2;
 
 	  // Zero initialization of the number of search nodes
       th->nodes = 0;

From 3a06de298b06a1d2aed43d7c968dbd49ea44b662 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 6 Sep 2020 21:46:08 +0200
Subject: [PATCH 190/583] Define BLAS variables in Makefile

makes it a little easier to change the BLAS library used,
doesn't hardcode the mingw headers. Works on Linux with openblas installed.
Should be no change on Windows.
---
 src/Makefile | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 0c6b21e5..eef17406 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -28,6 +28,21 @@ else
 EXE = stockfish
 endif
 
+### Establish the operating system name
+KERNEL = $(shell uname -s)
+ifeq ($(KERNEL),Linux)
+	OS = $(shell uname -o)
+endif
+
+### BLAS libraries
+ifeq ($(KERNEL),Linux)
+	BLASCXXFLAGS =
+	BLASLDFLAGS = -lopenblas
+else
+	BLASCXXFLAGS = -I/mingw64/include/OpenBLAS
+	BLASLDFLAGS = -lopenblas -Wl,-s -static
+endif
+
 ### Installation dir definitions
 PREFIX = /usr/local
 BINDIR = $(PREFIX)/bin
@@ -61,12 +76,6 @@ OBJS = $(notdir $(SRCS:.cpp=.o))
 
 VPATH = syzygy:nnue:nnue/features:eval:extra:learn
 
-### Establish the operating system name
-KERNEL = $(shell uname -s)
-ifeq ($(KERNEL),Linux)
-	OS = $(shell uname -o)
-endif
-
 ### ==========================================================================
 ### Section 2. High-level Configuration
 ### ==========================================================================
@@ -308,7 +317,7 @@ endif
 ### ==========================================================================
 
 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
+CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
 DEPENDFLAGS += -std=c++17
 LDFLAGS += $(EXTRALDFLAGS) $(LEARNLDFLAGS)
 
@@ -890,16 +899,16 @@ icc-profile-use:
 
 learn: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	EXTRALDFLAGS=' -lopenblas -fopenmp -Wl,-s -static ' \
+	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
 	all
 
 profile-learn: config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s -static '
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNLDFLAGS='  $(BLASLDLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOGENSFEN)
@@ -907,8 +916,8 @@ profile-learn: config-sanity objclean profileclean
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s -static '
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean

From edbbc1a4df941b7e41bb0b4b34adfe7db90f3ec7 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 6 Sep 2020 22:13:42 +0200
Subject: [PATCH 191/583] Remove some warnings

---
 src/learn/gensfen.cpp                       |  2 +-
 src/misc.h                                  |  2 +-
 src/nnue/trainer/trainer_affine_transform.h |  8 ++++----
 src/nnue/trainer/trainer_clipped_relu.h     |  8 ++++----
 src/nnue/trainer/trainer_input_slice.h      | 16 ++++++++--------
 src/nnue/trainer/trainer_sum.h              | 18 +++++++++---------
 6 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index eeeb7b2e..6c8c455e 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -92,7 +92,7 @@ namespace Learner
             {
                 // All buffers should be empty since file_worker_thread
                 // should have written everything before exiting.
-                for (const auto& p : sfen_buffers) { assert(p == nullptr); }
+                for (const auto& p : sfen_buffers) { assert(p == nullptr); (void)p ; }
                 assert(sfen_buffers_pool.empty());
             }
 #endif
diff --git a/src/misc.h b/src/misc.h
index 19bb008c..d73d0633 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -259,7 +259,7 @@ public:
   template <typename U> AlignedAllocator(const AlignedAllocator<U>&) {}
 
   T* allocate(std::size_t n) { return (T*)std_aligned_alloc(alignof(T), n * sizeof(T)); }
-  void deallocate(T* p, std::size_t n) { std_aligned_free(p); }
+  void deallocate(T* p, std::size_t ) { std_aligned_free(p); }
 };
 
 // --------------------
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index db56c1c0..da11ca29 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -25,9 +25,9 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
  public:
   // factory function
   static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+      LayerType* target_layer, FeatureTransformer* ft) {
     return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
+        new Trainer(target_layer, ft));
   }
 
   // Set options such as hyperparameters
@@ -186,11 +186,11 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
 
  private:
   // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
       batch_size_(0),
       batch_input_(nullptr),
       previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
+          &target_layer->previous_layer_, ft)),
       target_layer_(target_layer),
       biases_(),
       weights_(),
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index fd7b1a07..bd59a02d 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -23,9 +23,9 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
  public:
   // factory function
   static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+      LayerType* target_layer, FeatureTransformer* ft) {
     return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
+        new Trainer(target_layer, ft));
   }
 
   // Set options such as hyperparameters
@@ -78,10 +78,10 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
 
  private:
   // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
       batch_size_(0),
       previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
+          &target_layer->previous_layer_, ft)),
       target_layer_(target_layer) {
     std::fill(std::begin(min_activations_), std::end(min_activations_),
               std::numeric_limits<LearnFloatType>::max());
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 33e39244..7d9e76c3 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -18,10 +18,10 @@ class SharedInputTrainer {
  public:
   // factory function
   static std::shared_ptr<SharedInputTrainer> Create(
-      FeatureTransformer* feature_transformer) {
+      FeatureTransformer* ft) {
     static std::shared_ptr<SharedInputTrainer> instance;
     if (!instance) {
-      instance.reset(new SharedInputTrainer(feature_transformer));
+      instance.reset(new SharedInputTrainer(ft));
     }
     ++instance->num_referrers_;
     return instance;
@@ -105,13 +105,13 @@ class SharedInputTrainer {
 
  private:
   // constructor
-  SharedInputTrainer(FeatureTransformer* feature_transformer) :
+  SharedInputTrainer(FeatureTransformer* ft) :
       batch_size_(0),
       num_referrers_(0),
       num_calls_(0),
       current_operation_(Operation::kNone),
       feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
-          feature_transformer)),
+          ft)),
       output_(nullptr) {
   }
 
@@ -161,8 +161,8 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
  public:
   // factory function
   static std::shared_ptr<Trainer> Create(
-      LayerType* /*target_layer*/, FeatureTransformer* feature_transformer) {
-    return std::shared_ptr<Trainer>(new Trainer(feature_transformer));
+      LayerType* /*target_layer*/, FeatureTransformer* ft) {
+    return std::shared_ptr<Trainer>(new Trainer(ft));
   }
 
   // Set options such as hyperparameters
@@ -218,9 +218,9 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
 
  private:
   // constructor
-  Trainer(FeatureTransformer* feature_transformer):
+  Trainer(FeatureTransformer* ft):
       batch_size_(0),
-      shared_input_trainer_(SharedInputTrainer::Create(feature_transformer)) {
+      shared_input_trainer_(SharedInputTrainer::Create(ft)) {
   }
 
   // number of input/output dimensions
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index fb5b1532..f7bf3b3d 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -25,9 +25,9 @@ class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
  public:
   // factory function
   static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+      LayerType* target_layer, FeatureTransformer* ft) {
     return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
+        new Trainer(target_layer, ft));
   }
 
   // Set options such as hyperparameters
@@ -74,11 +74,11 @@ class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
 
  private:
   // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer):
-      Tail(target_layer, feature_transformer),
+  Trainer(LayerType* target_layer, FeatureTransformer* ft):
+      Tail(target_layer, ft),
       batch_size_(0),
       previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
+          &target_layer->previous_layer_, ft)),
       target_layer_(target_layer) {
   }
 
@@ -110,9 +110,9 @@ class Trainer<Layers::Sum<PreviousLayer>> {
  public:
   // factory function
   static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+      LayerType* target_layer, FeatureTransformer* ft) {
     return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, feature_transformer));
+        new Trainer(target_layer, ft));
   }
 
   // Set options such as hyperparameters
@@ -154,10 +154,10 @@ class Trainer<Layers::Sum<PreviousLayer>> {
 
  private:
   // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
       batch_size_(0),
       previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, feature_transformer)),
+          &target_layer->previous_layer_, ft)),
       target_layer_(target_layer) {
   }
 

From e9e52faae7f85a0b4ff96ae1c457556fcf5ce5ae Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 7 Sep 2020 08:08:43 +0200
Subject: [PATCH 192/583] Typo fix

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index eef17406..a8d7a13c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -908,7 +908,7 @@ profile-learn: config-sanity objclean profileclean
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
 	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
-	LEARNLDFLAGS='  $(BLASLDLAGS) -fopenmp '
+	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOGENSFEN)

From 31e8be3008a87716447582d3f0e7e4cabc3d4e22 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 7 Sep 2020 08:38:14 +0200
Subject: [PATCH 193/583] First little CI step for the learner

---
 .travis.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 092c7f53..a689702a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,7 +7,7 @@ matrix:
       compiler: gcc
       addons:
         apt:
-          packages: ['g++-8', 'g++-8-multilib', 'g++-multilib', 'valgrind', 'expect', 'curl']
+          packages: ['g++-8', 'g++-8-multilib', 'g++-multilib', 'valgrind', 'expect', 'curl', 'openblas']
       env:
         - COMPILER=g++-8
         - COMP=gcc
@@ -16,7 +16,7 @@ matrix:
       compiler: clang
       addons:
         apt:
-          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl']
+          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl', 'openblas']
       env:
         - COMPILER=clang++-10
         - COMP=clang
@@ -74,6 +74,9 @@ script:
   # workaround: exclude a custom version of llvm+clang, which doesn't find llvm-profdata on ubuntu
   - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
 
+  # start some basic learner CI
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern learn; fi
+
   # compile only for some more advanced architectures (might not run in travis)
   - make clean && make -j2 ARCH=x86-64-avx2 build
   - make clean && make -j2 ARCH=x86-64-bmi2 build

From bccc71afb412253507adcb64bf2acfc6618321e8 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 7 Sep 2020 08:50:59 +0200
Subject: [PATCH 194/583] fix openblas package name?

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a689702a..501c2d4b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,7 +7,7 @@ matrix:
       compiler: gcc
       addons:
         apt:
-          packages: ['g++-8', 'g++-8-multilib', 'g++-multilib', 'valgrind', 'expect', 'curl', 'openblas']
+          packages: ['g++-8', 'g++-8-multilib', 'g++-multilib', 'valgrind', 'expect', 'curl', 'libopenblas-dev']
       env:
         - COMPILER=g++-8
         - COMP=gcc
@@ -16,7 +16,7 @@ matrix:
       compiler: clang
       addons:
         apt:
-          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl', 'openblas']
+          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl', 'libopenblas-dev']
       env:
         - COMPILER=clang++-10
         - COMP=clang

From e004e47e5a16689fcdaa34a7d2b38016feeb83d1 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 7 Sep 2020 16:21:40 +0900
Subject: [PATCH 195/583] Commented out an unused function parameter to remove
 a compile warning.

---
 src/misc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/misc.cpp b/src/misc.cpp
index 851280fe..5c2a4637 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -743,7 +743,7 @@ namespace Dependency {
 // The function to dig a folder on linux is good for the time being... Only used to save the evaluation function file...
 
 namespace Dependency {
-    int mkdir(std::string dir_name)
+    int mkdir(std::string /* dir_name */)
     {
         return 0;
     }

From 4cc98d80f8a5e6eb0b716a47bc4eb8b877b5a979 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 7 Sep 2020 18:56:41 +0900
Subject: [PATCH 196/583] Replaced the utility function to create a directory
 to std::filesystem.

---
 src/learn/learner.cpp              |  3 +-
 src/misc.cpp                       | 63 ------------------------------
 src/misc.h                         |  5 ---
 src/nnue/evaluate_nnue_learner.cpp |  3 +-
 4 files changed, 4 insertions(+), 70 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 88f2a0c3..7021fd7f 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -39,6 +39,7 @@
 #include <memory>
 #include <limits>
 #include <optional>
+#include <filesystem>
 
 #if defined (_OPENMP)
 #include <omp.h>
@@ -1467,7 +1468,7 @@ namespace Learner
             cout << ".";
         };
 
-        Dependency::mkdir("tmp");
+        std::filesystem::create_directory("tmp");
 
         // Shuffle and export as a 10M phase shredded file.
         for (auto filename : filenames)
diff --git a/src/misc.cpp b/src/misc.cpp
index 5c2a4637..a23b1205 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -687,66 +687,3 @@ int write_memory_to_file(std::string filename, void* ptr, uint64_t size)
     fs.close();
     return 0;
 }
-
-// ----------------------------
-//     mkdir wrapper
-// ----------------------------
-
-// Specify relative to the current folder. Returns 0 on success, non-zero on failure.
-// Create a folder. Japanese is not used.
-// In case of gcc under msys2 environment, folder creation fails with _wmkdir(). Cause unknown.
-// Use _mkdir() because there is no help for it.
-
-#if defined(_WIN32)
-// for Windows
-
-#if defined(_MSC_VER)
-#include <codecvt> // I need this because I want wstring to mkdir
-#include <locale> // This is required for wstring_convert.
-
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> cv;
-        return _wmkdir(cv.from_bytes(dir_name).c_str());
-        //	::CreateDirectory(cv.from_bytes(dir_name).c_str(),NULL);
-    }
-}
-
-#elif defined(__GNUC__) 
-
-#include <direct.h>
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        return _mkdir(dir_name.c_str());
-    }
-}
-
-#endif
-#elif defined(__linux__)
-
-// In the linux environment, this symbol _LINUX is defined in the makefile.
-
-// mkdir implementation for Linux.
-#include "sys/stat.h"
-
-namespace Dependency {
-    int mkdir(std::string dir_name)
-    {
-        return ::mkdir(dir_name.c_str(), 0777);
-    }
-}
-#else
-
-// In order to judge whether it is a Linux environment, we have to divide the makefile..
-// The function to dig a folder on linux is good for the time being... Only used to save the evaluation function file...
-
-namespace Dependency {
-    int mkdir(std::string /* dir_name */)
-    {
-        return 0;
-    }
-}
-
-#endif
diff --git a/src/misc.h b/src/misc.h
index d73d0633..c918a351 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -273,11 +273,6 @@ namespace Dependency
   // So when calling getline() on fstream,
   // just write getline() instead of std::getline() and use this function.
   extern bool getline(std::ifstream& fs, std::string& s);
-
-  // Create a folder.
-  // Specify relative to the current folder. Japanese is not used for dir_name.
-  // Returns 0 on success, non-zero on failure.
-  extern int mkdir(std::string dir_name);
 }
 
 #endif // #ifndef MISC_H_INCLUDED
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 650f443e..13d9d578 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -4,6 +4,7 @@
 
 #include <random>
 #include <fstream>
+#include <filesystem>
 
 #include "../learn/learn.h"
 #include "../learn/learning_tools.h"
@@ -207,7 +208,7 @@ void save_eval(std::string dir_name) {
   // mkdir() will fail if this folder already exists, but
   // Apart from that. If not, I just want you to make it.
   // Also, assume that the folders up to EvalSaveDir have been dug.
-  Dependency::mkdir(eval_dir);
+  std::filesystem::create_directories(eval_dir);
 
   if (Options["SkipLoadingEval"] && NNUE::trainer) {
     NNUE::SendMessages({{"clear_unobserved_feature_weights"}});

From e638d66bbe2b155a7498ca717e44942726125503 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 19:54:25 +0200
Subject: [PATCH 197/583] Only add -s flag to the linker if debug=no

---
 src/Makefile | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index a8d7a13c..db8213c0 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -34,15 +34,6 @@ ifeq ($(KERNEL),Linux)
 	OS = $(shell uname -o)
 endif
 
-### BLAS libraries
-ifeq ($(KERNEL),Linux)
-	BLASCXXFLAGS =
-	BLASLDFLAGS = -lopenblas
-else
-	BLASCXXFLAGS = -I/mingw64/include/OpenBLAS
-	BLASLDFLAGS = -lopenblas -Wl,-s -static
-endif
-
 ### Installation dir definitions
 PREFIX = /usr/local
 BINDIR = $(PREFIX)/bin
@@ -141,6 +132,20 @@ neon = no
 ARCH = x86-64-modern
 STRIP = strip
 
+### BLAS libraries
+ifeq ($(KERNEL),Linux)
+	BLASCXXFLAGS =
+	BLASLDFLAGS = -lopenblas
+else
+	BLASCXXFLAGS = -I/mingw64/include/OpenBLAS
+
+	ifeq ($(debug),yes)
+		BLASLDFLAGS = -lopenblas -Wl,-static
+	else
+		BLASLDFLAGS = -lopenblas -Wl,-s -static
+	endif
+endif
+
 ### 2.2 Architecture specific
 
 ifeq ($(findstring x86,$(ARCH)),x86)

From 6e8f82ad76c4fb48f34c27bf7fd5185759b2e087 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 7 Sep 2020 20:14:21 +0200
Subject: [PATCH 198/583] Fix small CI failures

1) Only access UCI option if defined
2) disable -Werror for now.
3) disable a few target that don't have _mm_malloc.
4) Add profile-learn target, with small speedup.
5) just test on Linux + gcc (skip macOS, unclear openblas, skip linux+clang, unclear omp/std::filesystem).
---
 .travis.yml                | 59 +++++++++++++++++++++-----------------
 src/Makefile               |  4 +--
 src/nnue/evaluate_nnue.cpp |  2 ++
 src/ucioption.cpp          |  2 +-
 4 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 501c2d4b..5859f97b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,28 +12,28 @@ matrix:
         - COMPILER=g++-8
         - COMP=gcc
 
-    - os: linux
-      compiler: clang
-      addons:
-        apt:
-          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl', 'libopenblas-dev']
-      env:
-        - COMPILER=clang++-10
-        - COMP=clang
-
-    - os: osx
-      osx_image: xcode12
-      compiler: gcc
-      env:
-        - COMPILER=g++
-        - COMP=gcc
-
-    - os: osx
-      osx_image: xcode12
-      compiler: clang
-      env:
-        - COMPILER=clang++
-        - COMP=clang
+#    - os: linux
+#      compiler: clang
+#      addons:
+#        apt:
+#          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl', 'openblas']
+#      env:
+#        - COMPILER=clang++-10
+#        - COMP=clang
+#
+#    - os: osx
+#      osx_image: xcode12
+#      compiler: gcc
+#      env:
+#        - COMPILER=g++
+#        - COMP=gcc
+#
+#    - os: osx
+#      osx_image: xcode12
+#      compiler: clang
+#      env:
+#        - COMPILER=clang++
+#        - COMP=clang
 
 branches:
   only:
@@ -65,17 +65,22 @@ script:
   - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref
   - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref
   - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
+  # TODO avoid _mm_malloc
+  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
+  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
+  # TODO avoid _mm_malloc
+  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
+  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
   # workaround: exclude a custom version of llvm+clang, which doesn't find llvm-profdata on ubuntu
   - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
 
   # start some basic learner CI
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern learn; fi
+  #TODO enable -Werror
+  - export CXXFLAGS=""
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern learn; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern profile-learn; fi
 
   # compile only for some more advanced architectures (might not run in travis)
   - make clean && make -j2 ARCH=x86-64-avx2 build
diff --git a/src/Makefile b/src/Makefile
index db8213c0..9db13e44 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -40,7 +40,7 @@ BINDIR = $(PREFIX)/bin
 
 ### Built-in benchmark for pgo-builds
 PGOBENCH = ./$(EXE) bench
-PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 100000
+PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000
 
 ### Source and object files
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
@@ -908,7 +908,7 @@ learn: config-sanity
 	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
 	all
 
-profile-learn: config-sanity objclean profileclean
+profile-learn: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index a2845c96..5c8cee71 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -184,11 +184,13 @@ namespace Eval::NNUE {
 
     Initialize();
 
+#if defined(EVAL_NNUE)
     if (Options["SkipLoadingEval"])
     {
       std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
       return true;
     }
+#endif
 
     fileName = evalFile;
 
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 519160cf..0007b559 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -96,7 +96,7 @@ void init(OptionsMap& o) {
 #if defined(EVAL_LEARN)
   // When learning the evaluation function, you can change the folder to save the evaluation function.
   // Evalsave by default. This folder shall be prepared in advance.
-  // Automatically dig a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
+  // Automatically create a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
   o["EvalSaveDir"] << Option("evalsave");
 #endif
 }

From e5f05fa2b9f60503e121102ba94390e6974ced1e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 14:32:05 +0200
Subject: [PATCH 199/583] Add a script to extract a contiguous range of entries
 from a .bin file.

---
 script/extract_bin.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 script/extract_bin.py

diff --git a/script/extract_bin.py b/script/extract_bin.py
new file mode 100644
index 00000000..9574aa17
--- /dev/null
+++ b/script/extract_bin.py
@@ -0,0 +1,42 @@
+import sys
+
+ENTRY_SIZE = 40
+NUM_ENTRIES_IN_CHUNK = 1024*1024
+
+def copy(infile, outfile, count, times):
+    if times > 1:
+        outfile.write(infile.read(count*ENTRY_SIZE)*times)
+    else:
+        offset = 0
+        while offset < count:
+            to_read = NUM_ENTRIES_IN_CHUNK if offset + NUM_ENTRIES_IN_CHUNK <= count else count - offset
+
+            outfile.write(infile.read(to_read*ENTRY_SIZE))
+
+            offset += NUM_ENTRIES_IN_CHUNK
+
+def work():
+    filename = sys.argv[1]
+    offset = int(sys.argv[2])
+    count = int(sys.argv[3])
+    times = int(sys.argv[4]) if len(sys.argv) >= 5 else 1
+
+    with open(filename, 'rb') as infile:
+        infile.seek(offset * ENTRY_SIZE)
+        filename_parts = filename.split('.')
+        out_path = '.'.join(filename_parts[:-1]) + '_' + str(offset) + '_' + str(count) + '_' + str(times) + '.' + filename_parts[-1]
+        with open(out_path, 'wb') as outfile:
+            copy(infile, outfile, count, times)
+
+def show_help():
+    print('Usage: python extract_bin.py filename offset count [times]')
+    print('filename - the path to the .bin file to process')
+    print('offset - the number of sfens to skip')
+    print('count - the number of sfens to extract')
+    print('times - the number of times to repeat the extracted sfens. Default = 1')
+    print('The result is saved in a new file named `filename.stem`_`offset`_`count`_`times`.bin')
+
+if len(sys.argv) < 4:
+    show_help()
+else:
+    work()

From 58863c32436c22ea05121e039850253510d923d1 Mon Sep 17 00:00:00 2001
From: noobpwnftw <noobpwnftw@users.noreply.github.com>
Date: Tue, 8 Sep 2020 11:39:21 +0800
Subject: [PATCH 200/583] Update gensfen.cpp

---
 src/learn/gensfen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 6c8c455e..4214233b 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -58,7 +58,7 @@ namespace Learner
     // If hybrid eval is enabled, training data
     // generation and training don't work well.
     // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-    static bool use_raw_nnue_eval = true;
+    extern bool use_raw_nnue_eval;
 
     // Helper class for exporting Sfen
     struct SfenWriter

From 832c414b0d78263595b4e7cd6d19c87e61519010 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 23:03:53 +0200
Subject: [PATCH 201/583] First batch of reorganization.

---
 src/learn/learner.cpp | 402 +++++++++++++++++++++++++-----------------
 src/misc.cpp          |  21 ++-
 src/misc.h            |  32 +++-
 3 files changed, 278 insertions(+), 177 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 7021fd7f..98c8e32e 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -66,7 +66,7 @@ using namespace std;
 //extern Book::BookMoveSelector book;
 
 template <typename T>
-T operator += (std::atomic<T>& x, const T rhs)
+T operator +=(std::atomic<T>& x, const T rhs)
 {
     T old = x.load(std::memory_order_consume);
     // It is allowed that the value is rewritten from other thread at this timing.
@@ -84,8 +84,9 @@ namespace Learner
     static bool use_draw_games_in_training = false;
     static bool use_draw_games_in_validation = false;
     static bool skip_duplicated_positions_in_training = true;
-    // 1.0 / PawnValueEg / 4.0 * log(10.0)
-    static double winning_probability_coefficient = 0.00276753015984861260098316280611;
+
+    static double winning_probability_coefficient = 1.0 / PawnValueEg / 4.0 * std::log(10.0);
+
     // Score scale factors.  ex) If we set src_score_min_value = 0.0,
     // src_score_max_value = 1.0, dest_score_min_value = 0.0,
     // dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000].
@@ -93,6 +94,7 @@ namespace Learner
     static double src_score_max_value = 1.0;
     static double dest_score_min_value = 0.0;
     static double dest_score_max_value = 1.0;
+
     // Assume teacher signals are the scores of deep searches, and convert them into winning
     // probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
     // data directly. In those cases, we set false to this variable.
@@ -102,7 +104,7 @@ namespace Learner
     // generation and training don't work well.
     // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
     // This CANNOT be static since it's used elsewhere.
-    bool use_raw_nnue_eval = true;
+    bool use_raw_nnue_eval = false;
 
     // Using WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
@@ -111,38 +113,37 @@ namespace Learner
     // command to learn from the generated game (learn)
     // -----------------------------------
 
-    // ordinary sigmoid function
-    double sigmoid(double x)
-    {
-        return 1.0 / (1.0 + std::exp(-x));
-    }
-
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage(double value)
     {
         // 1/(1+10^(-Eval/4))
         // = 1/(1+e^(-Eval/4*ln(10))
         // = sigmoid(Eval/4*ln(10))
-        return sigmoid(value * winning_probability_coefficient);
+        return Math::sigmoid(value * winning_probability_coefficient);
     }
 
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage_wdl(double value, int ply)
     {
+        constexpr double wdl_total = 1000.0;
+        constexpr double draw_score = 0.5;
+
         double wdl_w = UCI::win_rate_model_double(value, ply);
         double wdl_l = UCI::win_rate_model_double(-value, ply);
-        double wdl_d = 1000.0 - wdl_w - wdl_l;
+        double wdl_d = wdl_total - wdl_w - wdl_l;
 
-        return (wdl_w + wdl_d / 2.0) / 1000.0;
+        return (wdl_w + wdl_d * draw_score) / wdl_total;
     }
 
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage(double value, int ply)
     {
-        if (use_wdl) {
+        if (use_wdl) 
+        {
             return winning_percentage_wdl(value, ply);
         }
-        else {
+        else 
+        {
             return winning_percentage(value);
         }
     }
@@ -151,7 +152,7 @@ namespace Learner
     {
         double p = deep_win_rate;
         double q = winning_percentage(shallow_eval, ply);
-        return -p * std::log(q) - (1 - p) * std::log(1 - q);
+        return -p * std::log(q) - (1.0 - p) * std::log(1.0 - q);
     }
 
     double calc_d_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
@@ -164,17 +165,6 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
-    double dsigmoid(double x)
-    {
-        // Sigmoid function
-        // f(x) = 1/(1+exp(-x))
-        // the first derivative is
-        // f'(x) = df/dx = f(x)・{ 1-f(x)}
-        // becomes
-
-        return sigmoid(x) * (1.0 - sigmoid(x));
-    }
-
     // When the objective function is the sum of squares of the difference in winning percentage
 #if defined (LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
 // function to calculate the gradient
@@ -202,7 +192,7 @@ namespace Learner
 
         double p = winning_percentage(deep);
         double q = winning_percentage(shallow);
-        return (q - p) * dsigmoid(double(shallow) / 600.0);
+        return (q - p) * Math::dsigmoid(double(shallow) / 600.0);
     }
 #endif
 
@@ -253,39 +243,75 @@ namespace Learner
     double ELMO_LAMBDA2 = 0.33;
     double ELMO_LAMBDA_LIMIT = 32000;
 
+    // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
+    double get_scaled_signal(double signal)
+    {
+        double scaled_signal = signal;
+
+        // Normalize to [0.0, 1.0].
+        scaled_signal =
+            (scaled_signal - src_score_min_value)
+            / (src_score_max_value - src_score_min_value);
+
+        // Scale to [dest_score_min_value, dest_score_max_value].
+        scaled_signal =
+            scaled_signal * (dest_score_max_value - dest_score_min_value)
+            + dest_score_min_value;
+
+        return scaled_signal;
+    }
+
+    // Teacher winning probability.
+    double calculate_p(double teacher_signal, int ply)
+    {
+        const double scaled_teacher_signal = get_scaled_signal(teacher_signal);
+
+        // Teacher winning probability.
+        double p = scaled_teacher_signal;
+        if (convert_teacher_signal_to_winning_probability) 
+        {
+            p = winning_percentage(scaled_teacher_signal);
+        }
+    }
+
+    double calculate_lambda(double teacher_signal)
+    {
+        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
+        const double lambda =
+            (std::abs(teacher_signal) >= ELMO_LAMBDA_LIMIT)
+            ? ELMO_LAMBDA2
+            : ELMO_LAMBDA;
+
+        return lambda;
+    }
+
+    double calculate_t(int game_result)
+    {
+        // Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
+        // game_result = 1,0,-1 so add 1 and divide by 2.
+        const double t = double(game_result + 1) * 0.5;
+
+        return t;
+    }
+
     double calc_grad(Value teacher_signal, Value shallow, const PackedSfenValue& psv)
     {
         // elmo (WCSC27) method
         // Correct with the actual game wins and losses.
-
-        // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-        double scaled_teacher_signal = teacher_signal;
-        // Normalize to [0.0, 1.0].
-        scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
-        // Scale to [dest_score_min_value, dest_score_max_value].
-        scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
-
         const double q = winning_percentage(shallow, psv.gamePly);
-        // Teacher winning probability.
-        double p = scaled_teacher_signal;
-        if (convert_teacher_signal_to_winning_probability) {
-            p = winning_percentage(scaled_teacher_signal, psv.gamePly);
-        }
-
-        // Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
-        // game_result = 1,0,-1 so add 1 and divide by 2.
-        const double t = double(psv.game_result + 1) / 2;
-
-        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
-        const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
+        const double p = calculate_p(teacher_signal, psv.gamePly);
+        const double t = calculate_t(psv.game_result);
+        const double lambda = calculate_lambda(teacher_signal);
 
         double grad;
-        if (use_wdl) {
-            double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
-            double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
+        if (use_wdl) 
+        {
+            const double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
+            const double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
             grad = lambda * dce_p + (1.0 - lambda) * dce_t;
         }
-        else {
+        else 
+        {
             // Use the actual win rate as a correction term.
             // This is the idea of ​​elmo (WCSC27), modern O-parts.
             grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
@@ -296,30 +322,25 @@ namespace Learner
 
     // Calculate cross entropy during learning
     // The individual cross entropy of the win/loss term and win rate term of the elmo expression is returned to the arguments cross_entropy_eval and cross_entropy_win.
-    void calc_cross_entropy(Value teacher_signal, Value shallow, const PackedSfenValue& psv,
-        double& cross_entropy_eval, double& cross_entropy_win, double& cross_entropy,
-        double& entropy_eval, double& entropy_win, double& entropy)
+    void calc_cross_entropy(
+        Value teacher_signal, 
+        Value shallow, 
+        const PackedSfenValue& psv,
+        double& cross_entropy_eval, 
+        double& cross_entropy_win, 
+        double& cross_entropy,
+        double& entropy_eval, 
+        double& entropy_win, 
+        double& entropy)
     {
-        // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-        double scaled_teacher_signal = teacher_signal;
-        // Normalize to [0.0, 1.0].
-        scaled_teacher_signal = (scaled_teacher_signal - src_score_min_value) / (src_score_max_value - src_score_min_value);
-        // Scale to [dest_score_min_value, dest_score_max_value].
-        scaled_teacher_signal = scaled_teacher_signal * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
-
         // Teacher winning probability.
-        double p = scaled_teacher_signal;
-        if (convert_teacher_signal_to_winning_probability) {
-            p = winning_percentage(scaled_teacher_signal);
-        }
-        const double q /* eval_winrate    */ = winning_percentage(shallow);
-        const double t = double(psv.game_result + 1) / 2;
+        const double q = winning_percentage(shallow, psv.gamePly);
+        const double p = calculate_p(teacher_signal, psv.gamePly);
+        const double t = calculate_t(psv.game_result);
+        const double lambda = calculate_lambda(teacher_signal);
 
         constexpr double epsilon = 0.000001;
 
-        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
-        const double lambda = (abs(teacher_signal) >= ELMO_LAMBDA_LIMIT) ? ELMO_LAMBDA2 : ELMO_LAMBDA;
-
         const double m = (1.0 - lambda) * t + lambda * p;
 
         cross_entropy_eval =
@@ -343,7 +364,8 @@ namespace Learner
     // Other variations may be prepared as the objective function..
 
 
-    double calc_grad(Value shallow, const PackedSfenValue& psv) {
+    double calc_grad(Value shallow, const PackedSfenValue& psv) 
+    {
         return calc_grad((Value)psv.score, shallow, psv);
     }
 
@@ -363,8 +385,14 @@ namespace Learner
         // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
         static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
 
+        // hash to limit the reading of the same situation
+        // Is there too many 64 million phases? Or Not really..
+        // It must be 2**N because it will be used as the mask to calculate hash_index.
+        static constexpr uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
+
         // Do not use std::random_device().  Because it always the same integers on MinGW.
-        SfenReader(int thread_num) : prng(std::chrono::system_clock::now().time_since_epoch().count())
+        SfenReader(int thread_num) : 
+            prng(std::chrono::system_clock::now().time_since_epoch().count())
         {
             packed_sfens.resize(thread_num);
             total_read = 0;
@@ -398,6 +426,7 @@ namespace Learner
                     cout << "Error! read packed sfen , failed." << endl;
                     break;
                 }
+
                 sfen_for_mse.push_back(ps);
 
                 // Get the hash key.
@@ -418,8 +447,10 @@ namespace Learner
                 {
                     if (eval_limit < abs(p.score))
                         continue;
+
                     if (!use_draw_games_in_validation && p.game_result == 0)
                         continue;
+
                     sfen_for_mse.push_back(p);
                 }
                 else
@@ -436,7 +467,7 @@ namespace Learner
             auto& thread_ps = packed_sfens[thread_id];
 
             // Fill the read buffer if there is no remaining buffer, but if it doesn't even exist, finish.
-            if ((thread_ps == nullptr || thread_ps->size() == 0) // If the buffer is empty, fill it.
+            if ((thread_ps == nullptr || thread_ps->empty()) // If the buffer is empty, fill it.
                 && !read_to_thread_buffer_impl(thread_id))
                 return false;
 
@@ -444,11 +475,11 @@ namespace Learner
             // Since the filling of the thread buffer with the phase has been completed successfully
             // thread_ps->rbegin() is alive.
 
-            ps = *(thread_ps->rbegin());
+            ps = thread_ps->back();
             thread_ps->pop_back();
 
             // If you've run out of buffers, call delete yourself to free this buffer.
-            if (thread_ps->size() == 0)
+            if (thread_ps->empty())
             {
                 thread_ps.reset();
             }
@@ -507,7 +538,7 @@ namespace Learner
                     return false;
 
                 // Get the next file name.
-                string filename = *filenames.rbegin();
+                string filename = filenames.back();
                 filenames.pop_back();
 
                 fs.open(filename, ios::in | ios::binary);
@@ -523,6 +554,7 @@ namespace Learner
                 // This size() is read only, so you don't need to lock it.
                 while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
                     sleep(100);
+
                 if (stop_flag)
                     return;
 
@@ -555,9 +587,7 @@ namespace Learner
 
                 if (!no_shuffle)
                 {
-                    auto size = sfens.size();
-                    for (size_t i = 0; i < size; ++i)
-                        swap(sfens[i], sfens[(size_t)(prng.rand((uint64_t)size - i) + i)]);
+                    Algo::shuffle(sfens, prng);
                 }
 
                 // Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
@@ -591,6 +621,13 @@ namespace Learner
             }
         }
 
+        // Determine if it is a phase for calculating rmse.
+        // (The computational aspects of rmse should not be used for learning.)
+        bool is_for_rmse(Key key) const
+        {
+            return sfen_for_mse_hash.count(key) != 0;
+        }
+
         // sfen files
         vector<string> filenames;
 
@@ -613,17 +650,6 @@ namespace Learner
 
         bool stop_flag;
 
-        // Determine if it is a phase for calculating rmse.
-        // (The computational aspects of rmse should not be used for learning.)
-        bool is_for_rmse(Key key) const
-        {
-            return sfen_for_mse_hash.count(key) != 0;
-        }
-
-        // hash to limit the reading of the same situation
-        // Is there too many 64 million phases? Or Not really..
-        // It must be 2**N because it will be used as the mask to calculate hash_index.
-        static const uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
         vector<Key> hash; // 64MB*8 = 512MB
 
         // test phase for mse calculation
@@ -663,7 +689,10 @@ namespace Learner
     // Class to generate sfen with multiple threads
     struct LearnerThink : public MultiThink
     {
-        LearnerThink(SfenReader& sr_) :sr(sr_), stop_flag(false), save_only_once(false)
+        LearnerThink(SfenReader& sr_) : 
+            sr(sr_), 
+            stop_flag(false), 
+            save_only_once(false)
         {
 #if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
             learn_sum_cross_entropy_eval = 0.0;
@@ -686,7 +715,12 @@ namespace Learner
         virtual void thread_worker(size_t thread_id);
 
         // Start a thread that loads the phase file in the background.
-        void start_file_read_worker() { sr.start_file_read_worker(); }
+        void start_file_read_worker() 
+        { 
+            sr.start_file_read_worker(); 
+        }
+
+        Value get_shallow_value(Position& task_pos);
 
         // save merit function parameters to a file
         bool save(bool is_final = false);
@@ -753,6 +787,33 @@ namespace Learner
         TaskDispatcher task_dispatcher;
     };
 
+    Value LearnerThink::get_shallow_value(Position& task_pos)
+    {
+        // Evaluation value for shallow search
+        // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
+        // Use qsearch() because it is difficult to compare the values.
+        // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
+        const auto [_, pv] = qsearch(task_pos);
+
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
+        for (size_t i = 0; i < pv.size(); ++i)
+        {
+            task_pos.do_move(pv[i], states[i]);
+            Eval::NNUE::update_eval(task_pos);
+        }
+
+        const auto rootColor = task_pos.side_to_move();
+        const Value shallow_value =
+            (rootColor == task_pos.side_to_move())
+            ? Eval::evaluate(task_pos)
+            : -Eval::evaluate(task_pos);
+
+        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+            task_pos.undo_move(*it);
+
+        return shallow_value;
+    }
+
     void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
     {
         // There is no point in hitting the replacement table, so at this timing the generation of the replacement table is updated.
@@ -800,8 +861,6 @@ namespace Learner
         pos.set(StartFEN, false, &si, th);
         std::cout << "hirate eval = " << Eval::evaluate(pos);
 
-        //Eval::print_eval_stat(pos);
-
         // It's better to parallelize here, but it's a bit troublesome because the search before slave has not finished.
         // I created a mechanism to call task, so I will use it.
 
@@ -818,6 +877,7 @@ namespace Learner
             // It is not possible to capture pos used in ↑, so specify the variables you want to capture one by one.
             auto task =
                 [
+                    this,
                     &ps,
                     &test_sum_cross_entropy_eval,
                     &test_sum_cross_entropy_win,
@@ -830,7 +890,6 @@ namespace Learner
                     &move_accord_count
                 ](size_t task_thread_id)
             {
-                // Does C++ properly capture a new ps instance for each loop?.
                 auto task_th = Threads[task_thread_id];
                 auto& task_pos = task_th->rootPos;
                 StateInfo task_si;
@@ -840,26 +899,7 @@ namespace Learner
                     cout << "Error! : illegal packed sfen " << task_pos.fen() << endl;
                 }
 
-                // Evaluation value for shallow search
-                // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
-                // Use qsearch() because it is difficult to compare the values.
-                // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
-                auto task_search_result = qsearch(task_pos);
-
-                auto shallow_value = task_search_result.first;
-                {
-                    const auto rootColor = task_pos.side_to_move();
-                    const auto pv = task_search_result.second;
-                    std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
-                    for (size_t i = 0; i < pv.size(); ++i)
-                    {
-                        task_pos.do_move(pv[i], states[i]);
-                        Eval::NNUE::update_eval(task_pos);
-                    }
-                    shallow_value = (rootColor == task_pos.side_to_move()) ? Eval::evaluate(task_pos) : -Eval::evaluate(task_pos);
-                    for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-                        task_pos.undo_move(*it);
-                }
+                const Value shallow_value = get_shallow_value(task_pos);
 
                 // Evaluation value of deep search
                 auto deep_value = (Value)ps.score;
@@ -887,7 +927,17 @@ namespace Learner
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
                 double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
                 double test_entropy_eval, test_entropy_win, test_entropy;
-                calc_cross_entropy(deep_value, shallow_value, ps, test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy, test_entropy_eval, test_entropy_win, test_entropy);
+                calc_cross_entropy(
+                    deep_value, 
+                    shallow_value, 
+                    ps, 
+                    test_cross_entropy_eval, 
+                    test_cross_entropy_win, 
+                    test_cross_entropy, 
+                    test_entropy_eval, 
+                    test_entropy_win, 
+                    test_entropy);
+
                 // The total cross entropy need not be abs() by definition.
                 test_sum_cross_entropy_eval += test_cross_entropy_eval;
                 test_sum_cross_entropy_win += test_cross_entropy_win;
@@ -900,8 +950,8 @@ namespace Learner
 
                 // Determine if the teacher's move and the score of the shallow search match
                 {
-                    auto r = search(task_pos, 1);
-                    if ((uint16_t)r.second[0] == ps.move)
+                    const auto [value, pv] = search(task_pos, 1);
+                    if ((uint16_t)pv[0] == ps.move)
                         move_accord_count.fetch_add(1, std::memory_order_relaxed);
                 }
 
@@ -950,6 +1000,7 @@ namespace Learner
                 << " , test_entropy = " << test_sum_entropy / sr.sfen_for_mse.size()
                 << " , norm = " << sum_norm
                 << " , move accuracy = " << (move_accord_count * 100.0 / sr.sfen_for_mse.size()) << "%";
+
             if (done != static_cast<uint64_t>(-1))
             {
                 cout
@@ -962,7 +1013,8 @@ namespace Learner
             }
             cout << endl;
         }
-        else {
+        else 
+        {
             cout << "Error! : sr.sfen_for_mse.size() = " << sr.sfen_for_mse.size() << " ,  done = " << done << endl;
         }
 
@@ -978,7 +1030,6 @@ namespace Learner
 #endif
     }
 
-
     void LearnerThink::thread_worker(size_t thread_id)
     {
 #if defined(_OPENMP)
@@ -1092,7 +1143,9 @@ namespace Learner
             }
 
             PackedSfenValue ps;
-        RetryRead:;
+
+        RETRY_READ:;
+
             if (!sr.read_to_thread_buffer(thread_id, ps))
             {
                 // ran out of thread pool for my thread.
@@ -1106,16 +1159,14 @@ namespace Learner
             // The evaluation value exceeds the learning target value.
             // Ignore this aspect information.
             if (eval_limit < abs(ps.score))
-                goto RetryRead;
-
+                goto RETRY_READ;
 
             if (!use_draw_games_in_training && ps.game_result == 0)
-                goto RetryRead;
-
+                goto RETRY_READ;
 
             // Skip over the opening phase
             if (ps.gamePly < prng.rand(reduction_gameply))
-                goto RetryRead;
+                goto RETRY_READ;
 
 #if 0
             auto sfen = pos.sfen_unpack(ps.data);
@@ -1129,20 +1180,24 @@ namespace Learner
                 // I got a strange sfen. Should be debugged!
                 // Since it is an illegal sfen, it may not be displayed with pos.sfen(), but it is better than not.
                 cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
-                goto RetryRead;
+                goto RETRY_READ;
             }
+
 #if !defined(EVAL_NNUE)
+            if (skip_duplicated_positions_in_training)
             {
-                auto key = pos.key();
+                const auto key = pos.key();
+
                 // Exclude the phase used for rmse calculation.
-                if (sr.is_for_rmse(key) && skip_duplicated_positions_in_training)
-                    goto RetryRead;
+                if (sr.is_for_rmse(key))
+                    goto RETRY_READ;
 
                 // Exclude the most recently used aspect.
-                auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
-                auto key2 = sr.hash[hash_index];
-                if (key == key2 && skip_duplicated_positions_in_training)
-                    goto RetryRead;
+                const auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
+                const auto key2 = sr.hash[hash_index];
+                if (key == key2)
+                    goto RETRY_READ;
+
                 sr.hash[hash_index] = key; // Replace with the current key.
             }
 #endif
@@ -1152,22 +1207,21 @@ namespace Learner
             // (shouldn't write out such teacher aspect itself, but may have written it out with an old generation routine)
         // Skip the position if there are no legal moves (=checkmated or stalemate).
             if (MoveList<LEGAL>(pos).size() == 0)
-                goto RetryRead;
+                goto RETRY_READ;
 
             // I can read it, so try displaying it.
             //      cout << pos << value << endl;
 
             // Evaluation value of shallow search (qsearch)
-            auto r = qsearch(pos);
-            auto pv = r.second;
+            const auto [shallow_value, pv] = qsearch(pos);
 
             // Evaluation value of deep search
-            auto deep_value = (Value)ps.score;
+            const auto deep_value = (Value)ps.score;
 
             // I feel that the mini batch has a better gradient.
             // Go to the leaf node as it is, add only to the gradient array, and later try AdaGrad at the time of rmse aggregation.
 
-            auto rootColor = pos.side_to_move();
+            const auto rootColor = pos.side_to_move();
 
             // If the initial PV is different, it is better not to use it for learning.
             // If it is the result of searching a completely different place, it may become noise.
@@ -1203,13 +1257,26 @@ namespace Learner
                 // I don't think this is a very desirable property, as the aspect that gives that gradient will be different.
                 // I have turned off the substitution table, but since the pv array has not been updated due to one stumbling block etc...
 
-                Value shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
+                const Value shallow_value = 
+                    (rootColor == pos.side_to_move()) 
+                    ? Eval::evaluate(pos) 
+                    : -Eval::evaluate(pos);
 
 #if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
                 // Calculate loss for training data
                 double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
                 double learn_entropy_eval, learn_entropy_win, learn_entropy;
-                calc_cross_entropy(deep_value, shallow_value, ps, learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy, learn_entropy_eval, learn_entropy_win, learn_entropy);
+                calc_cross_entropy(
+                    deep_value, 
+                    shallow_value, 
+                    ps, 
+                    learn_cross_entropy_eval, 
+                    learn_cross_entropy_win, 
+                    learn_cross_entropy, 
+                    learn_entropy_eval, 
+                    learn_entropy_win, 
+                    learn_entropy);
+
                 learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
                 learn_sum_cross_entropy_win += learn_cross_entropy_win;
                 learn_sum_cross_entropy += learn_cross_entropy;
@@ -1266,7 +1333,8 @@ namespace Learner
                 Eval::NNUE::update_eval(pos);
             }
 
-            if (illegal_move) {
+            if (illegal_move) 
+            {
                 sync_cout << "An illical move was detected... Excluded the position from the learning data..." << sync_endl;
                 continue;
             }
@@ -1284,7 +1352,6 @@ namespace Learner
             dj_dw = calc_grad(deep_value, shallow_value, ps);
             Eval::add_grad(pos, rootColor, dj_dw, without_kpp);
 #endif
-
         }
 
     }
@@ -1301,14 +1368,17 @@ namespace Learner
             // Do not dig a subfolder because I want to save it only once.
             Eval::save_eval("");
         }
-        else if (is_final) {
+        else if (is_final) 
+        {
             Eval::save_eval("final");
             return true;
         }
-        else {
+        else 
+        {
             static int dir_number = 0;
             const std::string dir_name = std::to_string(dir_number++);
             Eval::save_eval(dir_name);
+
 #if defined(EVAL_NNUE)
             if (newbob_decay != 1.0 && latest_loss_count > 0) {
                 static int trials = newbob_num_trials;
@@ -1316,22 +1386,28 @@ namespace Learner
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
                 cout << "loss: " << latest_loss;
-                if (latest_loss < best_loss) {
+                if (latest_loss < best_loss) 
+                {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
                     best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
                     trials = newbob_num_trials;
                 }
-                else {
+                else 
+                {
                     cout << " >= best (" << best_loss << "), rejected" << endl;
-                    if (best_nn_directory.empty()) {
+                    if (best_nn_directory.empty()) 
+                    {
                         cout << "WARNING: no improvement from initial model" << endl;
                     }
-                    else {
+                    else 
+                    {
                         cout << "restoring parameters from " << best_nn_directory << endl;
                         Eval::NNUE::RestoreParameters(best_nn_directory);
                     }
-                    if (--trials > 0 && !is_final) {
+
+                    if (--trials > 0 && !is_final) 
+                    {
                         cout << "reducing learning rate scale from " << newbob_scale
                             << " to " << (newbob_scale * newbob_decay)
                             << " (" << trials << " more trials)" << endl;
@@ -1339,7 +1415,9 @@ namespace Learner
                         Eval::NNUE::SetGlobalLearningRateScale(newbob_scale);
                     }
                 }
-                if (trials == 0) {
+                
+                if (trials == 0) 
+                {
                     cout << "converged" << endl;
                     return true;
                 }
@@ -1371,10 +1449,11 @@ namespace Learner
             // Output progress every 10M phase or when all writing is completed
             if (((write_sfen_count % buffer_size) == 0) ||
                 (write_sfen_count == total_sfen_count))
+            {
                 cout << write_sfen_count << " / " << total_sfen_count << endl;
+            }
         };
 
-
         cout << endl << "write : " << output_file_name << endl;
 
         fstream fs(output_file_name, ios::out | ios::binary);
@@ -1453,9 +1532,7 @@ namespace Learner
 
         auto write_buffer = [&](uint64_t size)
         {
-            // shuffle from buf[0] to buf[size-1]
-            for (uint64_t i = 0; i < size; ++i)
-                swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
+            Algo::shuffle(buf, prng);
 
             // write to a file
             fstream fs;
@@ -1533,13 +1610,8 @@ namespace Learner
             auto& fs = afs[i];
 
             fs.open(filename, ios::in | ios::binary);
-            fs.seekg(0, fstream::end);
-            uint64_t eofPos = (uint64_t)fs.tellg();
-            fs.clear(); // Otherwise, the next seek may fail.
-            fs.seekg(0, fstream::beg);
-            uint64_t begPos = (uint64_t)fs.tellg();
-            uint64_t file_size = eofPos - begPos;
-            uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
+            const uint64_t file_size = get_file_size(fs);
+            const uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
             a_count[i] = sfen_count;
 
             // Output the number of sfen stored in each file.
@@ -1578,8 +1650,8 @@ namespace Learner
         PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
         uint64_t size = (uint64_t)buf.size();
         std::cout << "shuffle buf.size() = " << size << std::endl;
-        for (uint64_t i = 0; i < size; ++i)
-            swap(buf[i], buf[(uint64_t)(prng.rand(size - i) + i)]);
+
+        Algo::shuffle(buf, prng);
 
         std::cout << "write : " << output_file_name << endl;
 
diff --git a/src/misc.cpp b/src/misc.cpp
index a23b1205..5ef5ecdc 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -627,18 +627,27 @@ void* aligned_malloc(size_t size, size_t align)
     return p;
 }
 
+std::uint64_t get_file_size(std::fstream& fs)
+{
+    auto pos = fs.tellg();
+
+    fs.seekg(0, fstream::end);
+    const uint64_t eofPos = (uint64_t)fs.tellg();
+    fs.clear(); // Otherwise, the next seek may fail.
+    fs.seekg(0, fstream::beg);
+    const uint64_t begPos = (uint64_t)fs.tellg();
+    fs.seekg(pos);
+
+    return eofPos - begPos;
+}
+
 int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func)
 {
     fstream fs(filename, ios::in | ios::binary);
     if (fs.fail())
         return 1;
 
-    fs.seekg(0, fstream::end);
-    uint64_t eofPos = (uint64_t)fs.tellg();
-    fs.clear(); // Otherwise the next seek may fail.
-    fs.seekg(0, fstream::beg);
-    uint64_t begPos = (uint64_t)fs.tellg();
-    uint64_t file_size = eofPos - begPos;
+    const uint64_t file_size = get_file_size(fs);
     //std::cout << "filename = " << filename << " , file_size = " << file_size << endl;
 
     // I know the file size, so call callback_func to get a buffer for this,
diff --git a/src/misc.h b/src/misc.h
index c918a351..5add3b36 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -26,6 +26,8 @@
 #include <ostream>
 #include <string>
 #include <vector>
+#include <utility>
+#include <cmath>
 
 #include "types.h"
 
@@ -155,6 +157,7 @@ std::string now_string();
 // Also, if the buffer cannot be allocated in the callback function or if the file size is different from the expected file size,
 // Return nullptr. At this time, read_file_to_memory() interrupts reading and returns with an error.
 
+std::uint64_t get_file_size(std::fstream& fs);
 int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func);
 int write_memory_to_file(std::string filename, void* ptr, uint64_t size);
 
@@ -199,20 +202,37 @@ inline std::ostream& operator<<(std::ostream& os, AsyncPRNG& prng)
 
 // Mathematical function used for progress calculation and learning
 namespace Math {
-	// Sigmoid function
-	// = 1.0 / (1.0 + std::exp(-x))
-	double sigmoid(double x);
+    inline double sigmoid(double x)
+    {
+        return 1.0 / (1.0 + std::exp(-x));
+    }
 
-	// Differentiation of sigmoid function
-	// = sigmoid(x) * (1.0-sigmoid(x))
-	double dsigmoid(double x);
+    inline double dsigmoid(double x)
+    {
+        // Sigmoid function
+        // f(x) = 1/(1+exp(-x))
+        // the first derivative is
+        // f'(x) = df/dx = f(x)・{ 1-f(x)}
+        // becomes
+
+        return sigmoid(x) * (1.0 - sigmoid(x));
+    }
 
 	// Clip v so that it fits between [lo,hi].
 	// * In Stockfish, this function is written in bitboard.h.
 	template<class T> constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
 		return v < lo ? lo : v > hi ? hi : v;
 	}
+}
 
+namespace Algo {
+    template <typename Rng, typename T>
+    void shuffle(std::vector<T>& buf, Rng&& prng)
+    {
+        const auto size = buf.size();
+        for (uint64_t i = 0; i < size; ++i)
+            std::swap(buf[i], buf[prng.rand(size - i) + i]);
+    }
 }
 
 // --------------------

From 1482e5215afa1b457418d45805bb57a25f4529f4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 23:26:38 +0200
Subject: [PATCH 202/583] A second batch of code reorganization.

---
 src/Makefile              |   1 -
 src/learn/convert.cpp     |  10 +--
 src/learn/gensfen.cpp     |   8 +-
 src/learn/gensfen2019.cpp |   1 -
 src/learn/learn.h         |  56 ++++++-------
 src/learn/learner.cpp     | 170 +++++++++++++++-----------------------
 6 files changed, 96 insertions(+), 150 deletions(-)
 delete mode 100644 src/learn/gensfen2019.cpp

diff --git a/src/Makefile b/src/Makefile
index 9db13e44..ca851dba 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -56,7 +56,6 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/features/enpassant.cpp \
 	nnue/nnue_test_command.cpp \
 	extra/sfen_packer.cpp \
-	learn/gensfen2019.cpp \
 	learn/learner.cpp \
 	learn/gensfen.cpp \
 	learn/convert.cpp \
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index b84dc2f8..9bd9548d 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -25,20 +25,12 @@
 #include <chrono>
 #include <random>
 #include <regex>
+#include <filesystem>
 
 #if defined (_OPENMP)
 #include <omp.h>
 #endif
 
-#if defined(_MSC_VER)
-// The C++ filesystem cannot be used unless it is C++17 or later or MSVC.
-// I tried to use windows.h, but with g++ of msys2 I can not get the files in the folder well.
-// Use dirent.h because there is no help for it.
-#include <filesystem>
-#elif defined(__GNUC__)
-#include <dirent.h>
-#endif
-
 using namespace std;
 
 namespace Learner
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 4214233b..b049192e 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -28,18 +28,12 @@
 #include <memory>
 #include <limits>
 #include <optional>
+#include <filesystem>
 
 #if defined (_OPENMP)
 #include <omp.h>
 #endif
 
-#if defined(_MSC_VER)
-// std::filesystem doesn't work on GCC even though it claims to support C++17.
-#include <filesystem>
-#elif defined(__GNUC__)
-#include <dirent.h>
-#endif
-
 #if defined(EVAL_NNUE)
 #include "../nnue/evaluate_nnue_learner.h"
 #include <climits>
diff --git a/src/learn/gensfen2019.cpp b/src/learn/gensfen2019.cpp
deleted file mode 100644
index 01293b9c..00000000
--- a/src/learn/gensfen2019.cpp
+++ /dev/null
@@ -1 +0,0 @@
-// just a place holder
diff --git a/src/learn/learn.h b/src/learn/learn.h
index e29ed74a..1bc39cf9 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -27,30 +27,6 @@
 // SGD looking only at the sign of the gradient. It requires less memory, but the accuracy is...
 // #define SGD_UPDATE
 
-// ----------------------
-// Settings for learning
-// ----------------------
-
-// mini-batch size.
-// Calculate the gradient by combining this number of phases.
-// If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
-// If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
-// I don't think you need to change this value in most cases.
-
-#define LEARN_MINI_BATCH_SIZE (1000 * 1000 * 1)
-
-// The number of phases to read from the file at one time. After reading this much, shuffle.
-// It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
-// Must be a multiple of THREAD_BUFFER_SIZE(=10000).
-
-#define LEARN_SFEN_READ_SIZE (1000 * 1000 * 10)
-
-// Saving interval of evaluation function at learning. Save each time you learn this number of phases.
-// Needless to say, the longer the saving interval, the shorter the learning time.
-// Folder name is incremented for each save like 0/, 1/, 2/...
-// By default, once every 1 billion phases.
-#define LEARN_EVAL_SAVE_INTERVAL (1000000000ULL)
-
 
 // ----------------------
 // Select the objective function
@@ -79,10 +55,6 @@
 // debug settings for learning
 // ----------------------
 
-// Reduce the output of rmse during learning to 1 for this number of times.
-// rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
-#define LEARN_RMSE_OUTPUT_INTERVAL 1
-
 
 // ----------------------
 // learning from zero vector
@@ -205,6 +177,34 @@ typedef float LearnFloatType;
 
 namespace Learner
 {
+	// ----------------------
+	// Settings for learning
+	// ----------------------
+
+	// mini-batch size.
+	// Calculate the gradient by combining this number of phases.
+	// If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
+	// If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
+	// I don't think you need to change this value in most cases.
+
+	constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
+
+	// The number of phases to read from the file at one time. After reading this much, shuffle.
+	// It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
+	// Must be a multiple of THREAD_BUFFER_SIZE(=10000).
+
+	constexpr std::size_t LEARN_SFEN_READ_SIZE = 1000 * 1000 * 10;
+
+	// Saving interval of evaluation function at learning. Save each time you learn this number of phases.
+	// Needless to say, the longer the saving interval, the shorter the learning time.
+	// Folder name is incremented for each save like 0/, 1/, 2/...
+	// By default, once every 1 billion phases.
+	constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 1000000000ULL;
+
+	// Reduce the output of rmse during learning to 1 for this number of times.
+	// rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
+	constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
+
 	//Structure in which PackedSfen and evaluation value are integrated
 	// If you write different contents for each option, it will be a problem when reusing the teacher game
 	// For the time being, write all the following members regardless of the options.
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 98c8e32e..ddfaff5a 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -45,15 +45,6 @@
 #include <omp.h>
 #endif
 
-#if defined(_MSC_VER)
-// The C++ filesystem cannot be used unless it is C++17 or later or MSVC.
-// I tried to use windows.h, but with g++ of msys2 I can not get the files in the folder well.
-// Use dirent.h because there is no help for it.
-#include <filesystem>
-#elif defined(__GNUC__)
-#include <dirent.h>
-#endif
-
 #if defined(EVAL_NNUE)
 #include "../nnue/evaluate_nnue_learner.h"
 #include <climits>
@@ -62,8 +53,11 @@
 
 using namespace std;
 
-//// This is defined in the search section.
-//extern Book::BookMoveSelector book;
+
+#if defined(USE_BOOK)
+// This is defined in the search section.
+extern Book::BookMoveSelector book;
+#endif
 
 template <typename T>
 T operator +=(std::atomic<T>& x, const T rhs)
@@ -128,9 +122,9 @@ namespace Learner
         constexpr double wdl_total = 1000.0;
         constexpr double draw_score = 0.5;
 
-        double wdl_w = UCI::win_rate_model_double(value, ply);
-        double wdl_l = UCI::win_rate_model_double(-value, ply);
-        double wdl_d = wdl_total - wdl_w - wdl_l;
+        const double wdl_w = UCI::win_rate_model_double(value, ply);
+        const double wdl_l = UCI::win_rate_model_double(-value, ply);
+        const double wdl_d = wdl_total - wdl_w - wdl_l;
 
         return (wdl_w + wdl_d * draw_score) / wdl_total;
     }
@@ -150,16 +144,17 @@ namespace Learner
 
     double calc_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
     {
-        double p = deep_win_rate;
-        double q = winning_percentage(shallow_eval, ply);
+        const double p = deep_win_rate;
+        const double q = winning_percentage(shallow_eval, ply);
         return -p * std::log(q) - (1.0 - p) * std::log(1.0 - q);
     }
 
     double calc_d_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
     {
         constexpr double epsilon = 0.000001;
-        double y1 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval, ply);
-        double y2 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval + epsilon, ply);
+
+        const double y1 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval, ply);
+        const double y2 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval + epsilon, ply);
 
         // Divide by the winning_probability_coefficient to match scale with the sigmoidal win rate
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
@@ -190,8 +185,8 @@ namespace Learner
         // Also, the coefficient of 1/m is unnecessary if you use the update formula that has the automatic gradient adjustment function like Adam and AdaGrad.
         // Therefore, it is not necessary to save it in memory.
 
-        double p = winning_percentage(deep);
-        double q = winning_percentage(shallow);
+        const double p = winning_percentage(deep, psv.gamePly);
+        const double q = winning_percentage(shallow, psv.gamePly);
         return (q - p) * Math::dsigmoid(double(shallow) / 600.0);
     }
 #endif
@@ -216,8 +211,8 @@ namespace Learner
         // = ...
         // = q-p.
 
-        double p = winning_percentage(deep);
-        double q = winning_percentage(shallow);
+        const double p = winning_percentage(deep, psv.gamePly);
+        const double q = winning_percentage(shallow, psv.gamePly);
 
         return q - p;
     }
@@ -270,8 +265,10 @@ namespace Learner
         double p = scaled_teacher_signal;
         if (convert_teacher_signal_to_winning_probability) 
         {
-            p = winning_percentage(scaled_teacher_signal);
+            p = winning_percentage(scaled_teacher_signal, ply);
         }
+
+        return p;
     }
 
     double calculate_lambda(double teacher_signal)
@@ -534,7 +531,7 @@ namespace Learner
                     fs.close();
 
                 // no more
-                if (filenames.size() == 0)
+                if (filenames.empty())
                     return false;
 
                 // Get the next file name.
@@ -543,6 +540,7 @@ namespace Learner
 
                 fs.open(filename, ios::in | ios::binary);
                 cout << "open filename = " << filename << endl;
+
                 assert(fs);
 
                 return true;
@@ -569,16 +567,12 @@ namespace Learner
                     {
                         sfens.push_back(p);
                     }
-                    else
+                    else if(!open_next_file())
                     {
-                        // read failure
-                        if (!open_next_file())
-                        {
-                            // There was no next file. Abon.
-                            cout << "..end of files." << endl;
-                            end_of_files = true;
-                            return;
-                        }
+                        // There was no next file. Abon.
+                        cout << "..end of files." << endl;
+                        end_of_files = true;
+                        return;
                     }
                 }
 
@@ -702,6 +696,7 @@ namespace Learner
             learn_sum_entropy_win = 0.0;
             learn_sum_entropy = 0.0;
 #endif
+
 #if defined(EVAL_NNUE)
             newbob_scale = 1.0;
             newbob_decay = 1.0;
@@ -1213,7 +1208,7 @@ namespace Learner
             //      cout << pos << value << endl;
 
             // Evaluation value of shallow search (qsearch)
-            const auto [shallow_value, pv] = qsearch(pos);
+            const auto [_, pv] = qsearch(pos);
 
             // Evaluation value of deep search
             const auto deep_value = (Value)ps.score;
@@ -1408,9 +1403,11 @@ namespace Learner
 
                     if (--trials > 0 && !is_final) 
                     {
-                        cout << "reducing learning rate scale from " << newbob_scale
+                        cout
+                            << "reducing learning rate scale from " << newbob_scale
                             << " to " << (newbob_scale * newbob_decay)
                             << " (" << trials << " more trials)" << endl;
+
                         newbob_scale *= newbob_decay;
                         Eval::NNUE::SetGlobalLearningRateScale(newbob_scale);
                     }
@@ -1432,10 +1429,10 @@ namespace Learner
     // prng: random number
     // afs: fstream of each teacher phase file
     // a_count: The number of teacher positions inherent in each file.
-    void shuffle_write(const string& output_file_name, PRNG& prng, vector<fstream>& afs, vector<uint64_t>& a_count)
+    void shuffle_write(const string& output_file_name, PRNG& prng, vector<fstream>& sfen_file_streams, vector<uint64_t>& sfen_count_in_file)
     {
         uint64_t total_sfen_count = 0;
-        for (auto c : a_count)
+        for (auto c : sfen_count_in_file)
             total_sfen_count += c;
 
         // number of exported phases
@@ -1459,39 +1456,39 @@ namespace Learner
         fstream fs(output_file_name, ios::out | ios::binary);
 
         // total teacher positions
-        uint64_t sum = 0;
-        for (auto c : a_count)
-            sum += c;
+        uint64_t sfen_count_left = total_sfen_count;
 
-        while (sum != 0)
+        while (sfen_count_left != 0)
         {
-            auto r = prng.rand(sum);
+            auto r = prng.rand(sfen_count_left);
 
             // Aspects stored in fs[0] file ... Aspects stored in fs[1] file ...
             //Think of it as a series like, and determine in which file r is pointing.
             // The contents of the file are shuffled, so you can take the next element from that file.
             // Each file has a_count[x] phases, so this process can be written as follows.
 
-            uint64_t n = 0;
-            while (a_count[n] <= r)
-                r -= a_count[n++];
+            uint64_t i = 0;
+            while (sfen_count_in_file[i] <= r)
+                r -= sfen_count_in_file[i++];
 
             // This confirms n. Before you forget it, reduce the remaining number.
 
-            --a_count[n];
-            --sum;
+            --sfen_count_in_file[i];
+            --sfen_count_left;
 
             PackedSfenValue psv;
             // It's better to read and write all at once until the performance is not so good...
-            if (afs[n].read((char*)&psv, sizeof(PackedSfenValue)))
+            if (sfen_file_streams[i].read((char*)&psv, sizeof(PackedSfenValue)))
             {
                 fs.write((char*)&psv, sizeof(PackedSfenValue));
                 ++write_sfen_count;
                 print_status();
             }
         }
+
         print_status();
         fs.close();
+
         cout << "done!" << endl;
     }
 
@@ -1509,8 +1506,8 @@ namespace Learner
         // There should have been a limit of 512 per process on Windows, so you can open here as 500,
         // The current setting is 500 files x 20M = 10G = 10 billion phases.
 
-        PSVector buf;
-        buf.resize(buffer_size);
+        PSVector buf(buffer_size);
+
         // ↑ buffer, a marker that indicates how much you have used
         uint64_t buf_write_marker = 0;
 
@@ -1537,7 +1534,7 @@ namespace Learner
             // write to a file
             fstream fs;
             fs.open(make_filename(write_file_count++), ios::out | ios::binary);
-            fs.write((char*)&buf[0], size * sizeof(PackedSfenValue));
+            fs.write(reinterpret_cast<char*>(buf.data()), size * sizeof(PackedSfenValue));
             fs.close();
             a_count.push_back(size);
 
@@ -1552,14 +1549,13 @@ namespace Learner
         {
             fstream fs(filename, ios::in | ios::binary);
             cout << endl << "open file = " << filename;
-            while (fs.read((char*)&buf[buf_write_marker], sizeof(PackedSfenValue)))
+            while (fs.read(reinterpret_cast<char*>(&buf[buf_write_marker]), sizeof(PackedSfenValue)))
                 if (++buf_write_marker == buffer_size)
                     write_buffer(buffer_size);
 
             // Read in units of sizeof(PackedSfenValue),
             // Ignore the last remaining fraction. (Fails in fs.read, so exit while)
             // (The remaining fraction seems to be half-finished data that was created because it was stopped halfway during teacher generation.)
-
         }
 
         if (buf_write_marker != 0)
@@ -1599,20 +1595,20 @@ namespace Learner
         size_t file_count = filenames.size();
 
         // Number of teacher positions stored in each file in filenames
-        vector<uint64_t> a_count(file_count);
+        vector<uint64_t> sfen_count_in_file(file_count);
 
         // Count the number of teacher aspects in each file.
-        vector<fstream> afs(file_count);
+        vector<fstream> sfen_file_streams(file_count);
 
         for (size_t i = 0; i < file_count; ++i)
         {
             auto filename = filenames[i];
-            auto& fs = afs[i];
+            auto& fs = sfen_file_streams[i];
 
             fs.open(filename, ios::in | ios::binary);
             const uint64_t file_size = get_file_size(fs);
             const uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
-            a_count[i] = sfen_count;
+            sfen_count_in_file[i] = sfen_count;
 
             // Output the number of sfen stored in each file.
             cout << filename << " = " << sfen_count << " sfens." << endl;
@@ -1624,7 +1620,7 @@ namespace Learner
         // Now you have shuffled.
 
         // Throw to the subcontract function and end.
-        shuffle_write(output_file_name, prng, afs, a_count);
+        shuffle_write(output_file_name, prng, sfen_file_streams, sfen_count_in_file);
     }
 
     // Subcontracting the teacher shuffle "learn shufflem" command.
@@ -1656,7 +1652,10 @@ namespace Learner
         std::cout << "write : " << output_file_name << endl;
 
         // If the file to be written exceeds 2GB, it cannot be written in one shot with fstream::write, so use wrapper.
-        write_memory_to_file(output_file_name, (void*)&buf[0], (uint64_t)sizeof(PackedSfenValue) * (uint64_t)buf.size());
+        write_memory_to_file(
+            output_file_name, 
+            (void*)&buf[0], 
+            sizeof(PackedSfenValue) * buf.size());
 
         std::cout << "..shuffle_on_memory done." << std::endl;
     }
@@ -1664,7 +1663,7 @@ namespace Learner
     // Learning from the generated game record
     void learn(Position&, istringstream& is)
     {
-        auto thread_num = (int)Options["Threads"];
+        const auto thread_num = (int)Options["Threads"];
         SfenReader sr(thread_num);
 
         LearnerThink learn_think(sr);
@@ -1889,13 +1888,6 @@ namespace Learner
         {
             string kif_base_dir = Path::Combine(base_dir, target_dir);
 
-            // Remove this folder. Keep it relative to base_dir.
-#if defined(_MSC_VER)
-        // If you use std::tr2, warning C4996 will appear, so suppress it.
-        // * std::tr2 issued a deprecation warning by default under std:c++14, and was deleted by default in /std:c++17.
-#pragma warning(push)
-#pragma warning(disable:4996)
-
             namespace sys = std::filesystem;
             sys::path p(kif_base_dir); // Origin of enumeration
             std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
@@ -1903,36 +1895,6 @@ namespace Learner
                     if (sys::is_regular_file(p))
                         filenames.push_back(Path::Combine(target_dir, p.filename().generic_string()));
                 });
-#pragma warning(pop)
-
-#elif defined(__GNUC__)
-
-            auto ends_with = [](std::string const& value, std::string const& ending)
-            {
-                if (ending.size() > value.size()) return false;
-                return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
-            };
-
-            // It can't be helped, so read it using dirent.h.
-            DIR* dp; // pointer to directory
-            dirent* entry; // entry point returned by readdir()
-
-            dp = opendir(kif_base_dir.c_str());
-            if (dp != NULL)
-            {
-                do {
-                    entry = readdir(dp);
-                    // Only list files ending with ".bin"
-                    // →I hate this restriction when generating files with serial numbers...
-                    if (entry != NULL && ends_with(entry->d_name, ".bin"))
-                    {
-                        //cout << entry->d_name << endl;
-                        filenames.push_back(Path::Combine(target_dir, entry->d_name));
-                    }
-                } while (entry != NULL);
-                closedir(dp);
-            }
-#endif
         }
 
         cout << "learn from ";
@@ -1990,6 +1952,7 @@ namespace Learner
                 dest_score_max_value,
                 check_invalid_fen,
                 check_illegal_move);
+
             return;
 
         }
@@ -1997,7 +1960,12 @@ namespace Learner
         {
             Eval::init_NNUE();
             cout << "convert_bin_from_pgn-extract.." << endl;
-            convert_bin_from_pgn_extract(filenames, output_file_name, pgn_eval_side_to_move, convert_no_eval_fens_as_score_zero);
+            convert_bin_from_pgn_extract(
+                filenames, 
+                output_file_name, 
+                pgn_eval_side_to_move, 
+                convert_no_eval_fens_as_score_zero);
+
             return;
         }
 
@@ -2154,12 +2122,6 @@ namespace Learner
 #endif
     }
 
-
 } // namespace Learner
 
-#if defined(GENSFEN2019)
-#include "gensfen2019.cpp"
-#endif
-
-
 #endif // EVAL_LEARN

From a0b2d6a01e39627e9ea87b234a18067e4e404faf Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 23:33:32 +0200
Subject: [PATCH 203/583] Note a potential defect in sfen packer.

---
 src/extra/sfen_packer.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
index ac789ce8..fd013fa2 100644
--- a/src/extra/sfen_packer.cpp
+++ b/src/extra/sfen_packer.cpp
@@ -218,7 +218,7 @@ struct SfenPacker
     PieceType pr = type_of(pc);
     auto c = huffman_table[pr];
     stream.write_n_bit(c.code, c.bits);
- 
+
     if (pc == NO_PIECE)
       return;
 
@@ -249,7 +249,7 @@ struct SfenPacker
 
     // first and second flag
     Color c = (Color)stream.read_one_bit();
-    
+
     return make_piece(c, pr);
   }
 };
@@ -266,7 +266,10 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 {
 	SfenPacker packer;
 	auto& stream = packer.stream;
-	stream.set_data((uint8_t*)&sfen);
+
+  // TODO: separate streams for writing and reading. Here we actually have to
+  // const_cast which is not safe in the long run.
+	stream.set_data(const_cast<uint8_t*>(&sfen));
 
 	std::memset(this, 0, sizeof(Position));
 	std::memset(si, 0, sizeof(StateInfo));

From 0202218f58467dac447b73b7724158ebec4a221f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 23:34:13 +0200
Subject: [PATCH 204/583] fix cast

---
 src/extra/sfen_packer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
index fd013fa2..1d82111d 100644
--- a/src/extra/sfen_packer.cpp
+++ b/src/extra/sfen_packer.cpp
@@ -269,7 +269,7 @@ int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thre
 
   // TODO: separate streams for writing and reading. Here we actually have to
   // const_cast which is not safe in the long run.
-	stream.set_data(const_cast<uint8_t*>(&sfen));
+	stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
 
 	std::memset(this, 0, sizeof(Position));
 	std::memset(si, 0, sizeof(StateInfo));

From 41b7674aee3920cb72554f8d22eb4e2cb6c57e09 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 7 Sep 2020 23:55:07 +0200
Subject: [PATCH 205/583] Improve comments, break long lines.

---
 src/learn/learner.cpp | 321 ++++++++++++++++++++++++++++--------------
 src/misc.h            |   1 +
 2 files changed, 213 insertions(+), 109 deletions(-)

diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index ddfaff5a..f9d188b8 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1,18 +1,24 @@
-﻿// learning routines
+﻿// Learning routines:
 //
-// 1) Automatic generation of game records
+// 1) Automatic generation of game records in .bin format
 // → "gensfen" command
-// 2) Learning evaluation function parameters from the generated game record
+//
+// 2) Learning evaluation function parameters from the generated .bin files
 // → "learn" command
+//
 // → Shuffle in the teacher phase is also an extension of this command.
 // Example) "learn shuffle"
+//
 // 3) Automatic generation of fixed traces
 // → "makebook think" command
 // → implemented in extra/book/book.cpp
+//
 // 4) Post-station automatic review mode
 // → I will not be involved in the engine because it is a problem that the GUI should assist.
 // etc..
 
+#define EVAL_LEARN
+
 #if defined(EVAL_LEARN)
 
 #include "../eval/evaluate_common.h"
@@ -53,7 +59,6 @@
 
 using namespace std;
 
-
 #if defined(USE_BOOK)
 // This is defined in the search section.
 extern Book::BookMoveSelector book;
@@ -63,6 +68,7 @@ template <typename T>
 T operator +=(std::atomic<T>& x, const T rhs)
 {
     T old = x.load(std::memory_order_consume);
+
     // It is allowed that the value is rewritten from other thread at this timing.
     // The idea that the value is not destroyed is good.
     T desired = old + rhs;
@@ -81,7 +87,7 @@ namespace Learner
 
     static double winning_probability_coefficient = 1.0 / PawnValueEg / 4.0 * std::log(10.0);
 
-    // Score scale factors.  ex) If we set src_score_min_value = 0.0,
+    // Score scale factors. ex) If we set src_score_min_value = 0.0,
     // src_score_max_value = 1.0, dest_score_min_value = 0.0,
     // dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000].
     static double src_score_min_value = 0.0;
@@ -89,8 +95,9 @@ namespace Learner
     static double dest_score_min_value = 0.0;
     static double dest_score_max_value = 1.0;
 
-    // Assume teacher signals are the scores of deep searches, and convert them into winning
-    // probabilities in the trainer. Sometimes we want to use the winning probabilities in the training
+    // Assume teacher signals are the scores of deep searches, 
+    // and convert them into winning probabilities in the trainer. 
+    // Sometimes we want to use the winning probabilities in the training
     // data directly. In those cases, we set false to this variable.
     static bool convert_teacher_signal_to_winning_probability = true;
 
@@ -100,13 +107,9 @@ namespace Learner
     // This CANNOT be static since it's used elsewhere.
     bool use_raw_nnue_eval = false;
 
-    // Using WDL with win rate model instead of sigmoid
+    // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
-    // -----------------------------------
-    // command to learn from the generated game (learn)
-    // -----------------------------------
-
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage(double value)
     {
@@ -142,21 +145,31 @@ namespace Learner
         }
     }
 
-    double calc_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
+    double calc_cross_entropy_of_winning_percentage(
+        double deep_win_rate, 
+        double shallow_eval, 
+        int ply)
     {
         const double p = deep_win_rate;
         const double q = winning_percentage(shallow_eval, ply);
         return -p * std::log(q) - (1.0 - p) * std::log(1.0 - q);
     }
 
-    double calc_d_cross_entropy_of_winning_percentage(double deep_win_rate, double shallow_eval, int ply)
+    double calc_d_cross_entropy_of_winning_percentage(
+        double deep_win_rate, 
+        double shallow_eval, 
+        int ply)
     {
         constexpr double epsilon = 0.000001;
 
-        const double y1 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval, ply);
-        const double y2 = calc_cross_entropy_of_winning_percentage(deep_win_rate, shallow_eval + epsilon, ply);
+        const double y1 = calc_cross_entropy_of_winning_percentage(
+            deep_win_rate, shallow_eval, ply);
 
-        // Divide by the winning_probability_coefficient to match scale with the sigmoidal win rate
+        const double y2 = calc_cross_entropy_of_winning_percentage(
+            deep_win_rate, shallow_eval + epsilon, ply);
+
+        // Divide by the winning_probability_coefficient to 
+        // match scale with the sigmoidal win rate
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
@@ -167,9 +180,12 @@ namespace Learner
     {
         // The square of the win rate difference minimizes it in the objective function.
         // Objective function J = 1/2m Σ (win_rate(shallow)-win_rate(deep) )^2
-        // However, σ is a sigmoid function that converts the evaluation value into the difference in the winning percentage.
-        // m is the number of samples. shallow is the evaluation value for a shallow search (qsearch()). deep is the evaluation value for deep search.
-        // If W is the feature vector (parameter of the evaluation function) and Xi and Yi are teachers
+        // However, σ is a sigmoid function that converts the 
+        // evaluation value into the difference in the winning percentage.
+        // m is the number of samples. shallow is the evaluation value 
+        // for a shallow search (qsearch()). deep is the evaluation value for deep search.
+        // If W is the feature vector (parameter of the evaluation function) 
+        // and Xi and Yi are teachers
         // shallow = W*Xi // * is the Hadamard product, transposing W and meaning X
         // f(Xi) = win_rate(W*Xi)
         // If σ(i th deep) = Yi,
@@ -179,10 +195,12 @@ namespace Learner
         // ∂J/∂Wj = ∂J/∂f ・∂f/∂W ・∂W/∂Wj
         // = 1/m Σ (f(Xi)-y) ・f'(Xi) ・ 1
 
-        // 1/m will be multiplied later, but the contents of Σ can be retained in the array as the value of the gradient.
+        // 1/m will be multiplied later, but the contents of Σ can 
+        // be retained in the array as the value of the gradient.
         // f'(Xi) = win_rate'(shallow) = sigmoid'(shallow/600) = dsigmoid(shallow / 600) / 600
         // This /600 at the end is adjusted by the learning rate, so do not write it..
-        // Also, the coefficient of 1/m is unnecessary if you use the update formula that has the automatic gradient adjustment function like Adam and AdaGrad.
+        // Also, the coefficient of 1/m is unnecessary if you use the update 
+        // formula that has the automatic gradient adjustment function like Adam and AdaGrad.
         // Therefore, it is not necessary to save it in memory.
 
         const double p = winning_percentage(deep, psv.gamePly);
@@ -202,7 +220,9 @@ namespace Learner
         // Refer to etc.
 
         // Objective function design)
-        // We want to make the distribution of p closer to the distribution of q → Think of it as the problem of minimizing the cross entropy between the probability distributions of p and q.
+        // We want to make the distribution of p closer to the distribution of q 
+        // → Think of it as the problem of minimizing the cross entropy 
+        // between the probability distributions of p and q.
         // J = H(p,q) =-Σ p(x) log(q(x)) = -p log q-(1-p) log(1-q)
         // x
 
@@ -222,7 +242,8 @@ namespace Learner
     double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
     {
         // Version that does not pass the winning percentage function
-        // This, unless EVAL_LIMIT is set low, trying to match the evaluation value with the shape of the end stage
+        // This, unless EVAL_LIMIT is set low, trying to 
+        // match the evaluation value with the shape of the end stage
         // eval may exceed the range of eval.
         return shallow - deep;
     }
@@ -261,7 +282,6 @@ namespace Learner
     {
         const double scaled_teacher_signal = get_scaled_signal(teacher_signal);
 
-        // Teacher winning probability.
         double p = scaled_teacher_signal;
         if (convert_teacher_signal_to_winning_probability) 
         {
@@ -273,7 +293,8 @@ namespace Learner
 
     double calculate_lambda(double teacher_signal)
     {
-        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT, apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
+        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT
+        // then apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
         const double lambda =
             (std::abs(teacher_signal) >= ELMO_LAMBDA_LIMIT)
             ? ELMO_LAMBDA2
@@ -284,7 +305,8 @@ namespace Learner
 
     double calculate_t(int game_result)
     {
-        // Use 1 as the correction term if the expected win rate is 1, 0 if you lose, and 0.5 if you draw.
+        // Use 1 as the correction term if the expected win rate is 1, 
+        // 0 if you lose, and 0.5 if you draw.
         // game_result = 1,0,-1 so add 1 and divide by 2.
         const double t = double(game_result + 1) * 0.5;
 
@@ -318,7 +340,9 @@ namespace Learner
     }
 
     // Calculate cross entropy during learning
-    // The individual cross entropy of the win/loss term and win rate term of the elmo expression is returned to the arguments cross_entropy_eval and cross_entropy_win.
+    // The individual cross entropy of the win/loss term and win 
+    // rate term of the elmo expression is returned 
+    // to the arguments cross_entropy_eval and cross_entropy_win.
     void calc_cross_entropy(
         Value teacher_signal, 
         Value shallow, 
@@ -356,11 +380,7 @@ namespace Learner
     }
 
 #endif
-
-
-    // Other variations may be prepared as the objective function..
-
-
+    // Other objective functions may be considered in the future...
     double calc_grad(Value shallow, const PackedSfenValue& psv) 
     {
         return calc_grad((Value)psv.score, shallow, psv);
@@ -369,15 +389,17 @@ namespace Learner
     // Sfen reader
     struct SfenReader
     {
-        // number of phases used for calculation such as mse
+        // Number of phases used for calculation such as mse
         // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
-        //Since search() is performed with depth = 1 in calculation of move match rate, simple comparison is not possible...
+        // Since search() is performed with depth = 1 in calculation of 
+        // move match rate, simple comparison is not possible...
         static constexpr uint64_t sfen_for_mse_size = 2000;
 
         // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
         static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
 
-        // Buffer for reading files (If this is made larger, the shuffle becomes larger and the phases may vary.
+        // Buffer for reading files (If this is made larger, 
+        // the shuffle becomes larger and the phases may vary.
         // If it is too large, the memory consumption will increase.
         // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
         static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
@@ -387,7 +409,8 @@ namespace Learner
         // It must be 2**N because it will be used as the mask to calculate hash_index.
         static constexpr uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
 
-        // Do not use std::random_device().  Because it always the same integers on MinGW.
+        // Do not use std::random_device().
+        // Because it always the same integers on MinGW.
         SfenReader(int thread_num) : 
             prng(std::chrono::system_clock::now().time_since_epoch().count())
         {
@@ -460,16 +483,20 @@ namespace Learner
         // [ASYNC] Thread returns one aspect. Otherwise returns false.
         bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
         {
-            // If there are any positions left in the thread buffer, retrieve one and return it.
+            // If there are any positions left in the thread buffer
+            // then retrieve one and return it.
             auto& thread_ps = packed_sfens[thread_id];
 
-            // Fill the read buffer if there is no remaining buffer, but if it doesn't even exist, finish.
-            if ((thread_ps == nullptr || thread_ps->empty()) // If the buffer is empty, fill it.
+            // Fill the read buffer if there is no remaining buffer, 
+            // but if it doesn't even exist, finish.
+            // If the buffer is empty, fill it.
+            if ((thread_ps == nullptr || thread_ps->empty())
                 && !read_to_thread_buffer_impl(thread_id))
                 return false;
 
             // read_to_thread_buffer_impl() returned true,
-            // Since the filling of the thread buffer with the phase has been completed successfully
+            // Since the filling of the thread buffer with the 
+            // phase has been completed successfully
             // thread_ps->rbegin() is alive.
 
             ps = thread_ps->back();
@@ -511,6 +538,7 @@ namespace Learner
 
                 // Waiting for file worker to fill packed_sfens_pool.
                 // The mutex isn't locked, so it should fill up soon.
+                // Poor man's condition variable.
                 sleep(1);
             }
 
@@ -519,14 +547,14 @@ namespace Learner
         // Start a thread that loads the phase file in the background.
         void start_file_read_worker()
         {
-            file_worker_thread = std::thread([&] { this->file_read_worker(); });
+            file_worker_thread = std::thread([&] { 
+                this->file_read_worker(); 
+                });
         }
 
-        // for file read-only threads
         void file_read_worker()
         {
-            auto open_next_file = [&]()
-            {
+            auto open_next_file = [&]() {
                 if (fs.is_open())
                     fs.close();
 
@@ -569,7 +597,7 @@ namespace Learner
                     }
                     else if(!open_next_file())
                     {
-                        // There was no next file. Abon.
+                        // There was no next file. Abort.
                         cout << "..end of files." << endl;
                         end_of_files = true;
                         return;
@@ -577,8 +605,6 @@ namespace Learner
                 }
 
                 // Shuffle the read phase data.
-                // random shuffle by Fisher-Yates algorithm
-
                 if (!no_shuffle)
                 {
                     Algo::shuffle(sfens, prng);
@@ -597,17 +623,19 @@ namespace Learner
                     // Delete this pointer on the receiving side.
                     auto buf = std::make_unique<PSVector>();
                     buf->resize(THREAD_BUFFER_SIZE);
-                    memcpy(buf->data(), &sfens[i * THREAD_BUFFER_SIZE], sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
+                    memcpy(
+                        buf->data(), 
+                        &sfens[i * THREAD_BUFFER_SIZE], 
+                        sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
 
                     buffers.emplace_back(std::move(buf));
                 }
 
-                // Since sfens is ready, look at the occasion and copy
                 {
                     std::unique_lock<std::mutex> lk(mutex);
 
-                    // You can ignore this time because you just copy the pointer...
-                    // The mutex lock is required because the contents of packed_sfens_pool are changed.
+                    // The mutex lock is required because the 
+                    // contents of packed_sfens_pool are changed.
 
                     for (auto& buf : buffers)
                         packed_sfens_pool.emplace_back(std::move(buf));
@@ -644,7 +672,7 @@ namespace Learner
 
         bool stop_flag;
 
-        vector<Key> hash; // 64MB*8 = 512MB
+        vector<Key> hash;
 
         // test phase for mse calculation
         PSVector sfen_for_mse;
@@ -660,7 +688,6 @@ namespace Learner
         // Did you read the files and reached the end?
         atomic<bool> end_of_files;
 
-
         // handle of sfen file
         std::fstream fs;
 
@@ -727,7 +754,7 @@ namespace Learner
         uint64_t epoch = 0;
 
         // Mini batch size size. Be sure to set it on the side that uses this class.
-        uint64_t mini_batch_size = 1000 * 1000;
+        uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
 
         bool stop_flag;
 
@@ -740,7 +767,8 @@ namespace Learner
         // Option not to learn kk/kkp/kpp/kppp
         std::array<bool, 4> freeze;
 
-        // If the absolute value of the evaluation value of the deep search of the teacher phase exceeds this value, discard the teacher phase.
+        // If the absolute value of the evaluation value of the deep search 
+        // of the teacher phase exceeds this value, discard the teacher phase.
         int eval_limit;
 
         // Flag whether to dig a folder each time the evaluation function is saved.
@@ -811,7 +839,8 @@ namespace Learner
 
     void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
     {
-        // There is no point in hitting the replacement table, so at this timing the generation of the replacement table is updated.
+        // There is no point in hitting the replacement table, 
+        // so at this timing the generation of the replacement table is updated.
         // It doesn't matter if you have disabled the substitution table.
         TT.new_search();
 
@@ -845,7 +874,8 @@ namespace Learner
         sum_norm = 0;
 #endif
 
-        // The number of times the pv first move of deep search matches the pv first move of search(1).
+        // The number of times the pv first move of deep 
+        // search matches the pv first move of search(1).
         atomic<int> move_accord_count;
         move_accord_count = 0;
 
@@ -856,7 +886,8 @@ namespace Learner
         pos.set(StartFEN, false, &si, th);
         std::cout << "hirate eval = " << Eval::evaluate(pos);
 
-        // It's better to parallelize here, but it's a bit troublesome because the search before slave has not finished.
+        // It's better to parallelize here, but it's a bit 
+        // troublesome because the search before slave has not finished.
         // I created a mechanism to call task, so I will use it.
 
         // The number of tasks to do.
@@ -869,7 +900,8 @@ namespace Learner
         {
             // Assign work to each thread using TaskDispatcher.
             // A task definition for that.
-            // It is not possible to capture pos used in ↑, so specify the variables you want to capture one by one.
+            // It is not possible to capture pos used in ↑, 
+            // so specify the variables you want to capture one by one.
             auto task =
                 [
                     this,
@@ -899,7 +931,8 @@ namespace Learner
                 // Evaluation value of deep search
                 auto deep_value = (Value)ps.score;
 
-                // Note) This code does not consider when eval_limit is specified in the learn command.
+                // Note) This code does not consider when 
+                //       eval_limit is specified in the learn command.
 
                 // --- error calculation
 
@@ -975,14 +1008,16 @@ namespace Learner
             << " , eval mae = " << eval_mae;
 #endif
 
-#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
+#if defined(LOSS_FUNCTION_IS_ELMO_METHOD)
 #if defined(EVAL_NNUE)
         latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
         latest_loss_count += sr.sfen_for_mse.size();
 #endif
 
-        // learn_cross_entropy may be called train cross entropy in the world of machine learning,
-        // When omitting the acronym, it is nice to be able to distinguish it from test cross entropy(tce) by writing it as lce.
+        // learn_cross_entropy may be called train cross 
+        // entropy in the world of machine learning,
+        // When omitting the acronym, it is nice to be able to 
+        // distinguish it from test cross entropy(tce) by writing it as lce.
 
         if (sr.sfen_for_mse.size() && done)
         {
@@ -1074,7 +1109,9 @@ namespace Learner
                     // Output the current time. Output every time.
                     std::cout << sr.total_done << " sfens , at " << now_string() << std::endl;
 
-                    // Reflect the gradient in the weight array at this timing. The calculation of the gradient is just right for each 1M phase in terms of mini-batch.
+                    // Reflect the gradient in the weight array at this timing. 
+                    // The calculation of the gradient is just right for 
+                    // each 1M phase in terms of mini-batch.
                     Eval::update_weights(epoch, freeze);
 
                     // Display epoch and current eta for debugging.
@@ -1090,14 +1127,13 @@ namespace Learner
 #endif
                     ++epoch;
 
-                    // Save once every 1 billion phases.
-
                     // However, the elapsed time during update_weights() and calc_rmse() is ignored.
                     if (++sr.save_count * mini_batch_size >= eval_save_interval)
                     {
                         sr.save_count = 0;
 
-                        // During this time, as the gradient calculation proceeds, the value becomes too large and I feel annoyed, so stop other threads.
+                        // During this time, as the gradient calculation proceeds, 
+                        // the value becomes too large and I feel annoyed, so stop other threads.
                         const bool converged = save();
                         if (converged)
                         {
@@ -1109,7 +1145,6 @@ namespace Learner
 
                     // Calculate rmse. This is done for samples of 10,000 phases.
                     // If you do with 40 cores, update_weights every 1 million phases
-                    // I don't think it's so good to be tiring.
                     static uint64_t loss_output_count = 0;
                     if (++loss_output_count * mini_batch_size >= loss_output_interval)
                     {
@@ -1129,10 +1164,12 @@ namespace Learner
                         sr.last_done = sr.total_done;
                     }
 
-                    // Next time, I want you to do this series of processing again when you process only mini_batch_size.
+                    // Next time, I want you to do this series of 
+                    // processing again when you process only mini_batch_size.
                     sr.next_update_weights += mini_batch_size;
 
-                    // Since I was waiting for the update of this sr.next_update_weights except the main thread,
+                    // Since I was waiting for the update of this 
+                    // sr.next_update_weights except the main thread,
                     // Once this value is updated, it will start moving again.
                 }
             }
@@ -1173,7 +1210,8 @@ namespace Learner
             if (pos.set_from_packed_sfen(ps.sfen, &si, th, mirror) != 0)
             {
                 // I got a strange sfen. Should be debugged!
-                // Since it is an illegal sfen, it may not be displayed with pos.sfen(), but it is better than not.
+                // Since it is an illegal sfen, it may not be 
+                // displayed with pos.sfen(), but it is better than not.
                 cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
                 goto RETRY_READ;
             }
@@ -1198,9 +1236,11 @@ namespace Learner
 #endif
 
             // There is a possibility that all the pieces are blocked and stuck.
-            // Also, the declaration win phase is excluded from learning because you cannot go to leaf with PV moves.
-            // (shouldn't write out such teacher aspect itself, but may have written it out with an old generation routine)
-        // Skip the position if there are no legal moves (=checkmated or stalemate).
+            // Also, the declaration win phase is excluded from 
+            // learning because you cannot go to leaf with PV moves.
+            // (shouldn't write out such teacher aspect itself, 
+            // but may have written it out with an old generation routine)
+            // Skip the position if there are no legal moves (=checkmated or stalemate).
             if (MoveList<LEGAL>(pos).size() == 0)
                 goto RETRY_READ;
 
@@ -1214,7 +1254,8 @@ namespace Learner
             const auto deep_value = (Value)ps.score;
 
             // I feel that the mini batch has a better gradient.
-            // Go to the leaf node as it is, add only to the gradient array, and later try AdaGrad at the time of rmse aggregation.
+            // Go to the leaf node as it is, add only to the gradient array, 
+            // and later try AdaGrad at the time of rmse aggregation.
 
             const auto rootColor = pos.side_to_move();
 
@@ -1223,23 +1264,25 @@ namespace Learner
             // It may be better not to study where the difference in evaluation values ​​is too large.
 
 #if 0
-        // If you do this, about 13% of the phases will be excluded from the learning target. Good and bad are subtle.
+            // If you do this, about 13% of the phases will be excluded 
+            // from the learning target. Good and bad are subtle.
             if (pv.size() >= 1 && (uint16_t)pv[0] != ps.move)
             {
-                // dbg_hit_on(false);
+                //dbg_hit_on(false);
                 continue;
             }
 #endif
 
 #if 0
             // It may be better not to study where the difference in evaluation values ​​is too large.
-            // → It's okay because it passes the win rate function... About 30% of the phases are out of the scope of learning...
+            // → It's okay because it passes the win rate function... 
+            // About 30% of the phases are out of the scope of learning...
             if (abs((int16_t)r.first - ps.score) >= Eval::PawnValue * 4)
             {
-                //          dbg_hit_on(false);
+                //dbg_hit_on(false);
                 continue;
             }
-            //      dbg_hit_on(true);
+            //dbg_hit_on(true);
 #endif
 
             int ply = 0;
@@ -1248,9 +1291,12 @@ namespace Learner
             auto pos_add_grad = [&]() {
                 // Use the value of evaluate in leaf as shallow_value.
                 // Using the return value of qsearch() as shallow_value,
-                // If PV is interrupted in the middle, the phase where evaluate() is called to calculate the gradient, and
-                // I don't think this is a very desirable property, as the aspect that gives that gradient will be different.
-                // I have turned off the substitution table, but since the pv array has not been updated due to one stumbling block etc...
+                // If PV is interrupted in the middle, the phase where 
+                // evaluate() is called to calculate the gradient, 
+                // and I don't think this is a very desirable property, 
+                // as the aspect that gives that gradient will be different.
+                // I have turned off the substitution table, but since 
+                // the pv array has not been updated due to one stumbling block etc...
 
                 const Value shallow_value = 
                     (rootColor == pos.side_to_move()) 
@@ -1284,7 +1330,8 @@ namespace Learner
                 // Slope
                 double dj_dw = calc_grad(deep_value, shallow_value, ps);
 
-                // Add jd_dw as the gradient (∂J/∂Wj) for the feature vector currently appearing in the leaf node.
+                // Add jd_dw as the gradient (∂J/∂Wj) for the 
+                // feature vector currently appearing in the leaf node.
 
                 // If it is not PV termination, apply a discount rate.
                 if (discount_rate != 0 && ply != (int)pv.size())
@@ -1330,7 +1377,7 @@ namespace Learner
 
             if (illegal_move) 
             {
-                sync_cout << "An illical move was detected... Excluded the position from the learning data..." << sync_endl;
+                sync_cout << "An illegal move was detected... Excluded the position from the learning data..." << sync_endl;
                 continue;
             }
 
@@ -1343,7 +1390,11 @@ namespace Learner
 
 #if 0
             // When adding the gradient to the root phase
-            shallow_value = (rootColor == pos.side_to_move()) ? Eval::evaluate(pos) : -Eval::evaluate(pos);
+            shallow_value = 
+                (rootColor == pos.side_to_move()) 
+                ? Eval::evaluate(pos) 
+                : -Eval::evaluate(pos);
+
             dj_dw = calc_grad(deep_value, shallow_value, ps);
             Eval::add_grad(pos, rootColor, dj_dw, without_kpp);
 #endif
@@ -1426,10 +1477,14 @@ namespace Learner
 
     // Shuffle_files(), shuffle_files_quick() subcontracting, writing part.
     // output_file_name: Name of the file to write
-    // prng: random number
-    // afs: fstream of each teacher phase file
-    // a_count: The number of teacher positions inherent in each file.
-    void shuffle_write(const string& output_file_name, PRNG& prng, vector<fstream>& sfen_file_streams, vector<uint64_t>& sfen_count_in_file)
+    // prng: random number generator
+    // sfen_file_streams: fstream of each teacher phase file
+    // sfen_count_in_file: The number of teacher positions present in each file.
+    void shuffle_write(
+        const string& output_file_name, 
+        PRNG& prng, 
+        vector<fstream>& sfen_file_streams, 
+        vector<uint64_t>& sfen_count_in_file)
     {
         uint64_t total_sfen_count = 0;
         for (auto c : sfen_count_in_file)
@@ -1502,7 +1557,8 @@ namespace Learner
         // Temporary file is written to tmp/ folder for each buffer_size phase.
         // For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
         // In a PC with a small memory, it would be better to reduce this.
-        // However, if the number of files increases too much, it will not be possible to open at the same time due to OS restrictions.
+        // However, if the number of files increases too much, 
+        // it will not be possible to open at the same time due to OS restrictions.
         // There should have been a limit of 512 per process on Windows, so you can open here as 500,
         // The current setting is 500 files x 20M = 10G = 10 billion phases.
 
@@ -1555,19 +1611,23 @@ namespace Learner
 
             // Read in units of sizeof(PackedSfenValue),
             // Ignore the last remaining fraction. (Fails in fs.read, so exit while)
-            // (The remaining fraction seems to be half-finished data that was created because it was stopped halfway during teacher generation.)
+            // (The remaining fraction seems to be half-finished data 
+            // that was created because it was stopped halfway during teacher generation.)
         }
 
         if (buf_write_marker != 0)
             write_buffer(buf_write_marker);
 
         // Only shuffled files have been written write_file_count.
-        // As a second pass, if you open all of them at the same time, select one at random and load one phase at a time
+        // As a second pass, if you open all of them at the same time, 
+        // select one at random and load one phase at a time
         // Now you have shuffled.
 
-        // Original file for shirt full + tmp file + file to write requires 3 times the storage capacity of the original file.
+        // Original file for shirt full + tmp file + file to write 
+        // requires 3 times the storage capacity of the original file.
         // 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
-        // If you want to delete (or delete by hand) the original file at this point after writing to tmp,
+        // If you want to delete (or delete by hand) the 
+        // original file at this point after writing to tmp,
         // The storage capacity is about twice that of the original file.
         // So, maybe we should have an option to delete the original file.
 
@@ -1592,7 +1652,7 @@ namespace Learner
         PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
 
         // number of files
-        size_t file_count = filenames.size();
+        const size_t file_count = filenames.size();
 
         // Number of teacher positions stored in each file in filenames
         vector<uint64_t> sfen_count_in_file(file_count);
@@ -1651,7 +1711,8 @@ namespace Learner
 
         std::cout << "write : " << output_file_name << endl;
 
-        // If the file to be written exceeds 2GB, it cannot be written in one shot with fstream::write, so use wrapper.
+        // If the file to be written exceeds 2GB, it cannot be 
+        // written in one shot with fstream::write, so use wrapper.
         write_memory_to_file(
             output_file_name, 
             (void*)&buf[0], 
@@ -1703,9 +1764,11 @@ namespace Learner
         uint64_t buffer_size = 20000000;
         // fast shuffling assuming each file is shuffled
         bool shuffle_quick = false;
-        // A function to read the entire file in memory and shuffle it. (Requires file size memory)
+        // A function to read the entire file in memory and shuffle it. 
+        // (Requires file size memory)
         bool shuffle_on_memory = false;
-        // Conversion of packed sfen. In plain, it consists of sfen(string), evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
+        // Conversion of packed sfen. In plain, it consists of sfen(string), 
+        // evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
         bool use_convert_plain = false;
         // convert plain format teacher to Yaneura King's bin
         bool use_convert_bin = false;
@@ -1721,13 +1784,16 @@ namespace Learner
         // File name to write in those cases (default is "shuffled_sfen.bin")
         string output_file_name = "shuffled_sfen.bin";
 
-        // If the absolute value of the evaluation value in the deep search of the teacher phase exceeds this value, that phase is discarded.
+        // If the absolute value of the evaluation value 
+        // in the deep search of the teacher phase exceeds this value, 
+        // that phase is discarded.
         int eval_limit = 32000;
 
         // Flag to save the evaluation function file only once near the end.
         bool save_only_once = false;
 
-        // Shuffle about what you are pre-reading on the teacher aspect. (Shuffle of about 10 million phases)
+        // Shuffle about what you are pre-reading on the teacher aspect. 
+        // (Shuffle of about 10 million phases)
         // Turn on if you want to pass a pre-shuffled file.
         bool no_shuffle = false;
 
@@ -1738,7 +1804,9 @@ namespace Learner
         ELMO_LAMBDA_LIMIT = 32000;
 #endif
 
-        // Discount rate. If this is set to a value other than 0, the slope will be added even at other than the PV termination. (At that time, apply this discount rate)
+        // Discount rate. If this is set to a value other than 0, 
+        // the slope will be added even at other than the PV termination. 
+        // (At that time, apply this discount rate)
         double discount_rate = 0;
 
         // if (gamePly <rand(reduction_gameply)) continue;
@@ -1797,15 +1865,27 @@ namespace Learner
             else if (option == "eta3")       is >> eta3;
             else if (option == "eta1_epoch") is >> eta1_epoch;
             else if (option == "eta2_epoch") is >> eta2_epoch;
+
             // Accept also the old option name.
-            else if (option == "use_draw_in_training" || option == "use_draw_games_in_training") is >> use_draw_games_in_training;
+            else if (option == "use_draw_in_training" 
+                  || option == "use_draw_games_in_training") 
+                is >> use_draw_games_in_training;
+
             // Accept also the old option name.
-            else if (option == "use_draw_in_validation" || option == "use_draw_games_in_validation") is >> use_draw_games_in_validation;
+            else if (option == "use_draw_in_validation" 
+                  || option == "use_draw_games_in_validation") 
+                is >> use_draw_games_in_validation;
+
             // Accept also the old option name.
-            else if (option == "use_hash_in_training" || option == "skip_duplicated_positions_in_training") is >> skip_duplicated_positions_in_training;
+            else if (option == "use_hash_in_training" 
+                  || option == "skip_duplicated_positions_in_training") 
+                is >> skip_duplicated_positions_in_training;
+
             else if (option == "winning_probability_coefficient") is >> winning_probability_coefficient;
+
             // Discount rate
             else if (option == "discount_rate") is >> discount_rate;
+
             // Using WDL with win rate model instead of sigmoid
             else if (option == "use_wdl") is >> use_wdl;
 
@@ -1873,8 +1953,11 @@ namespace Learner
             else
                 filenames.push_back(option);
         }
+
         if (loss_output_interval == 0)
+        {
             loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
+        }
 
         cout << "learn command , ";
 
@@ -1900,6 +1983,7 @@ namespace Learner
         cout << "learn from ";
         for (auto s : filenames)
             cout << s << " , ";
+
         cout << endl;
         if (!validation_set_file_name.empty())
         {
@@ -1917,18 +2001,21 @@ namespace Learner
             shuffle_files(filenames, output_file_name, buffer_size);
             return;
         }
+
         if (shuffle_quick)
         {
             cout << "quick shuffle mode.." << endl;
             shuffle_files_quick(filenames, output_file_name);
             return;
         }
+
         if (shuffle_on_memory)
         {
             cout << "shuffle on memory.." << endl;
             shuffle_files_on_memory(filenames, output_file_name);
             return;
         }
+
         if (use_convert_plain)
         {
             Eval::init_NNUE();
@@ -1936,6 +2023,7 @@ namespace Learner
             convert_plain(filenames, output_file_name);
             return;
         }
+
         if (use_convert_bin)
         {
             Eval::init_NNUE();
@@ -1956,6 +2044,7 @@ namespace Learner
             return;
 
         }
+
         if (use_convert_bin_from_pgn_extract)
         {
             Eval::init_NNUE();
@@ -1976,15 +2065,21 @@ namespace Learner
 
         // Insert the file name for the number of loops.
         for (int i = 0; i < loop; ++i)
-            // sfen reader, I'll read it in reverse order so I'll reverse it here. I'm sorry.
+        {
+            // sfen reader, I'll read it in reverse 
+            // order so I'll reverse it here. I'm sorry.
             for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
+            {
                 sr.filenames.push_back(Path::Combine(base_dir, *it));
+            }
+        }
 
 #if !defined(EVAL_NNUE)
         cout << "Gradient Method   : " << LEARN_UPDATE << endl;
 #endif
         cout << "Loss Function     : " << LOSS_FUNCTION << endl;
         cout << "mini-batch size   : " << mini_batch_size << endl;
+
 #if defined(EVAL_NNUE)
         cout << "nn_batch_size     : " << nn_batch_size << endl;
         cout << "nn_options        : " << nn_options << endl;
@@ -1994,6 +2089,7 @@ namespace Learner
         cout << "use_draw_games_in_training : " << use_draw_games_in_training << endl;
         cout << "use_draw_games_in_validation : " << use_draw_games_in_validation << endl;
         cout << "skip_duplicated_positions_in_training : " << skip_duplicated_positions_in_training << endl;
+
 #if defined(EVAL_NNUE)
         if (newbob_decay != 1.0) {
             cout << "scheduling        : newbob with decay = " << newbob_decay
@@ -2003,6 +2099,7 @@ namespace Learner
             cout << "scheduling        : default" << endl;
         }
 #endif
+
         cout << "discount rate     : " << discount_rate << endl;
 
         // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
@@ -2014,6 +2111,7 @@ namespace Learner
         cout << "LAMBDA2           : " << ELMO_LAMBDA2 << endl;
         cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
 #endif
+
         cout << "mirror_percentage : " << mirror_percentage << endl;
         cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
         cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
@@ -2071,11 +2169,13 @@ namespace Learner
         learn_think.sr.no_shuffle = no_shuffle;
         learn_think.freeze = freeze;
         learn_think.reduction_gameply = reduction_gameply;
+
 #if defined(EVAL_NNUE)
         learn_think.newbob_scale = 1.0;
         learn_think.newbob_decay = newbob_decay;
         learn_think.newbob_num_trials = newbob_num_trials;
 #endif
+
         learn_think.eval_save_interval = eval_save_interval;
         learn_think.loss_output_interval = loss_output_interval;
         learn_think.mirror_percentage = mirror_percentage;
@@ -2086,16 +2186,19 @@ namespace Learner
 
         learn_think.mini_batch_size = mini_batch_size;
 
-        if (validation_set_file_name.empty()) {
+        if (validation_set_file_name.empty()) 
+        {
             // Get about 10,000 data for mse calculation.
             sr.read_for_mse();
         }
-        else {
+        else 
+        {
             sr.read_validation_set(validation_set_file_name, eval_limit);
         }
 
         // Calculate rmse once at this point (timing of 0 sfen)
         // sr.calc_rmse();
+
 #if defined(EVAL_NNUE)
         if (newbob_decay != 1.0) {
             learn_think.calc_loss(0, -1);
diff --git a/src/misc.h b/src/misc.h
index 5add3b36..4c04d3f0 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -226,6 +226,7 @@ namespace Math {
 }
 
 namespace Algo {
+    // Fisher-Yates
     template <typename Rng, typename T>
     void shuffle(std::vector<T>& buf, Rng&& prng)
     {

From d21424c8d3af0f63e6317ebd0a727114442248e0 Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Tue, 8 Sep 2020 09:35:53 +0800
Subject: [PATCH 206/583] test

---
 README.md                              |   5 +-
 src/Makefile                           |   3 +-
 src/evaluate.cpp                       |  52 ++++----
 src/learn/gensfen.cpp                  | 170 +++----------------------
 src/learn/gensfen2019.cpp              |   1 -
 src/learn/learner.cpp                  |  25 ----
 src/nnue/features/enpassant.cpp        |   2 +-
 src/nnue/features/half_kp.cpp          |   4 +-
 src/nnue/features/half_relative_kp.cpp |   4 +-
 src/nnue/features/k.cpp                |   4 +-
 src/nnue/features/p.cpp                |   4 +-
 src/nnue/nnue_common.h                 |   2 +-
 src/search.cpp                         |  17 +--
 src/tt.cpp                             |   4 +-
 src/ucioption.cpp                      |   2 +-
 15 files changed, 61 insertions(+), 238 deletions(-)
 delete mode 100644 src/learn/gensfen2019.cpp

diff --git a/README.md b/README.md
index 6d28a998..0dcce0a6 100644
--- a/README.md
+++ b/README.md
@@ -17,12 +17,10 @@ setoption name Threads value x
 setoption name Hash value y
 setoption name SyzygyPath value path
 isready
-gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000 use_raw_nnue_eval 0
+gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000
 ```
 Specify how many threads and how much memory you would like to use with the x and y values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The path is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
 
-use_raw_nnue_eval controls if the training data generator or trainer uses raw NNUE eval values.  Don't forget to set use_raw_nnue_eval 0 when initial training data are generated.  Otherwise, the gensfen command will crash.
-
 This will save a file named "generated_kifu.bin" in the same folder as the binary. Once generation is done, rename the file to something like "1billiondepth12.bin" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
 #### Generation Parameters
 - Depth is the searched depth per move, or how far the engine looks forward. This value is an integer.
@@ -34,6 +32,7 @@ Use the "learn" binary. Create an empty folder named "evalsave" in the same dire
 ```
 uci
 setoption name SkipLoadingEval value true
+setoption name Training value true
 setoption name Use NNUE value true
 setoption name Threads value x
 isready
diff --git a/src/Makefile b/src/Makefile
index 9db13e44..4f8801ee 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -56,7 +56,6 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/features/enpassant.cpp \
 	nnue/nnue_test_command.cpp \
 	extra/sfen_packer.cpp \
-	learn/gensfen2019.cpp \
 	learn/learner.cpp \
 	learn/gensfen.cpp \
 	learn/convert.cpp \
@@ -908,7 +907,7 @@ learn: config-sanity
 	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
 	all
 
-profile-learn: net config-sanity objclean profileclean
+profile-learn: config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 8edc9bb8..9dd83e1f 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -32,13 +32,6 @@
 #include "thread.h"
 #include "uci.h"
 
-#ifdef EVAL_LEARN
-namespace Learner
-{
-    extern bool use_raw_nnue_eval;
-}
-#endif
-
 namespace Eval {
 
   bool useNNUE;
@@ -947,27 +940,32 @@ make_v:
 /// evaluation of the position from the point of view of the side to move.
 
 Value Eval::evaluate(const Position& pos) {
-#ifdef EVAL_LEARN
-  if (Learner::use_raw_nnue_eval) {
-      return NNUE::evaluate(pos);
+  if (Options["Training"]) {
+    Value v = NNUE::evaluate(pos);
+    // Damp down the evaluation linearly when shuffling
+    v = v * (100 - pos.rule50_count()) / 100;
+
+    // Guarantee evaluation does not hit the tablebase range
+    v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+
+    return v;
+  } else {
+    bool classical = !Eval::useNNUE
+                  ||  abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
+    Value v = classical ? Evaluation<NO_TRACE>(pos).value()
+                        : NNUE::evaluate(pos) * 5 / 4 + Tempo;
+
+    if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
+        v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
+
+    // Damp down the evaluation linearly when shuffling
+    v = v * (100 - pos.rule50_count()) / 100;
+
+    // Guarantee evaluation does not hit the tablebase range
+    v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+
+    return v;
   }
-#endif
-
-  bool classical = !Eval::useNNUE
-                ||  abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
-  Value v = classical ? Evaluation<NO_TRACE>(pos).value()
-                      : NNUE::evaluate(pos) * 5 / 4 + Tempo;
-
-  if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
-      v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
-
-  // Damp down the evaluation linearly when shuffling
-  v = v * (100 - pos.rule50_count()) / 100;
-
-  // Guarantee evaluation does not hit the tablebase range
-  v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
-
-  return v;
 }
 
 /// trace() is like evaluate(), but instead of returning a value, it returns
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 6c8c455e..8526bc40 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -11,10 +11,6 @@
 #include "../uci.h"
 #include "../syzygy/tbprobe.h"
 
-#if defined(USE_BOOK)
-#include "../extra/book/book.h"
-#endif
-
 #include <chrono>
 #include <random>
 #include <regex>
@@ -54,11 +50,7 @@ namespace Learner
     static bool detect_draw_by_consecutive_low_score = false;
     static bool detect_draw_by_insufficient_mating_material = false;
 
-    // Use raw NNUE eval value in the Eval::evaluate().
-    // If hybrid eval is enabled, training data
-    // generation and training don't work well.
-    // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-    static bool use_raw_nnue_eval = true;
+    static std::vector<std::string> bookStart;
 
     // Helper class for exporting Sfen
     struct SfenWriter
@@ -313,13 +305,6 @@ namespace Learner
             int ply,
             int& random_move_c);
 
-        Value evaluate_leaf(
-            Position& pos,
-            std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
-            int ply,
-            int depth,
-            vector<Move>& pv);
-
         // Min and max depths for search during gensfen
         int search_depth_min;
         int search_depth_max;
@@ -674,69 +659,6 @@ namespace Learner
         return random_move_flag;
     }
 
-    Value MultiThinkGenSfen::evaluate_leaf(
-        Position& pos,
-        std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
-        int ply,
-        int depth,
-        vector<Move>& pv)
-    {
-        auto rootColor = pos.side_to_move();
-
-        for (auto m : pv)
-        {
-#if 1
-            // There should be no illegal move. This is as a debugging precaution.
-            if (!pos.pseudo_legal(m) || !pos.legal(m))
-            {
-                cout << "Error! : " << pos.fen() << m << endl;
-            }
-#endif
-            pos.do_move(m, states[ply++]);
-
-            // Because the difference calculation of evaluate() cannot be
-            // performed unless each node evaluate() is called!
-            // If the depth is 8 or more, it seems
-            // faster not to calculate this difference.
-#if defined(EVAL_NNUE)
-            if (depth < 8)
-            {
-                Eval::NNUE::update_eval(pos);
-            }
-#endif  // defined(EVAL_NNUE)
-        }
-
-        // Reach leaf
-        Value v;
-        if (pos.checkers())
-        {
-            // Sometime a king is checked.  An example is a case that a checkmate is
-            // found in the search.  If Eval::evaluate() is called whne a king is
-            // checked, classic eval crashes by an assertion. To avoid crashes, return
-            // VALUE_NONE and let the caller assign a value to the position.
-            v = VALUE_NONE;
-        }
-        else
-        {
-            v = Eval::evaluate(pos);
-
-            // evaluate() returns the evaluation value on the turn side, so
-            // If it's a turn different from root_color, you must invert v and return it.
-            if (rootColor != pos.side_to_move())
-            {
-                v = -v;
-            }
-        }
-
-        // Rewind the pv moves.
-        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-        {
-            pos.undo_move(*it);
-        }
-
-        return v;
-    }
-
     // thread_id = 0..Threads.size()-1
     void MultiThinkGenSfen::thread_worker(size_t thread_id)
     {
@@ -760,12 +682,7 @@ namespace Learner
             auto th = Threads[thread_id];
 
             auto& pos = th->rootPos;
-            pos.set(StartFEN, false, &si, th);
-
-#if defined(USE_BOOK)
-            // Refer to the members of BookMoveSelector defined in the search section.
-            auto& book = ::book;
-#endif
+            pos.set(bookStart[prng.rand(bookStart.size())], false, &si, th);
 
             // Vector for holding the sfens in the current simulated game.
             PSVector a_psv;
@@ -800,35 +717,6 @@ namespace Learner
                     flush_psv(result.value());
                     break;
                 }
-#if defined(USE_BOOK)
-                if ((next_move = book.probe(pos)) != MOVE_NONE)
-                {
-                    // Hit the constant track.
-                    // The move was stored in next_move.
-
-                    // Do not use the fixed phase for learning.
-                    sfens.clear();
-
-                    if (random_move_minply != -1)
-                    {
-                        // Random move is performed with a certain
-                        // probability even in the constant phase.
-                        goto RANDOM_MOVE;
-                    }
-                    else
-                    {
-                        // When -1 is specified as random_move_minply,
-                        // it points according to the standard until
-                        // it goes out of the standard.
-                        // Prepare an innumerable number of situations
-                        // that have left the constant as
-                        // ConsiderationBookMoveCount true using a huge constant
-                        // Used for purposes such as performing
-                        // a random move 5 times from there.
-                        goto DO_MOVE;
-                    }
-                }
-#endif
                 {
                     auto [search_value, search_pv] = search(pos, depth, 1, nodes);
 
@@ -916,18 +804,7 @@ namespace Learner
 
                         // Get the value of evaluate() as seen from the
                         // root color on the leaf node of the PV line.
-                        // I don't know the goodness and badness of using the
-                        // return value of search() as it is.
-                        // TODO: Consider using search value instead of evaluate_leaf.
-                        //       Maybe give it as an option.
-
-                        // Use PV moves to reach the leaf node and use the value
-                        // that evaluated() is called on that leaf node.
-                        const auto leaf_value = evaluate_leaf(pos, states, ply, depth, search_pv);
-
-                        // If for some reason the leaf node couldn't yield an eval
-                        // we fallback to search value.
-                        psv.score = leaf_value == VALUE_NONE ? search_value : leaf_value;
+                        psv.score = search_value;
 
                         psv.gamePly = ply;
 
@@ -948,9 +825,6 @@ namespace Learner
                     // Update the next move according to best search result.
                     next_move = search_pv[0];
                 }
-
-            RANDOM_MOVE:;
-
                 auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
                 if (random_move.has_value())
                 {
@@ -962,13 +836,7 @@ namespace Learner
                     {
                         break;
                     }
-
-                    // Clear the sfens that were written before the random move.
-                    // (???) why?
-                    a_psv.clear();
                 }
-
-            DO_MOVE:;
                 pos.do_move(next_move, states[ply]);
 
                 // Call node evaluate() for each difference calculation.
@@ -1095,18 +963,10 @@ namespace Learner
                 is >> detect_draw_by_consecutive_low_score;
             else if (token == "detect_draw_by_insufficient_mating_material")
                 is >> detect_draw_by_insufficient_mating_material;
-            else if (token == "use_raw_nnue_eval")
-                is >> use_raw_nnue_eval;
             else
                 cout << "Error! : Illegal token " << token << endl;
         }
 
-#if defined(USE_GLOBAL_OPTIONS)
-        // Save it for later restore.
-        auto oldGlobalOptions = GlobalOptions;
-        GlobalOptions.use_eval_hash = use_eval_hash;
-#endif
-
         // If search depth2 is not set, leave it the same as search depth.
         if (search_depth_max == INT_MIN)
             search_depth_max = search_depth_min;
@@ -1130,15 +990,26 @@ namespace Learner
             output_file_name = output_file_name + "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
         }
 
+        bookStart.clear();
+        {
+          std::string line;
+          std::ifstream myfile ("3moves_v2.epd");
+          if (myfile.is_open())
+          {
+            while (getline(myfile,line))
+            {
+                bookStart.push_back(line);
+            }
+            myfile.close();
+          }
+        }
         std::cout << "gensfen : " << endl
             << "  search_depth_min = " << search_depth_min << " to " << search_depth_max << endl
             << "  nodes = " << nodes << endl
             << "  loop_max = " << loop_max << endl
             << "  eval_limit = " << eval_limit << endl
-            << "  thread_num (set by USI setoption) = " << thread_num << endl
-#if defined(USE_BOOK)
-            << "  book_moves (set by USI setoption) = " << Options["BookMoves"] << endl
-#endif
+            << "  thread_num             = " << thread_num << endl
+            << "  bookStart              = " << bookStart.size() << endl
             << "  random_move_minply     = " << random_move_minply << endl
             << "  random_move_maxply     = " << random_move_maxply << endl
             << "  random_move_count      = " << random_move_count << endl
@@ -1188,11 +1059,6 @@ namespace Learner
 
         std::cout << "gensfen finished." << endl;
 
-#if defined(USE_GLOBAL_OPTIONS)
-        // Restore Global Options.
-        GlobalOptions = oldGlobalOptions;
-#endif
-
     }
 }
 #endif
diff --git a/src/learn/gensfen2019.cpp b/src/learn/gensfen2019.cpp
deleted file mode 100644
index 01293b9c..00000000
--- a/src/learn/gensfen2019.cpp
+++ /dev/null
@@ -1 +0,0 @@
-// just a place holder
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 7021fd7f..a8724892 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -98,12 +98,6 @@ namespace Learner
     // data directly. In those cases, we set false to this variable.
     static bool convert_teacher_signal_to_winning_probability = true;
 
-    // Use raw NNUE eval value in the Eval::evaluate(). If hybrid eval is enabled, training data
-    // generation and training don't work well.
-    // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-    // This CANNOT be static since it's used elsewhere.
-    bool use_raw_nnue_eval = true;
-
     // Using WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
@@ -1616,15 +1610,6 @@ namespace Learner
         uint64_t eta1_epoch = 0; // eta2 is not applied by default
         uint64_t eta2_epoch = 0; // eta3 is not applied by default
 
-#if defined(USE_GLOBAL_OPTIONS)
-    // Save it for later restore.
-        auto oldGlobalOptions = GlobalOptions;
-        // If you hit the eval hash, you can not calculate rmse etc. so turn it off.
-        GlobalOptions.use_eval_hash = false;
-        // If you hit the replacement table, pruning may occur at the previous evaluation value, so turn it off.
-        GlobalOptions.use_hash_probe = false;
-#endif
-
         // --- Function that only shuffles the teacher aspect
 
         // normal shuffle
@@ -1796,7 +1781,6 @@ namespace Learner
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
             else if (option == "convert_teacher_signal_to_winning_probability") is >> convert_teacher_signal_to_winning_probability;
-            else if (option == "use_raw_nnue_eval") is >> use_raw_nnue_eval;
 
             // Otherwise, it's a filename.
             else
@@ -2076,18 +2060,9 @@ namespace Learner
         // Save once at the end.
         learn_think.save(true);
 
-#if defined(USE_GLOBAL_OPTIONS)
-        // Restore Global Options.
-        GlobalOptions = oldGlobalOptions;
-#endif
     }
 
 
 } // namespace Learner
 
-#if defined(GENSFEN2019)
-#include "gensfen2019.cpp"
-#endif
-
-
 #endif // EVAL_LEARN
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
index ea70529a..ed877322 100644
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -23,7 +23,7 @@ namespace Eval {
         }
 
         if (perspective == BLACK) {
-          epSquare = rotate180(epSquare);
+          epSquare = flip_rank(epSquare);
         }
 
         auto file = file_of(epSquare);
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index 88e384a3..ff20a00a 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -23,9 +23,9 @@
 
 namespace Eval::NNUE::Features {
 
-  // Orient a square according to perspective (rotates by 180 for black)
+  // Orient a square according to perspective (flip rank for black)
   inline Square orient(Color perspective, Square s) {
-    return Square(int(s) ^ (bool(perspective) * 63));
+    return Square(int(s) ^ (bool(perspective) * SQ_A8));
   }
 
   // Find the index of the feature quantity from the king position and PieceSquare
diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 015ecb73..efe85035 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -11,9 +11,9 @@ namespace NNUE {
 
 namespace Features {
 
-// Orient a square according to perspective (rotates by 180 for black)
+// Orient a square according to perspective (flip rank for black)
 inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
+  return Square(int(s) ^ (bool(perspective) * SQ_A8));
 }
 
 // Find the index of the feature quantity from the ball position and PieceSquare
diff --git a/src/nnue/features/k.cpp b/src/nnue/features/k.cpp
index 314b1338..1bb28c53 100644
--- a/src/nnue/features/k.cpp
+++ b/src/nnue/features/k.cpp
@@ -11,9 +11,9 @@ namespace NNUE {
 
 namespace Features {
 
-// Orient a square according to perspective (rotates by 180 for black)
+// Orient a square according to perspective (flip rank for black)
 inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
+  return Square(int(s) ^ (bool(perspective) * SQ_A8));
 }
 
 // Index of a feature for a given king position.
diff --git a/src/nnue/features/p.cpp b/src/nnue/features/p.cpp
index b4a6faf9..7e008fdc 100644
--- a/src/nnue/features/p.cpp
+++ b/src/nnue/features/p.cpp
@@ -11,9 +11,9 @@ namespace NNUE {
 
 namespace Features {
 
-// Orient a square according to perspective (rotates by 180 for black)
+// Orient a square according to perspective (flip rank for black)
 inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * 63));
+  return Square(int(s) ^ (bool(perspective) * SQ_A8));
 }
 
 // Find the index of the feature quantity from the king position and PieceSquare
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index d7ffa21a..cc54378b 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -69,7 +69,7 @@
 namespace Eval::NNUE {
 
   // Version of the evaluation file
-  constexpr std::uint32_t kVersion = 0x7AF32F16u;
+  constexpr std::uint32_t kVersion = 0x7AF32F17u;
 
   // Constant used in evaluation value calculation
   constexpr int FV_SCALE = 16;
diff --git a/src/search.cpp b/src/search.cpp
index 8f258ae4..c01247bd 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -68,8 +68,6 @@ namespace {
     return Value(223 * (d - improving));
   }
 
-  bool training;
-
   // Reductions lookup table, initialized at startup
   int Reductions[MAX_MOVES]; // [depth or moveNumber]
 
@@ -195,8 +193,6 @@ void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
       Reductions[i] = int((22.0 + std::log(Threads.size())) * std::log(i));
-
-  training = Options["Training"];
 }
 
 
@@ -1011,7 +1007,7 @@ moves_loop: // When in check, search starts from here
 
       // Step 12. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
-          && !(training && PvNode)
+          && !(Options["Training"] && PvNode)
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)
       {
@@ -2070,17 +2066,6 @@ namespace Learner
         rootMoves.push_back(Search::RootMove(m));
 
       assert(!rootMoves.empty());
-
-      //#if defined(USE_GLOBAL_OPTIONS)
-      // Since the generation of the substitution table for each search thread should be managed,
-      // Increase the generation of the substitution table for this thread because it is a new search.
-            //TT.new_search(th->thread_id());
-
-            // ª If you call new_search here, it may be a loss because you can't use the previous search result.
-            // Do not do this here, but caller should do TT.new_search(th->thread_id()) for each station ...
-
-            // ¨Because we want to avoid reaching the same final diagram, use the substitution table commonly for all threads when generating teachers.
-      //#endif
     }
   }
 
diff --git a/src/tt.cpp b/src/tt.cpp
index 60a3a5f1..5e1f53d2 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -115,7 +115,9 @@ void TranspositionTable::clear() {
 /// TTEntry t2 if its replace value is greater than that of t2.
 
 TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
-
+  if (Options["Training"]) {
+    return found = false, first_entry(0);
+  }
   TTEntry* const tte = first_entry(key);
   const uint16_t key16 = (uint16_t)key;  // Use the low 16 bits as key inside the cluster
 
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 0007b559..1517326e 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -82,7 +82,7 @@ void init(OptionsMap& o) {
   o["Use NNUE"]              << Option(true, on_use_NNUE);
   // The default must follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work.
-  o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);
+  o["EvalFile"]              << Option("nn.bin", on_eval_file);
 #ifdef EVAL_NNUE
   // When the evaluation function is loaded at the ucinewgame timing, it is necessary to convert the new evaluation function.
   // I want to hit the test eval convert command, but there is no new evaluation function

From a6013557f2cb5d13c21a2d406a02d504a643c885 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 22:45:29 +0900
Subject: [PATCH 207/583] Removed EVAL_NNUE macro.

---
 src/Makefile                                  |   6 +-
 src/eval/evaluate_common.h                    |   3 -
 src/learn/gensfen.cpp                         |  40 +++---
 src/learn/learner.cpp                         | 133 +++---------------
 src/nnue/evaluate_nnue.cpp                    |   2 -
 src/nnue/evaluate_nnue_learner.cpp            |   4 +-
 src/nnue/evaluate_nnue_learner.h              |   4 +-
 src/nnue/features/castling_right.cpp          |   4 -
 src/nnue/features/castling_right.h            |   4 -
 src/nnue/features/enpassant.cpp               |   4 -
 src/nnue/features/enpassant.h                 |   4 -
 src/nnue/features/half_relative_kp.cpp        |   4 -
 src/nnue/features/half_relative_kp.h          |   4 -
 src/nnue/features/k.cpp                       |   4 -
 src/nnue/features/k.h                         |   4 -
 src/nnue/features/p.cpp                       |   4 -
 src/nnue/features/p.h                         |   4 -
 src/nnue/layers/sum.h                         |   4 -
 src/nnue/nnue_test_command.cpp                |   4 +-
 src/nnue/nnue_test_command.h                  |   4 +-
 src/nnue/trainer/features/factorizer.h        |   4 -
 .../trainer/features/factorizer_feature_set.h |   4 -
 .../trainer/features/factorizer_half_kp.h     |   4 -
 src/nnue/trainer/trainer.h                    |   4 +-
 src/nnue/trainer/trainer_affine_transform.h   |   4 +-
 src/nnue/trainer/trainer_clipped_relu.h       |   4 +-
 .../trainer/trainer_feature_transformer.h     |   4 +-
 src/nnue/trainer/trainer_input_slice.h        |   4 +-
 src/nnue/trainer/trainer_sum.h                |   4 +-
 src/uci.cpp                                   |   6 +-
 src/ucioption.cpp                             |   2 -
 31 files changed, 65 insertions(+), 223 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index ca851dba..a07e1251 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -903,7 +903,7 @@ icc-profile-use:
 
 learn: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	EXTRACXXFLAGS=' -DEVAL_LEARN -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
 	all
 
@@ -911,7 +911,7 @@ profile-learn: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
@@ -920,7 +920,7 @@ profile-learn: net config-sanity objclean profileclean
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index dacbd2ba..3fb161ab 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -3,7 +3,6 @@
 
 // A common header-like function for modern evaluation functions (EVAL_KPPT and EVAL_KPP_KKPT).
 
-#if defined(EVAL_NNUE) || defined(EVAL_LEARN)
 #include <functional>
 
 // KK file name
@@ -79,6 +78,4 @@ namespace Eval
 
 }
 
-#endif  // defined(EVAL_NNUE) || defined(EVAL_LEARN)
-
 #endif // _EVALUATE_KPPT_COMMON_H_
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index b049192e..9ae83174 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,45 +1,41 @@
 ﻿#if defined(EVAL_LEARN)
 
 #include "../eval/evaluate_common.h"
-
-#include "learn.h"
-#include "multi_think.h"
 #include "../misc.h"
-#include "../thread.h"
+#include "../nnue/evaluate_nnue_learner.h"
 #include "../position.h"
+#include "../syzygy/tbprobe.h"
+#include "../thread.h"
 #include "../tt.h"
 #include "../uci.h"
-#include "../syzygy/tbprobe.h"
+#include "learn.h"
+#include "multi_think.h"
 
 #if defined(USE_BOOK)
 #include "../extra/book/book.h"
 #endif
 
 #include <chrono>
-#include <random>
-#include <regex>
-#include <sstream>
-#include <fstream>
-#include <unordered_set>
-#include <iomanip>
-#include <list>
+#include <climits>
 #include <cmath>
 #include <cstring>
-#include <memory>
-#include <limits>
-#include <optional>
 #include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <random>
+#include <regex>
+#include <shared_mutex>
+#include <sstream>
+#include <unordered_set>
 
 #if defined (_OPENMP)
 #include <omp.h>
 #endif
 
-#if defined(EVAL_NNUE)
-#include "../nnue/evaluate_nnue_learner.h"
-#include <climits>
-#include <shared_mutex>
-#endif
-
 using namespace std;
 
 namespace Learner
@@ -692,12 +688,10 @@ namespace Learner
             // performed unless each node evaluate() is called!
             // If the depth is 8 or more, it seems
             // faster not to calculate this difference.
-#if defined(EVAL_NNUE)
             if (depth < 8)
             {
                 Eval::NNUE::update_eval(pos);
             }
-#endif  // defined(EVAL_NNUE)
         }
 
         // Reach leaf
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index f9d188b8..358848ec 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -17,45 +17,40 @@
 // → I will not be involved in the engine because it is a problem that the GUI should assist.
 // etc..
 
-#define EVAL_LEARN
-
 #if defined(EVAL_LEARN)
 
 #include "../eval/evaluate_common.h"
-
+#include "../misc.h"
+#include "../nnue/evaluate_nnue_learner.h"
+#include "../position.h"
+#include "../syzygy/tbprobe.h"
+#include "../thread.h"
+#include "../tt.h"
+#include "../uci.h"
 #include "learn.h"
 #include "multi_think.h"
-#include "../uci.h"
-#include "../syzygy/tbprobe.h"
-#include "../misc.h"
-#include "../thread.h"
-#include "../position.h"
-#include "../tt.h"
 
 #include <chrono>
-#include <random>
-#include <regex>
-#include <sstream>
-#include <fstream>
-#include <unordered_set>
-#include <iomanip>
-#include <list>
+#include <climits>
 #include <cmath>    // std::exp(),std::pow(),std::log()
 #include <cstring>  // memcpy()
-#include <memory>
-#include <limits>
-#include <optional>
 #include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <random>
+#include <regex>
+#include <shared_mutex>
+#include <sstream>
+#include <unordered_set>
 
 #if defined (_OPENMP)
 #include <omp.h>
 #endif
 
-#if defined(EVAL_NNUE)
-#include "../nnue/evaluate_nnue_learner.h"
-#include <climits>
-#include <shared_mutex>
-#endif
 
 using namespace std;
 
@@ -724,14 +719,12 @@ namespace Learner
             learn_sum_entropy = 0.0;
 #endif
 
-#if defined(EVAL_NNUE)
             newbob_scale = 1.0;
             newbob_decay = 1.0;
             newbob_num_trials = 2;
             best_loss = std::numeric_limits<double>::infinity();
             latest_loss_sum = 0.0;
             latest_loss_count = 0;
-#endif
         }
 
         virtual void thread_worker(size_t thread_id);
@@ -787,7 +780,6 @@ namespace Learner
         atomic<double> learn_sum_entropy;
 #endif
 
-#if defined(EVAL_NNUE)
         shared_timed_mutex nn_mutex;
         double newbob_scale;
         double newbob_decay;
@@ -796,7 +788,6 @@ namespace Learner
         double latest_loss_sum;
         uint64_t latest_loss_count;
         std::string best_nn_directory;
-#endif
 
         uint64_t eval_save_interval;
         uint64_t loss_output_interval;
@@ -844,13 +835,10 @@ namespace Learner
         // It doesn't matter if you have disabled the substitution table.
         TT.new_search();
 
-
-#if defined(EVAL_NNUE)
         std::cout << "PROGRESS: " << now_string() << ", ";
         std::cout << sr.total_done << " sfens";
         std::cout << ", iteration " << epoch;
         std::cout << ", eta = " << Eval::get_eta() << ", ";
-#endif
 
 #if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
         double sum_error = 0;
@@ -1009,10 +997,8 @@ namespace Learner
 #endif
 
 #if defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-#if defined(EVAL_NNUE)
         latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
         latest_loss_count += sr.sfen_for_mse.size();
-#endif
 
         // learn_cross_entropy may be called train cross 
         // entropy in the world of machine learning,
@@ -1074,14 +1060,10 @@ namespace Learner
             // display mse (this is sometimes done only for thread 0)
             // Immediately after being read from the file...
 
-#if defined(EVAL_NNUE)
         // Lock the evaluation function so that it is not used during updating.
             shared_lock<shared_timed_mutex> read_lock(nn_mutex, defer_lock);
             if (sr.next_update_weights <= sr.total_done ||
                 (thread_id != 0 && !read_lock.try_lock()))
-#else
-            if (sr.next_update_weights <= sr.total_done)
-#endif
             {
                 if (thread_id != 0)
                 {
@@ -1105,18 +1087,6 @@ namespace Learner
                         continue;
                     }
 
-#if !defined(EVAL_NNUE)
-                    // Output the current time. Output every time.
-                    std::cout << sr.total_done << " sfens , at " << now_string() << std::endl;
-
-                    // Reflect the gradient in the weight array at this timing. 
-                    // The calculation of the gradient is just right for 
-                    // each 1M phase in terms of mini-batch.
-                    Eval::update_weights(epoch, freeze);
-
-                    // Display epoch and current eta for debugging.
-                    std::cout << "epoch = " << epoch << " , eta = " << Eval::get_eta() << std::endl;
-#else
                     {
                         // update parameters
 
@@ -1124,7 +1094,7 @@ namespace Learner
                         lock_guard<shared_timed_mutex> write_lock(nn_mutex);
                         Eval::NNUE::UpdateParameters(epoch);
                     }
-#endif
+
                     ++epoch;
 
                     // However, the elapsed time during update_weights() and calc_rmse() is ignored.
@@ -1156,9 +1126,7 @@ namespace Learner
                         // loss calculation
                         calc_loss(thread_id, done);
 
-#if defined(EVAL_NNUE)
                         Eval::NNUE::CheckHealth();
-#endif
 
                         // Make a note of how far you have totaled.
                         sr.last_done = sr.total_done;
@@ -1216,25 +1184,6 @@ namespace Learner
                 goto RETRY_READ;
             }
 
-#if !defined(EVAL_NNUE)
-            if (skip_duplicated_positions_in_training)
-            {
-                const auto key = pos.key();
-
-                // Exclude the phase used for rmse calculation.
-                if (sr.is_for_rmse(key))
-                    goto RETRY_READ;
-
-                // Exclude the most recently used aspect.
-                const auto hash_index = size_t(key & (sr.READ_SFEN_HASH_SIZE - 1));
-                const auto key2 = sr.hash[hash_index];
-                if (key == key2)
-                    goto RETRY_READ;
-
-                sr.hash[hash_index] = key; // Replace with the current key.
-            }
-#endif
-
             // There is a possibility that all the pieces are blocked and stuck.
             // Also, the declaration win phase is excluded from 
             // learning because you cannot go to leaf with PV moves.
@@ -1326,25 +1275,9 @@ namespace Learner
                 learn_sum_entropy += learn_entropy;
 #endif
 
-#if !defined(EVAL_NNUE)
-                // Slope
-                double dj_dw = calc_grad(deep_value, shallow_value, ps);
-
-                // Add jd_dw as the gradient (∂J/∂Wj) for the 
-                // feature vector currently appearing in the leaf node.
-
-                // If it is not PV termination, apply a discount rate.
-                if (discount_rate != 0 && ply != (int)pv.size())
-                    dj_dw *= discount_rate;
-
-                // Since we have reached leaf, add the gradient to the features that appear in this phase.
-                // Update based on gradient later.
-                Eval::add_grad(pos, rootColor, dj_dw, freeze);
-#else
                 const double example_weight =
                     (discount_rate != 0 && ply != (int)pv.size()) ? discount_rate : 1.0;
                 Eval::NNUE::AddExample(pos, rootColor, ps, example_weight);
-#endif
 
                 // Since the processing is completed, the counter of the processed number is incremented
                 sr.total_done++;
@@ -1425,7 +1358,6 @@ namespace Learner
             const std::string dir_name = std::to_string(dir_number++);
             Eval::save_eval(dir_name);
 
-#if defined(EVAL_NNUE)
             if (newbob_decay != 1.0 && latest_loss_count > 0) {
                 static int trials = newbob_num_trials;
                 const double latest_loss = latest_loss_sum / latest_loss_count;
@@ -1470,7 +1402,6 @@ namespace Learner
                     return true;
                 }
             }
-#endif
         }
         return false;
     }
@@ -1817,12 +1748,10 @@ namespace Learner
         // Optional item that does not let you learn KK/KKP/KPP/KPPP
         array<bool, 4> freeze = {};
 
-#if defined(EVAL_NNUE)
         uint64_t nn_batch_size = 1000;
         double newbob_decay = 1.0;
         int newbob_num_trials = 2;
         string nn_options;
-#endif
 
         uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
         uint64_t loss_output_interval = 0;
@@ -1922,12 +1851,11 @@ namespace Learner
             else if (option == "save_only_once") save_only_once = true;
             else if (option == "no_shuffle") no_shuffle = true;
 
-#if defined(EVAL_NNUE)
             else if (option == "nn_batch_size") is >> nn_batch_size;
             else if (option == "newbob_decay") is >> newbob_decay;
             else if (option == "newbob_num_trials") is >> newbob_num_trials;
             else if (option == "nn_options") is >> nn_options;
-#endif
+
             else if (option == "eval_save_interval") is >> eval_save_interval;
             else if (option == "loss_output_interval") is >> loss_output_interval;
             else if (option == "mirror_percentage") is >> mirror_percentage;
@@ -2074,23 +2002,18 @@ namespace Learner
             }
         }
 
-#if !defined(EVAL_NNUE)
-        cout << "Gradient Method   : " << LEARN_UPDATE << endl;
-#endif
         cout << "Loss Function     : " << LOSS_FUNCTION << endl;
         cout << "mini-batch size   : " << mini_batch_size << endl;
 
-#if defined(EVAL_NNUE)
         cout << "nn_batch_size     : " << nn_batch_size << endl;
         cout << "nn_options        : " << nn_options << endl;
-#endif
+
         cout << "learning rate     : " << eta1 << " , " << eta2 << " , " << eta3 << endl;
         cout << "eta_epoch         : " << eta1_epoch << " , " << eta2_epoch << endl;
         cout << "use_draw_games_in_training : " << use_draw_games_in_training << endl;
         cout << "use_draw_games_in_validation : " << use_draw_games_in_validation << endl;
         cout << "skip_duplicated_positions_in_training : " << skip_duplicated_positions_in_training << endl;
 
-#if defined(EVAL_NNUE)
         if (newbob_decay != 1.0) {
             cout << "scheduling        : newbob with decay = " << newbob_decay
                 << ", " << newbob_num_trials << " trials" << endl;
@@ -2098,7 +2021,6 @@ namespace Learner
         else {
             cout << "scheduling        : default" << endl;
         }
-#endif
 
         cout << "discount rate     : " << discount_rate << endl;
 
@@ -2133,12 +2055,6 @@ namespace Learner
         // Read evaluation function parameters
         Eval::init_NNUE();
 
-#if !defined(EVAL_NNUE)
-        cout << "init_grad.." << endl;
-
-        // Initialize gradient array of merit function parameters
-        Eval::init_grad(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
-#else
         cout << "init_training.." << endl;
         Eval::NNUE::InitializeTraining(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
         Eval::NNUE::SetBatchSize(nn_batch_size);
@@ -2146,7 +2062,6 @@ namespace Learner
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
             learn_think.best_nn_directory = std::string(Options["EvalDir"]);
         }
-#endif
 
 #if 0
         // A test to give a gradient of 1.0 to the initial stage of Hirate.
@@ -2170,11 +2085,9 @@ namespace Learner
         learn_think.freeze = freeze;
         learn_think.reduction_gameply = reduction_gameply;
 
-#if defined(EVAL_NNUE)
         learn_think.newbob_scale = 1.0;
         learn_think.newbob_decay = newbob_decay;
         learn_think.newbob_num_trials = newbob_num_trials;
-#endif
 
         learn_think.eval_save_interval = eval_save_interval;
         learn_think.loss_output_interval = loss_output_interval;
@@ -2199,7 +2112,6 @@ namespace Learner
         // Calculate rmse once at this point (timing of 0 sfen)
         // sr.calc_rmse();
 
-#if defined(EVAL_NNUE)
         if (newbob_decay != 1.0) {
             learn_think.calc_loss(0, -1);
             learn_think.best_loss = learn_think.latest_loss_sum / learn_think.latest_loss_count;
@@ -2207,7 +2119,6 @@ namespace Learner
             learn_think.latest_loss_count = 0;
             cout << "initial loss: " << learn_think.best_loss << endl;
         }
-#endif
 
         // -----------------------------------
         // start learning evaluation function parameters
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 5c8cee71..a2845c96 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -184,13 +184,11 @@ namespace Eval::NNUE {
 
     Initialize();
 
-#if defined(EVAL_NNUE)
     if (Options["SkipLoadingEval"])
     {
       std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
       return true;
     }
-#endif
 
     fileName = evalFile;
 
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 13d9d578..7be06832 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -1,6 +1,6 @@
 ﻿// Code for learning NNUE evaluation function
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include <random>
 #include <fstream>
@@ -229,4 +229,4 @@ double get_eta() {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 1e4a463e..0e5fbcd2 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -3,7 +3,7 @@
 #ifndef _EVALUATE_NNUE_LEARNER_H_
 #define _EVALUATE_NNUE_LEARNER_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../learn/learn.h"
 
@@ -41,6 +41,6 @@ void CheckHealth();
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
index ee7b6576..47fbd986 100644
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -1,7 +1,5 @@
 //Definition of input feature quantity K of NNUE evaluation function
 
-#if defined(EVAL_NNUE)
-
 #include "castling_right.h"
 #include "index_list.h"
 
@@ -69,5 +67,3 @@ namespace Eval {
   }  // namespace NNUE
 
 }  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
diff --git a/src/nnue/features/castling_right.h b/src/nnue/features/castling_right.h
index 3af5b074..27074080 100644
--- a/src/nnue/features/castling_right.h
+++ b/src/nnue/features/castling_right.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_FEATURES_CASTLING_RIGHT_H_
 #define _NNUE_FEATURES_CASTLING_RIGHT_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../evaluate.h"
 #include "features_common.h"
 
@@ -43,6 +41,4 @@ namespace Eval {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
index ea70529a..77bc936e 100644
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -1,7 +1,5 @@
 //Definition of input feature quantity K of NNUE evaluation function
 
-#if defined(EVAL_NNUE)
-
 #include "enpassant.h"
 #include "index_list.h"
 
@@ -43,5 +41,3 @@ namespace Eval {
   }  // namespace NNUE
 
 }  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
diff --git a/src/nnue/features/enpassant.h b/src/nnue/features/enpassant.h
index f77f9c4f..70a8eb5a 100644
--- a/src/nnue/features/enpassant.h
+++ b/src/nnue/features/enpassant.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_FEATURES_ENPASSANT_H_
 #define _NNUE_FEATURES_ENPASSANT_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../evaluate.h"
 #include "features_common.h"
 
@@ -43,6 +41,4 @@ namespace Eval {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 015ecb73..597d65fb 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -1,7 +1,5 @@
 ﻿//Definition of input features HalfRelativeKP of NNUE evaluation function
 
-#if defined(EVAL_NNUE)
-
 #include "half_relative_kp.h"
 #include "index_list.h"
 
@@ -74,5 +72,3 @@ template class HalfRelativeKP<Side::kEnemy>;
 }  // namespace NNUE
 
 }  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
diff --git a/src/nnue/features/half_relative_kp.h b/src/nnue/features/half_relative_kp.h
index 2d4182e4..1b384c14 100644
--- a/src/nnue/features/half_relative_kp.h
+++ b/src/nnue/features/half_relative_kp.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
 #define _NNUE_FEATURES_HALF_RELATIVE_KP_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../evaluate.h"
 #include "features_common.h"
 
@@ -60,6 +58,4 @@ class HalfRelativeKP {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/features/k.cpp b/src/nnue/features/k.cpp
index 314b1338..38ec9997 100644
--- a/src/nnue/features/k.cpp
+++ b/src/nnue/features/k.cpp
@@ -1,7 +1,5 @@
 ﻿//Definition of input feature quantity K of NNUE evaluation function
 
-#if defined(EVAL_NNUE)
-
 #include "k.h"
 #include "index_list.h"
 
@@ -54,5 +52,3 @@ void K::AppendChangedIndices(
 }  // namespace NNUE
 
 }  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
diff --git a/src/nnue/features/k.h b/src/nnue/features/k.h
index 0c394f4e..9a0be4bb 100644
--- a/src/nnue/features/k.h
+++ b/src/nnue/features/k.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_FEATURES_K_H_
 #define _NNUE_FEATURES_K_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../evaluate.h"
 #include "features_common.h"
 
@@ -47,6 +45,4 @@ private:
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/features/p.cpp b/src/nnue/features/p.cpp
index b4a6faf9..0c1b7d50 100644
--- a/src/nnue/features/p.cpp
+++ b/src/nnue/features/p.cpp
@@ -1,7 +1,5 @@
 ﻿//Definition of input feature P of NNUE evaluation function
 
-#if defined(EVAL_NNUE)
-
 #include "p.h"
 #include "index_list.h"
 
@@ -52,5 +50,3 @@ void P::AppendChangedIndices(
 }  // namespace NNUE
 
 }  // namespace Eval
-
-#endif  // defined(EVAL_NNUE)
diff --git a/src/nnue/features/p.h b/src/nnue/features/p.h
index b3d4191e..07d88952 100644
--- a/src/nnue/features/p.h
+++ b/src/nnue/features/p.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_FEATURES_P_H_
 #define _NNUE_FEATURES_P_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../evaluate.h"
 #include "features_common.h"
 
@@ -47,6 +45,4 @@ class P {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h
index d8c7bf93..419ced89 100644
--- a/src/nnue/layers/sum.h
+++ b/src/nnue/layers/sum.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_LAYERS_SUM_H_
 #define _NNUE_LAYERS_SUM_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../nnue_common.h"
 
 namespace Eval {
@@ -158,6 +156,4 @@ class Sum<PreviousLayer> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index 311c5ded..b8346693 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -1,6 +1,6 @@
 ﻿// USI extended command for NNUE evaluation function
 
-#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+#if defined(ENABLE_TEST_CMD)
 
 #include "../thread.h"
 #include "../uci.h"
@@ -198,4 +198,4 @@ void TestCommand(Position& pos, std::istream& stream) {
 
 }  // namespace Eval
 
-#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+#endif  // defined(ENABLE_TEST_CMD)
diff --git a/src/nnue/nnue_test_command.h b/src/nnue/nnue_test_command.h
index 570ef01b..30854fd2 100644
--- a/src/nnue/nnue_test_command.h
+++ b/src/nnue/nnue_test_command.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TEST_COMMAND_H_
 #define _NNUE_TEST_COMMAND_H_
 
-#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+#if defined(ENABLE_TEST_CMD)
 
 namespace Eval {
 
@@ -16,6 +16,6 @@ void TestCommand(Position& pos, std::istream& stream);
 
 }  // namespace Eval
 
-#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+#endif  // defined(ENABLE_TEST_CMD)
 
 #endif
diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
index 148ee8ec..43950de2 100644
--- a/src/nnue/trainer/features/factorizer.h
+++ b/src/nnue/trainer/features/factorizer.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../nnue_common.h"
 #include "../trainer.h"
 
@@ -105,6 +103,4 @@ constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
index af524719..caf6608b 100644
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/nnue/trainer/features/factorizer_feature_set.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../features/feature_set.h"
 #include "factorizer.h"
 
@@ -99,6 +97,4 @@ public:
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
index 955894e8..70a6acca 100644
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/nnue/trainer/features/factorizer_half_kp.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 
-#if defined(EVAL_NNUE)
-
 #include "../../features/half_kp.h"
 #include "../../features/p.h"
 #include "../../features/half_relative_kp.h"
@@ -98,6 +96,4 @@ constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_NNUE)
-
 #endif
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index 4b467041..d526557a 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TRAINER_H_
 #define _NNUE_TRAINER_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../nnue_common.h"
 #include "../features/index_list.h"
@@ -120,6 +120,6 @@ std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index da11ca29..4b5ddee6 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../../learn/learn.h"
 #include "../layers/affine_transform.h"
@@ -296,6 +296,6 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index bd59a02d..72575bf8 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
 #define _NNUE_TRAINER_CLIPPED_RELU_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../../learn/learn.h"
 #include "../layers/clipped_relu.h"
@@ -137,6 +137,6 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 97dbeff4..6b94d952 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 #define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../../learn/learn.h"
 #include "../nnue_feature_transformer.h"
@@ -372,6 +372,6 @@ class Trainer<FeatureTransformer> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 7d9e76c3..b6d6635b 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TRAINER_INPUT_SLICE_H_
 #define _NNUE_TRAINER_INPUT_SLICE_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../../learn/learn.h"
 #include "../layers/input_slice.h"
@@ -246,6 +246,6 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index f7bf3b3d..0b7abe36 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -3,7 +3,7 @@
 #ifndef _NNUE_TRAINER_SUM_H_
 #define _NNUE_TRAINER_SUM_H_
 
-#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#if defined(EVAL_LEARN)
 
 #include "../../learn/learn.h"
 #include "../layers/sum.h"
@@ -185,6 +185,6 @@ class Trainer<Layers::Sum<PreviousLayer>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+#endif  // defined(EVAL_LEARN)
 
 #endif
diff --git a/src/uci.cpp b/src/uci.cpp
index d6745d19..5be2afbb 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -32,7 +32,7 @@
 #include "uci.h"
 #include "syzygy/tbprobe.h"
 
-#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
+#if defined(ENABLE_TEST_CMD)
 #include "nnue/nnue_test_command.h"
 #endif
 
@@ -67,7 +67,7 @@ namespace Learner
 }
 #endif
 
-#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
+#if defined(ENABLE_TEST_CMD)
 void test_cmd(Position& pos, istringstream& is)
 {
     // Initialize as it may be searched.
@@ -373,7 +373,7 @@ void UCI::loop(int argc, char* argv[]) {
 
 #endif
 
-#if defined(EVAL_NNUE) && defined(ENABLE_TEST_CMD)
+#if defined(ENABLE_TEST_CMD)
       // test command
       else if (token == "test") test_cmd(pos, is);
 #endif
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 0007b559..4f9fab5e 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -83,7 +83,6 @@ void init(OptionsMap& o) {
   // The default must follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work.
   o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);
-#ifdef EVAL_NNUE
   // When the evaluation function is loaded at the ucinewgame timing, it is necessary to convert the new evaluation function.
   // I want to hit the test eval convert command, but there is no new evaluation function
   // It ends abnormally before executing this command.
@@ -92,7 +91,6 @@ void init(OptionsMap& o) {
   o["SkipLoadingEval"]       << Option(false);
   // how many moves to use a fixed move
   // o["BookMoves"] << Option(16, 0, 10000);
-#endif
 #if defined(EVAL_LEARN)
   // When learning the evaluation function, you can change the folder to save the evaluation function.
   // Evalsave by default. This folder shall be prepared in advance.

From e6a6ba52213290d0996913ec6367a8364c5199ec Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 22:49:55 +0900
Subject: [PATCH 208/583] Removed USE_BOOK macro.

---
 src/learn/gensfen.cpp | 40 ----------------------------------------
 src/learn/learner.cpp |  5 -----
 2 files changed, 45 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 9ae83174..589d9559 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -11,10 +11,6 @@
 #include "learn.h"
 #include "multi_think.h"
 
-#if defined(USE_BOOK)
-#include "../extra/book/book.h"
-#endif
-
 #include <chrono>
 #include <climits>
 #include <cmath>
@@ -750,11 +746,6 @@ namespace Learner
             auto& pos = th->rootPos;
             pos.set(StartFEN, false, &si, th);
 
-#if defined(USE_BOOK)
-            // Refer to the members of BookMoveSelector defined in the search section.
-            auto& book = ::book;
-#endif
-
             // Vector for holding the sfens in the current simulated game.
             PSVector a_psv;
             a_psv.reserve(write_maxply + MAX_PLY);
@@ -788,35 +779,7 @@ namespace Learner
                     flush_psv(result.value());
                     break;
                 }
-#if defined(USE_BOOK)
-                if ((next_move = book.probe(pos)) != MOVE_NONE)
-                {
-                    // Hit the constant track.
-                    // The move was stored in next_move.
 
-                    // Do not use the fixed phase for learning.
-                    sfens.clear();
-
-                    if (random_move_minply != -1)
-                    {
-                        // Random move is performed with a certain
-                        // probability even in the constant phase.
-                        goto RANDOM_MOVE;
-                    }
-                    else
-                    {
-                        // When -1 is specified as random_move_minply,
-                        // it points according to the standard until
-                        // it goes out of the standard.
-                        // Prepare an innumerable number of situations
-                        // that have left the constant as
-                        // ConsiderationBookMoveCount true using a huge constant
-                        // Used for purposes such as performing
-                        // a random move 5 times from there.
-                        goto DO_MOVE;
-                    }
-                }
-#endif
                 {
                     auto [search_value, search_pv] = search(pos, depth, 1, nodes);
 
@@ -1124,9 +1087,6 @@ namespace Learner
             << "  loop_max = " << loop_max << endl
             << "  eval_limit = " << eval_limit << endl
             << "  thread_num (set by USI setoption) = " << thread_num << endl
-#if defined(USE_BOOK)
-            << "  book_moves (set by USI setoption) = " << Options["BookMoves"] << endl
-#endif
             << "  random_move_minply     = " << random_move_minply << endl
             << "  random_move_maxply     = " << random_move_maxply << endl
             << "  random_move_count      = " << random_move_count << endl
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 358848ec..e7f021fe 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -54,11 +54,6 @@
 
 using namespace std;
 
-#if defined(USE_BOOK)
-// This is defined in the search section.
-extern Book::BookMoveSelector book;
-#endif
-
 template <typename T>
 T operator +=(std::atomic<T>& x, const T rhs)
 {

From 21cfead52c2a77abc4e9eed21739ccc3df9826c0 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 22:52:46 +0900
Subject: [PATCH 209/583] Removed unused OMP_ macro.

---
 src/learn/convert.cpp        | 4 ----
 src/learn/gensfen.cpp        | 4 ----
 src/learn/learning_tools.cpp | 3 ---
 3 files changed, 11 deletions(-)

diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index 9bd9548d..d07fc00c 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -27,10 +27,6 @@
 #include <regex>
 #include <filesystem>
 
-#if defined (_OPENMP)
-#include <omp.h>
-#endif
-
 using namespace std;
 
 namespace Learner
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 589d9559..65e64177 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -28,10 +28,6 @@
 #include <sstream>
 #include <unordered_set>
 
-#if defined (_OPENMP)
-#include <omp.h>
-#endif
-
 using namespace std;
 
 namespace Learner
diff --git a/src/learn/learning_tools.cpp b/src/learn/learning_tools.cpp
index de6da9c5..eca11c47 100644
--- a/src/learn/learning_tools.cpp
+++ b/src/learn/learning_tools.cpp
@@ -2,9 +2,6 @@
 
 #if defined (EVAL_LEARN)
 
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
 #include "../misc.h"
 
 using namespace Eval;

From 1d00d002412e11505430a9da32297b81e11b6801 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 22:59:57 +0900
Subject: [PATCH 210/583] Removed ENABLE_TEST_CMD macro.

---
 src/Makefile                   |  6 +++---
 src/nnue/nnue_test_command.cpp |  4 ----
 src/nnue/nnue_test_command.h   |  4 ----
 src/uci.cpp                    | 11 ++---------
 4 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index a07e1251..49c6c1b3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -903,7 +903,7 @@ icc-profile-use:
 
 learn: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' -DEVAL_LEARN -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	EXTRACXXFLAGS=' -DEVAL_LEARN -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
 	all
 
@@ -911,7 +911,7 @@ profile-learn: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
@@ -920,7 +920,7 @@ profile-learn: net config-sanity objclean profileclean
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DENABLE_TEST_CMD -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index b8346693..c3a53c7d 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -1,7 +1,5 @@
 ﻿// USI extended command for NNUE evaluation function
 
-#if defined(ENABLE_TEST_CMD)
-
 #include "../thread.h"
 #include "../uci.h"
 #include "evaluate_nnue.h"
@@ -197,5 +195,3 @@ void TestCommand(Position& pos, std::istream& stream) {
 }  // namespace NNUE
 
 }  // namespace Eval
-
-#endif  // defined(ENABLE_TEST_CMD)
diff --git a/src/nnue/nnue_test_command.h b/src/nnue/nnue_test_command.h
index 30854fd2..75d33e82 100644
--- a/src/nnue/nnue_test_command.h
+++ b/src/nnue/nnue_test_command.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TEST_COMMAND_H_
 #define _NNUE_TEST_COMMAND_H_
 
-#if defined(ENABLE_TEST_CMD)
-
 namespace Eval {
 
 namespace NNUE {
@@ -16,6 +14,4 @@ void TestCommand(Position& pos, std::istream& stream);
 
 }  // namespace Eval
 
-#endif  // defined(ENABLE_TEST_CMD)
-
 #endif
diff --git a/src/uci.cpp b/src/uci.cpp
index 5be2afbb..1454e4e0 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -24,17 +24,14 @@
 
 #include "evaluate.h"
 #include "movegen.h"
+#include "nnue/nnue_test_command.h"
 #include "position.h"
 #include "search.h"
+#include "syzygy/tbprobe.h"
 #include "thread.h"
 #include "timeman.h"
 #include "tt.h"
 #include "uci.h"
-#include "syzygy/tbprobe.h"
-
-#if defined(ENABLE_TEST_CMD)
-#include "nnue/nnue_test_command.h"
-#endif
 
 using namespace std;
 
@@ -67,7 +64,6 @@ namespace Learner
 }
 #endif
 
-#if defined(ENABLE_TEST_CMD)
 void test_cmd(Position& pos, istringstream& is)
 {
     // Initialize as it may be searched.
@@ -78,7 +74,6 @@ void test_cmd(Position& pos, istringstream& is)
 
     if (param == "nnue") Eval::NNUE::TestCommand(pos, is);
 }
-#endif
 
 namespace {
 
@@ -373,10 +368,8 @@ void UCI::loop(int argc, char* argv[]) {
 
 #endif
 
-#if defined(ENABLE_TEST_CMD)
       // test command
       else if (token == "test") test_cmd(pos, is);
-#endif
       else
           sync_cout << "Unknown command: " << cmd << sync_endl;
 

From 458771a18199d4f64f4190521bea4aa91460c462 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 23:02:31 +0900
Subject: [PATCH 211/583] Removed GENSFEN2019 macro.

---
 src/uci.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/uci.cpp b/src/uci.cpp
index 1454e4e0..6675f2e0 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -50,11 +50,6 @@ namespace Learner
   // Learning from the generated game record
   void learn(Position& pos, istringstream& is);
 
-#if defined(GENSFEN2019)
-  // Automatic generation command of teacher phase under development
-  void gen_sfen2019(Position& pos, istringstream& is);
-#endif
-
   // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
   typedef std::pair<Value, std::vector<Move> > ValueAndPV;
 
@@ -358,10 +353,6 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "gensfen") Learner::gen_sfen(pos, is);
       else if (token == "learn") Learner::learn(pos, is);
 
-#if defined (GENSFEN2019)
-	  // Command to generate teacher phase under development
-      else if (token == "gensfen2019") Learner::gen_sfen2019(pos, is);
-#endif
       // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);
       else if (token == "search") search_cmd(pos, is);

From 04a9a951b8611d6f176d49c9edd24d22ec5ba457 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 23:08:39 +0900
Subject: [PATCH 212/583] Removed "#if 0" and "#if 1".

---
 src/learn/gensfen.cpp | 24 +++------------------
 src/learn/learner.cpp | 50 -------------------------------------------
 2 files changed, 3 insertions(+), 71 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 65e64177..ec3de570 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -166,7 +166,7 @@ namespace Learner
                         output_file_stream.write(reinterpret_cast<const char*>(buf->data()), sizeof(PackedSfenValue) * buf->size());
 
                         sfen_write_count += buf->size();
-#if 1
+
                         // Add the processed number here, and if it exceeds save_every,
                         // change the file name and reset this counter.
                         sfen_write_count_current_file += buf->size();
@@ -186,7 +186,7 @@ namespace Learner
                             output_file_stream.open(new_filename, ios::out | ios::binary | ios::app);
                             cout << endl << "output sfen file = " << new_filename << endl;
                         }
-#endif
+
                         // Output '.' every time when writing a game record.
                         std::cout << ".";
 
@@ -519,10 +519,6 @@ namespace Learner
         {
             // Write out one sfen.
             sfen_writer.write(thread_id, *it);
-#if 0
-            pos.set_from_packed_sfen(it->sfen);
-            cout << pos << "Win : " << it->is_win << " , " << it->score << endl;
-#endif
         }
 
         return quit;
@@ -667,13 +663,12 @@ namespace Learner
 
         for (auto m : pv)
         {
-#if 1
             // There should be no illegal move. This is as a debugging precaution.
             if (!pos.pseudo_legal(m) || !pos.legal(m))
             {
                 cout << "Error! : " << pos.fen() << m << endl;
             }
-#endif
+
             pos.do_move(m, states[ply++]);
 
             // Because the difference calculation of evaluate() cannot be
@@ -803,19 +798,6 @@ namespace Learner
                     // Save the move score for adjudication.
                     move_hist_scores.push_back(search_value);
 
-#if 0
-                    dbg_hit_on(search_value == leaf_value);
-                    // gensfen depth 3 eval_limit 32000
-                    // Total 217749 Hits 203579 hit rate (%) 93.490
-                    // gensfen depth 6 eval_limit 32000
-                    // Total 78407 Hits 69190 hit rate (%) 88.245
-                    // gensfen depth 6 eval_limit 3000
-                    // Total 53879 Hits 43713 hit rate (%) 81.132
-
-                    // Problems such as pruning with moves in the substitution table.
-                    // This is a little uncomfortable as a teacher...
-#endif
-
                     // If depth 0, pv is not obtained, so search again at depth 2.
                     if (search_depth_min <= 0)
                     {
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index e7f021fe..2f1d27b2 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1163,11 +1163,6 @@ namespace Learner
             if (ps.gamePly < prng.rand(reduction_gameply))
                 goto RETRY_READ;
 
-#if 0
-            auto sfen = pos.sfen_unpack(ps.data);
-            pos.set(sfen);
-#endif
-            // ↑ Since it is slow when passing through sfen, I made a dedicated function.
             StateInfo si;
             const bool mirror = prng.rand(100) < mirror_percentage;
             if (pos.set_from_packed_sfen(ps.sfen, &si, th, mirror) != 0)
@@ -1207,28 +1202,6 @@ namespace Learner
             // If it is the result of searching a completely different place, it may become noise.
             // It may be better not to study where the difference in evaluation values ​​is too large.
 
-#if 0
-            // If you do this, about 13% of the phases will be excluded 
-            // from the learning target. Good and bad are subtle.
-            if (pv.size() >= 1 && (uint16_t)pv[0] != ps.move)
-            {
-                //dbg_hit_on(false);
-                continue;
-            }
-#endif
-
-#if 0
-            // It may be better not to study where the difference in evaluation values ​​is too large.
-            // → It's okay because it passes the win rate function... 
-            // About 30% of the phases are out of the scope of learning...
-            if (abs((int16_t)r.first - ps.score) >= Eval::PawnValue * 4)
-            {
-                //dbg_hit_on(false);
-                continue;
-            }
-            //dbg_hit_on(true);
-#endif
-
             int ply = 0;
 
             // A helper function that adds the gradient to the current phase.
@@ -1315,17 +1288,6 @@ namespace Learner
             // rewind the phase
             for (auto it = pv.rbegin(); it != pv.rend(); ++it)
                 pos.undo_move(*it);
-
-#if 0
-            // When adding the gradient to the root phase
-            shallow_value = 
-                (rootColor == pos.side_to_move()) 
-                ? Eval::evaluate(pos) 
-                : -Eval::evaluate(pos);
-
-            dj_dw = calc_grad(deep_value, shallow_value, ps);
-            Eval::add_grad(pos, rootColor, dj_dw, without_kpp);
-#endif
         }
 
     }
@@ -2058,18 +2020,6 @@ namespace Learner
             learn_think.best_nn_directory = std::string(Options["EvalDir"]);
         }
 
-#if 0
-        // A test to give a gradient of 1.0 to the initial stage of Hirate.
-        pos.set_hirate();
-        cout << Eval::evaluate(pos) << endl;
-        //Eval::print_eval_stat(pos);
-        Eval::add_grad(pos, BLACK, 32.0, false);
-        Eval::update_weights(1);
-        pos.state()->sum.p[2][0] = VALUE_NOT_EVALUATED;
-        cout << Eval::evaluate(pos) << endl;
-        //Eval::print_eval_stat(pos);
-#endif
-
         cout << "init done." << endl;
 
         // Reflect other option settings.

From ec96409176fa8f2cdc2e8a003150fcabf037f85c Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 23:30:57 +0900
Subject: [PATCH 213/583] Replaced DNDEBUG macro to _DEBUG macro.

---
 src/learn/gensfen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index ec3de570..0232e5d4 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -70,7 +70,7 @@ namespace Learner
             file_worker_thread.join();
             output_file_stream.close();
 
-#if !defined(DNDEBUG)
+#if defined(_DEBUG)
             {
                 // All buffers should be empty since file_worker_thread
                 // should have written everything before exiting.

From aa2452caf39446fded3c0ee79c18c3ecb43369b3 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 23:45:19 +0900
Subject: [PATCH 214/583] Removed #if for USE_EVAL_HASH.

---
 src/eval/evaluate_common.h |  6 ------
 src/learn/gensfen.cpp      | 10 ----------
 src/learn/learner.cpp      |  2 --
 3 files changed, 18 deletions(-)

diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index 3fb161ab..927783cd 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -18,12 +18,6 @@
 
 namespace Eval
 {
-
-#if defined(USE_EVAL_HASH)
-	// prefetch function
-	void prefetch_evalhash(const Key key);
-#endif
-
 	// An operator that applies the function f to each parameter of the evaluation function.
 	// Used for parameter analysis etc.
 	// type indicates the survey target.
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 0232e5d4..4050d983 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -956,12 +956,6 @@ namespace Learner
 
         string token;
 
-        // When hit to eval hash, as a evaluation value near the initial stage, if a hash collision occurs and a large value is written
-        // When eval_limit is set small, eval_limit will be exceeded every time in the initial phase, and phase generation will not proceed.
-        // Therefore, eval hash needs to be disabled.
-        // After that, when the hash of the eval hash collides, the evaluation value of a strange value is used, and it may be unpleasant to use it for the teacher.
-        bool use_eval_hash = false;
-
         // Save to file in this unit.
         // File names are serialized like file_1.bin, file_2.bin.
         uint64_t save_every = UINT64_MAX;
@@ -1010,8 +1004,6 @@ namespace Learner
                 is >> write_minply;
             else if (token == "write_maxply")
                 is >> write_maxply;
-            else if (token == "use_eval_hash")
-                is >> use_eval_hash;
             else if (token == "save_every")
                 is >> save_every;
             else if (token == "random_file_name")
@@ -1033,7 +1025,6 @@ namespace Learner
 #if defined(USE_GLOBAL_OPTIONS)
         // Save it for later restore.
         auto oldGlobalOptions = GlobalOptions;
-        GlobalOptions.use_eval_hash = use_eval_hash;
 #endif
 
         // If search depth2 is not set, leave it the same as search depth.
@@ -1075,7 +1066,6 @@ namespace Learner
             << "  write_minply           = " << write_minply << endl
             << "  write_maxply           = " << write_maxply << endl
             << "  output_file_name       = " << output_file_name << endl
-            << "  use_eval_hash          = " << use_eval_hash << endl
             << "  save_every             = " << save_every << endl
             << "  random_file_name       = " << random_file_name << endl
             << "  write_out_draw_game_in_training_data_generation = " << write_out_draw_game_in_training_data_generation << endl
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 2f1d27b2..9e6f10cb 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1639,8 +1639,6 @@ namespace Learner
 #if defined(USE_GLOBAL_OPTIONS)
     // Save it for later restore.
         auto oldGlobalOptions = GlobalOptions;
-        // If you hit the eval hash, you can not calculate rmse etc. so turn it off.
-        GlobalOptions.use_eval_hash = false;
         // If you hit the replacement table, pruning may occur at the previous evaluation value, so turn it off.
         GlobalOptions.use_hash_probe = false;
 #endif

From 82dc68ba9ffe1d5fe849eef1f0fcc565ef810512 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 23:47:04 +0900
Subject: [PATCH 215/583] Removed #if for USE_GLOBAL_OPTIONS.

---
 src/learn/gensfen.cpp | 11 -----------
 src/learn/learner.cpp | 12 ------------
 src/search.cpp        | 11 -----------
 3 files changed, 34 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 4050d983..3d015acf 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1022,11 +1022,6 @@ namespace Learner
                 cout << "Error! : Illegal token " << token << endl;
         }
 
-#if defined(USE_GLOBAL_OPTIONS)
-        // Save it for later restore.
-        auto oldGlobalOptions = GlobalOptions;
-#endif
-
         // If search depth2 is not set, leave it the same as search depth.
         if (search_depth_max == INT_MIN)
             search_depth_max = search_depth_min;
@@ -1103,12 +1098,6 @@ namespace Learner
         }
 
         std::cout << "gensfen finished." << endl;
-
-#if defined(USE_GLOBAL_OPTIONS)
-        // Restore Global Options.
-        GlobalOptions = oldGlobalOptions;
-#endif
-
     }
 }
 #endif
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 9e6f10cb..daea9594 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1636,13 +1636,6 @@ namespace Learner
         uint64_t eta1_epoch = 0; // eta2 is not applied by default
         uint64_t eta2_epoch = 0; // eta3 is not applied by default
 
-#if defined(USE_GLOBAL_OPTIONS)
-    // Save it for later restore.
-        auto oldGlobalOptions = GlobalOptions;
-        // If you hit the replacement table, pruning may occur at the previous evaluation value, so turn it off.
-        GlobalOptions.use_hash_probe = false;
-#endif
-
         // --- Function that only shuffles the teacher aspect
 
         // normal shuffle
@@ -2072,11 +2065,6 @@ namespace Learner
 
         // Save once at the end.
         learn_think.save(true);
-
-#if defined(USE_GLOBAL_OPTIONS)
-        // Restore Global Options.
-        GlobalOptions = oldGlobalOptions;
-#endif
     }
 
 } // namespace Learner
diff --git a/src/search.cpp b/src/search.cpp
index 8f258ae4..67348a2b 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -2070,17 +2070,6 @@ namespace Learner
         rootMoves.push_back(Search::RootMove(m));
 
       assert(!rootMoves.empty());
-
-      //#if defined(USE_GLOBAL_OPTIONS)
-      // Since the generation of the substitution table for each search thread should be managed,
-      // Increase the generation of the substitution table for this thread because it is a new search.
-            //TT.new_search(th->thread_id());
-
-            // ª If you call new_search here, it may be a loss because you can't use the previous search result.
-            // Do not do this here, but caller should do TT.new_search(th->thread_id()) for each station ...
-
-            // ¨Because we want to avoid reaching the same final diagram, use the substitution table commonly for all threads when generating teachers.
-      //#endif
     }
   }
 

From 05d26499b42878447a21b6d721f4868151357665 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Tue, 8 Sep 2020 23:57:51 +0900
Subject: [PATCH 216/583] Removed LEARN_ELMO_METHOD macro.

---
 src/learn/learn.h | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 1bc39cf9..7285f61a 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -5,18 +5,6 @@
 
 #include <vector>
 
-// =====================
-// Settings for learning
-// =====================
-
-// If you select one of the following, the details after that will be automatically selected.
-// If you don't select any of them, you need to set the subsequent details one by one.
-
-// Learning setting by elmo method. This is the default setting.
-// To make a standard squeeze diaphragm, specify "lambda 1" with the learn command.
-#define LEARN_ELMO_METHOD
-
-
 // ----------------------
 // update formula
 // ----------------------
@@ -147,10 +135,8 @@ typedef float LearnFloatType;
 // Learning with the method of elmo (WCSC27)
 // ----------------------
 
-#if defined( LEARN_ELMO_METHOD )
 #define LOSS_FUNCTION_IS_ELMO_METHOD
 #define ADA_GRAD_UPDATE
-#endif
 
 // Character string according to update formula. (Output for debugging.)
 // Implemented various update expressions, but concluded that AdaGrad is the best in terms of speed and memory.

From 0271d707759117af6557beb93319aa51c07280aa Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:01:53 +0900
Subject: [PATCH 217/583] Removed ADA_GRAD_UPDATE macro.

---
 src/learn/learn.h          | 10 +------
 src/learn/learning_tools.h | 54 +-------------------------------------
 2 files changed, 2 insertions(+), 62 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 7285f61a..8fb6217f 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -9,9 +9,6 @@
 // update formula
 // ----------------------
 
-// Ada Grad. Recommended because it is stable.
-// #define ADA_GRAD_UPDATE
-
 // SGD looking only at the sign of the gradient. It requires less memory, but the accuracy is...
 // #define SGD_UPDATE
 
@@ -136,13 +133,8 @@ typedef float LearnFloatType;
 // ----------------------
 
 #define LOSS_FUNCTION_IS_ELMO_METHOD
-#define ADA_GRAD_UPDATE
 
-// Character string according to update formula. (Output for debugging.)
-// Implemented various update expressions, but concluded that AdaGrad is the best in terms of speed and memory.
-#if defined(ADA_GRAD_UPDATE)
-#define LEARN_UPDATE "AdaGrad"
-#elif defined(SGD_UPDATE)
+#if defined(SGD_UPDATE)
 #define LEARN_UPDATE "SGD"
 #endif
 
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 3c4be08a..854133e4 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -76,59 +76,7 @@ namespace EvalLearningTools
 
 		template <typename T> void updateFV(T& v) { updateFV(v, 1.0); }
 
-#if defined (ADA_GRAD_UPDATE)
-
-		// Since the maximum value that can be accurately calculated with float is INT16_MAX*256-1
-		// Keep the small value as a marker.
-		const LearnFloatType V0_NOT_INIT = (INT16_MAX * 128);
-
-		// What holds v internally. The previous implementation kept a fixed decimal with only a fractional part to save memory,
-		// Since it is doubtful in accuracy and the visibility is bad, it was abolished.
-		LearnFloatType v0 = LearnFloatType(V0_NOT_INIT);
-
-		// AdaGrad g2
-		LearnFloatType g2 = LearnFloatType(0);
-
-		// update with AdaGrad
-		// When executing this function, the value of g and the member do not change
-		// Guaranteed by the caller. It does not have to be an atomic operation.
-		// k is a coefficient for eta. 1.0 is usually sufficient. If you want to lower eta for your turn item, set this to 1/8.0 etc.
-		template <typename T>
-		void updateFV(T& v,double k)
-		{
-			// AdaGrad update formula
-			// Gradient vector is g, vector to be updated is v, η(eta) is a constant,
-			//     g2 = g2 + g^2
-			//     v = v - ηg/sqrt(g2)
-
-			constexpr double epsilon = 0.000001;
-
-			if (g == LearnFloatType(0))
-				return;
-
-			g2 += g * g;
-
-			// If v0 is V0_NOT_INIT, it means that the value is not initialized with the value of KK/KKP/KPP array,
-			// In this case, read the value of v from the one passed in the argument.
-			double V = (v0 == V0_NOT_INIT) ? v : v0;
-
-			V -= k * eta * (double)g / sqrt((double)g2 + epsilon);
-
-			// Limit the value of V to be within the range of types.
-			// By the way, windows.h defines the min and max macros, so to avoid it,
-			// Here, it is enclosed in parentheses so that it is not treated as a function-like macro.
-			V = (std::min)((double)(std::numeric_limits<T>::max)() , V);
-			V = (std::max)((double)(std::numeric_limits<T>::min)() , V);
-
-			v0 = (LearnFloatType)V;
-			v = (T)round(V);
-
-			// Clear g because one update of mini-batch for this element is over
-			// g[i] = 0;
-			// → There is a problem of dimension reduction, so this will be done by the caller.
-		}
-
-#elif defined(SGD_UPDATE)
+#if defined(SGD_UPDATE)
 
 		// See only the sign of the gradient Update with SGD
 		// When executing this function, the value of g and the member do not change

From f3a158725d573753cf4b81fc5866c0f3bbdb1e88 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:07:09 +0900
Subject: [PATCH 218/583] Removed SGD_UPDATE macro.

---
 src/learn/learn.h          | 12 ---------
 src/learn/learning_tools.h | 51 ++------------------------------------
 2 files changed, 2 insertions(+), 61 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 8fb6217f..91b40213 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -5,14 +5,6 @@
 
 #include <vector>
 
-// ----------------------
-// update formula
-// ----------------------
-
-// SGD looking only at the sign of the gradient. It requires less memory, but the accuracy is...
-// #define SGD_UPDATE
-
-
 // ----------------------
 // Select the objective function
 // ----------------------
@@ -134,10 +126,6 @@ typedef float LearnFloatType;
 
 #define LOSS_FUNCTION_IS_ELMO_METHOD
 
-#if defined(SGD_UPDATE)
-#define LEARN_UPDATE "SGD"
-#endif
-
 #if defined(LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
 #define LOSS_FUNCTION "WINNING_PERCENTAGE"
 #elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 854133e4..348105b6 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -4,13 +4,12 @@
 // A set of machine learning tools related to the weight array used for machine learning of evaluation functions
 
 #include "learn.h"
+
 #if defined (EVAL_LEARN)
-#include <array>
 
-#if defined(SGD_UPDATE) || defined(USE_KPPP_MIRROR_WRITE)
 #include "../misc.h"  // PRNG , my_insertion_sort
-#endif
 
+#include <array>
 #include <cmath>	// std::sqrt()
 
 namespace EvalLearningTools
@@ -29,14 +28,6 @@ namespace EvalLearningTools
 		// cumulative value of one mini-batch gradient
 		LearnFloatType g = LearnFloatType(0);
 
-		// When ADA_GRAD_UPDATE. LearnFloatType == float,
-		// total 4*2 + 4*2 + 1*2 = 18 bytes
-		// It suffices to secure a Weight array that is 4.5 times the size of the evaluation function parameter of 1GB.
-		// However, sizeof(Weight)==20 code is generated if the structure alignment is in 4-byte units, so
-		// Specify pragma pack(2).
-
-		// For SGD_UPDATE, this structure is reduced by 10 bytes to 8 bytes.
-
 		// Learning rate η(eta) such as AdaGrad.
 		// It is assumed that eta1,2,3,eta1_epoch,eta2_epoch have been set by the time updateFV() is called.
 		// The epoch of update_weights() gradually changes from eta1 to eta2 until eta1_epoch.
@@ -76,44 +67,6 @@ namespace EvalLearningTools
 
 		template <typename T> void updateFV(T& v) { updateFV(v, 1.0); }
 
-#if defined(SGD_UPDATE)
-
-		// See only the sign of the gradient Update with SGD
-		// When executing this function, the value of g and the member do not change
-		// Guaranteed by the caller. It does not have to be an atomic operation.
-		template <typename T>
-		void updateFV(T & v , double k)
-		{
-			if (g == 0)
-				return;
-
-			// See only the sign of g and update.
-			// If g <0, add v a little.
-			// If g> 0, subtract v slightly.
-
-			// Since we only add integers, no decimal part is required.
-
-			// It's a good idea to move around 0-5.
-			// It is better to have a Gaussian distribution, so generate a 5-bit random number (each bit has a 1/2 probability of 1),
-			// Pop_count() it. At this time, it has a binomial distribution.
-			//int16_t diff = (int16_t)POPCNT32((u32)prng.rand(31));
-			// → If I do this with 80 threads, this AsyncPRNG::rand() locks, so I slowed down. This implementation is not good.
-			int16_t diff = 1;
-
-			double V = v;
-			if (g > 0.0)
-				V-= diff;
-			else
-				V+= diff;
-
-			V = (std::min)((double)(std::numeric_limits<T>::max)(), V);
-			V = (std::max)((double)(std::numeric_limits<T>::min)(), V);
-
-			v = (T)V;
-		}
-
-#endif
-
 		// grad setting
 		template <typename T> void set_grad(const T& g_) { g = g_; }
 

From d37eb63581ce2de8fd1a8406a9bc06b6377d2176 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:08:56 +0900
Subject: [PATCH 219/583] Removed LOSS_FUNCTION_IS_WINNING_PERCENTAGE macro.

---
 src/learn/learn.h     |  9 +--------
 src/learn/learner.cpp | 36 ------------------------------------
 2 files changed, 1 insertion(+), 44 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 91b40213..9d783986 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -9,11 +9,6 @@
 // Select the objective function
 // ----------------------
 
-// The objective function is the sum of squares of the difference in winning percentage
-// See learner.cpp for more information.
-
-//#define LOSS_FUNCTION_IS_WINNING_PERCENTAGE
-
 // Objective function is cross entropy
 // See learner.cpp for more information.
 // So-called ordinary "rag cloth squeezer"
@@ -126,9 +121,7 @@ typedef float LearnFloatType;
 
 #define LOSS_FUNCTION_IS_ELMO_METHOD
 
-#if defined(LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
-#define LOSS_FUNCTION "WINNING_PERCENTAGE"
-#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
+#if defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
 #define LOSS_FUNCTION "CROSS_ENTOROPY"
 #elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
 #define LOSS_FUNCTION "CROSS_ENTOROPY_FOR_VALUE"
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index daea9594..e9658da6 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -163,42 +163,6 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
-    // When the objective function is the sum of squares of the difference in winning percentage
-#if defined (LOSS_FUNCTION_IS_WINNING_PERCENTAGE)
-// function to calculate the gradient
-    double calc_grad(Value deep, Value shallow, PackedSfenValue& psv)
-    {
-        // The square of the win rate difference minimizes it in the objective function.
-        // Objective function J = 1/2m Σ (win_rate(shallow)-win_rate(deep) )^2
-        // However, σ is a sigmoid function that converts the 
-        // evaluation value into the difference in the winning percentage.
-        // m is the number of samples. shallow is the evaluation value 
-        // for a shallow search (qsearch()). deep is the evaluation value for deep search.
-        // If W is the feature vector (parameter of the evaluation function) 
-        // and Xi and Yi are teachers
-        // shallow = W*Xi // * is the Hadamard product, transposing W and meaning X
-        // f(Xi) = win_rate(W*Xi)
-        // If σ(i th deep) = Yi,
-        // J = m/2 Σ (f(Xi)-Yi )^2
-        // becomes a common expression.
-        // W is a vector, and if we write the jth element as Wj, from the chain rule
-        // ∂J/∂Wj = ∂J/∂f ・∂f/∂W ・∂W/∂Wj
-        // = 1/m Σ (f(Xi)-y) ・f'(Xi) ・ 1
-
-        // 1/m will be multiplied later, but the contents of Σ can 
-        // be retained in the array as the value of the gradient.
-        // f'(Xi) = win_rate'(shallow) = sigmoid'(shallow/600) = dsigmoid(shallow / 600) / 600
-        // This /600 at the end is adjusted by the learning rate, so do not write it..
-        // Also, the coefficient of 1/m is unnecessary if you use the update 
-        // formula that has the automatic gradient adjustment function like Adam and AdaGrad.
-        // Therefore, it is not necessary to save it in memory.
-
-        const double p = winning_percentage(deep, psv.gamePly);
-        const double q = winning_percentage(shallow, psv.gamePly);
-        return (q - p) * Math::dsigmoid(double(shallow) / 600.0);
-    }
-#endif
-
 #if defined (LOSS_FUNCTION_IS_CROSS_ENTOROPY)
     double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
     {

From f52fbf8006174023fa137feda1d7db67a884ac2e Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:10:04 +0900
Subject: [PATCH 220/583] Removed LOSS_FUNCTION_IS_CROSS_ENTOROPY macro.

---
 src/learn/learn.h     |  9 +--------
 src/learn/learner.cpp | 29 -----------------------------
 2 files changed, 1 insertion(+), 37 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 9d783986..da542d67 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -9,11 +9,6 @@
 // Select the objective function
 // ----------------------
 
-// Objective function is cross entropy
-// See learner.cpp for more information.
-// So-called ordinary "rag cloth squeezer"
-//#define LOSS_FUNCTION_IS_CROSS_ENTOROPY
-
 // A version in which the objective function is cross entropy, but the win rate function is not passed
 // #define LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE
 
@@ -121,9 +116,7 @@ typedef float LearnFloatType;
 
 #define LOSS_FUNCTION_IS_ELMO_METHOD
 
-#if defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY)
-#define LOSS_FUNCTION "CROSS_ENTOROPY"
-#elif defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
+#if defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
 #define LOSS_FUNCTION "CROSS_ENTOROPY_FOR_VALUE"
 #elif defined(LOSS_FUNCTION_IS_ELMO_METHOD)
 #define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index e9658da6..66835ce5 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -163,35 +163,6 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
-#if defined (LOSS_FUNCTION_IS_CROSS_ENTOROPY)
-    double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
-    {
-        // Objective function with cross entropy
-
-        // For the concept and nature of cross entropy,
-        // http://nnadl-ja.github.io/nnadl_site_ja/chap3.html#the_cross-entropy_cost_function
-        // http://postd.cc/visual-information-theory-3/
-        // Refer to etc.
-
-        // Objective function design)
-        // We want to make the distribution of p closer to the distribution of q 
-        // → Think of it as the problem of minimizing the cross entropy 
-        // between the probability distributions of p and q.
-        // J = H(p,q) =-Σ p(x) log(q(x)) = -p log q-(1-p) log(1-q)
-        // x
-
-        // p is a constant and q is a Wi function (q = σ(W・Xi) ).
-        // ∂J/∂Wi = -p・q'/q-(1-p)(1-q)'/(1-q)
-        // = ...
-        // = q-p.
-
-        const double p = winning_percentage(deep, psv.gamePly);
-        const double q = winning_percentage(shallow, psv.gamePly);
-
-        return q - p;
-    }
-#endif
-
 #if defined ( LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE )
     double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
     {

From ef1601218db703b42e31b34d8c324f0ec3001f83 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:11:11 +0900
Subject: [PATCH 221/583] Removed LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE
 macro.

---
 src/learn/learn.h     |  7 +------
 src/learn/learner.cpp | 11 -----------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index da542d67..d2477277 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -9,9 +9,6 @@
 // Select the objective function
 // ----------------------
 
-// A version in which the objective function is cross entropy, but the win rate function is not passed
-// #define LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE
-
 // elmo (WCSC27) method
 // #define LOSS_FUNCTION_IS_ELMO_METHOD
 
@@ -116,9 +113,7 @@ typedef float LearnFloatType;
 
 #define LOSS_FUNCTION_IS_ELMO_METHOD
 
-#if defined(LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE)
-#define LOSS_FUNCTION "CROSS_ENTOROPY_FOR_VALUE"
-#elif defined(LOSS_FUNCTION_IS_ELMO_METHOD)
+#if defined(LOSS_FUNCTION_IS_ELMO_METHOD)
 #define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
 #endif
 
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 66835ce5..82bcfa09 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -163,17 +163,6 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
-#if defined ( LOSS_FUNCTION_IS_CROSS_ENTOROPY_FOR_VALUE )
-    double calc_grad(Value deep, Value shallow, const PackedSfenValue& psv)
-    {
-        // Version that does not pass the winning percentage function
-        // This, unless EVAL_LIMIT is set low, trying to 
-        // match the evaluation value with the shape of the end stage
-        // eval may exceed the range of eval.
-        return shallow - deep;
-    }
-#endif
-
 #if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
 
     // A constant used in elmo (WCSC27). Adjustment required.

From dbad9d96e0fc2923edfdbef37162ecd5b0645d50 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:17:02 +0900
Subject: [PATCH 222/583] Removed LOSS_FUNCTION_IS_ELMO_METHOD macro.

---
 src/learn/learn.h     | 19 ---------------
 src/learn/learner.cpp | 54 +------------------------------------------
 2 files changed, 1 insertion(+), 72 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index d2477277..2ee2f8d6 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -5,21 +5,6 @@
 
 #include <vector>
 
-// ----------------------
-// Select the objective function
-// ----------------------
-
-// elmo (WCSC27) method
-// #define LOSS_FUNCTION_IS_ELMO_METHOD
-
-// ※ Other things may be added.
-
-
-// ----------------------
-// debug settings for learning
-// ----------------------
-
-
 // ----------------------
 // learning from zero vector
 // ----------------------
@@ -111,11 +96,7 @@ typedef float LearnFloatType;
 // Learning with the method of elmo (WCSC27)
 // ----------------------
 
-#define LOSS_FUNCTION_IS_ELMO_METHOD
-
-#if defined(LOSS_FUNCTION_IS_ELMO_METHOD)
 #define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
-#endif
 
 // ----------------------
 // Definition of struct used in Learner
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 82bcfa09..84cade5c 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -163,8 +163,6 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
-#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
-
     // A constant used in elmo (WCSC27). Adjustment required.
     // Since elmo does not internally divide the expression, the value is different.
     // You can set this value with the learn command.
@@ -293,7 +291,6 @@ namespace Learner
             (-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
     }
 
-#endif
     // Other objective functions may be considered in the future...
     double calc_grad(Value shallow, const PackedSfenValue& psv) 
     {
@@ -629,14 +626,12 @@ namespace Learner
             stop_flag(false), 
             save_only_once(false)
         {
-#if defined ( LOSS_FUNCTION_IS_ELMO_METHOD )
             learn_sum_cross_entropy_eval = 0.0;
             learn_sum_cross_entropy_win = 0.0;
             learn_sum_cross_entropy = 0.0;
             learn_sum_entropy_eval = 0.0;
             learn_sum_entropy_win = 0.0;
             learn_sum_entropy = 0.0;
-#endif
 
             newbob_scale = 1.0;
             newbob_decay = 1.0;
@@ -689,15 +684,13 @@ namespace Learner
 
         // --- loss calculation
 
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
-    // For calculation of learning data loss
+        // For calculation of learning data loss
         atomic<double> learn_sum_cross_entropy_eval;
         atomic<double> learn_sum_cross_entropy_win;
         atomic<double> learn_sum_cross_entropy;
         atomic<double> learn_sum_entropy_eval;
         atomic<double> learn_sum_entropy_win;
         atomic<double> learn_sum_entropy;
-#endif
 
         shared_timed_mutex nn_mutex;
         double newbob_scale;
@@ -759,13 +752,6 @@ namespace Learner
         std::cout << ", iteration " << epoch;
         std::cout << ", eta = " << Eval::get_eta() << ", ";
 
-#if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-        double sum_error = 0;
-        double sum_error2 = 0;
-        double sum_error3 = 0;
-#endif
-
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
         // For calculation of verification data loss
         atomic<double> test_sum_cross_entropy_eval, test_sum_cross_entropy_win, test_sum_cross_entropy;
         atomic<double> test_sum_entropy_eval, test_sum_entropy_win, test_sum_entropy;
@@ -779,7 +765,6 @@ namespace Learner
         // norm for learning
         atomic<double> sum_norm;
         sum_norm = 0;
-#endif
 
         // The number of times the pv first move of deep 
         // search matches the pv first move of search(1).
@@ -841,25 +826,11 @@ namespace Learner
                 // Note) This code does not consider when 
                 //       eval_limit is specified in the learn command.
 
-                // --- error calculation
-
-#if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-                auto grad = calc_grad(deep_value, shallow_value, ps);
-
-                // something like rmse
-                sum_error += grad * grad;
-                // Add the absolute value of the gradient
-                sum_error2 += abs(grad);
-                // Add the absolute value of the difference between the evaluation values
-                sum_error3 += abs(shallow_value - deep_value);
-#endif
-
                 // --- calculation of cross entropy
 
                 // For the time being, regarding the win rate and loss terms only in the elmo method
                 // Calculate and display the cross entropy.
 
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
                 double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
                 double test_entropy_eval, test_entropy_win, test_entropy;
                 calc_cross_entropy(
@@ -881,7 +852,6 @@ namespace Learner
                 test_sum_entropy_win += test_entropy_win;
                 test_sum_entropy += test_entropy;
                 sum_norm += (double)abs(shallow_value);
-#endif
 
                 // Determine if the teacher's move and the score of the shallow search match
                 {
@@ -905,17 +875,6 @@ namespace Learner
         while (task_count)
             sleep(1);
 
-#if !defined(LOSS_FUNCTION_IS_ELMO_METHOD)
-        // rmse = root mean square error: mean square error
-        // mae = mean absolute error: mean absolute error
-        auto dsig_rmse = std::sqrt(sum_error / (sfen_for_mse.size() + epsilon));
-        auto dsig_mae = sum_error2 / (sfen_for_mse.size() + epsilon);
-        auto eval_mae = sum_error3 / (sfen_for_mse.size() + epsilon);
-        cout << " , dsig rmse = " << dsig_rmse << " , dsig mae = " << dsig_mae
-            << " , eval mae = " << eval_mae;
-#endif
-
-#if defined(LOSS_FUNCTION_IS_ELMO_METHOD)
         latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
         latest_loss_count += sr.sfen_for_mse.size();
 
@@ -960,9 +919,6 @@ namespace Learner
         learn_sum_entropy_eval = 0.0;
         learn_sum_entropy_win = 0.0;
         learn_sum_entropy = 0.0;
-#else
-        << endl;
-#endif
     }
 
     void LearnerThink::thread_worker(size_t thread_id)
@@ -1144,7 +1100,6 @@ namespace Learner
                     ? Eval::evaluate(pos) 
                     : -Eval::evaluate(pos);
 
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
                 // Calculate loss for training data
                 double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
                 double learn_entropy_eval, learn_entropy_win, learn_entropy;
@@ -1165,7 +1120,6 @@ namespace Learner
                 learn_sum_entropy_eval += learn_entropy_eval;
                 learn_sum_entropy_win += learn_entropy_win;
                 learn_sum_entropy += learn_entropy;
-#endif
 
                 const double example_weight =
                     (discount_rate != 0 && ply != (int)pv.size()) ? discount_rate : 1.0;
@@ -1600,12 +1554,10 @@ namespace Learner
         // Turn on if you want to pass a pre-shuffled file.
         bool no_shuffle = false;
 
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
         // elmo lambda
         ELMO_LAMBDA = 0.33;
         ELMO_LAMBDA2 = 0.33;
         ELMO_LAMBDA_LIMIT = 32000;
-#endif
 
         // Discount rate. If this is set to a value other than 0, 
         // the slope will be added even at other than the PV termination. 
@@ -1703,13 +1655,11 @@ namespace Learner
             else if (option == "freeze_kkpp")  is >> freeze[3];
 #endif
 
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
             // LAMBDA
             else if (option == "lambda")       is >> ELMO_LAMBDA;
             else if (option == "lambda2")      is >> ELMO_LAMBDA2;
             else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
 
-#endif
             else if (option == "reduction_gameply") is >> reduction_gameply;
 
             // shuffle related
@@ -1900,11 +1850,9 @@ namespace Learner
         reduction_gameply = max(reduction_gameply, 1);
         cout << "reduction_gameply : " << reduction_gameply << endl;
 
-#if defined (LOSS_FUNCTION_IS_ELMO_METHOD)
         cout << "LAMBDA            : " << ELMO_LAMBDA << endl;
         cout << "LAMBDA2           : " << ELMO_LAMBDA2 << endl;
         cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
-#endif
 
         cout << "mirror_percentage : " << mirror_percentage << endl;
         cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;

From f52165e1d3b8bebdd702e089eb9fdd7761d45076 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:19:14 +0900
Subject: [PATCH 223/583] Removed RESET_TO_ZERO_VECTOR macro.

---
 src/learn/learn.h | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 2ee2f8d6..6056e8c6 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -5,18 +5,6 @@
 
 #include <vector>
 
-// ----------------------
-// learning from zero vector
-// ----------------------
-
-// Start learning the evaluation function parameters from the zero vector.
-// Initialize to zero, generate a game, learn from zero vector,
-// Game generation → If you repeat learning, you will get parameters that do not depend on the professional game. (maybe)
-// (very time consuming)
-
-//#define RESET_TO_ZERO_VECTOR
-
-
 // ----------------------
 // Floating point for learning
 // ----------------------

From 5e2570267228653a11bf42c14d77d1baf26b99ac Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:19:53 +0900
Subject: [PATCH 224/583] Removed USE_TRIANGLE_WEIGHT_ARRAY macro.

---
 src/learn/learn.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 6056e8c6..ea622bce 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -23,15 +23,6 @@ typedef float LearnFloatType;
 //#include "half_float.h"
 //typedef HalfFloat::float16 LearnFloatType;
 
-// ----------------------
-// save memory
-// ----------------------
-
-// Use a triangular array for the Weight array (of which is KPP) to save memory.
-// If this is used, the weight array for learning will be about 3 times as large as the evaluation function file.
-
-#define USE_TRIANGLE_WEIGHT_ARRAY
-
 // ----------------------
 // dimension down
 // ----------------------

From eafa5693658a91e97612a04b2c620ec5a545e3a0 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:22:02 +0900
Subject: [PATCH 225/583] Removed macros for KPP factorization.

---
 src/learn/learn.h | 31 -------------------------------
 1 file changed, 31 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index ea622bce..0df71c7a 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -23,37 +23,6 @@ typedef float LearnFloatType;
 //#include "half_float.h"
 //typedef HalfFloat::float16 LearnFloatType;
 
-// ----------------------
-// dimension down
-// ----------------------
-
-// Dimension reduction for mirrors (left/right symmetry) and inverse (forward/backward symmetry).
-// All on by default.
-
-// Dimension reduction using mirror and inverse for KK. (Unclear effect)
-// USE_KK_MIRROR_WRITE must be on when USE_KK_INVERSE_WRITE is on.
-#define USE_KK_MIRROR_WRITE
-#define USE_KK_INVERSE_WRITE
-
-// Dimension reduction using Mirror and Inverse for KKP. (Inverse is not so effective)
-// When USE_KKP_INVERSE_WRITE is turned on, USE_KKP_MIRROR_WRITE must also be turned on.
-#define USE_KKP_MIRROR_WRITE
-#define USE_KKP_INVERSE_WRITE
-
-// Perform dimension reduction using a mirror for KPP. (Turning this off requires double the teacher position)
-// KPP has no inverse. (Because there is only K on the front side)
-#define USE_KPP_MIRROR_WRITE
-
-// Perform a dimension reduction using a mirror for KPPP. (Turning this off requires double the teacher position)
-// KPPP has no inverse. (Because there is only K on the front side)
-#define USE_KPPP_MIRROR_WRITE
-
-// Reduce the dimension by KPP for learning the KKPP component.
-// Learning is very slow.
-// Do not use as it is not debugged.
-//#define USE_KKPP_LOWER_DIM
-
-
 // ======================
 // Settings for creating teacher phases
 // ======================

From 8d763fb503fed49e4b7fa2be115e0fa6eb0e74d7 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 00:22:38 +0900
Subject: [PATCH 226/583] Removed LEARN_GENSFEN_USE_DRAW_RESULT macro.

---
 src/learn/learn.h | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/learn/learn.h b/src/learn/learn.h
index 0df71c7a..b7ca18e8 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -23,19 +23,6 @@ typedef float LearnFloatType;
 //#include "half_float.h"
 //typedef HalfFloat::float16 LearnFloatType;
 
-// ======================
-// Settings for creating teacher phases
-// ======================
-
-// ----------------------
-// write out the draw
-// ----------------------
-
-// When you reach a draw, write it out as a teacher position
-// It's subtle whether it's better to do this.
-// #define LEARN_GENSFEN_USE_DRAW_RESULT
-
-
 // ======================
 // configure
 // ======================

From cea17c92f9ad91d0dd2d73db272e6ce6712ba048 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 08:53:57 +0900
Subject: [PATCH 227/583] Simplified evaluate_common.h.

---
 src/eval/evaluate_common.h | 59 ++++----------------------------------
 1 file changed, 5 insertions(+), 54 deletions(-)

diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index 927783cd..989169b3 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -1,75 +1,26 @@
 ﻿#ifndef _EVALUATE_COMMON_H_
 #define _EVALUATE_COMMON_H_
 
+#if defined(EVAL_LEARN)
+
 // A common header-like function for modern evaluation functions (EVAL_KPPT and EVAL_KPP_KKPT).
 
-#include <functional>
-
-// KK file name
-#define KK_BIN "KK_synthesized.bin"
-
-// KKP file name
-#define KKP_BIN "KKP_synthesized.bin"
-
-// KPP file name
-#define KPP_BIN "KPP_synthesized.bin"
-
-#include "../position.h"
+#include <string>
 
 namespace Eval
 {
-	// An operator that applies the function f to each parameter of the evaluation function.
-	// Used for parameter analysis etc.
-	// type indicates the survey target.
-	// type = -1 :KK,KKP,KPP all
-	// type = 0: KK only
-	// type = 1: KKP only
-	// type = 2: KPP only
-	void foreach_eval_param(std::function<void(int32_t, int32_t)>f, int type = -1);
-
 	// --------------------------
 	// for learning
 	// --------------------------
 
-#if defined(EVAL_LEARN)
-	// Initialize the gradient array during learning
-	// Pass the learning rate as an argument. If 0.0, the default value is used.
-	// The epoch of update_weights() gradually changes from eta to eta2 until eta_epoch.
-	// After eta2_epoch, gradually change from eta2 to eta3.
-	void init_grad(double eta1, uint64_t eta_epoch, double eta2, uint64_t eta2_epoch, double eta3);
-
-	// Add the gradient difference value to the gradient array for all features that appear in the current phase.
-	// freeze[0]: Flag that kk does not learn
-	// freeze[1]: Flag that kkp does not learn
-	// freeze[2]: Flag that kpp does not learn
-	// freeze[3]: Flag that kppp does not learn
-	void add_grad(Position& pos, Color rootColor, double delt_grad, const std::array<bool, 4>& freeze);
-
-	// Do SGD or AdaGrad or something based on the current gradient.
-	// epoch: Generation counter (starting from 0)
-	// freeze[0]: Flag that kk does not learn
-	// freeze[1]: Flag that kkp does not learn
-	// freeze[2]: Flag that kpp does not learn
-	// freeze[3]: Flag that kppp does not learn
-	void update_weights(uint64_t epoch, const std::array<bool, 4>& freeze);
-
 	// Save the evaluation function parameters to a file.
 	// You can specify the extension added to the end of the file.
 	void save_eval(std::string suffix);
 
 	// Get the current eta.
 	double get_eta();
-
-	// --learning related commands
-
-	// A function that normalizes KK. Note that it is not completely equivalent to the original evaluation function.
-	// By making the values ​​of kkp and kpp as close to zero as possible, the value of the feature factor (which is zero) that did not appear during learning
-	// The idea of ​​ensuring it is valid.
-	void regularize_kk();
-
-#endif
-
-
 }
 
+#endif // defined(EVAL_LEARN)
+
 #endif // _EVALUATE_KPPT_COMMON_H_

From 2583f689729f7644cb5a5ac6d0369c0c726c3141 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 08:58:10 +0900
Subject: [PATCH 228/583] Removed macros for KPP evaluate functions.

---
 src/eval/evaluate_common.h |  2 +-
 src/learn/learner.cpp      | 16 ----------------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index 989169b3..7799fe79 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -3,7 +3,7 @@
 
 #if defined(EVAL_LEARN)
 
-// A common header-like function for modern evaluation functions (EVAL_KPPT and EVAL_KPP_KKPT).
+// A common header-like function for modern evaluation functions.
 
 #include <string>
 
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 84cade5c..5d9b242f 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1647,14 +1647,6 @@ namespace Learner
             else if (option == "freeze_kkp")   is >> freeze[1];
             else if (option == "freeze_kpp")   is >> freeze[2];
 
-#if defined(EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_KPP_KKPT_FV_VAR) || defined(EVAL_NABLA)
-
-#elif defined(EVAL_KPPPT) || defined(EVAL_KPPP_KKPT) || defined(EVAL_HELICES)
-            else if (option == "freeze_kppp")  is >> freeze[3];
-#elif defined(EVAL_KKPP_KKPT) || defined(EVAL_KKPPT)
-            else if (option == "freeze_kkpp")  is >> freeze[3];
-#endif
-
             // LAMBDA
             else if (option == "lambda")       is >> ELMO_LAMBDA;
             else if (option == "lambda2")      is >> ELMO_LAMBDA2;
@@ -1858,14 +1850,6 @@ namespace Learner
         cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
         cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
 
-#if defined(EVAL_KPPT) || defined(EVAL_KPP_KKPT) || defined(EVAL_KPP_KKPT_FV_VAR) || defined(EVAL_NABLA)
-        cout << "freeze_kk/kkp/kpp      : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << endl;
-#elif defined(EVAL_KPPPT) || defined(EVAL_KPPP_KKPT) || defined(EVAL_HELICES)
-        cout << "freeze_kk/kkp/kpp/kppp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
-#elif defined(EVAL_KKPP_KKPT) || defined(EVAL_KKPPT)
-        cout << "freeze_kk/kkp/kpp/kkpp : " << freeze[0] << " , " << freeze[1] << " , " << freeze[2] << " , " << freeze[3] << endl;
-#endif
-
         // -----------------------------------
         // various initialization
         // -----------------------------------

From 18648458117a35acb2617e9fe04192acca6ba2ae Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 09:26:42 +0900
Subject: [PATCH 229/583] Commented out unused parameters.

---
 src/nnue/features/castling_right.cpp | 6 +++---
 src/nnue/features/enpassant.cpp      | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
index 47fbd986..86fe06fe 100644
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -26,7 +26,7 @@ namespace Eval {
             & ((castling_rights >> 2) & 3);
         }
 
-        for (int i = 0; i <kDimensions; ++i) {
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
           if (relative_castling_rights & (i << 1)) {
             active->push_back(i);
           }
@@ -36,7 +36,7 @@ namespace Eval {
       // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
       void CastlingRight::AppendChangedIndices(
         const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+        IndexList* removed, IndexList* /* added */) {
 
         int previous_castling_rights = pos.state()->previous->castlingRights;
         int current_castling_rights = pos.state()->castlingRights;
@@ -54,7 +54,7 @@ namespace Eval {
             & ((current_castling_rights >> 2) & 3);
         }
 
-        for (int i = 0; i < kDimensions; ++i) {
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
           if ((relative_previous_castling_rights & (i << 1)) &&
             (relative_current_castling_rights & (i << 1)) == 0) {
             removed->push_back(i);
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
index 77bc936e..386bd907 100644
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -30,8 +30,8 @@ namespace Eval {
 
       // Get a list of indices whose values ??have changed from the previous one in the feature quantity
       void EnPassant::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+        const Position& /* pos */, Color /* perspective */,
+        IndexList* /* removed */, IndexList* /* added */) {
         // Not implemented.
         assert(false);
       }

From 4206a1edd069600da29b8ee5a99a486b7aa1603f Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 09:46:05 +0900
Subject: [PATCH 230/583] Renamed parameters to avoid shadowing other
 parameters.

---
 src/nnue/nnue_test_command.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index c3a53c7d..5f0776ef 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -34,12 +34,12 @@ void TestFeatures(Position& pos) {
   std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
   constexpr IndexType kUnknown = -1;
   std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
-  auto make_index_sets = [&](const Position& pos) {
+  auto make_index_sets = [&](const Position& position) {
     std::vector<std::vector<std::set<IndexType>>> index_sets(
         kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
     for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
       Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
+      RawFeatures::AppendActiveIndices(position, kRefreshTriggers[i],
                                        active_indices);
       for (const auto perspective : Colors) {
         for (const auto index : active_indices[perspective]) {
@@ -53,11 +53,11 @@ void TestFeatures(Position& pos) {
     }
     return index_sets;
   };
-  auto update_index_sets = [&](const Position& pos, auto* index_sets) {
+  auto update_index_sets = [&](const Position& position, auto* index_sets) {
     for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
       Features::IndexList removed_indices[2], added_indices[2];
       bool reset[2];
-      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
+      RawFeatures::AppendChangedIndices(position, kRefreshTriggers[i],
                                         removed_indices, added_indices, reset);
       for (const auto perspective : Colors) {
         if (reset[perspective]) {

From 17d42e023ed13665ed200491a299a177c7954c74 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Tue, 8 Sep 2020 15:10:58 +0200
Subject: [PATCH 231/583] add more CI, instrumented runs

---
 .travis.yml                 |   7 ++
 src/learn/learner.cpp       |   6 +-
 tests/instrumented_learn.sh | 126 ++++++++++++++++++++++++++++++++++++
 3 files changed, 136 insertions(+), 3 deletions(-)
 create mode 100755 tests/instrumented_learn.sh

diff --git a/.travis.yml b/.travis.yml
index 5859f97b..eb3ad741 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -107,3 +107,10 @@ script:
   #
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
+
+  #
+  # NNUE testing / TODO should work with debug=yes as well
+  #
+  - export CXXFLAGS="-O1 -fno-inline"
+  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined; fi
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 5d9b242f..15f0825d 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -1716,9 +1716,9 @@ namespace Learner
             namespace sys = std::filesystem;
             sys::path p(kif_base_dir); // Origin of enumeration
             std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
-                [&](const sys::path& p) {
-                    if (sys::is_regular_file(p))
-                        filenames.push_back(Path::Combine(target_dir, p.filename().generic_string()));
+                [&](const sys::path& path) {
+                    if (sys::is_regular_file(path))
+                        filenames.push_back(Path::Combine(target_dir, path.filename().generic_string()));
                 });
         }
 
diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
new file mode 100755
index 00000000..756569e6
--- /dev/null
+++ b/tests/instrumented_learn.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+# check for errors under valgrind or sanitizers.
+
+error()
+{
+  echo "instrumented testing failed on line $1"
+  exit 1
+}
+trap 'error ${LINENO}' ERR
+
+# define suitable post and prefixes for testing options
+case $1 in
+  --valgrind)
+    echo "valgrind testing started"
+    prefix=''
+    exeprefix='valgrind --error-exitcode=42'
+    postfix='1>/dev/null'
+    threads="1"
+  ;;
+  --valgrind-thread)
+    echo "valgrind-thread testing started"
+    prefix=''
+    exeprefix='valgrind --error-exitcode=42'
+    postfix='1>/dev/null'
+    threads="2"
+  ;;
+  --sanitizer-undefined)
+    echo "sanitizer-undefined testing started"
+    prefix='!'
+    exeprefix=''
+    postfix='2>&1 | grep -A50 "runtime error:"'
+    threads="1"
+  ;;
+  --sanitizer-thread)
+    echo "sanitizer-thread testing started"
+    prefix='!'
+    exeprefix=''
+    postfix='2>&1 | grep -A50 "WARNING: ThreadSanitizer:"'
+    threads="2"
+
+cat << EOF > tsan.supp
+race:TTEntry::move
+race:TTEntry::depth
+race:TTEntry::bound
+race:TTEntry::save
+race:TTEntry::value
+race:TTEntry::eval
+race:TTEntry::is_pv
+
+race:TranspositionTable::probe
+race:TranspositionTable::hashfull
+
+EOF
+
+    export TSAN_OPTIONS="suppressions=./tsan.supp"
+
+  ;;
+  *)
+    echo "unknown testing started"
+    prefix=''
+    exeprefix=''
+    postfix=''
+    threads="1"
+  ;;
+esac
+
+mkdir -p training_data_01
+mkdir -p training_data_02
+
+# gensfen testing 01
+cat << EOF > gensfen01.exp
+ set timeout 240
+ spawn $exeprefix ./stockfish
+
+ send "uci\n"
+ expect "uciok"
+
+ send "setoption name Threads value $threads\n"
+ send "setoption name Use NNUE value false\n"
+ send "isready\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.bin use_raw_nnue_eval 0\n"
+ expect "gensfen finished."
+
+ send "quit\n"
+ expect eof
+
+ # return error code of the spawned program, useful for valgrind
+ lassign [wait] pid spawnid os_error_flag value
+ exit \$value
+EOF
+
+# gensfen testing 02
+cat << EOF > gensfen02.exp
+ set timeout 240
+ spawn $exeprefix ./stockfish
+
+ send "uci\n"
+ expect "uciok"
+
+ send "setoption name Threads value $threads\n"
+ send "setoption name Use NNUE value true\n"
+ send "isready\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_02/training_data.bin use_raw_nnue_eval 0\n"
+ expect "gensfen finished."
+
+ send "quit\n"
+ expect eof
+
+ # return error code of the spawned program, useful for valgrind
+ lassign [wait] pid spawnid os_error_flag value
+ exit \$value
+EOF
+
+for exp in gensfen01.exp gensfen02.exp
+do
+
+  echo "$prefix expect $exp $postfix"
+  eval "$prefix expect $exp $postfix"
+
+  rm $exp
+
+done
+
+rm -f tsan.supp
+
+echo "instrumented learn testing OK"

From 8fcf8b97f1806313fd01d383cb1ffdfd2dcc4e47 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 9 Sep 2020 09:22:48 +0200
Subject: [PATCH 232/583] Add -lstdc++fs

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index eb3ad741..438bf4d0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -112,5 +112,5 @@ script:
   # NNUE testing / TODO should work with debug=yes as well
   #
   - export CXXFLAGS="-O1 -fno-inline"
-  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined; fi
+  - if [ -x "$(command -v valgrind )" ]; then make clean && LDFLAGS="-lstdc++fs"  make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs"  make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined; fi

From 158399da4b368c2118e0d418f09f6dd142608760 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 20:16:09 +0900
Subject: [PATCH 233/583] Remove compile warnings.

---
 .travis.yml                            |  3 +--
 src/learn/gensfen.cpp                  |  5 ++---
 src/nnue/evaluate_nnue_learner.cpp     | 14 ++++++++++++--
 src/nnue/trainer/trainer.h             |  4 ++--
 src/nnue/trainer/trainer_input_slice.h |  2 +-
 5 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 438bf4d0..503d678a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -77,8 +77,7 @@ script:
   - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
 
   # start some basic learner CI
-  #TODO enable -Werror
-  - export CXXFLAGS=""
+  - export CXXFLAGS="-Werror"
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern learn; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern profile-learn; fi
 
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 3d015acf..84feabb0 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -878,8 +878,7 @@ namespace Learner
                     next_move = search_pv[0];
                 }
 
-            RANDOM_MOVE:;
-
+                // Random move.
                 auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
                 if (random_move.has_value())
                 {
@@ -897,7 +896,7 @@ namespace Learner
                     a_psv.clear();
                 }
 
-            DO_MOVE:;
+                // Do move.
                 pos.do_move(next_move, states[ply]);
 
                 // Call node evaluate() for each difference calculation.
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 7be06832..8b0413e5 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -113,8 +113,13 @@ void SetOptions(const std::string& options) {
 void RestoreParameters(const std::string& dir_name) {
   const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
   std::ifstream stream(file_name, std::ios::binary);
-  bool result = ReadParameters(stream);
+#ifndef NDEBUG
+  bool result =
+#endif
+  ReadParameters(stream);
+#ifndef NDEBUG
   assert(result);
+#endif
 
   SendMessages({{"reset"}});
 }
@@ -216,8 +221,13 @@ void save_eval(std::string dir_name) {
 
   const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
   std::ofstream stream(file_name, std::ios::binary);
-  const bool result = NNUE::WriteParameters(stream);
+#ifndef NDEBUG
+  const bool result =
+#endif
+  NNUE::WriteParameters(stream);
+#ifndef NDEBUG
   assert(result);
+#endif
 
   std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
 }
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index d526557a..94553c07 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -70,8 +70,8 @@ struct Example {
 
 // Message used for setting hyperparameters
 struct Message {
-  Message(const std::string& name, const std::string& value = ""):
-      name(name), value(value), num_peekers(0), num_receivers(0) {}
+  Message(const std::string& message_name, const std::string& message_value = ""):
+      name(message_name), value(message_value), num_peekers(0), num_receivers(0) {}
   const std::string name;
   const std::string value;
   std::uint32_t num_peekers;
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index b6d6635b..6b0adc9f 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -206,7 +206,7 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
       const IndexType input_offset = kInputDimensions * b;
       const IndexType output_offset = kOutputDimensions * b;
       for (IndexType i = 0; i < kInputDimensions; ++i) {
-        if (i < Offset || i >= Offset + kOutputDimensions) {
+        if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) {
           gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
         } else {
           gradients_[input_offset + i] = gradients[output_offset + i - Offset];

From d993bd36d0a984b47b7f2f0e14a91bbcec5f948e Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 21:21:10 +0900
Subject: [PATCH 234/583] Removed compile warnings.

---
 src/learn/learning_tools.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 348105b6..1f9bdf96 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -40,13 +40,14 @@ namespace EvalLearningTools
 		static uint64_t eta2_epoch;
 
 		// Batch initialization of eta. If 0 is passed, the default value will be set.
-		static void init_eta(double eta1, double eta2, double eta3, uint64_t eta1_epoch, uint64_t eta2_epoch)
+		static void init_eta(double new_eta1, double new_eta2, double new_eta3,
+			uint64_t new_eta1_epoch, uint64_t new_eta2_epoch)
 		{
-			Weight::eta1 = (eta1 != 0) ? eta1 : 30.0;
-			Weight::eta2 = (eta2 != 0) ? eta2 : 30.0;
-			Weight::eta3 = (eta3 != 0) ? eta3 : 30.0;
-			Weight::eta1_epoch = (eta1_epoch != 0) ? eta1_epoch : 0;
-			Weight::eta2_epoch = (eta2_epoch != 0) ? eta2_epoch : 0;
+			Weight::eta1 = (new_eta1 != 0) ? new_eta1 : 30.0;
+			Weight::eta2 = (new_eta2 != 0) ? new_eta2 : 30.0;
+			Weight::eta3 = (new_eta3 != 0) ? new_eta3 : 30.0;
+			Weight::eta1_epoch = (new_eta1_epoch != 0) ? new_eta1_epoch : 0;
+			Weight::eta2_epoch = (new_eta2_epoch != 0) ? new_eta2_epoch : 0;
 		}
 
 		// Set eta according to epoch.

From 7bd4688747c37764778853f0d0ff1977bd7e663d Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 20:16:09 +0900
Subject: [PATCH 235/583] Remove compile warnings.

---
 .travis.yml                            |  3 +--
 src/learn/gensfen.cpp                  |  5 ++---
 src/nnue/evaluate_nnue_learner.cpp     | 14 ++++++++++++--
 src/nnue/trainer/trainer.h             |  4 ++--
 src/nnue/trainer/trainer_input_slice.h |  2 +-
 5 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 438bf4d0..503d678a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -77,8 +77,7 @@ script:
   - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
 
   # start some basic learner CI
-  #TODO enable -Werror
-  - export CXXFLAGS=""
+  - export CXXFLAGS="-Werror"
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern learn; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern profile-learn; fi
 
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 3d015acf..84feabb0 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -878,8 +878,7 @@ namespace Learner
                     next_move = search_pv[0];
                 }
 
-            RANDOM_MOVE:;
-
+                // Random move.
                 auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
                 if (random_move.has_value())
                 {
@@ -897,7 +896,7 @@ namespace Learner
                     a_psv.clear();
                 }
 
-            DO_MOVE:;
+                // Do move.
                 pos.do_move(next_move, states[ply]);
 
                 // Call node evaluate() for each difference calculation.
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 7be06832..8b0413e5 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -113,8 +113,13 @@ void SetOptions(const std::string& options) {
 void RestoreParameters(const std::string& dir_name) {
   const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
   std::ifstream stream(file_name, std::ios::binary);
-  bool result = ReadParameters(stream);
+#ifndef NDEBUG
+  bool result =
+#endif
+  ReadParameters(stream);
+#ifndef NDEBUG
   assert(result);
+#endif
 
   SendMessages({{"reset"}});
 }
@@ -216,8 +221,13 @@ void save_eval(std::string dir_name) {
 
   const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
   std::ofstream stream(file_name, std::ios::binary);
-  const bool result = NNUE::WriteParameters(stream);
+#ifndef NDEBUG
+  const bool result =
+#endif
+  NNUE::WriteParameters(stream);
+#ifndef NDEBUG
   assert(result);
+#endif
 
   std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
 }
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index d526557a..94553c07 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -70,8 +70,8 @@ struct Example {
 
 // Message used for setting hyperparameters
 struct Message {
-  Message(const std::string& name, const std::string& value = ""):
-      name(name), value(value), num_peekers(0), num_receivers(0) {}
+  Message(const std::string& message_name, const std::string& message_value = ""):
+      name(message_name), value(message_value), num_peekers(0), num_receivers(0) {}
   const std::string name;
   const std::string value;
   std::uint32_t num_peekers;
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index b6d6635b..6b0adc9f 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -206,7 +206,7 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
       const IndexType input_offset = kInputDimensions * b;
       const IndexType output_offset = kOutputDimensions * b;
       for (IndexType i = 0; i < kInputDimensions; ++i) {
-        if (i < Offset || i >= Offset + kOutputDimensions) {
+        if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) {
           gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
         } else {
           gradients_[input_offset + i] = gradients[output_offset + i - Offset];

From 9dcadfa642524553dbba9ea7d89516fc87ccb583 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 21:21:10 +0900
Subject: [PATCH 236/583] Removed compile warnings.

---
 src/learn/learning_tools.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 348105b6..1f9bdf96 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -40,13 +40,14 @@ namespace EvalLearningTools
 		static uint64_t eta2_epoch;
 
 		// Batch initialization of eta. If 0 is passed, the default value will be set.
-		static void init_eta(double eta1, double eta2, double eta3, uint64_t eta1_epoch, uint64_t eta2_epoch)
+		static void init_eta(double new_eta1, double new_eta2, double new_eta3,
+			uint64_t new_eta1_epoch, uint64_t new_eta2_epoch)
 		{
-			Weight::eta1 = (eta1 != 0) ? eta1 : 30.0;
-			Weight::eta2 = (eta2 != 0) ? eta2 : 30.0;
-			Weight::eta3 = (eta3 != 0) ? eta3 : 30.0;
-			Weight::eta1_epoch = (eta1_epoch != 0) ? eta1_epoch : 0;
-			Weight::eta2_epoch = (eta2_epoch != 0) ? eta2_epoch : 0;
+			Weight::eta1 = (new_eta1 != 0) ? new_eta1 : 30.0;
+			Weight::eta2 = (new_eta2 != 0) ? new_eta2 : 30.0;
+			Weight::eta3 = (new_eta3 != 0) ? new_eta3 : 30.0;
+			Weight::eta1_epoch = (new_eta1_epoch != 0) ? new_eta1_epoch : 0;
+			Weight::eta2_epoch = (new_eta2_epoch != 0) ? new_eta2_epoch : 0;
 		}
 
 		// Set eta according to epoch.

From 005009f4e531561618d44780025ccf638532912c Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 23:38:00 +0900
Subject: [PATCH 237/583] Changed a option name more descriptive, "Training" ->
 "PruneAtShallowDepthOnPvNode".  The default value was changed but the default
 behavior is not changed. Changed to set a global option
 prune_at_shallow_depth_on_pv_node on a callback function.

---
 src/search.cpp    | 12 +++++++-----
 src/search.h      |  4 ++++
 src/ucioption.cpp |  8 +++++++-
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 67348a2b..6fbfdedf 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -54,6 +54,10 @@ using std::string;
 using Eval::evaluate;
 using namespace Search;
 
+#if defined(EVAL_LEARN)
+bool Search::prune_at_shallow_depth_on_pv_node = false;
+#endif
+
 namespace {
 
   // Different node types, used as a template parameter
@@ -68,8 +72,6 @@ namespace {
     return Value(223 * (d - improving));
   }
 
-  bool training;
-
   // Reductions lookup table, initialized at startup
   int Reductions[MAX_MOVES]; // [depth or moveNumber]
 
@@ -195,8 +197,6 @@ void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
       Reductions[i] = int((22.0 + std::log(Threads.size())) * std::log(i));
-
-  training = Options["Training"];
 }
 
 
@@ -1011,7 +1011,9 @@ moves_loop: // When in check, search starts from here
 
       // Step 12. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
-          && !(training && PvNode)
+#ifdef EVAL_LEARN
+          && !(!prune_at_shallow_depth_on_pv_node && PvNode)
+#endif
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)
       {
diff --git a/src/search.h b/src/search.h
index 01d8a4c1..9d5ce279 100644
--- a/src/search.h
+++ b/src/search.h
@@ -33,6 +33,10 @@ namespace Search {
 constexpr int CounterMovePruneThreshold = 0;
 
 
+#if defined(EVAL_LEARN)
+extern bool prune_at_shallow_depth_on_pv_node;
+#endif
+
 /// Stack struct keeps track of the information we need to remember from nodes
 /// shallower and deeper in the tree during the search. Each search thread has
 /// its own array of Stack objects, indexed by the current ply.
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 4f9fab5e..0e561416 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -42,6 +42,11 @@ void on_threads(const Option& o) { Threads.set(size_t(o)); }
 void on_tb_path(const Option& o) { Tablebases::init(o); }
 void on_use_NNUE(const Option& ) { Eval::init_NNUE(); }
 void on_eval_file(const Option& ) { Eval::init_NNUE(); }
+#ifdef EVAL_LEARN
+void on_prune_at_shallow_depth_on_pv_node(const Option& o) {
+  Search::prune_at_shallow_depth_on_pv_node = o;
+}
+#endif
 
 /// Our case insensitive less() function as required by UCI protocol
 bool CaseInsensitiveLess::operator() (const string& s1, const string& s2) const {
@@ -69,7 +74,6 @@ void init(OptionsMap& o) {
   o["Move Overhead"]         << Option(10, 0, 5000);
   o["Slow Mover"]            << Option(100, 10, 1000);
   o["nodestime"]             << Option(0, 0, 10000);
-  o["Training"]              << Option(false);
   o["UCI_Chess960"]          << Option(false);
   o["UCI_AnalyseMode"]       << Option(false);
   o["UCI_LimitStrength"]     << Option(false);
@@ -96,6 +100,8 @@ void init(OptionsMap& o) {
   // Evalsave by default. This folder shall be prepared in advance.
   // Automatically create a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
   o["EvalSaveDir"] << Option("evalsave");
+  // Prune at shallow depth on PV nodes. Setting this value to true gains elo in shallow search.
+  o["PruneAtShallowDepthOnPvNode"] << Option(false, on_prune_at_shallow_depth_on_pv_node);
 #endif
 }
 

From e0a98607085655167cc01aed50db83976dbb3ec5 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 9 Sep 2020 19:08:56 +0200
Subject: [PATCH 238/583] Upgrade CI distro, remove special cases, fix one more
 warning

---
 .travis.yml                    | 35 ++++++++++++++++------------------
 src/nnue/features/index_list.h |  2 +-
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 503d678a..608d22c1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,5 @@
 language: cpp
-dist: bionic
+dist: focal
 
 matrix:
   include:
@@ -7,9 +7,9 @@ matrix:
       compiler: gcc
       addons:
         apt:
-          packages: ['g++-8', 'g++-8-multilib', 'g++-multilib', 'valgrind', 'expect', 'curl', 'libopenblas-dev']
+          packages: ['g++-multilib', 'valgrind', 'expect', 'curl', 'libopenblas-dev']
       env:
-        - COMPILER=g++-8
+        - COMPILER=g++
         - COMP=gcc
 
 #    - os: linux
@@ -68,18 +68,17 @@ script:
   # TODO avoid _mm_malloc
   # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
   # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref; fi
+  - make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref
   # TODO avoid _mm_malloc
   # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
   # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
-  # workaround: exclude a custom version of llvm+clang, which doesn't find llvm-profdata on ubuntu
-  - if [[ "$TRAVIS_OS_NAME" != "linux" || "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
+  - make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref
 
   # start some basic learner CI
-  - export CXXFLAGS="-Werror"
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern learn; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs" make -j2 ARCH=x86-64-modern profile-learn; fi
+  - make clean && make -j2 ARCH=x86-64-modern learn
+  - make clean && make -j2 ARCH=x86-64-modern profile-learn
+  - make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no learn
 
   # compile only for some more advanced architectures (might not run in travis)
   - make clean && make -j2 ARCH=x86-64-avx2 build
@@ -98,18 +97,16 @@ script:
   # Valgrind
   #
   - export CXXFLAGS="-O1 -fno-inline"
-  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi
-  - if [ -x "$(command -v valgrind )" ]; then ../tests/instrumented.sh --valgrind-thread; fi
+  - make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind
+  - ../tests/instrumented.sh --valgrind-thread
 
   #
   # Sanitizer
   #
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread
 
-  #
-  # NNUE testing / TODO should work with debug=yes as well
-  #
+  # NNUE testing
   - export CXXFLAGS="-O1 -fno-inline"
-  - if [ -x "$(command -v valgrind )" ]; then make clean && LDFLAGS="-lstdc++fs"  make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && LDFLAGS="-lstdc++fs"  make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined; fi
+  - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
diff --git a/src/nnue/features/index_list.h b/src/nnue/features/index_list.h
index d9ad680a..dd055fb3 100644
--- a/src/nnue/features/index_list.h
+++ b/src/nnue/features/index_list.h
@@ -50,7 +50,7 @@ namespace Eval::NNUE::Features {
     }
 
    private:
-    T values_[MaxSize];
+    T values_[MaxSize] = {};
     std::size_t size_ = 0;
   };
 

From 69563aeed9726af36b6543be3572fbd825698f31 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 20:16:09 +0900
Subject: [PATCH 239/583] Remove compile warnings.


From 073d43738442657c30f1ddedd411b01a782f9d1b Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 21:21:10 +0900
Subject: [PATCH 240/583] Removed compile warnings.


From e63b6088ba8066844fdf47a5843355196e0e2ad1 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 9 Sep 2020 23:38:00 +0900
Subject: [PATCH 241/583] Changed a option name more descriptive, "Training" ->
 "PruneAtShallowDepthOnPvNode".  The default value was changed but the default
 behavior is not changed. Changed to set a global option
 prune_at_shallow_depth_on_pv_node on a callback function.

---
 src/search.cpp    | 12 +++++++-----
 src/search.h      |  4 ++++
 src/ucioption.cpp |  8 +++++++-
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 67348a2b..6fbfdedf 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -54,6 +54,10 @@ using std::string;
 using Eval::evaluate;
 using namespace Search;
 
+#if defined(EVAL_LEARN)
+bool Search::prune_at_shallow_depth_on_pv_node = false;
+#endif
+
 namespace {
 
   // Different node types, used as a template parameter
@@ -68,8 +72,6 @@ namespace {
     return Value(223 * (d - improving));
   }
 
-  bool training;
-
   // Reductions lookup table, initialized at startup
   int Reductions[MAX_MOVES]; // [depth or moveNumber]
 
@@ -195,8 +197,6 @@ void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
       Reductions[i] = int((22.0 + std::log(Threads.size())) * std::log(i));
-
-  training = Options["Training"];
 }
 
 
@@ -1011,7 +1011,9 @@ moves_loop: // When in check, search starts from here
 
       // Step 12. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
-          && !(training && PvNode)
+#ifdef EVAL_LEARN
+          && !(!prune_at_shallow_depth_on_pv_node && PvNode)
+#endif
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)
       {
diff --git a/src/search.h b/src/search.h
index 01d8a4c1..9d5ce279 100644
--- a/src/search.h
+++ b/src/search.h
@@ -33,6 +33,10 @@ namespace Search {
 constexpr int CounterMovePruneThreshold = 0;
 
 
+#if defined(EVAL_LEARN)
+extern bool prune_at_shallow_depth_on_pv_node;
+#endif
+
 /// Stack struct keeps track of the information we need to remember from nodes
 /// shallower and deeper in the tree during the search. Each search thread has
 /// its own array of Stack objects, indexed by the current ply.
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 4f9fab5e..0e561416 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -42,6 +42,11 @@ void on_threads(const Option& o) { Threads.set(size_t(o)); }
 void on_tb_path(const Option& o) { Tablebases::init(o); }
 void on_use_NNUE(const Option& ) { Eval::init_NNUE(); }
 void on_eval_file(const Option& ) { Eval::init_NNUE(); }
+#ifdef EVAL_LEARN
+void on_prune_at_shallow_depth_on_pv_node(const Option& o) {
+  Search::prune_at_shallow_depth_on_pv_node = o;
+}
+#endif
 
 /// Our case insensitive less() function as required by UCI protocol
 bool CaseInsensitiveLess::operator() (const string& s1, const string& s2) const {
@@ -69,7 +74,6 @@ void init(OptionsMap& o) {
   o["Move Overhead"]         << Option(10, 0, 5000);
   o["Slow Mover"]            << Option(100, 10, 1000);
   o["nodestime"]             << Option(0, 0, 10000);
-  o["Training"]              << Option(false);
   o["UCI_Chess960"]          << Option(false);
   o["UCI_AnalyseMode"]       << Option(false);
   o["UCI_LimitStrength"]     << Option(false);
@@ -96,6 +100,8 @@ void init(OptionsMap& o) {
   // Evalsave by default. This folder shall be prepared in advance.
   // Automatically create a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
   o["EvalSaveDir"] << Option("evalsave");
+  // Prune at shallow depth on PV nodes. Setting this value to true gains elo in shallow search.
+  o["PruneAtShallowDepthOnPvNode"] << Option(false, on_prune_at_shallow_depth_on_pv_node);
 #endif
 }
 

From 94f3cae760f0ed6ab464cf8febd79ebe9925b53a Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Thu, 10 Sep 2020 08:23:21 +0900
Subject: [PATCH 242/583] Changed a sentence.

---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 6fbfdedf..b92ea7c8 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1012,7 +1012,7 @@ moves_loop: // When in check, search starts from here
       // Step 12. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
 #ifdef EVAL_LEARN
-          && !(!prune_at_shallow_depth_on_pv_node && PvNode)
+          && (PvNode ? prune_at_shallow_depth_on_pv_node : true)
 #endif
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)

From 020e66d2e63acdbd5449de5f39e99c7e2bcb2551 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 9 Sep 2020 22:36:40 +0200
Subject: [PATCH 243/583] Add "sfen_format" option in gensfen. Valid values are
 "bin" and "binpack". It determines the output format of the sfens. Binpack is
 a highly compressed formats for consecutive sfens. Extension is now
 determined by the used format, output_file_name should contain just the stem.

---
 src/extra/nnue_data_binpack_format.h | 7469 ++++++++++++++++++++++++++
 src/learn/gensfen.cpp                |  129 +-
 2 files changed, 7587 insertions(+), 11 deletions(-)
 create mode 100644 src/extra/nnue_data_binpack_format.h

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
new file mode 100644
index 00000000..9f810a3b
--- /dev/null
+++ b/src/extra/nnue_data_binpack_format.h
@@ -0,0 +1,7469 @@
+#pragma once
+
+#include <cstdio>
+#include <cassert>
+#include <string>
+#include <string_view>
+#include <vector>
+#include <memory>
+#include <fstream>
+#include <cstring>
+#include <iostream>
+#include <set>
+#include <cstdio>
+#include <cassert>
+#include <array>
+#include <immintrin.h>
+#include <intrin.h>
+#include <nmmintrin.h>
+#include <limits>
+
+
+namespace chess
+{
+    #if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+
+    #define FORCEINLINE __attribute__((always_inline))
+
+    #elif defined(_MSC_VER)
+
+    // NOTE: for some reason it breaks the profiler a little
+    //       keep it on only when not profiling.
+    //#define FORCEINLINE __forceinline
+    #define FORCEINLINE
+
+    #else
+
+    #define FORCEINLINE inline
+
+    #endif
+
+    #if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+
+    #define NOINLINE __attribute__((noinline))
+
+    #elif defined(_MSC_VER)
+
+    #define NOINLINE __declspec(noinline)
+
+    #else
+
+    #define NOINLINE
+
+    #endif
+
+    namespace intrin
+    {
+        [[nodiscard]] constexpr int popcount_constexpr(std::uint64_t value)
+        {
+            int r = 0;
+            while (value)
+            {
+                value &= value - 1;
+                ++r;
+            }
+            return r;
+        }
+
+        [[nodiscard]] constexpr int lsb_constexpr(std::uint64_t value)
+        {
+            int c = 0;
+            value &= ~value + 1; // leave only the lsb
+            if ((value & 0x00000000FFFFFFFFull) == 0) c += 32;
+            if ((value & 0x0000FFFF0000FFFFull) == 0) c += 16;
+            if ((value & 0x00FF00FF00FF00FFull) == 0) c += 8;
+            if ((value & 0x0F0F0F0F0F0F0F0Full) == 0) c += 4;
+            if ((value & 0x3333333333333333ull) == 0) c += 2;
+            if ((value & 0x5555555555555555ull) == 0) c += 1;
+            return c;
+        }
+
+        [[nodiscard]] constexpr int msb_constexpr(std::uint64_t value)
+        {
+            int c = 63;
+            if ((value & 0xFFFFFFFF00000000ull) == 0) { c -= 32; value <<= 32; }
+            if ((value & 0xFFFF000000000000ull) == 0) { c -= 16; value <<= 16; }
+            if ((value & 0xFF00000000000000ull) == 0) { c -= 8; value <<= 8; }
+            if ((value & 0xF000000000000000ull) == 0) { c -= 4; value <<= 4; }
+            if ((value & 0xC000000000000000ull) == 0) { c -= 2; value <<= 2; }
+            if ((value & 0x8000000000000000ull) == 0) { c -= 1; }
+            return c;
+        }
+    }
+
+    namespace intrin
+    {
+        [[nodiscard]] inline int popcount(std::uint64_t b)
+        {
+    #if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(__clang__)
+
+            return static_cast<int>(_mm_popcnt_u64(b));
+
+    #else
+
+            return static_cast<int>(__builtin_popcountll(b));
+
+    #endif
+        }
+
+    #if defined(_MSC_VER) && !defined(__clang__)
+
+        [[nodiscard]] inline int lsb(std::uint64_t value)
+        {
+            assert(value != 0);
+
+            unsigned long idx;
+            _BitScanForward64(&idx, value);
+            return static_cast<int>(idx);
+        }
+
+        [[nodiscard]] inline int msb(std::uint64_t value)
+        {
+            assert(value != 0);
+
+            unsigned long idx;
+            _BitScanReverse64(&idx, value);
+            return static_cast<int>(idx);
+        }
+
+    #else
+
+        [[nodiscard]] inline int lsb(std::uint64_t value)
+        {
+            assert(value != 0);
+
+            return __builtin_ctzll(value);
+        }
+
+        [[nodiscard]] inline int msb(std::uint64_t value)
+        {
+            assert(value != 0);
+
+            return 63 ^ __builtin_clzll(value);
+        }
+
+    #endif
+    }
+
+
+    template <typename IntT>
+    [[nodiscard]] constexpr IntT mulSaturate(IntT lhs, IntT rhs)
+    {
+        static_assert(std::is_unsigned_v<IntT>); // currently no support for signed
+
+    #if defined (_MSC_VER)
+
+        if (lhs == 0) return 0;
+
+        const IntT result = lhs * rhs;
+        return result / lhs == rhs ? result : std::numeric_limits<IntT>::max();
+
+    #elif defined (__GNUC__)
+
+        IntT result{};
+        return __builtin_mul_overflow(lhs, rhs, &result) ? std::numeric_limits<IntT>::max() : result;
+
+    #endif
+    }
+
+    template <typename IntT>
+    [[nodiscard]] constexpr IntT addSaturate(IntT lhs, IntT rhs)
+    {
+        static_assert(std::is_unsigned_v<IntT>); // currently no support for signed
+
+    #if defined (_MSC_VER)
+
+        const IntT result = lhs + rhs;
+        return result >= lhs ? result : std::numeric_limits<IntT>::max();
+
+    #elif defined (__GNUC__)
+
+        IntT result{};
+        return __builtin_add_overflow(lhs, rhs, &result) ? std::numeric_limits<IntT>::max() : result;
+
+    #endif
+    }
+
+    template <typename IntT>
+    [[nodiscard]] constexpr bool addOverflows(IntT lhs, IntT rhs)
+    {
+    #if defined (_MSC_VER)
+
+        return static_cast<IntT>(lhs + rhs) < lhs;
+
+    #elif defined (__GNUC__)
+
+        IntT result{};
+        __builtin_add_overflow(lhs, rhs, &result);
+        return result;
+
+    #endif
+    }
+
+    template <typename IntT>
+    [[nodiscard]] constexpr IntT floorLog2(IntT value)
+    {
+        return intrin::msb_constexpr(value);
+    }
+
+    template <typename IntT>
+    constexpr std::size_t maxFibonacciNumberIndexForType()
+    {
+        static_assert(std::is_unsigned_v<IntT>);
+
+        switch (sizeof(IntT))
+        {
+        case 8:
+            return 93;
+        case 4:
+            return 47;
+        case 2:
+            return 24;
+        case 1:
+            return 13;
+        }
+
+        return 0;
+    }
+
+    template <typename IntT>
+    constexpr auto computeMasks()
+    {
+        static_assert(std::is_unsigned_v<IntT>);
+
+        constexpr std::size_t numBits = sizeof(IntT) * CHAR_BIT;
+        std::array<IntT, numBits + 1u> nbitmasks{};
+
+        for (std::size_t i = 0; i < numBits; ++i)
+        {
+            nbitmasks[i] = (static_cast<IntT>(1u) << i) - 1u;
+        }
+        nbitmasks[numBits] = ~static_cast<IntT>(0u);
+
+        return nbitmasks;
+    }
+
+    template <typename IntT>
+    constexpr auto nbitmask = computeMasks<IntT>();
+
+    template <typename IntT>
+    constexpr auto computeFibonacciNumbers()
+    {
+        constexpr std::size_t size = maxFibonacciNumberIndexForType<IntT>() + 1;
+        std::array<IntT, size> numbers{};
+        numbers[0] = 0;
+        numbers[1] = 1;
+
+        for (std::size_t i = 2; i < size; ++i)
+        {
+            numbers[i] = numbers[i - 1] + numbers[i - 2];
+        }
+
+        return numbers;
+    }
+
+    // F(0) = 0, F(1) = 1
+    template <typename IntT>
+    constexpr auto fibonacciNumbers = computeFibonacciNumbers<IntT>();
+
+    template <std::size_t N, typename FromT, typename ToT = std::make_signed_t<FromT>>
+    inline ToT signExtend(FromT value)
+    {
+        static_assert(std::is_signed_v<ToT>);
+        static_assert(std::is_unsigned_v<FromT>);
+        static_assert(sizeof(ToT) == sizeof(FromT));
+
+        constexpr std::size_t totalBits = sizeof(FromT) * CHAR_BIT;
+
+        static_assert(N > 0 && N <= totalBits);
+
+        constexpr std::size_t unusedBits = totalBits - N;
+        if constexpr (ToT(~FromT(0)) >> 1 == ToT(~FromT(0)))
+        {
+            return ToT(value << unusedBits) >> ToT(unusedBits);
+        }
+        else
+        {
+            constexpr FromT mask = (~FromT(0)) >> unusedBits;
+            value &= mask;
+            if (value & (FromT(1) << (N - 1)))
+            {
+                value |= ~mask;
+            }
+            return static_cast<ToT>(value);
+        }
+    }
+
+    namespace lookup
+    {
+        constexpr int nthSetBitIndexNaive(std::uint64_t value, int n)
+        {
+            for (int i = 0; i < n; ++i)
+            {
+                value &= value - 1;
+            }
+            return intrin::lsb_constexpr(value);
+        }
+
+        constexpr std::array<std::array<std::uint8_t, 8>, 256> nthSetBitIndex = []()
+        {
+            std::array<std::array<std::uint8_t, 8>, 256> t{};
+
+            for (int i = 0; i < 256; ++i)
+            {
+                for (int j = 0; j < 8; ++j)
+                {
+                    t[i][j] = nthSetBitIndexNaive(i, j);
+                }
+            }
+
+            return t;
+        }();
+    }
+
+    inline int nthSetBitIndex(std::uint64_t v, std::uint64_t n)
+    {
+        std::uint64_t shift = 0;
+
+        std::uint64_t p = intrin::popcount(v & 0xFFFFFFFFull);
+        std::uint64_t pmask = static_cast<std::uint64_t>(p > n) - 1ull;
+        v >>= 32 & pmask;
+        shift += 32 & pmask;
+        n -= p & pmask;
+
+        p = intrin::popcount(v & 0xFFFFull);
+        pmask = static_cast<std::uint64_t>(p > n) - 1ull;
+        v >>= 16 & pmask;
+        shift += 16 & pmask;
+        n -= p & pmask;
+
+        p = intrin::popcount(v & 0xFFull);
+        pmask = static_cast<std::uint64_t>(p > n) - 1ull;
+        shift += 8 & pmask;
+        v >>= 8 & pmask;
+        n -= p & pmask;
+
+        return static_cast<int>(lookup::nthSetBitIndex[v & 0xFFull][n] + shift);
+    }
+
+    namespace util
+    {
+        inline std::size_t usedBits(std::size_t value)
+        {
+            if (value == 0) return 0;
+            return intrin::msb(value) + 1;
+        }
+    }
+
+    template <typename EnumT>
+    struct EnumTraits;
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr auto hasEnumTraits() -> decltype(EnumTraits<EnumT>::cardinaliy, bool{})
+    {
+        return true;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr bool hasEnumTraits(...)
+    {
+        return false;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr bool isNaturalIndex() noexcept
+    {
+        return EnumTraits<EnumT>::isNaturalIndex;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr int cardinality() noexcept
+    {
+        return EnumTraits<EnumT>::cardinality;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr const std::array<EnumT, cardinality<EnumT>()>& values() noexcept
+    {
+        return EnumTraits<EnumT>::values;
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr EnumT fromOrdinal(int id) noexcept
+    {
+        assert(!EnumTraits<EnumT>::isNaturalIndex || (id >= 0 && id < EnumTraits<EnumT>::cardinality));
+
+        return EnumTraits<EnumT>::fromOrdinal(id);
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr typename EnumTraits<EnumT>::IdType ordinal(EnumT v) noexcept
+    {
+        return EnumTraits<EnumT>::ordinal(v);
+    }
+
+    template <typename EnumT, typename... ArgsTs, typename SFINAE = std::enable_if_t<hasEnumTraits<EnumT>()>>
+    [[nodiscard]] constexpr decltype(auto) toString(EnumT v, ArgsTs&&... args)
+    {
+        return EnumTraits<EnumT>::toString(v, std::forward<ArgsTs>(args)...);
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr decltype(auto) toString(EnumT v)
+    {
+        return EnumTraits<EnumT>::toString(v);
+    }
+
+    template <typename EnumT, typename FormatT, typename SFINAE = std::enable_if_t<!hasEnumTraits<FormatT>()>>
+    [[nodiscard]] constexpr decltype(auto) toString(FormatT&& f, EnumT v)
+    {
+        return EnumTraits<EnumT>::toString(std::forward<FormatT>(f), v);
+    }
+
+    template <typename EnumT>
+    [[nodiscard]] constexpr decltype(auto) toChar(EnumT v)
+    {
+        return EnumTraits<EnumT>::toChar(v);
+    }
+
+    template <typename EnumT, typename FormatT>
+    [[nodiscard]] constexpr decltype(auto) toChar(FormatT&& f, EnumT v)
+    {
+        return EnumTraits<EnumT>::toChar(std::forward<FormatT>(f), v);
+    }
+
+    template <typename EnumT, typename... ArgsTs>
+    [[nodiscard]] constexpr decltype(auto) fromString(ArgsTs&& ... args)
+    {
+        return EnumTraits<EnumT>::fromString(std::forward<ArgsTs>(args)...);
+    }
+
+    template <typename EnumT, typename... ArgsTs>
+    [[nodiscard]] constexpr decltype(auto) fromChar(ArgsTs&& ... args)
+    {
+        return EnumTraits<EnumT>::fromChar(std::forward<ArgsTs>(args)...);
+    }
+
+    template <>
+    struct EnumTraits<bool>
+    {
+        using IdType = int;
+        using EnumType = bool;
+
+        static constexpr int cardinality = 2;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            false,
+            true
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            return static_cast<EnumType>(id);
+        }
+    };
+
+    template <typename EnumT, typename ValueT, std::size_t SizeV = cardinality<EnumT>()>
+    struct EnumArray
+    {
+        static_assert(isNaturalIndex<EnumT>(), "Enum must start with 0 and end with cardinality-1.");
+
+        using value_type      = ValueT;
+        using size_type       = std::size_t;
+        using difference_type = std::ptrdiff_t;
+        using pointer         = ValueT *;
+        using const_pointer   = const ValueT*;
+        using reference       = ValueT &;
+        using const_reference = const ValueT &;
+
+        using iterator       = pointer;
+        using const_iterator = const_pointer;
+
+        using reverse_iterator       = std::reverse_iterator<iterator>;
+        using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+        using KeyType = EnumT;
+        using ValueType = ValueT;
+
+        constexpr void fill(const ValueType& init)
+        {
+            for (auto& v : elements)
+            {
+                v = init;
+            }
+        }
+
+        [[nodiscard]] constexpr ValueType& operator[](const KeyType& dir)
+        {
+            assert(ordinal(dir) < SizeV);
+
+            return elements[ordinal(dir)];
+        }
+
+        [[nodiscard]] constexpr const ValueType& operator[](const KeyType& dir) const
+        {
+            assert(ordinal(dir) < SizeV);
+
+            return elements[ordinal(dir)];
+        }
+
+        [[nodiscard]] constexpr ValueType& front()
+        {
+            return elements[0];
+        }
+
+        [[nodiscard]] constexpr const ValueType& front() const
+        {
+            return elements[0];
+        }
+
+        [[nodiscard]] constexpr ValueType& back()
+        {
+            return elements[SizeV - 1];
+        }
+
+        [[nodiscard]] constexpr const ValueType& back() const
+        {
+            return elements[SizeV - 1];
+        }
+
+        [[nodiscard]] constexpr pointer data()
+        {
+            return elements;
+        }
+
+        [[nodiscard]] constexpr const_pointer data() const
+        {
+            return elements;
+        }
+
+        [[nodiscard]] constexpr iterator begin() noexcept
+        {
+            return elements;
+        }
+
+        [[nodiscard]] constexpr const_iterator begin() const noexcept
+        {
+            return elements;
+        }
+
+        [[nodiscard]] constexpr iterator end() noexcept
+        {
+            return elements + SizeV;
+        }
+
+        [[nodiscard]] constexpr const_iterator end() const noexcept
+        {
+            return elements + SizeV;
+        }
+
+        [[nodiscard]] constexpr reverse_iterator rbegin() noexcept
+        {
+            return reverse_iterator(end());
+        }
+
+        [[nodiscard]] constexpr const_reverse_iterator rbegin() const noexcept
+        {
+            return const_reverse_iterator(end());
+        }
+
+        [[nodiscard]] constexpr reverse_iterator rend() noexcept
+        {
+            return reverse_iterator(begin());
+        }
+
+        [[nodiscard]] constexpr const_reverse_iterator rend() const noexcept
+        {
+            return const_reverse_iterator(begin());
+        }
+
+        [[nodiscard]] constexpr const_iterator cbegin() const noexcept
+        {
+            return begin();
+        }
+
+        [[nodiscard]] constexpr const_iterator cend() const noexcept
+        {
+            return end();
+        }
+
+        [[nodiscard]] constexpr const_reverse_iterator crbegin() const noexcept
+        {
+            return rbegin();
+        }
+
+        [[nodiscard]] constexpr const_reverse_iterator crend() const noexcept
+        {
+            return rend();
+        }
+
+        [[nodiscard]] constexpr size_type size() const noexcept
+        {
+            return SizeV;
+        }
+
+        ValueT elements[SizeV];
+    };
+
+    template <typename Enum1T, typename Enum2T, typename ValueT, std::size_t Size1V = cardinality<Enum1T>(), std::size_t Size2V = cardinality<Enum2T>()>
+    using EnumArray2 = EnumArray<Enum1T, EnumArray<Enum2T, ValueT, Size2V>, Size1V>;
+
+    enum struct Color : std::uint8_t
+    {
+        White,
+        Black
+    };
+
+    template <>
+    struct EnumTraits<Color>
+    {
+        using IdType = int;
+        using EnumType = Color;
+
+        static constexpr int cardinality = 2;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            Color::White,
+            Color::Black
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType c) noexcept
+        {
+            return std::string_view("wb" + ordinal(c), 1);
+        }
+
+        [[nodiscard]] static constexpr char toChar(EnumType c) noexcept
+        {
+            return "wb"[ordinal(c)];
+        }
+
+        [[nodiscard]] static constexpr std::optional<Color> fromChar(char c) noexcept
+        {
+            if (c == 'w') return Color::White;
+            if (c == 'b') return Color::Black;
+
+            return {};
+        }
+
+        [[nodiscard]] static constexpr std::optional<Color> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    constexpr Color operator!(Color c)
+    {
+        return fromOrdinal<Color>(ordinal(c) ^ 1);
+    }
+
+    enum struct PieceType : std::uint8_t
+    {
+        Pawn,
+        Knight,
+        Bishop,
+        Rook,
+        Queen,
+        King,
+
+        None
+    };
+
+    template <>
+    struct EnumTraits<PieceType>
+    {
+        using IdType = int;
+        using EnumType = PieceType;
+
+        static constexpr int cardinality = 7;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            PieceType::Pawn,
+            PieceType::Knight,
+            PieceType::Bishop,
+            PieceType::Rook,
+            PieceType::Queen,
+            PieceType::King,
+            PieceType::None
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType p, Color c) noexcept
+        {
+            return std::string_view("PpNnBbRrQqKk " + (chess::ordinal(p) * 2 + chess::ordinal(c)), 1);
+        }
+
+        [[nodiscard]] static constexpr char toChar(EnumType p, Color c) noexcept
+        {
+            return "PpNnBbRrQqKk "[chess::ordinal(p) * 2 + chess::ordinal(c)];
+        }
+
+        [[nodiscard]] static constexpr std::optional<PieceType> fromChar(char c) noexcept
+        {
+            auto it = std::string_view("PpNnBbRrQqKk ").find(c);
+            if (it == std::string::npos) return {};
+            else return static_cast<PieceType>(it/2);
+        }
+
+        [[nodiscard]] static constexpr std::optional<PieceType> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    struct Piece
+    {
+        [[nodiscard]] static constexpr Piece fromId(int id)
+        {
+            return Piece(id);
+        }
+
+        [[nodiscard]] static constexpr Piece none()
+        {
+            return Piece(PieceType::None, Color::White);
+        }
+
+        constexpr Piece() noexcept :
+            Piece(PieceType::None, Color::White)
+        {
+
+        }
+
+        constexpr Piece(PieceType type, Color color) noexcept :
+            m_id((ordinal(type) << 1) | ordinal(color))
+        {
+            assert(type != PieceType::None || color == Color::White);
+        }
+
+        constexpr Piece& operator=(const Piece& other) = default;
+
+        [[nodiscard]] constexpr friend bool operator==(Piece lhs, Piece rhs) noexcept
+        {
+            return lhs.m_id == rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(Piece lhs, Piece rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+
+        [[nodiscard]] constexpr PieceType type() const
+        {
+            return fromOrdinal<PieceType>(m_id >> 1);
+        }
+
+        [[nodiscard]] constexpr Color color() const
+        {
+            return fromOrdinal<Color>(m_id & 1);
+        }
+
+        [[nodiscard]] constexpr std::pair<PieceType, Color> parts() const
+        {
+            return std::make_pair(type(), color());
+        }
+
+        [[nodiscard]] constexpr explicit operator int() const
+        {
+            return static_cast<int>(m_id);
+        }
+
+    private:
+        constexpr Piece(int id) :
+            m_id(id)
+        {
+        }
+
+        std::uint8_t m_id; // lowest bit is a color, 7 highest bits are a piece type
+    };
+
+    [[nodiscard]] constexpr Piece operator|(PieceType type, Color color) noexcept
+    {
+        return Piece(type, color);
+    }
+
+    [[nodiscard]] constexpr Piece operator|(Color color, PieceType type) noexcept
+    {
+        return Piece(type, color);
+    }
+
+    constexpr Piece whitePawn = Piece(PieceType::Pawn, Color::White);
+    constexpr Piece whiteKnight = Piece(PieceType::Knight, Color::White);
+    constexpr Piece whiteBishop = Piece(PieceType::Bishop, Color::White);
+    constexpr Piece whiteRook = Piece(PieceType::Rook, Color::White);
+    constexpr Piece whiteQueen = Piece(PieceType::Queen, Color::White);
+    constexpr Piece whiteKing = Piece(PieceType::King, Color::White);
+
+    constexpr Piece blackPawn = Piece(PieceType::Pawn, Color::Black);
+    constexpr Piece blackKnight = Piece(PieceType::Knight, Color::Black);
+    constexpr Piece blackBishop = Piece(PieceType::Bishop, Color::Black);
+    constexpr Piece blackRook = Piece(PieceType::Rook, Color::Black);
+    constexpr Piece blackQueen = Piece(PieceType::Queen, Color::Black);
+    constexpr Piece blackKing = Piece(PieceType::King, Color::Black);
+
+    static_assert(Piece::none().type() == PieceType::None);
+
+    template <>
+    struct EnumTraits<Piece>
+    {
+        using IdType = int;
+        using EnumType = Piece;
+
+        static constexpr int cardinality = 13;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            whitePawn,
+            blackPawn,
+            whiteKnight,
+            blackKnight,
+            whiteBishop,
+            blackBishop,
+            whiteRook,
+            blackRook,
+            whiteQueen,
+            blackQueen,
+            whiteKing,
+            blackKing,
+            Piece::none()
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(int id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return Piece::fromId(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType p) noexcept
+        {
+            return std::string_view("PpNnBbRrQqKk " + ordinal(p), 1);
+        }
+
+        [[nodiscard]] static constexpr char toChar(EnumType p) noexcept
+        {
+            return "PpNnBbRrQqKk "[ordinal(p)];
+        }
+
+        [[nodiscard]] static constexpr std::optional<Piece> fromChar(char c) noexcept
+        {
+            auto it = std::string_view("PpNnBbRrQqKk ").find(c);
+            if (it == std::string::npos) return {};
+            else return Piece::fromId(static_cast<int>(it));
+        }
+
+        [[nodiscard]] static constexpr std::optional<Piece> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    template <typename TagT>
+    struct Coord
+    {
+        constexpr Coord() noexcept :
+            m_i(0)
+        {
+        }
+
+        constexpr explicit Coord(int i) noexcept :
+            m_i(i)
+        {
+        }
+
+        [[nodiscard]] constexpr explicit operator int() const
+        {
+            return static_cast<int>(m_i);
+        }
+
+        constexpr friend Coord& operator++(Coord& c)
+        {
+            ++c.m_i;
+            return c;
+        }
+
+        constexpr friend Coord& operator--(Coord& c)
+        {
+            --c.m_i;
+            return c;
+        }
+
+        constexpr friend Coord& operator+=(Coord& c, int d)
+        {
+            c.m_i += d;
+            return c;
+        }
+
+        constexpr friend Coord& operator-=(Coord& c, int d)
+        {
+            c.m_i -= d;
+            return c;
+        }
+
+        constexpr friend Coord operator+(const Coord& c, int d)
+        {
+            Coord cpy(c);
+            cpy += d;
+            return cpy;
+        }
+
+        constexpr friend Coord operator-(const Coord& c, int d)
+        {
+            Coord cpy(c);
+            cpy -= d;
+            return cpy;
+        }
+
+        constexpr friend int operator-(const Coord& c1, const Coord& c2)
+        {
+            return c1.m_i - c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator==(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i == c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i != c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator<(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i < c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator<=(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i <= c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator>(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i > c2.m_i;
+        }
+
+        [[nodiscard]] constexpr friend bool operator>=(const Coord& c1, const Coord& c2) noexcept
+        {
+            return c1.m_i >= c2.m_i;
+        }
+
+    private:
+        std::int8_t m_i;
+    };
+
+    struct FileTag;
+    struct RankTag;
+    using File = Coord<FileTag>;
+    using Rank = Coord<RankTag>;
+
+    constexpr File fileA = File(0);
+    constexpr File fileB = File(1);
+    constexpr File fileC = File(2);
+    constexpr File fileD = File(3);
+    constexpr File fileE = File(4);
+    constexpr File fileF = File(5);
+    constexpr File fileG = File(6);
+    constexpr File fileH = File(7);
+
+    constexpr Rank rank1 = Rank(0);
+    constexpr Rank rank2 = Rank(1);
+    constexpr Rank rank3 = Rank(2);
+    constexpr Rank rank4 = Rank(3);
+    constexpr Rank rank5 = Rank(4);
+    constexpr Rank rank6 = Rank(5);
+    constexpr Rank rank7 = Rank(6);
+    constexpr Rank rank8 = Rank(7);
+
+    template <>
+    struct EnumTraits<File>
+    {
+        using IdType = int;
+        using EnumType = File;
+
+        static constexpr int cardinality = 8;
+        static constexpr bool isNaturalIndex = true;
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType c) noexcept
+        {
+            assert(ordinal(c) >= 0 && ordinal(c) < 8);
+
+            return std::string_view("abcdefgh" + ordinal(c), 1);
+        }
+
+        [[nodiscard]] static constexpr std::optional<File> fromChar(char c) noexcept
+        {
+            if (c < 'a' || c > 'h') return {};
+            return static_cast<File>(c - 'a');
+        }
+
+        [[nodiscard]] static constexpr std::optional<File> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    template <>
+    struct EnumTraits<Rank>
+    {
+        using IdType = int;
+        using EnumType = Rank;
+
+        static constexpr int cardinality = 8;
+        static constexpr bool isNaturalIndex = true;
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(EnumType c) noexcept
+        {
+            assert(ordinal(c) >= 0 && ordinal(c) < 8);
+
+            return std::string_view("12345678" + ordinal(c), 1);
+        }
+
+        [[nodiscard]] static constexpr std::optional<Rank> fromChar(char c) noexcept
+        {
+            if (c < '1' || c > '8') return {};
+            return static_cast<Rank>(c - '1');
+        }
+
+        [[nodiscard]] static constexpr std::optional<Rank> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 1) return {};
+
+            return fromChar(sv[0]);
+        }
+    };
+
+    // files east
+    // ranks north
+    struct FlatSquareOffset
+    {
+        std::int8_t value;
+
+        constexpr FlatSquareOffset() noexcept :
+            value(0)
+        {
+        }
+
+        constexpr FlatSquareOffset(int files, int ranks) noexcept :
+            value(files + ranks * cardinality<File>())
+        {
+            assert(files + ranks * cardinality<File>() >= std::numeric_limits<std::int8_t>::min());
+            assert(files + ranks * cardinality<File>() <= std::numeric_limits<std::int8_t>::max());
+        }
+
+        constexpr FlatSquareOffset operator-() const noexcept
+        {
+            return FlatSquareOffset(-value);
+        }
+
+    private:
+        constexpr FlatSquareOffset(int v) noexcept :
+            value(v)
+        {
+        }
+    };
+
+    struct Offset
+    {
+        std::int8_t files;
+        std::int8_t ranks;
+
+        constexpr Offset() :
+            files(0),
+            ranks(0)
+        {
+        }
+
+        constexpr Offset(int files, int ranks) :
+            files(files),
+            ranks(ranks)
+        {
+        }
+
+        [[nodiscard]] constexpr FlatSquareOffset flat() const
+        {
+            return { files, ranks };
+        }
+
+        [[nodiscard]] constexpr Offset operator-() const
+        {
+            return { -files, -ranks };
+        }
+    };
+
+    struct SquareCoords
+    {
+        File file;
+        Rank rank;
+
+        constexpr SquareCoords() noexcept :
+            file{},
+            rank{}
+        {
+        }
+
+        constexpr SquareCoords(File f, Rank r) noexcept :
+            file(f),
+            rank(r)
+        {
+        }
+
+        constexpr friend SquareCoords& operator+=(SquareCoords& c, Offset offset)
+        {
+            c.file += offset.files;
+            c.rank += offset.ranks;
+            return c;
+        }
+
+        [[nodiscard]] constexpr friend SquareCoords operator+(const SquareCoords& c, Offset offset)
+        {
+            SquareCoords cpy(c);
+            cpy.file += offset.files;
+            cpy.rank += offset.ranks;
+            return cpy;
+        }
+
+        [[nodiscard]] constexpr bool isOk() const
+        {
+            return file >= fileA && file <= fileH && rank >= rank1 && rank <= rank8;
+        }
+    };
+
+    struct Square
+    {
+    private:
+        static constexpr std::int8_t m_noneId = cardinality<Rank>() * cardinality<File>();
+
+        static constexpr std::uint8_t fileMask = 0b111;
+        static constexpr std::uint8_t rankMask = 0b111000;
+        static constexpr std::uint8_t rankShift = 3;
+
+    public:
+        [[nodiscard]] static constexpr Square none()
+        {
+            return Square(m_noneId);
+        }
+
+        constexpr Square() noexcept :
+            m_id(0)
+        {
+        }
+
+        constexpr explicit Square(int idx) noexcept :
+            m_id(idx)
+        {
+            assert(isOk() || m_id == m_noneId);
+        }
+
+        constexpr Square(File file, Rank rank) noexcept :
+            m_id(ordinal(file) + ordinal(rank) * cardinality<File>())
+        {
+            assert(isOk());
+        }
+
+        constexpr explicit Square(SquareCoords coords) noexcept :
+            Square(coords.file, coords.rank)
+        {
+        }
+
+        [[nodiscard]] constexpr friend bool operator<(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id < rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator>(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id > rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator<=(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id <= rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator>=(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id >= rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator==(Square lhs, Square rhs) noexcept
+        {
+            return lhs.m_id == rhs.m_id;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(Square lhs, Square rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+
+        constexpr friend Square& operator++(Square& sq)
+        {
+            ++sq.m_id;
+            return sq;
+        }
+
+        constexpr friend Square& operator--(Square& sq)
+        {
+            --sq.m_id;
+            return sq;
+        }
+
+        [[nodiscard]] constexpr friend Square operator+(Square sq, FlatSquareOffset offset)
+        {
+            Square sqCpy = sq;
+            sqCpy += offset;
+            return sqCpy;
+        }
+
+        constexpr friend Square& operator+=(Square& sq, FlatSquareOffset offset)
+        {
+            assert(sq.m_id + offset.value >= 0 && sq.m_id + offset.value < Square::m_noneId);
+            sq.m_id += offset.value;
+            return sq;
+        }
+
+        [[nodiscard]] constexpr friend Square operator+(Square sq, Offset offset)
+        {
+            assert(sq.file() + offset.files >= fileA);
+            assert(sq.file() + offset.files <= fileH);
+            assert(sq.rank() + offset.ranks >= rank1);
+            assert(sq.rank() + offset.ranks <= rank8);
+            return operator+(sq, offset.flat());
+        }
+
+        constexpr friend Square& operator+=(Square& sq, Offset offset)
+        {
+            return operator+=(sq, offset.flat());
+        }
+
+        [[nodiscard]] constexpr explicit operator int() const
+        {
+            return m_id;
+        }
+
+        [[nodiscard]] constexpr File file() const
+        {
+            assert(isOk());
+            return File(static_cast<unsigned>(m_id) & fileMask);
+        }
+
+        [[nodiscard]] constexpr Rank rank() const
+        {
+            assert(isOk());
+            return Rank(static_cast<unsigned>(m_id) >> rankShift);
+        }
+
+        [[nodiscard]] constexpr SquareCoords coords() const
+        {
+            return { file(), rank() };
+        }
+
+        [[nodiscard]] constexpr Color color() const
+        {
+            assert(isOk());
+            return !fromOrdinal<Color>(ordinal(rank()) + ordinal(file()) & 1);
+        }
+
+        constexpr void flipVertically()
+        {
+            m_id ^= rankMask;
+        }
+
+        constexpr void flipHorizontally()
+        {
+            m_id ^= fileMask;
+        }
+
+        constexpr Square flippedVertically() const
+        {
+            return Square(m_id ^ rankMask);
+        }
+
+        constexpr Square flippedHorizontally() const
+        {
+            return Square(m_id ^ fileMask);
+        }
+
+        [[nodiscard]] constexpr bool isOk() const
+        {
+            return m_id >= 0 && m_id < m_noneId;
+        }
+
+    private:
+        std::int8_t m_id;
+    };
+
+    constexpr Square a1(fileA, rank1);
+    constexpr Square a2(fileA, rank2);
+    constexpr Square a3(fileA, rank3);
+    constexpr Square a4(fileA, rank4);
+    constexpr Square a5(fileA, rank5);
+    constexpr Square a6(fileA, rank6);
+    constexpr Square a7(fileA, rank7);
+    constexpr Square a8(fileA, rank8);
+
+    constexpr Square b1(fileB, rank1);
+    constexpr Square b2(fileB, rank2);
+    constexpr Square b3(fileB, rank3);
+    constexpr Square b4(fileB, rank4);
+    constexpr Square b5(fileB, rank5);
+    constexpr Square b6(fileB, rank6);
+    constexpr Square b7(fileB, rank7);
+    constexpr Square b8(fileB, rank8);
+
+    constexpr Square c1(fileC, rank1);
+    constexpr Square c2(fileC, rank2);
+    constexpr Square c3(fileC, rank3);
+    constexpr Square c4(fileC, rank4);
+    constexpr Square c5(fileC, rank5);
+    constexpr Square c6(fileC, rank6);
+    constexpr Square c7(fileC, rank7);
+    constexpr Square c8(fileC, rank8);
+
+    constexpr Square d1(fileD, rank1);
+    constexpr Square d2(fileD, rank2);
+    constexpr Square d3(fileD, rank3);
+    constexpr Square d4(fileD, rank4);
+    constexpr Square d5(fileD, rank5);
+    constexpr Square d6(fileD, rank6);
+    constexpr Square d7(fileD, rank7);
+    constexpr Square d8(fileD, rank8);
+
+    constexpr Square e1(fileE, rank1);
+    constexpr Square e2(fileE, rank2);
+    constexpr Square e3(fileE, rank3);
+    constexpr Square e4(fileE, rank4);
+    constexpr Square e5(fileE, rank5);
+    constexpr Square e6(fileE, rank6);
+    constexpr Square e7(fileE, rank7);
+    constexpr Square e8(fileE, rank8);
+
+    constexpr Square f1(fileF, rank1);
+    constexpr Square f2(fileF, rank2);
+    constexpr Square f3(fileF, rank3);
+    constexpr Square f4(fileF, rank4);
+    constexpr Square f5(fileF, rank5);
+    constexpr Square f6(fileF, rank6);
+    constexpr Square f7(fileF, rank7);
+    constexpr Square f8(fileF, rank8);
+
+    constexpr Square g1(fileG, rank1);
+    constexpr Square g2(fileG, rank2);
+    constexpr Square g3(fileG, rank3);
+    constexpr Square g4(fileG, rank4);
+    constexpr Square g5(fileG, rank5);
+    constexpr Square g6(fileG, rank6);
+    constexpr Square g7(fileG, rank7);
+    constexpr Square g8(fileG, rank8);
+
+    constexpr Square h1(fileH, rank1);
+    constexpr Square h2(fileH, rank2);
+    constexpr Square h3(fileH, rank3);
+    constexpr Square h4(fileH, rank4);
+    constexpr Square h5(fileH, rank5);
+    constexpr Square h6(fileH, rank6);
+    constexpr Square h7(fileH, rank7);
+    constexpr Square h8(fileH, rank8);
+
+    static_assert(e1.color() == Color::Black);
+    static_assert(e8.color() == Color::White);
+
+    static_assert(e1.file() == fileE);
+    static_assert(e1.rank() == rank1);
+
+    static_assert(e1.flippedHorizontally() == d1);
+    static_assert(e1.flippedVertically() == e8);
+
+    template <>
+    struct EnumTraits<Square>
+    {
+        using IdType = int;
+        using EnumType = Square;
+
+        static constexpr int cardinality = chess::cardinality<Rank>() * chess::cardinality<File>();
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            a1, b1, c1, d1, e1, f1, g1, h1,
+            a2, b2, c2, d2, e2, f2, g2, h2,
+            a3, b3, c3, d3, e3, f3, g3, h3,
+            a4, b4, c4, d4, e4, f4, g4, h4,
+            a5, b5, c5, d5, e5, f5, g5, h5,
+            a6, b6, c6, d6, e6, f6, g6, h6,
+            a7, b7, c7, d7, e7, f7, g7, h7,
+            a8, b8, c8, d8, e8, f8, g8, h8
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality + 1);
+
+            return static_cast<EnumType>(id);
+        }
+
+        [[nodiscard]] static constexpr std::string_view toString(Square sq)
+        {
+            assert(sq.isOk());
+
+            return
+                std::string_view(
+                    "a1b1c1d1e1f1g1h1"
+                    "a2b2c2d2e2f2g2h2"
+                    "a3b3c3d3e3f3g3h3"
+                    "a4b4c4d4e4f4g4h4"
+                    "a5b5c5d5e5f5g5h5"
+                    "a6b6c6d6e6f6g6h6"
+                    "a7b7c7d7e7f7g7h7"
+                    "a8b8c8d8e8f8g8h8"
+                    + (ordinal(sq) * 2),
+                    2
+                );
+        }
+
+        [[nodiscard]] static constexpr std::optional<Square> fromString(std::string_view sv) noexcept
+        {
+            if (sv.size() != 2) return {};
+
+            const char f = sv[0];
+            const char r = sv[1];
+            if (f < 'a' || f > 'h') return {};
+            if (r < '1' || r > '8') return {};
+
+            return Square(static_cast<File>(f - 'a'), static_cast<Rank>(r - '1'));
+        }
+    };
+
+    static_assert(toString(d1) == std::string_view("d1"));
+    static_assert(values<Square>()[29] == f4);
+
+    enum struct MoveType : std::uint8_t
+    {
+        Normal,
+        Promotion,
+        Castle,
+        EnPassant
+    };
+
+    template <>
+    struct EnumTraits<MoveType>
+    {
+        using IdType = int;
+        using EnumType = MoveType;
+
+        static constexpr int cardinality = 4;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            MoveType::Normal,
+            MoveType::Promotion,
+            MoveType::Castle,
+            MoveType::EnPassant
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+    };
+
+    enum struct CastleType : std::uint8_t
+    {
+        Short,
+        Long
+    };
+
+    [[nodiscard]] constexpr CastleType operator!(CastleType ct)
+    {
+        return static_cast<CastleType>(static_cast<std::uint8_t>(ct) ^ 1);
+    }
+
+    template <>
+    struct EnumTraits<CastleType>
+    {
+        using IdType = int;
+        using EnumType = CastleType;
+
+        static constexpr int cardinality = 2;
+        static constexpr bool isNaturalIndex = true;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            CastleType::Short,
+            CastleType::Long
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            assert(id >= 0 && id < cardinality);
+
+            return static_cast<EnumType>(id);
+        }
+    };
+
+    struct CompressedMove;
+
+    // castling is encoded as a king capturing rook
+    // ep is encoded as a normal pawn capture (move.to is empty on the board)
+    struct Move
+    {
+        Square from;
+        Square to;
+        MoveType type = MoveType::Normal;
+        Piece promotedPiece = Piece::none();
+
+        [[nodiscard]] constexpr friend bool operator==(const Move& lhs, const Move& rhs) noexcept
+        {
+            return lhs.from == rhs.from
+                && lhs.to == rhs.to
+                && lhs.type == rhs.type
+                && lhs.promotedPiece == rhs.promotedPiece;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(const Move& lhs, const Move& rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+
+        [[nodiscard]] constexpr CompressedMove compress() const noexcept;
+
+        [[nodiscard]] constexpr static Move null()
+        {
+            return Move{ Square::none(), Square::none() };
+        }
+
+        [[nodiscard]] constexpr static Move castle(CastleType ct, Color c);
+
+        [[nodiscard]] constexpr static Move normal(Square from, Square to)
+        {
+            return Move{ from, to, MoveType::Normal, Piece::none() };
+        }
+
+        [[nodiscard]] constexpr static Move enPassant(Square from, Square to)
+        {
+            return Move{ from, to, MoveType::EnPassant, Piece::none() };
+        }
+
+        [[nodiscard]] constexpr static Move promotion(Square from, Square to, Piece piece)
+        {
+            return Move{ from, to, MoveType::Promotion, piece };
+        }
+    };
+
+    namespace detail::castle
+    {
+        constexpr EnumArray2<CastleType, Color, Move> moves = { {
+            {{ { e1, h1, MoveType::Castle }, { e8, h8, MoveType::Castle } }},
+            {{ { e1, a1, MoveType::Castle }, { e8, a8, MoveType::Castle } }}
+        } };
+    }
+
+    [[nodiscard]] constexpr Move Move::castle(CastleType ct, Color c)
+    {
+        return detail::castle::moves[ct][c];
+    }
+
+    static_assert(sizeof(Move) == 4);
+
+    struct CompressedMove
+    {
+    private:
+        // from most significant bits
+        // 2 bits for move type
+        // 6 bits for from square
+        // 6 bits for to square
+        // 2 bits for promoted piece type
+        //    0 if not a promotion
+        static constexpr std::uint16_t squareMask = 0b111111u;
+        static constexpr std::uint16_t promotedPieceTypeMask = 0b11u;
+        static constexpr std::uint16_t moveTypeMask = 0b11u;
+
+    public:
+        [[nodiscard]] constexpr static CompressedMove readFromBigEndian(const unsigned char* data)
+        {
+            CompressedMove move{};
+            move.m_packed = (data[0] << 8) | data[1];
+            return move;
+        }
+
+        constexpr CompressedMove() noexcept :
+            m_packed(0)
+        {
+        }
+
+        // move must be either valid or a null move
+        constexpr CompressedMove(Move move) noexcept :
+            m_packed(0)
+        {
+            // else null move
+            if (move.from != move.to)
+            {
+                assert(move.from != Square::none());
+                assert(move.to != Square::none());
+
+                m_packed =
+                    (static_cast<std::uint16_t>(ordinal(move.type)) << (16 - 2))
+                    | (static_cast<std::uint16_t>(ordinal(move.from)) << (16 - 2 - 6))
+                    | (static_cast<std::uint16_t>(ordinal(move.to)) << (16 - 2 - 6 - 6));
+
+                if (move.type == MoveType::Promotion)
+                {
+                    assert(move.promotedPiece != Piece::none());
+
+                    m_packed |= ordinal(move.promotedPiece.type()) - ordinal(PieceType::Knight);
+                }
+                else
+                {
+                    assert(move.promotedPiece == Piece::none());
+                }
+            }
+        }
+
+        void writeToBigEndian(unsigned char* data) const
+        {
+            *data++ = m_packed >> 8;
+            *data++ = m_packed & 0xFF;
+        }
+
+        [[nodiscard]] constexpr std::uint16_t packed() const
+        {
+            return m_packed;
+        }
+
+        [[nodiscard]] constexpr MoveType type() const
+        {
+            return fromOrdinal<MoveType>(m_packed >> (16 - 2));
+        }
+
+        [[nodiscard]] constexpr Square from() const
+        {
+            return fromOrdinal<Square>((m_packed >> (16 - 2 - 6)) & squareMask);
+        }
+
+        [[nodiscard]] constexpr Square to() const
+        {
+            return fromOrdinal<Square>((m_packed >> (16 - 2 - 6 - 6)) & squareMask);
+        }
+
+        [[nodiscard]] constexpr Piece promotedPiece() const
+        {
+            if (type() == MoveType::Promotion)
+            {
+                const Color color =
+                    (to().rank() == rank1)
+                    ? Color::Black
+                    : Color::White;
+
+                const PieceType pt = fromOrdinal<PieceType>((m_packed & promotedPieceTypeMask) + ordinal(PieceType::Knight));
+                return color | pt;
+            }
+            else
+            {
+                return Piece::none();
+            }
+        }
+
+        [[nodiscard]] constexpr Move decompress() const noexcept
+        {
+            if (m_packed == 0)
+            {
+                return Move::null();
+            }
+            else
+            {
+                const MoveType type = fromOrdinal<MoveType>(m_packed >> (16 - 2));
+                const Square from = fromOrdinal<Square>((m_packed >> (16 - 2 - 6)) & squareMask);
+                const Square to = fromOrdinal<Square>((m_packed >> (16 - 2 - 6 - 6)) & squareMask);
+                const Piece promotedPiece = [&]() {
+                    if (type == MoveType::Promotion)
+                    {
+                        const Color color =
+                            (to.rank() == rank1)
+                            ? Color::Black
+                            : Color::White;
+
+                        const PieceType pt = fromOrdinal<PieceType>((m_packed & promotedPieceTypeMask) + ordinal(PieceType::Knight));
+                        return color | pt;
+                    }
+                    else
+                    {
+                        return Piece::none();
+                    }
+                }();
+
+                return Move{ from, to, type, promotedPiece };
+            }
+        }
+
+    private:
+        std::uint16_t m_packed;
+    };
+
+    static_assert(sizeof(CompressedMove) == 2);
+
+    [[nodiscard]] constexpr CompressedMove Move::compress() const noexcept
+    {
+        return CompressedMove(*this);
+    }
+
+    static_assert(a4 + Offset{ 0, 1 } == a5);
+    static_assert(a4 + Offset{ 0, 2 } == a6);
+    static_assert(a4 + Offset{ 0, -2 } == a2);
+    static_assert(a4 + Offset{ 0, -1 } == a3);
+
+    static_assert(e4 + Offset{ 1, 0 } == f4);
+    static_assert(e4 + Offset{ 2, 0 } == g4);
+    static_assert(e4 + Offset{ -1, 0 } == d4);
+    static_assert(e4 + Offset{ -2, 0 } == c4);
+
+    enum struct CastlingRights : std::uint8_t
+    {
+        None = 0x0,
+        WhiteKingSide = 0x1,
+        WhiteQueenSide = 0x2,
+        BlackKingSide = 0x4,
+        BlackQueenSide = 0x8,
+        White = WhiteKingSide | WhiteQueenSide,
+        Black = BlackKingSide | BlackQueenSide,
+        All = WhiteKingSide | WhiteQueenSide | BlackKingSide | BlackQueenSide
+    };
+
+    [[nodiscard]] constexpr CastlingRights operator|(CastlingRights lhs, CastlingRights rhs)
+    {
+        return static_cast<CastlingRights>(static_cast<std::uint8_t>(lhs) | static_cast<std::uint8_t>(rhs));
+    }
+
+    [[nodiscard]] constexpr CastlingRights operator&(CastlingRights lhs, CastlingRights rhs)
+    {
+        return static_cast<CastlingRights>(static_cast<std::uint8_t>(lhs) & static_cast<std::uint8_t>(rhs));
+    }
+
+    [[nodiscard]] constexpr CastlingRights operator~(CastlingRights lhs)
+    {
+        return static_cast<CastlingRights>(~static_cast<std::uint8_t>(lhs) & static_cast<std::uint8_t>(CastlingRights::All));
+    }
+
+    constexpr CastlingRights& operator|=(CastlingRights& lhs, CastlingRights rhs)
+    {
+        lhs = static_cast<CastlingRights>(static_cast<std::uint8_t>(lhs) | static_cast<std::uint8_t>(rhs));
+        return lhs;
+    }
+
+    constexpr CastlingRights& operator&=(CastlingRights& lhs, CastlingRights rhs)
+    {
+        lhs = static_cast<CastlingRights>(static_cast<std::uint8_t>(lhs) & static_cast<std::uint8_t>(rhs));
+        return lhs;
+    }
+    // checks whether lhs contains rhs
+    [[nodiscard]] constexpr bool contains(CastlingRights lhs, CastlingRights rhs)
+    {
+        return (lhs & rhs) == rhs;
+    }
+
+    template <>
+    struct EnumTraits<CastlingRights>
+    {
+        using IdType = int;
+        using EnumType = CastlingRights;
+
+        static constexpr int cardinality = 4;
+        static constexpr bool isNaturalIndex = false;
+
+        static constexpr std::array<EnumType, cardinality> values{
+            CastlingRights::WhiteKingSide,
+            CastlingRights::WhiteQueenSide,
+            CastlingRights::BlackKingSide,
+            CastlingRights::BlackQueenSide
+        };
+
+        [[nodiscard]] static constexpr int ordinal(EnumType c) noexcept
+        {
+            return static_cast<IdType>(c);
+        }
+
+        [[nodiscard]] static constexpr EnumType fromOrdinal(IdType id) noexcept
+        {
+            return static_cast<EnumType>(id);
+        }
+    };
+
+    struct CompressedReverseMove;
+
+    struct ReverseMove
+    {
+        Move move;
+        Piece capturedPiece;
+        Square oldEpSquare;
+        CastlingRights oldCastlingRights;
+
+        // We need a well defined case for the starting position.
+        constexpr ReverseMove() :
+            move(Move::null()),
+            capturedPiece(Piece::none()),
+            oldEpSquare(Square::none()),
+            oldCastlingRights(CastlingRights::All)
+        {
+        }
+
+        constexpr ReverseMove(const Move& move, Piece capturedPiece, Square oldEpSquare, CastlingRights oldCastlingRights) :
+            move(move),
+            capturedPiece(capturedPiece),
+            oldEpSquare(oldEpSquare),
+            oldCastlingRights(oldCastlingRights)
+        {
+        }
+
+        constexpr bool isNull() const
+        {
+            return move.from == move.to;
+        }
+
+        [[nodiscard]] constexpr CompressedReverseMove compress() const noexcept;
+
+        [[nodiscard]] constexpr friend bool operator==(const ReverseMove& lhs, const ReverseMove& rhs) noexcept
+        {
+            return lhs.move == rhs.move
+                && lhs.capturedPiece == rhs.capturedPiece
+                && lhs.oldEpSquare == rhs.oldEpSquare
+                && lhs.oldCastlingRights == rhs.oldCastlingRights;
+        }
+
+        [[nodiscard]] constexpr friend bool operator!=(const ReverseMove& lhs, const ReverseMove& rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+    };
+
+    static_assert(sizeof(ReverseMove) == 7);
+
+    struct CompressedReverseMove
+    {
+    private:
+        // we use 7 bits because it can be Square::none()
+        static constexpr std::uint32_t squareMask = 0b1111111u;
+        static constexpr std::uint32_t pieceMask = 0b1111u;
+        static constexpr std::uint32_t castlingRightsMask = 0b1111;
+    public:
+
+        constexpr CompressedReverseMove() noexcept :
+            m_move{},
+            m_oldState{}
+        {
+        }
+
+        constexpr CompressedReverseMove(const ReverseMove& rm) noexcept :
+            m_move(rm.move.compress()),
+            m_oldState{ static_cast<uint16_t>(
+                ((ordinal(rm.capturedPiece) & pieceMask) << 11)
+                | ((ordinal(rm.oldCastlingRights) & castlingRightsMask) << 7)
+                | (ordinal(rm.oldEpSquare) & squareMask)
+                )
+            }
+        {
+        }
+
+        [[nodiscard]] constexpr Move move() const
+        {
+            return m_move.decompress();
+        }
+
+        [[nodiscard]] const CompressedMove& compressedMove() const
+        {
+            return m_move;
+        }
+
+        [[nodiscard]] constexpr Piece capturedPiece() const
+        {
+            return fromOrdinal<Piece>(m_oldState >> 11);
+        }
+
+        [[nodiscard]] constexpr CastlingRights oldCastlingRights() const
+        {
+            return fromOrdinal<CastlingRights>((m_oldState >> 7) & castlingRightsMask);
+        }
+
+        [[nodiscard]] constexpr Square oldEpSquare() const
+        {
+            return fromOrdinal<Square>(m_oldState & squareMask);
+        }
+
+        [[nodiscard]] constexpr ReverseMove decompress() const noexcept
+        {
+            const Piece capturedPiece = fromOrdinal<Piece>(m_oldState >> 11);
+            const CastlingRights castlingRights = fromOrdinal<CastlingRights>((m_oldState >> 7) & castlingRightsMask);
+            // We could pack the ep square more, but don't have to, because
+            // can't save another byte anyway.
+            const Square epSquare = fromOrdinal<Square>(m_oldState & squareMask);
+
+            return ReverseMove(m_move.decompress(), capturedPiece, epSquare, castlingRights);
+        }
+
+    private:
+        CompressedMove m_move;
+        std::uint16_t m_oldState;
+    };
+
+    static_assert(sizeof(CompressedReverseMove) == 4);
+
+    [[nodiscard]] constexpr CompressedReverseMove ReverseMove::compress() const noexcept
+    {
+        return CompressedReverseMove(*this);
+    }
+
+    // This can be regarded as a perfect hash. Going back is hard.
+    struct PackedReverseMove
+    {
+        static constexpr std::uint32_t mask = 0x7FFFFFFu;
+        static constexpr std::size_t numBits = 27;
+
+    private:
+        static constexpr std::uint32_t squareMask = 0b111111u;
+        static constexpr std::uint32_t pieceMask = 0b1111u;
+        static constexpr std::uint32_t pieceTypeMask = 0b111u;
+        static constexpr std::uint32_t castlingRightsMask = 0b1111;
+        static constexpr std::uint32_t fileMask = 0b111;
+
+    public:
+        constexpr PackedReverseMove(const std::uint32_t packed) :
+            m_packed(packed)
+        {
+
+        }
+
+        constexpr PackedReverseMove(const ReverseMove& reverseMove) :
+            m_packed(
+                0u
+                // The only move when square is none() is null move and
+                // then both squares are none(). No other move is like that
+                // so we don't lose any information by storing only
+                // the 6 bits of each square.
+                | ((ordinal(reverseMove.move.from) & squareMask) << 21)
+                | ((ordinal(reverseMove.move.to) & squareMask) << 15)
+                // Other masks are just for code clarity, they should
+                // never change the values.
+                | ((ordinal(reverseMove.capturedPiece) & pieceMask) << 11)
+                | ((ordinal(reverseMove.oldCastlingRights) & castlingRightsMask) << 7)
+                | ((ordinal(reverseMove.move.promotedPiece.type()) & pieceTypeMask) << 4)
+                | (((reverseMove.oldEpSquare != Square::none()) & 1) << 3)
+                // We probably could omit the squareMask here but for clarity it's left.
+                | (ordinal(Square(ordinal(reverseMove.oldEpSquare) & squareMask).file()) & fileMask)
+            )
+        {
+        }
+
+        constexpr std::uint32_t packed() const
+        {
+            return m_packed;
+        }
+
+        constexpr ReverseMove unpack(Color sideThatMoved) const
+        {
+            ReverseMove rmove{};
+
+            rmove.move.from = fromOrdinal<Square>((m_packed >> 21) & squareMask);
+            rmove.move.to = fromOrdinal<Square>((m_packed >> 15) & squareMask);
+            rmove.capturedPiece = fromOrdinal<Piece>((m_packed >> 11) & pieceMask);
+            rmove.oldCastlingRights = fromOrdinal<CastlingRights>((m_packed >> 7) & castlingRightsMask);
+            const PieceType promotedPieceType = fromOrdinal<PieceType>((m_packed >> 4) & pieceTypeMask);
+            if (promotedPieceType != PieceType::None)
+            {
+                rmove.move.promotedPiece = Piece(promotedPieceType, sideThatMoved);
+                rmove.move.type = MoveType::Promotion;
+            }
+            const bool hasEpSquare = static_cast<bool>((m_packed >> 3) & 1);
+            if (hasEpSquare)
+            {
+                // ep square is always where the opponent moved
+                const Rank rank =
+                    sideThatMoved == Color::White
+                    ? rank6
+                    : rank3;
+                const File file = fromOrdinal<File>(m_packed & fileMask);
+                rmove.oldEpSquare = Square(file, rank);
+                if (rmove.oldEpSquare == rmove.move.to)
+                {
+                    rmove.move.type = MoveType::EnPassant;
+                }
+            }
+            else
+            {
+                rmove.oldEpSquare = Square::none();
+            }
+
+            if (rmove.move.type == MoveType::Normal && rmove.oldCastlingRights != CastlingRights::None)
+            {
+                // If castling was possible then we know it was the king that moved from e1/e8.
+                if (rmove.move.from == e1)
+                {
+                    if (rmove.move.to == h1 || rmove.move.to == a1)
+                    {
+                        rmove.move.type = MoveType::Castle;
+                    }
+                }
+                else if (rmove.move.from == e8)
+                {
+                    if (rmove.move.to == h8 || rmove.move.to == a8)
+                    {
+                        rmove.move.type = MoveType::Castle;
+                    }
+                }
+            }
+
+            return rmove;
+        }
+
+    private:
+        // Uses only 27 lowest bits.
+        // Bit meaning from highest to lowest.
+        // - 6 bits from
+        // - 6 bits to
+        // - 4 bits for the captured piece
+        // - 4 bits for prev castling rights
+        // - 3 bits promoted piece type
+        // - 1 bit  to specify if the ep square was valid (false if none())
+        // - 3 bits for prev ep square file
+        std::uint32_t m_packed;
+    };
+
+    struct MoveCompareLess
+    {
+        [[nodiscard]] bool operator()(const Move& lhs, const Move& rhs) const noexcept
+        {
+            if (ordinal(lhs.from) < ordinal(rhs.from)) return true;
+            if (ordinal(lhs.from) > ordinal(rhs.from)) return false;
+
+            if (ordinal(lhs.to) < ordinal(rhs.to)) return true;
+            if (ordinal(lhs.to) > ordinal(rhs.to)) return false;
+
+            if (ordinal(lhs.type) < ordinal(rhs.type)) return true;
+            if (ordinal(lhs.type) > ordinal(rhs.type)) return false;
+
+            if (ordinal(lhs.promotedPiece) < ordinal(rhs.promotedPiece)) return true;
+
+            return false;
+        }
+    };
+
+    struct ReverseMoveCompareLess
+    {
+        [[nodiscard]] bool operator()(const ReverseMove& lhs, const ReverseMove& rhs) const noexcept
+        {
+            if (MoveCompareLess{}(lhs.move, rhs.move)) return true;
+            if (MoveCompareLess{}(rhs.move, lhs.move)) return false;
+
+            if (ordinal(lhs.capturedPiece) < ordinal(rhs.capturedPiece)) return true;
+            if (ordinal(lhs.capturedPiece) > ordinal(rhs.capturedPiece)) return false;
+
+            if (static_cast<unsigned>(lhs.oldCastlingRights) < static_cast<unsigned>(rhs.oldCastlingRights)) return true;
+            if (static_cast<unsigned>(lhs.oldCastlingRights) > static_cast<unsigned>(rhs.oldCastlingRights)) return false;
+
+            if (ordinal(lhs.oldEpSquare) < ordinal(rhs.oldEpSquare)) return true;
+            if (ordinal(lhs.oldEpSquare) > ordinal(rhs.oldEpSquare)) return false;
+
+            return false;
+        }
+    };
+
+    struct BitboardIterator
+    {
+        using value_type = Square;
+        using difference_type = std::ptrdiff_t;
+        using reference = Square;
+        using iterator_category = std::input_iterator_tag;
+        using pointer = const Square*;
+
+        constexpr BitboardIterator() noexcept :
+            m_squares(0)
+        {
+        }
+
+        constexpr BitboardIterator(std::uint64_t v) noexcept :
+            m_squares(v)
+        {
+        }
+
+        constexpr BitboardIterator(const BitboardIterator&) = default;
+        constexpr BitboardIterator(BitboardIterator&&) = default;
+        constexpr BitboardIterator& operator=(const BitboardIterator&) = default;
+        constexpr BitboardIterator& operator=(BitboardIterator&&) = default;
+
+        [[nodiscard]] constexpr bool friend operator==(BitboardIterator lhs, BitboardIterator rhs) noexcept
+        {
+            return lhs.m_squares == rhs.m_squares;
+        }
+
+        [[nodiscard]] constexpr bool friend operator!=(BitboardIterator lhs, BitboardIterator rhs) noexcept
+        {
+            return lhs.m_squares != rhs.m_squares;
+        }
+
+        [[nodiscard]] inline Square operator*() const
+        {
+            return first();
+        }
+
+        constexpr BitboardIterator& operator++() noexcept
+        {
+            popFirst();
+            return *this;
+        }
+
+    private:
+        std::uint64_t m_squares;
+
+        constexpr void popFirst() noexcept
+        {
+            m_squares &= m_squares - 1;
+        }
+
+        [[nodiscard]] inline Square first() const
+        {
+            assert(m_squares != 0);
+
+            return fromOrdinal<Square>(intrin::lsb(m_squares));
+        }
+    };
+
+    struct Bitboard
+    {
+        // bits counted from the LSB
+        // order is A1 B2 ... G8 H8
+        // just like in Square
+
+    public:
+        constexpr Bitboard() noexcept :
+            m_squares(0)
+        {
+        }
+
+    private:
+        constexpr explicit Bitboard(Square sq) noexcept :
+            m_squares(static_cast<std::uint64_t>(1ULL) << ordinal(sq))
+        {
+            assert(sq.isOk());
+        }
+
+        constexpr explicit Bitboard(Rank r) noexcept :
+            m_squares(static_cast<std::uint64_t>(0xFFULL) << (ordinal(r) * 8))
+        {
+        }
+
+        constexpr explicit Bitboard(File f) noexcept :
+            m_squares(static_cast<std::uint64_t>(0x0101010101010101ULL) << ordinal(f))
+        {
+        }
+
+        constexpr explicit Bitboard(Color c) noexcept :
+            m_squares(c == Color::White ? 0xAA55AA55AA55AA55ULL : ~0xAA55AA55AA55AA55ULL)
+        {
+        }
+
+        constexpr explicit Bitboard(std::uint64_t bb) noexcept :
+            m_squares(bb)
+        {
+        }
+
+        // files A..file inclusive
+        static constexpr EnumArray<File, std::uint64_t> m_filesUpToBB{
+            0x0101010101010101ULL,
+            0x0303030303030303ULL,
+            0x0707070707070707ULL,
+            0x0F0F0F0F0F0F0F0FULL,
+            0x1F1F1F1F1F1F1F1FULL,
+            0x3F3F3F3F3F3F3F3FULL,
+            0x7F7F7F7F7F7F7F7FULL,
+            0xFFFFFFFFFFFFFFFFULL
+        };
+
+    public:
+
+        [[nodiscard]] static constexpr Bitboard none()
+        {
+            return Bitboard{};
+        }
+
+        [[nodiscard]] static constexpr Bitboard all()
+        {
+            return ~none();
+        }
+
+        [[nodiscard]] static constexpr Bitboard square(Square sq)
+        {
+            return Bitboard(sq);
+        }
+
+        [[nodiscard]] static constexpr Bitboard file(File f)
+        {
+            return Bitboard(f);
+        }
+
+        [[nodiscard]] static constexpr Bitboard rank(Rank r)
+        {
+            return Bitboard(r);
+        }
+
+        [[nodiscard]] static constexpr Bitboard color(Color c)
+        {
+            return Bitboard(c);
+        }
+
+        [[nodiscard]] static constexpr Bitboard fromBits(std::uint64_t bits)
+        {
+            return Bitboard(bits);
+        }
+
+        // inclusive
+        [[nodiscard]] static constexpr Bitboard betweenFiles(File left, File right)
+        {
+            assert(left <= right);
+
+            if (left == fileA)
+            {
+                return Bitboard::fromBits(m_filesUpToBB[right]);
+            }
+            else
+            {
+                return Bitboard::fromBits(m_filesUpToBB[right] ^ m_filesUpToBB[left - 1]);
+            }
+        }
+
+        [[nodiscard]] constexpr bool isEmpty() const
+        {
+            return m_squares == 0;
+        }
+
+        [[nodiscard]] constexpr bool isSet(Square sq) const
+        {
+            return !!((m_squares >> ordinal(sq)) & 1ull);
+        }
+
+        constexpr void set(Square sq)
+        {
+            *this |= Bitboard(sq);
+        }
+
+        constexpr void unset(Square sq)
+        {
+            *this &= ~(Bitboard(sq));
+        }
+
+        constexpr void toggle(Square sq)
+        {
+            *this ^= Bitboard(sq);
+        }
+
+        [[nodiscard]] constexpr BitboardIterator begin() const
+        {
+            return BitboardIterator(m_squares);
+        }
+
+        [[nodiscard]] constexpr BitboardIterator end() const
+        {
+            return BitboardIterator{};
+        }
+
+        [[nodiscard]] constexpr BitboardIterator cbegin() const
+        {
+            return BitboardIterator(m_squares);
+        }
+
+        [[nodiscard]] constexpr BitboardIterator cend() const
+        {
+            return BitboardIterator{};
+        }
+
+        [[nodiscard]] constexpr bool friend operator==(Bitboard lhs, Bitboard rhs) noexcept
+        {
+            return lhs.m_squares == rhs.m_squares;
+        }
+
+        [[nodiscard]] constexpr bool friend operator!=(Bitboard lhs, Bitboard rhs) noexcept
+        {
+            return lhs.m_squares != rhs.m_squares;
+        }
+
+        constexpr Bitboard shiftedVertically(int ranks) const
+        {
+            if (ranks >= 0)
+            {
+                return fromBits(m_squares << 8 * ranks);
+            }
+            else
+            {
+                return fromBits(m_squares >> -8 * ranks);
+            }
+        }
+
+        template <int files, int ranks>
+        constexpr void shift()
+        {
+            static_assert(files >= -7);
+            static_assert(ranks >= -7);
+            static_assert(files <= 7);
+            static_assert(ranks <= 7);
+
+            if constexpr (files != 0)
+            {
+                constexpr Bitboard mask =
+                    files > 0
+                    ? Bitboard::betweenFiles(fileA, fileH - files)
+                    : Bitboard::betweenFiles(fileA - files, fileH);
+
+                m_squares &= mask.m_squares;
+            }
+
+            constexpr int shift = files + ranks * 8;
+            if constexpr (shift == 0)
+            {
+                return;
+            }
+
+            if constexpr (shift < 0)
+            {
+                m_squares >>= -shift;
+            }
+            else
+            {
+                m_squares <<= shift;
+            }
+        }
+
+        template <int files, int ranks>
+        constexpr Bitboard shifted() const
+        {
+            Bitboard bbCpy(*this);
+            bbCpy.shift<files, ranks>();
+            return bbCpy;
+        }
+
+        constexpr void shift(Offset offset)
+        {
+            assert(offset.files >= -7);
+            assert(offset.ranks >= -7);
+            assert(offset.files <= 7);
+            assert(offset.ranks <= 7);
+
+            if (offset.files != 0)
+            {
+                const Bitboard mask =
+                    offset.files > 0
+                    ? Bitboard::betweenFiles(fileA, fileH - offset.files)
+                    : Bitboard::betweenFiles(fileA - offset.files, fileH);
+
+                m_squares &= mask.m_squares;
+            }
+
+            const int shift = offset.files + offset.ranks * 8;
+            if (shift < 0)
+            {
+                m_squares >>= -shift;
+            }
+            else
+            {
+                m_squares <<= shift;
+            }
+        }
+
+        [[nodiscard]] constexpr Bitboard shifted(Offset offset) const
+        {
+            Bitboard bbCpy(*this);
+            bbCpy.shift(offset);
+            return bbCpy;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator~() const
+        {
+            Bitboard bb = *this;
+            bb.m_squares = ~m_squares;
+            return bb;
+        }
+
+        constexpr Bitboard& operator^=(Color c)
+        {
+            m_squares ^= Bitboard(c).m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator&=(Color c)
+        {
+            m_squares &= Bitboard(c).m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator|=(Color c)
+        {
+            m_squares |= Bitboard(c).m_squares;
+            return *this;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator^(Color c) const
+        {
+            Bitboard bb = *this;
+            bb ^= c;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator&(Color c) const
+        {
+            Bitboard bb = *this;
+            bb &= c;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator|(Color c) const
+        {
+            Bitboard bb = *this;
+            bb |= c;
+            return bb;
+        }
+
+        constexpr Bitboard& operator^=(Square sq)
+        {
+            m_squares ^= Bitboard(sq).m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator&=(Square sq)
+        {
+            m_squares &= Bitboard(sq).m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator|=(Square sq)
+        {
+            m_squares |= Bitboard(sq).m_squares;
+            return *this;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator^(Square sq) const
+        {
+            Bitboard bb = *this;
+            bb ^= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator&(Square sq) const
+        {
+            Bitboard bb = *this;
+            bb &= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator|(Square sq) const
+        {
+            Bitboard bb = *this;
+            bb |= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr friend Bitboard operator^(Square sq, Bitboard bb)
+        {
+            return bb ^ sq;
+        }
+
+        [[nodiscard]] constexpr friend Bitboard operator&(Square sq, Bitboard bb)
+        {
+            return bb & sq;
+        }
+
+        [[nodiscard]] constexpr friend Bitboard operator|(Square sq, Bitboard bb)
+        {
+            return bb | sq;
+        }
+
+        constexpr Bitboard& operator^=(Bitboard rhs)
+        {
+            m_squares ^= rhs.m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator&=(Bitboard rhs)
+        {
+            m_squares &= rhs.m_squares;
+            return *this;
+        }
+
+        constexpr Bitboard& operator|=(Bitboard rhs)
+        {
+            m_squares |= rhs.m_squares;
+            return *this;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator^(Bitboard sq) const
+        {
+            Bitboard bb = *this;
+            bb ^= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator&(Bitboard sq) const
+        {
+            Bitboard bb = *this;
+            bb &= sq;
+            return bb;
+        }
+
+        [[nodiscard]] constexpr Bitboard operator|(Bitboard sq) const
+        {
+            Bitboard bb = *this;
+            bb |= sq;
+            return bb;
+        }
+
+        [[nodiscard]] inline int count() const
+        {
+            return static_cast<int>(intrin::popcount(m_squares));
+        }
+
+        [[nodiscard]] constexpr bool moreThanOne() const
+        {
+            return !!(m_squares & (m_squares - 1));
+        }
+
+        [[nodiscard]] constexpr bool exactlyOne() const
+        {
+            return m_squares != 0 && !moreThanOne();
+        }
+
+        [[nodiscard]] constexpr bool any() const
+        {
+            return !!m_squares;
+        }
+
+        [[nodiscard]] inline Square first() const
+        {
+            assert(m_squares != 0);
+
+            return fromOrdinal<Square>(intrin::lsb(m_squares));
+        }
+
+        [[nodiscard]] inline Square nth(int n) const
+        {
+            assert(count() > n);
+
+            Bitboard cpy = *this;
+            while (n--) cpy.popFirst();
+            return cpy.first();
+        }
+
+        [[nodiscard]] inline Square last() const
+        {
+            assert(m_squares != 0);
+
+            return fromOrdinal<Square>(intrin::msb(m_squares));
+        }
+
+        [[nodiscard]] constexpr std::uint64_t bits() const
+        {
+            return m_squares;
+        }
+
+        constexpr void popFirst()
+        {
+            assert(m_squares != 0);
+
+            m_squares &= m_squares - 1;
+        }
+
+        constexpr Bitboard& operator=(const Bitboard& other) = default;
+
+    private:
+        std::uint64_t m_squares;
+    };
+
+    [[nodiscard]] constexpr Bitboard operator^(Square sq0, Square sq1)
+    {
+        return Bitboard::square(sq0) ^ sq1;
+    }
+
+    [[nodiscard]] constexpr Bitboard operator&(Square sq0, Square sq1)
+    {
+        return Bitboard::square(sq0) & sq1;
+    }
+
+    [[nodiscard]] constexpr Bitboard operator|(Square sq0, Square sq1)
+    {
+        return Bitboard::square(sq0) | sq1;
+    }
+
+    [[nodiscard]] constexpr Bitboard operator""_bb(std::uint64_t bits)
+    {
+        return Bitboard::fromBits(bits);
+    }
+
+    namespace bb
+    {
+        namespace fancy_magics
+        {
+            // Implementation based on https://github.com/syzygy1/Cfish
+
+            alignas(64) constexpr EnumArray<Square, std::uint64_t> g_rookMagics{ {
+                0x0A80004000801220ull,
+                0x8040004010002008ull,
+                0x2080200010008008ull,
+                0x1100100008210004ull,
+                0xC200209084020008ull,
+                0x2100010004000208ull,
+                0x0400081000822421ull,
+                0x0200010422048844ull,
+                0x0800800080400024ull,
+                0x0001402000401000ull,
+                0x3000801000802001ull,
+                0x4400800800100083ull,
+                0x0904802402480080ull,
+                0x4040800400020080ull,
+                0x0018808042000100ull,
+                0x4040800080004100ull,
+                0x0040048001458024ull,
+                0x00A0004000205000ull,
+                0x3100808010002000ull,
+                0x4825010010000820ull,
+                0x5004808008000401ull,
+                0x2024818004000A00ull,
+                0x0005808002000100ull,
+                0x2100060004806104ull,
+                0x0080400880008421ull,
+                0x4062220600410280ull,
+                0x010A004A00108022ull,
+                0x0000100080080080ull,
+                0x0021000500080010ull,
+                0x0044000202001008ull,
+                0x0000100400080102ull,
+                0xC020128200040545ull,
+                0x0080002000400040ull,
+                0x0000804000802004ull,
+                0x0000120022004080ull,
+                0x010A386103001001ull,
+                0x9010080080800400ull,
+                0x8440020080800400ull,
+                0x0004228824001001ull,
+                0x000000490A000084ull,
+                0x0080002000504000ull,
+                0x200020005000C000ull,
+                0x0012088020420010ull,
+                0x0010010080080800ull,
+                0x0085001008010004ull,
+                0x0002000204008080ull,
+                0x0040413002040008ull,
+                0x0000304081020004ull,
+                0x0080204000800080ull,
+                0x3008804000290100ull,
+                0x1010100080200080ull,
+                0x2008100208028080ull,
+                0x5000850800910100ull,
+                0x8402019004680200ull,
+                0x0120911028020400ull,
+                0x0000008044010200ull,
+                0x0020850200244012ull,
+                0x0020850200244012ull,
+                0x0000102001040841ull,
+                0x140900040A100021ull,
+                0x000200282410A102ull,
+                0x000200282410A102ull,
+                0x000200282410A102ull,
+                0x4048240043802106ull
+                    } };
+            alignas(64) extern EnumArray<Square, Bitboard> g_rookMasks;
+            alignas(64) extern EnumArray<Square, std::uint8_t> g_rookShifts;
+            alignas(64) extern EnumArray<Square, const Bitboard*> g_rookAttacks;
+
+            alignas(64) constexpr EnumArray<Square, std::uint64_t> g_bishopMagics{ {
+                0x40106000A1160020ull,
+                0x0020010250810120ull,
+                0x2010010220280081ull,
+                0x002806004050C040ull,
+                0x0002021018000000ull,
+                0x2001112010000400ull,
+                0x0881010120218080ull,
+                0x1030820110010500ull,
+                0x0000120222042400ull,
+                0x2000020404040044ull,
+                0x8000480094208000ull,
+                0x0003422A02000001ull,
+                0x000A220210100040ull,
+                0x8004820202226000ull,
+                0x0018234854100800ull,
+                0x0100004042101040ull,
+                0x0004001004082820ull,
+                0x0010000810010048ull,
+                0x1014004208081300ull,
+                0x2080818802044202ull,
+                0x0040880C00A00100ull,
+                0x0080400200522010ull,
+                0x0001000188180B04ull,
+                0x0080249202020204ull,
+                0x1004400004100410ull,
+                0x00013100A0022206ull,
+                0x2148500001040080ull,
+                0x4241080011004300ull,
+                0x4020848004002000ull,
+                0x10101380D1004100ull,
+                0x0008004422020284ull,
+                0x01010A1041008080ull,
+                0x0808080400082121ull,
+                0x0808080400082121ull,
+                0x0091128200100C00ull,
+                0x0202200802010104ull,
+                0x8C0A020200440085ull,
+                0x01A0008080B10040ull,
+                0x0889520080122800ull,
+                0x100902022202010Aull,
+                0x04081A0816002000ull,
+                0x0000681208005000ull,
+                0x8170840041008802ull,
+                0x0A00004200810805ull,
+                0x0830404408210100ull,
+                0x2602208106006102ull,
+                0x1048300680802628ull,
+                0x2602208106006102ull,
+                0x0602010120110040ull,
+                0x0941010801043000ull,
+                0x000040440A210428ull,
+                0x0008240020880021ull,
+                0x0400002012048200ull,
+                0x00AC102001210220ull,
+                0x0220021002009900ull,
+                0x84440C080A013080ull,
+                0x0001008044200440ull,
+                0x0004C04410841000ull,
+                0x2000500104011130ull,
+                0x1A0C010011C20229ull,
+                0x0044800112202200ull,
+                0x0434804908100424ull,
+                0x0300404822C08200ull,
+                0x48081010008A2A80ull
+            } };
+            alignas(64) extern EnumArray<Square, Bitboard> g_bishopMasks;
+            alignas(64) extern EnumArray<Square, std::uint8_t> g_bishopShifts;
+            alignas(64) extern EnumArray<Square, const Bitboard*> g_bishopAttacks;
+
+            inline Bitboard bishopAttacks(Square s, Bitboard occupied)
+            {
+                const std::size_t idx =
+                    (occupied & fancy_magics::g_bishopMasks[s]).bits()
+                    * fancy_magics::g_bishopMagics[s]
+                    >> fancy_magics::g_bishopShifts[s];
+
+                return fancy_magics::g_bishopAttacks[s][idx];
+            }
+
+            inline Bitboard rookAttacks(Square s, Bitboard occupied)
+            {
+                const std::size_t idx =
+                    (occupied & fancy_magics::g_rookMasks[s]).bits()
+                    * fancy_magics::g_rookMagics[s]
+                    >> fancy_magics::g_rookShifts[s];
+
+                return fancy_magics::g_rookAttacks[s][idx];
+            }
+        }
+
+        [[nodiscard]] constexpr Bitboard square(Square sq)
+        {
+            return Bitboard::square(sq);
+        }
+
+        [[nodiscard]] constexpr Bitboard rank(Rank rank)
+        {
+            return Bitboard::rank(rank);
+        }
+
+        [[nodiscard]] constexpr Bitboard file(File file)
+        {
+            return Bitboard::file(file);
+        }
+
+        [[nodiscard]] constexpr Bitboard color(Color c)
+        {
+            return Bitboard::color(c);
+        }
+
+        [[nodiscard]] constexpr Bitboard before(Square sq)
+        {
+            return Bitboard::fromBits(nbitmask<std::uint64_t>[ordinal(sq)]);
+        }
+
+        constexpr Bitboard lightSquares = bb::color(Color::White);
+        constexpr Bitboard darkSquares = bb::color(Color::Black);
+
+        constexpr Bitboard fileA = bb::file(chess::fileA);
+        constexpr Bitboard fileB = bb::file(chess::fileB);
+        constexpr Bitboard fileC = bb::file(chess::fileC);
+        constexpr Bitboard fileD = bb::file(chess::fileD);
+        constexpr Bitboard fileE = bb::file(chess::fileE);
+        constexpr Bitboard fileF = bb::file(chess::fileF);
+        constexpr Bitboard fileG = bb::file(chess::fileG);
+        constexpr Bitboard fileH = bb::file(chess::fileH);
+
+        constexpr Bitboard rank1 = bb::rank(chess::rank1);
+        constexpr Bitboard rank2 = bb::rank(chess::rank2);
+        constexpr Bitboard rank3 = bb::rank(chess::rank3);
+        constexpr Bitboard rank4 = bb::rank(chess::rank4);
+        constexpr Bitboard rank5 = bb::rank(chess::rank5);
+        constexpr Bitboard rank6 = bb::rank(chess::rank6);
+        constexpr Bitboard rank7 = bb::rank(chess::rank7);
+        constexpr Bitboard rank8 = bb::rank(chess::rank8);
+
+        constexpr Bitboard a1 = bb::square(chess::a1);
+        constexpr Bitboard a2 = bb::square(chess::a2);
+        constexpr Bitboard a3 = bb::square(chess::a3);
+        constexpr Bitboard a4 = bb::square(chess::a4);
+        constexpr Bitboard a5 = bb::square(chess::a5);
+        constexpr Bitboard a6 = bb::square(chess::a6);
+        constexpr Bitboard a7 = bb::square(chess::a7);
+        constexpr Bitboard a8 = bb::square(chess::a8);
+
+        constexpr Bitboard b1 = bb::square(chess::b1);
+        constexpr Bitboard b2 = bb::square(chess::b2);
+        constexpr Bitboard b3 = bb::square(chess::b3);
+        constexpr Bitboard b4 = bb::square(chess::b4);
+        constexpr Bitboard b5 = bb::square(chess::b5);
+        constexpr Bitboard b6 = bb::square(chess::b6);
+        constexpr Bitboard b7 = bb::square(chess::b7);
+        constexpr Bitboard b8 = bb::square(chess::b8);
+
+        constexpr Bitboard c1 = bb::square(chess::c1);
+        constexpr Bitboard c2 = bb::square(chess::c2);
+        constexpr Bitboard c3 = bb::square(chess::c3);
+        constexpr Bitboard c4 = bb::square(chess::c4);
+        constexpr Bitboard c5 = bb::square(chess::c5);
+        constexpr Bitboard c6 = bb::square(chess::c6);
+        constexpr Bitboard c7 = bb::square(chess::c7);
+        constexpr Bitboard c8 = bb::square(chess::c8);
+
+        constexpr Bitboard d1 = bb::square(chess::d1);
+        constexpr Bitboard d2 = bb::square(chess::d2);
+        constexpr Bitboard d3 = bb::square(chess::d3);
+        constexpr Bitboard d4 = bb::square(chess::d4);
+        constexpr Bitboard d5 = bb::square(chess::d5);
+        constexpr Bitboard d6 = bb::square(chess::d6);
+        constexpr Bitboard d7 = bb::square(chess::d7);
+        constexpr Bitboard d8 = bb::square(chess::d8);
+
+        constexpr Bitboard e1 = bb::square(chess::e1);
+        constexpr Bitboard e2 = bb::square(chess::e2);
+        constexpr Bitboard e3 = bb::square(chess::e3);
+        constexpr Bitboard e4 = bb::square(chess::e4);
+        constexpr Bitboard e5 = bb::square(chess::e5);
+        constexpr Bitboard e6 = bb::square(chess::e6);
+        constexpr Bitboard e7 = bb::square(chess::e7);
+        constexpr Bitboard e8 = bb::square(chess::e8);
+
+        constexpr Bitboard f1 = bb::square(chess::f1);
+        constexpr Bitboard f2 = bb::square(chess::f2);
+        constexpr Bitboard f3 = bb::square(chess::f3);
+        constexpr Bitboard f4 = bb::square(chess::f4);
+        constexpr Bitboard f5 = bb::square(chess::f5);
+        constexpr Bitboard f6 = bb::square(chess::f6);
+        constexpr Bitboard f7 = bb::square(chess::f7);
+        constexpr Bitboard f8 = bb::square(chess::f8);
+
+        constexpr Bitboard g1 = bb::square(chess::g1);
+        constexpr Bitboard g2 = bb::square(chess::g2);
+        constexpr Bitboard g3 = bb::square(chess::g3);
+        constexpr Bitboard g4 = bb::square(chess::g4);
+        constexpr Bitboard g5 = bb::square(chess::g5);
+        constexpr Bitboard g6 = bb::square(chess::g6);
+        constexpr Bitboard g7 = bb::square(chess::g7);
+        constexpr Bitboard g8 = bb::square(chess::g8);
+
+        constexpr Bitboard h1 = bb::square(chess::h1);
+        constexpr Bitboard h2 = bb::square(chess::h2);
+        constexpr Bitboard h3 = bb::square(chess::h3);
+        constexpr Bitboard h4 = bb::square(chess::h4);
+        constexpr Bitboard h5 = bb::square(chess::h5);
+        constexpr Bitboard h6 = bb::square(chess::h6);
+        constexpr Bitboard h7 = bb::square(chess::h7);
+        constexpr Bitboard h8 = bb::square(chess::h8);
+
+        [[nodiscard]] Bitboard between(Square s1, Square s2);
+
+        [[nodiscard]] Bitboard line(Square s1, Square s2);
+
+        template <PieceType PieceTypeV>
+        [[nodiscard]] Bitboard pseudoAttacks(Square sq);
+
+        [[nodiscard]] Bitboard pseudoAttacks(PieceType pt, Square sq);
+
+        template <PieceType PieceTypeV>
+        Bitboard attacks(Square sq, Bitboard occupied)
+        {
+            static_assert(PieceTypeV != PieceType::None && PieceTypeV != PieceType::Pawn);
+
+            assert(sq.isOk());
+
+            if constexpr (PieceTypeV == PieceType::Bishop)
+            {
+                return fancy_magics::bishopAttacks(sq, occupied);
+            }
+            else if constexpr (PieceTypeV == PieceType::Rook)
+            {
+                return fancy_magics::rookAttacks(sq, occupied);
+            }
+            else if constexpr (PieceTypeV == PieceType::Queen)
+            {
+                return
+                    fancy_magics::bishopAttacks(sq, occupied)
+                    | fancy_magics::rookAttacks(sq, occupied);
+            }
+            else
+            {
+                return pseudoAttacks<PieceTypeV>(sq);
+            }
+        }
+
+        [[nodiscard]] inline Bitboard attacks(PieceType pt, Square sq, Bitboard occupied)
+        {
+            assert(sq.isOk());
+
+            switch (pt)
+            {
+            case PieceType::Bishop:
+                return attacks<PieceType::Bishop>(sq, occupied);
+            case PieceType::Rook:
+                return attacks<PieceType::Rook>(sq, occupied);
+            case PieceType::Queen:
+                return attacks<PieceType::Queen>(sq, occupied);
+            default:
+                return pseudoAttacks(pt, sq);
+            }
+        }
+
+        [[nodiscard]] inline Bitboard pawnAttacks(Bitboard pawns, Color color);
+
+        [[nodiscard]] inline Bitboard westPawnAttacks(Bitboard pawns, Color color);
+
+        [[nodiscard]] inline Bitboard eastPawnAttacks(Bitboard pawns, Color color);
+
+        [[nodiscard]] inline bool isAttackedBySlider(
+            Square sq,
+            Bitboard bishops,
+            Bitboard rooks,
+            Bitboard queens,
+            Bitboard occupied
+        );
+
+        namespace detail
+        {
+            static constexpr std::array<Offset, 8> knightOffsets{ { {-1, -2}, {-1, 2}, {1, -2}, {1, 2}, {-2, -1}, {-2, 1}, {2, -1}, {2, 1} } };
+            static constexpr std::array<Offset, 8> kingOffsets{ { {-1, -1}, {-1, 0}, {-1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } };
+
+            enum Direction
+            {
+                North = 0,
+                NorthEast,
+                East,
+                SouthEast,
+                South,
+                SouthWest,
+                West,
+                NorthWest
+            };
+
+            constexpr std::array<Offset, 8> offsets = { {
+                { 0, 1 },
+                { 1, 1 },
+                { 1, 0 },
+                { 1, -1 },
+                { 0, -1 },
+                { -1, -1 },
+                { -1, 0 },
+                { -1, 1 }
+            } };
+
+            static constexpr std::array<Offset, 4> bishopOffsets{
+                offsets[NorthEast],
+                offsets[SouthEast],
+                offsets[SouthWest],
+                offsets[NorthWest]
+            };
+            static constexpr std::array<Offset, 4> rookOffsets{
+                offsets[North],
+                offsets[East],
+                offsets[South],
+                offsets[West]
+            };
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Pawn()
+            {
+                // pseudo attacks don't make sense for pawns
+                return {};
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Knight()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    Bitboard bb{};
+
+                    for (auto&& offset : knightOffsets)
+                    {
+                        const SquareCoords toSq = fromSq.coords() + offset;
+                        if (toSq.isOk())
+                        {
+                            bb |= Square(toSq);
+                        }
+                    }
+
+                    bbs[fromSq] = bb;
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static Bitboard generateSliderPseudoAttacks(const std::array<Offset, 4> & offsets, Square fromSq)
+            {
+                assert(fromSq.isOk());
+
+                Bitboard bb{};
+
+                for (auto&& offset : offsets)
+                {
+                    SquareCoords fromSqC = fromSq.coords();
+
+                    for (;;)
+                    {
+                        fromSqC += offset;
+
+                        if (!fromSqC.isOk())
+                        {
+                            break;
+                        }
+
+                        bb |= Square(fromSqC);
+                    }
+                }
+
+                return bb;
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Bishop()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    bbs[fromSq] = generateSliderPseudoAttacks(bishopOffsets, fromSq);
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Rook()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    bbs[fromSq] = generateSliderPseudoAttacks(rookOffsets, fromSq);
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_Queen()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    bbs[fromSq] =
+                        generateSliderPseudoAttacks(bishopOffsets, fromSq)
+                        | generateSliderPseudoAttacks(rookOffsets, fromSq);
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePseudoAttacks_King()
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    Bitboard bb{};
+
+                    for (auto&& offset : kingOffsets)
+                    {
+                        const SquareCoords toSq = fromSq.coords() + offset;
+                        if (toSq.isOk())
+                        {
+                            bb |= Square(toSq);
+                        }
+                    }
+
+                    bbs[fromSq] = bb;
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static EnumArray2<PieceType, Square, Bitboard> generatePseudoAttacks()
+            {
+                return EnumArray2<PieceType, Square, Bitboard>{
+                    generatePseudoAttacks_Pawn(),
+                        generatePseudoAttacks_Knight(),
+                        generatePseudoAttacks_Bishop(),
+                        generatePseudoAttacks_Rook(),
+                        generatePseudoAttacks_Queen(),
+                        generatePseudoAttacks_King()
+                };
+            }
+
+            static const EnumArray2<PieceType, Square, Bitboard> pseudoAttacks = generatePseudoAttacks();
+
+            [[nodiscard]] static Bitboard generatePositiveRayAttacks(Direction dir, Square fromSq)
+            {
+                assert(fromSq.isOk());
+
+                Bitboard bb{};
+
+                const auto offset = offsets[dir];
+                SquareCoords fromSqC = fromSq.coords();
+                for (;;)
+                {
+                    fromSqC += offset;
+
+                    if (!fromSqC.isOk())
+                    {
+                        break;
+                    }
+
+                    bb |= Square(fromSqC);
+                }
+
+                return bb;
+            }
+
+            // classical slider move generation approach https://www.chessprogramming.org/Classical_Approach
+
+            [[nodiscard]] static EnumArray<Square, Bitboard> generatePositiveRayAttacks(Direction dir)
+            {
+                EnumArray<Square, Bitboard> bbs{};
+
+                for (Square fromSq = chess::a1; fromSq != Square::none(); ++fromSq)
+                {
+                    bbs[fromSq] = generatePositiveRayAttacks(dir, fromSq);
+                }
+
+                return bbs;
+            }
+
+            [[nodiscard]] static std::array<EnumArray<Square, Bitboard>, 8> generatePositiveRayAttacks()
+            {
+                std::array<EnumArray<Square, Bitboard>, 8> bbs{};
+
+                bbs[North] = generatePositiveRayAttacks(North);
+                bbs[NorthEast] = generatePositiveRayAttacks(NorthEast);
+                bbs[East] = generatePositiveRayAttacks(East);
+                bbs[SouthEast] = generatePositiveRayAttacks(SouthEast);
+                bbs[South] = generatePositiveRayAttacks(South);
+                bbs[SouthWest] = generatePositiveRayAttacks(SouthWest);
+                bbs[West] = generatePositiveRayAttacks(West);
+                bbs[NorthWest] = generatePositiveRayAttacks(NorthWest);
+
+                return bbs;
+            }
+
+            static const std::array<EnumArray<Square, Bitboard>, 8> positiveRayAttacks = generatePositiveRayAttacks();
+
+            template <Direction DirV>
+            [[nodiscard]] static Bitboard slidingAttacks(Square sq, Bitboard occupied)
+            {
+                assert(sq.isOk());
+
+                Bitboard attacks = positiveRayAttacks[DirV][sq];
+
+                if constexpr (DirV == NorthWest || DirV == North || DirV == NorthEast || DirV == East)
+                {
+                    Bitboard blocker = (attacks & occupied) | h8; // set highest bit (H8) so msb never fails
+                    return attacks ^ positiveRayAttacks[DirV][blocker.first()];
+                }
+                else
+                {
+                    Bitboard blocker = (attacks & occupied) | a1;
+                    return attacks ^ positiveRayAttacks[DirV][blocker.last()];
+                }
+            }
+
+            template Bitboard slidingAttacks<Direction::North>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::NorthEast>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::East>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::SouthEast>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::South>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::SouthWest>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::West>(Square, Bitboard);
+            template Bitboard slidingAttacks<Direction::NorthWest>(Square, Bitboard);
+
+            template <PieceType PieceTypeV>
+            [[nodiscard]] inline Bitboard pieceSlidingAttacks(Square sq, Bitboard occupied)
+            {
+                static_assert(
+                    PieceTypeV == PieceType::Rook
+                    || PieceTypeV == PieceType::Bishop
+                    || PieceTypeV == PieceType::Queen);
+
+                assert(sq.isOk());
+
+                if constexpr (PieceTypeV == PieceType::Bishop)
+                {
+                    return
+                        detail::slidingAttacks<detail::NorthEast>(sq, occupied)
+                        | detail::slidingAttacks<detail::SouthEast>(sq, occupied)
+                        | detail::slidingAttacks<detail::SouthWest>(sq, occupied)
+                        | detail::slidingAttacks<detail::NorthWest>(sq, occupied);
+                }
+                else if constexpr (PieceTypeV == PieceType::Rook)
+                {
+                    return
+                        detail::slidingAttacks<detail::North>(sq, occupied)
+                        | detail::slidingAttacks<detail::East>(sq, occupied)
+                        | detail::slidingAttacks<detail::South>(sq, occupied)
+                        | detail::slidingAttacks<detail::West>(sq, occupied);
+                }
+                else // if constexpr (PieceTypeV == PieceType::Queen)
+                {
+                    return
+                        detail::slidingAttacks<detail::North>(sq, occupied)
+                        | detail::slidingAttacks<detail::NorthEast>(sq, occupied)
+                        | detail::slidingAttacks<detail::East>(sq, occupied)
+                        | detail::slidingAttacks<detail::SouthEast>(sq, occupied)
+                        | detail::slidingAttacks<detail::South>(sq, occupied)
+                        | detail::slidingAttacks<detail::SouthWest>(sq, occupied)
+                        | detail::slidingAttacks<detail::West>(sq, occupied)
+                        | detail::slidingAttacks<detail::NorthWest>(sq, occupied);
+                }
+            }
+
+            static Bitboard generateBetween(Square s1, Square s2)
+            {
+                Bitboard bb = Bitboard::none();
+
+                if (s1 == s2)
+                {
+                    return bb;
+                }
+
+                const int fd = s2.file() - s1.file();
+                const int rd = s2.rank() - s1.rank();
+
+                if (fd == 0 || rd == 0 || fd == rd || fd == -rd)
+                {
+                    // s1 and s2 lie on a line.
+                    const int fileStep = (fd > 0) - (fd < 0);
+                    const int rankStep = (rd > 0) - (rd < 0);
+                    const auto step = FlatSquareOffset(fileStep, rankStep);
+                    s1 += step; // omit s1
+                    while(s1 != s2) // omit s2
+                    {
+                        bb |= s1;
+                        s1 += step;
+                    }
+                }
+
+                return bb;
+            }
+
+            static Bitboard generateLine(Square s1, Square s2)
+            {
+                for (PieceType pt : { PieceType::Bishop, PieceType::Rook })
+                {
+                    const Bitboard s1Attacks = pseudoAttacks[pt][s1];
+                    if (s1Attacks.isSet(s2))
+                    {
+                        const Bitboard s2Attacks = pseudoAttacks[pt][s2];
+                        return (s1Attacks & s2Attacks) | s1 | s2;
+                    }
+                }
+
+                return Bitboard::none();
+            }
+
+            static const EnumArray2<Square, Square, Bitboard> between = []()
+            {
+                EnumArray2<Square, Square, Bitboard> between;
+
+                for (Square s1 : values<Square>())
+                {
+                    for (Square s2 : values<Square>())
+                    {
+                        between[s1][s2] = generateBetween(s1, s2);
+                    }
+                }
+
+                return between;
+            }();
+
+            static const EnumArray2<Square, Square, Bitboard> line = []()
+            {
+                EnumArray2<Square, Square, Bitboard> line;
+
+                for (Square s1 : values<Square>())
+                {
+                    for (Square s2 : values<Square>())
+                    {
+                        line[s1][s2] = generateLine(s1, s2);
+                    }
+                }
+
+                return line;
+            }();
+        }
+
+        namespace fancy_magics
+        {
+            enum struct MagicsType
+            {
+                Rook,
+                Bishop
+            };
+
+            alignas(64) EnumArray<Square, Bitboard> g_rookMasks;
+            alignas(64) EnumArray<Square, std::uint8_t> g_rookShifts;
+            alignas(64) EnumArray<Square, const Bitboard*> g_rookAttacks;
+
+            alignas(64) EnumArray<Square, Bitboard> g_bishopMasks;
+            alignas(64) EnumArray<Square, std::uint8_t> g_bishopShifts;
+            alignas(64) EnumArray<Square, const Bitboard*> g_bishopAttacks;
+
+            alignas(64) static std::array<Bitboard, 102400> g_allRookAttacks;
+            alignas(64) static std::array<Bitboard, 5248> g_allBishopAttacks;
+
+            template <MagicsType TypeV>
+            [[nodiscard]] inline Bitboard slidingAttacks(Square sq, Bitboard occupied)
+            {
+                if (TypeV == MagicsType::Rook)
+                {
+                    return chess::bb::detail::pieceSlidingAttacks<PieceType::Rook>(sq, occupied);
+                }
+
+                if (TypeV == MagicsType::Bishop)
+                {
+                    return chess::bb::detail::pieceSlidingAttacks<PieceType::Bishop>(sq, occupied);
+                }
+
+                return Bitboard::none();
+            }
+
+            template <MagicsType TypeV, std::size_t SizeV>
+            [[nodiscard]] inline bool initMagics(
+                const EnumArray<Square, std::uint64_t>& magics,
+                std::array<Bitboard, SizeV>& table,
+                EnumArray<Square, Bitboard>& masks,
+                EnumArray<Square, std::uint8_t>& shifts,
+                EnumArray<Square, const Bitboard*>& attacks
+            )
+            {
+                std::size_t size = 0;
+                for (Square sq : values<Square>())
+                {
+                    const Bitboard edges =
+                        ((bb::rank1 | bb::rank8) & ~Bitboard::rank(sq.rank()))
+                        | ((bb::fileA | bb::fileH) & ~Bitboard::file(sq.file()));
+
+                    Bitboard* currentAttacks = table.data() + size;
+
+                    attacks[sq] = currentAttacks;
+                    masks[sq] = slidingAttacks<TypeV>(sq, Bitboard::none()) & ~edges;
+                    shifts[sq] = 64 - masks[sq].count();
+
+                    Bitboard occupied = Bitboard::none();
+                    do
+                    {
+                        const std::size_t idx =
+                            (occupied & masks[sq]).bits()
+                            * magics[sq]
+                            >> shifts[sq];
+
+                        currentAttacks[idx] = slidingAttacks<TypeV>(sq, occupied);
+
+                        ++size;
+                        occupied = Bitboard::fromBits(occupied.bits() - masks[sq].bits()) & masks[sq];
+                    } while (occupied.any());
+                }
+
+                return true;
+            }
+
+            static bool g_isRookMagicsInitialized =
+                initMagics<MagicsType::Rook>(g_rookMagics, g_allRookAttacks, g_rookMasks, g_rookShifts, g_rookAttacks);
+
+            static bool g_isBishopMagicsInitialized =
+                initMagics<MagicsType::Bishop>(g_bishopMagics, g_allBishopAttacks, g_bishopMasks, g_bishopShifts, g_bishopAttacks);
+        }
+
+        [[nodiscard]] inline Bitboard between(Square s1, Square s2)
+        {
+            return detail::between[s1][s2];
+        }
+
+        [[nodiscard]] inline Bitboard line(Square s1, Square s2)
+        {
+            return detail::line[s1][s2];
+        }
+
+        template <PieceType PieceTypeV>
+        [[nodiscard]] inline Bitboard pseudoAttacks(Square sq)
+        {
+            static_assert(PieceTypeV != PieceType::None && PieceTypeV != PieceType::Pawn);
+
+            assert(sq.isOk());
+
+            return detail::pseudoAttacks[PieceTypeV][sq];
+        }
+
+        [[nodiscard]] inline Bitboard pseudoAttacks(PieceType pt, Square sq)
+        {
+            assert(sq.isOk());
+
+            return detail::pseudoAttacks[pt][sq];
+        }
+
+        [[nodiscard]] inline Bitboard pawnAttacks(Bitboard pawns, Color color)
+        {
+            if (color == Color::White)
+            {
+                return pawns.shifted<1, 1>() | pawns.shifted<-1, 1>();
+            }
+            else
+            {
+                return pawns.shifted<1, -1>() | pawns.shifted<-1, -1>();
+            }
+        }
+
+        [[nodiscard]] inline Bitboard westPawnAttacks(Bitboard pawns, Color color)
+        {
+            if (color == Color::White)
+            {
+                return pawns.shifted<-1, 1>();
+            }
+            else
+            {
+                return pawns.shifted<-1, -1>();
+            }
+        }
+
+        [[nodiscard]] inline Bitboard eastPawnAttacks(Bitboard pawns, Color color)
+        {
+            if (color == Color::White)
+            {
+                return pawns.shifted<1, 1>();
+            }
+            else
+            {
+                return pawns.shifted<1, -1>();
+            }
+        }
+
+        [[nodiscard]] inline bool isAttackedBySlider(
+            Square sq,
+            Bitboard bishops,
+            Bitboard rooks,
+            Bitboard queens,
+            Bitboard occupied
+        )
+        {
+            const Bitboard opponentBishopLikePieces = (bishops | queens);
+            const Bitboard bishopAttacks = bb::attacks<PieceType::Bishop>(sq, occupied);
+            if ((bishopAttacks & opponentBishopLikePieces).any())
+            {
+                return true;
+            }
+
+            const Bitboard opponentRookLikePieces = (rooks | queens);
+            const Bitboard rookAttacks = bb::attacks<PieceType::Rook>(sq, occupied);
+            return (rookAttacks & opponentRookLikePieces).any();
+        }
+    }
+
+    struct CastlingTraits
+    {
+        static constexpr EnumArray2<Color, CastleType, Square> rookDestination = { { {{ f1, d1 }}, {{ f8, d8 }} } };
+        static constexpr EnumArray2<Color, CastleType, Square> kingDestination = { { {{ g1, c1 }}, {{ g8, c8 }} } };
+
+        static constexpr EnumArray2<Color, CastleType, Square> rookStart = { { {{ h1, a1 }}, {{ h8, a8 }} } };
+
+        static constexpr EnumArray<Color, Square> kingStart = { { e1, e8 } };
+
+        static constexpr EnumArray2<Color, CastleType, Bitboard> castlingPath = {
+            {
+                {{ Bitboard::square(f1) | g1, Bitboard::square(b1) | c1 | d1 }},
+                {{ Bitboard::square(f8) | g8, Bitboard::square(b8) | c8 | d8 }}
+            }
+        };
+
+        static constexpr EnumArray2<Color, CastleType, Square> squarePassedByKing = {
+            {
+                {{ f1, d1 }},
+                {{ f8, d8 }}
+            }
+        };
+
+        static constexpr EnumArray2<Color, CastleType, CastlingRights> castlingRights = {
+            {
+                {{ CastlingRights::WhiteKingSide, CastlingRights::WhiteQueenSide }},
+                {{ CastlingRights::BlackKingSide, CastlingRights::BlackQueenSide }}
+            }
+        };
+
+        // Move has to be a legal castling move.
+        static constexpr CastleType moveCastlingType(const Move& move)
+        {
+            return (move.to.file() == fileH) ? CastleType::Short : CastleType::Long;
+        }
+
+        // Move must be a legal castling move.
+        static constexpr CastlingRights moveCastlingRight(Move move)
+        {
+            if (move.to == h1) return CastlingRights::WhiteKingSide;
+            if (move.to == a1) return CastlingRights::WhiteQueenSide;
+            if (move.to == h8) return CastlingRights::WhiteKingSide;
+            if (move.to == a8) return CastlingRights::WhiteQueenSide;
+            return CastlingRights::None;
+        }
+    };
+
+    namespace parser_bits
+    {
+        [[nodiscard]] constexpr bool isFile(char c)
+        {
+            return c >= 'a' && c <= 'h';
+        }
+
+        [[nodiscard]] constexpr bool isRank(char c)
+        {
+            return c >= '1' && c <= '8';
+        }
+
+        [[nodiscard]] constexpr Rank parseRank(char c)
+        {
+            assert(isRank(c));
+
+            return fromOrdinal<Rank>(c - '1');
+        }
+
+        [[nodiscard]] constexpr File parseFile(char c)
+        {
+            assert(isFile(c));
+
+            return fromOrdinal<File>(c - 'a');
+        }
+
+        [[nodiscard]] constexpr bool isSquare(const char* s)
+        {
+            return isFile(s[0]) && isRank(s[1]);
+        }
+
+        [[nodiscard]] constexpr Square parseSquare(const char* s)
+        {
+            const File file = parseFile(s[0]);
+            const Rank rank = parseRank(s[1]);
+            return Square(file, rank);
+        }
+
+        [[nodiscard]] constexpr std::optional<Square> tryParseSquare(std::string_view s)
+        {
+            if (s.size() != 2) return {};
+            if (!isSquare(s.data())) return {};
+            return parseSquare(s.data());
+        }
+
+        [[nodiscard]] constexpr std::optional<Square> tryParseEpSquare(std::string_view s)
+        {
+            if (s == std::string_view("-")) return Square::none();
+            return tryParseSquare(s);
+        }
+
+        [[nodiscard]] constexpr std::optional<CastlingRights> tryParseCastlingRights(std::string_view s)
+        {
+            if (s == std::string_view("-")) return CastlingRights::None;
+
+            CastlingRights rights = CastlingRights::None;
+
+            for (auto& c : s)
+            {
+                CastlingRights toAdd = CastlingRights::None;
+                switch (c)
+                {
+                case 'K':
+                    toAdd = CastlingRights::WhiteKingSide;
+                    break;
+                case 'Q':
+                    toAdd = CastlingRights::WhiteQueenSide;
+                    break;
+                case 'k':
+                    toAdd = CastlingRights::BlackKingSide;
+                    break;
+                case 'q':
+                    toAdd = CastlingRights::BlackQueenSide;
+                    break;
+                }
+
+                // If there are duplicated castling rights specification we bail.
+                // If there is an invalid character we bail.
+                // (It always contains None)
+                if (contains(rights, toAdd)) return {};
+                else rights |= toAdd;
+            }
+
+            return rights;
+        }
+
+        [[nodiscard]] constexpr CastlingRights readCastlingRights(const char*& s)
+        {
+            CastlingRights rights = CastlingRights::None;
+
+            while (*s != ' ')
+            {
+                switch (*s)
+                {
+                case 'K':
+                    rights |= CastlingRights::WhiteKingSide;
+                    break;
+                case 'Q':
+                    rights |= CastlingRights::WhiteQueenSide;
+                    break;
+                case 'k':
+                    rights |= CastlingRights::BlackKingSide;
+                    break;
+                case 'q':
+                    rights |= CastlingRights::BlackQueenSide;
+                    break;
+                }
+
+                ++s;
+            }
+
+            return rights;
+        }
+
+        FORCEINLINE inline void appendCastlingRightsToString(CastlingRights rights, std::string& str)
+        {
+            if (rights == CastlingRights::None)
+            {
+                str += '-';
+            }
+            else
+            {
+                if (contains(rights, CastlingRights::WhiteKingSide)) str += 'K';
+                if (contains(rights, CastlingRights::WhiteQueenSide)) str += 'Q';
+                if (contains(rights, CastlingRights::BlackKingSide)) str += 'k';
+                if (contains(rights, CastlingRights::BlackQueenSide)) str += 'q';
+            }
+        }
+
+        FORCEINLINE inline void appendSquareToString(Square sq, std::string& str)
+        {
+            str += static_cast<char>('a' + ordinal(sq.file()));
+            str += static_cast<char>('1' + ordinal(sq.rank()));
+        }
+
+        FORCEINLINE inline void appendEpSquareToString(Square sq, std::string& str)
+        {
+            if (sq == Square::none())
+            {
+                str += '-';
+            }
+            else
+            {
+                appendSquareToString(sq, str);
+            }
+        }
+
+        FORCEINLINE inline void appendRankToString(Rank r, std::string& str)
+        {
+            str += static_cast<char>('1' + ordinal(r));
+        }
+
+        FORCEINLINE inline void appendFileToString(File f, std::string& str)
+        {
+            str += static_cast<char>('a' + ordinal(f));
+        }
+
+        [[nodiscard]] FORCEINLINE inline bool isDigit(char c)
+        {
+            return c >= '0' && c <= '9';
+        }
+
+        [[nodiscard]] inline std::uint16_t parseUInt16(std::string_view sv)
+        {
+            assert(sv.size() > 0);
+            assert(sv.size() <= 5);
+
+            std::uint16_t v = 0;
+
+            std::size_t idx = 0;
+            switch (sv.size())
+            {
+            case 5:
+                v += (sv[idx++] - '0') * 10000;
+            case 4:
+                v += (sv[idx++] - '0') * 1000;
+            case 3:
+                v += (sv[idx++] - '0') * 100;
+            case 2:
+                v += (sv[idx++] - '0') * 10;
+            case 1:
+                v += sv[idx] - '0';
+                break;
+
+            default:
+                assert(false);
+            }
+
+            return v;
+        }
+
+        [[nodiscard]] inline std::optional<std::uint16_t> tryParseUInt16(std::string_view sv)
+        {
+            if (sv.size() == 0 || sv.size() > 5) return std::nullopt;
+
+            std::uint32_t v = 0;
+
+            std::size_t idx = 0;
+            switch (sv.size())
+            {
+            case 5:
+                v += (sv[idx++] - '0') * 10000;
+            case 4:
+                v += (sv[idx++] - '0') * 1000;
+            case 3:
+                v += (sv[idx++] - '0') * 100;
+            case 2:
+                v += (sv[idx++] - '0') * 10;
+            case 1:
+                v += sv[idx] - '0';
+                break;
+
+            default:
+                assert(false);
+            }
+
+            if (v > std::numeric_limits<std::uint16_t>::max())
+            {
+                return std::nullopt;
+            }
+
+            return static_cast<std::uint16_t>(v);
+        }
+    }
+
+
+    struct Board
+    {
+        constexpr Board() noexcept :
+            m_pieces{},
+            m_pieceBB{},
+            m_piecesByColorBB{},
+            m_pieceCount{}
+        {
+            m_pieces.fill(Piece::none());
+            m_pieceBB.fill(Bitboard::none());
+            m_pieceBB[Piece::none()] = Bitboard::all();
+            m_piecesByColorBB.fill(Bitboard::none());
+            m_pieceCount.fill(0);
+            m_pieceCount[Piece::none()] = 64;
+        }
+
+        [[nodiscard]] inline bool isValid() const
+        {
+            if (piecesBB(whiteKing).count() != 1) return false;
+            if (piecesBB(blackKing).count() != 1) return false;
+            if (((piecesBB(whitePawn) | piecesBB(blackPawn)) & (bb::rank(rank1) | bb::rank(rank8))).any()) return false;
+            return true;
+        }
+
+        [[nodiscard]] std::string fen() const;
+
+        [[nodiscard]] inline bool trySet(std::string_view boardState)
+        {
+            File f = fileA;
+            Rank r = rank8;
+            bool lastWasSkip = false;
+            for (auto c : boardState)
+            {
+                Piece piece = Piece::none();
+                switch (c)
+                {
+                case 'r':
+                    piece = Piece(PieceType::Rook, Color::Black);
+                    break;
+                case 'n':
+                    piece = Piece(PieceType::Knight, Color::Black);
+                    break;
+                case 'b':
+                    piece = Piece(PieceType::Bishop, Color::Black);
+                    break;
+                case 'q':
+                    piece = Piece(PieceType::Queen, Color::Black);
+                    break;
+                case 'k':
+                    piece = Piece(PieceType::King, Color::Black);
+                    break;
+                case 'p':
+                    piece = Piece(PieceType::Pawn, Color::Black);
+                    break;
+
+                case 'R':
+                    piece = Piece(PieceType::Rook, Color::White);
+                    break;
+                case 'N':
+                    piece = Piece(PieceType::Knight, Color::White);
+                    break;
+                case 'B':
+                    piece = Piece(PieceType::Bishop, Color::White);
+                    break;
+                case 'Q':
+                    piece = Piece(PieceType::Queen, Color::White);
+                    break;
+                case 'K':
+                    piece = Piece(PieceType::King, Color::White);
+                    break;
+                case 'P':
+                    piece = Piece(PieceType::Pawn, Color::White);
+                    break;
+
+                case '1':
+                case '2':
+                case '3':
+                case '4':
+                case '5':
+                case '6':
+                case '7':
+                case '8':
+                {
+                    if (lastWasSkip) return false;
+                    lastWasSkip = true;
+
+                    const int skip = c - '0';
+                    f += skip;
+                    if (f > fileH + 1) return false;
+                    break;
+                }
+
+                case '/':
+                    lastWasSkip = false;
+                    if (f != fileH + 1) return false;
+                    f = fileA;
+                    --r;
+                    break;
+
+                default:
+                    return false;
+                }
+
+                if (piece != Piece::none())
+                {
+                    lastWasSkip = false;
+
+                    const Square sq(f, r);
+                    if (!sq.isOk()) return false;
+
+                    place(piece, sq);
+                    ++f;
+                }
+            }
+
+            if (f != fileH + 1) return false;
+            if (r != rank1) return false;
+
+            return isValid();
+        }
+
+        // returns side to move
+        [[nodiscard]] constexpr const char* set(const char* fen)
+        {
+            assert(fen != nullptr);
+
+            File f = fileA;
+            Rank r = rank8;
+            auto current = fen;
+            bool done = false;
+            while (*current != '\0')
+            {
+                Piece piece = Piece::none();
+                switch (*current)
+                {
+                case 'r':
+                    piece = Piece(PieceType::Rook, Color::Black);
+                    break;
+                case 'n':
+                    piece = Piece(PieceType::Knight, Color::Black);
+                    break;
+                case 'b':
+                    piece = Piece(PieceType::Bishop, Color::Black);
+                    break;
+                case 'q':
+                    piece = Piece(PieceType::Queen, Color::Black);
+                    break;
+                case 'k':
+                    piece = Piece(PieceType::King, Color::Black);
+                    break;
+                case 'p':
+                    piece = Piece(PieceType::Pawn, Color::Black);
+                    break;
+
+                case 'R':
+                    piece = Piece(PieceType::Rook, Color::White);
+                    break;
+                case 'N':
+                    piece = Piece(PieceType::Knight, Color::White);
+                    break;
+                case 'B':
+                    piece = Piece(PieceType::Bishop, Color::White);
+                    break;
+                case 'Q':
+                    piece = Piece(PieceType::Queen, Color::White);
+                    break;
+                case 'K':
+                    piece = Piece(PieceType::King, Color::White);
+                    break;
+                case 'P':
+                    piece = Piece(PieceType::Pawn, Color::White);
+                    break;
+
+                case ' ':
+                    done = true;
+                    break;
+
+                case '1':
+                case '2':
+                case '3':
+                case '4':
+                case '5':
+                case '6':
+                case '7':
+                case '8':
+                {
+                    const int skip = (*current) - '0';
+                    f += skip;
+                    break;
+                }
+
+                case '/':
+                    f = fileA;
+                    --r;
+                    break;
+
+                default:
+                    break;
+                }
+
+                if (done)
+                {
+                    break;
+                }
+
+                if (piece != Piece::none())
+                {
+                    place(piece, Square(f, r));
+                    ++f;
+                }
+
+                ++current;
+            }
+
+            return current;
+        }
+
+        static constexpr Board fromFen(const char* fen)
+        {
+            Board board;
+            (void)board.set(fen);
+            return board;
+        }
+
+        [[nodiscard]] constexpr friend bool operator==(const Board& lhs, const Board& rhs) noexcept
+        {
+            bool equal = true;
+            for (Square sq = a1; sq <= h8; ++sq)
+            {
+                if (lhs.m_pieces[sq] != rhs.m_pieces[sq])
+                {
+                    equal = false;
+                    break;
+                }
+            }
+
+            assert(bbsEqual(lhs, rhs) == equal);
+
+            return equal;
+        }
+
+        constexpr void place(Piece piece, Square sq)
+        {
+            assert(sq.isOk());
+
+            auto oldPiece = m_pieces[sq];
+            m_pieceBB[oldPiece] ^= sq;
+            if (oldPiece != Piece::none())
+            {
+                m_piecesByColorBB[oldPiece.color()] ^= sq;
+            }
+            m_pieces[sq] = piece;
+            m_pieceBB[piece] |= sq;
+            m_piecesByColorBB[piece.color()] |= sq;
+            --m_pieceCount[oldPiece];
+            ++m_pieceCount[piece];
+        }
+
+        // returns captured piece
+        // doesn't check validity
+        FORCEINLINE constexpr Piece doMove(Move move)
+        {
+            if (move.type == MoveType::Normal)
+            {
+                const Piece capturedPiece = m_pieces[move.to];
+                const Piece piece = m_pieces[move.from];
+
+                const Bitboard frombb = Bitboard::square(move.from);
+                const Bitboard tobb = Bitboard::square(move.to);
+                const Bitboard xormove = frombb ^ tobb;
+
+                m_pieces[move.to] = piece;
+                m_pieces[move.from] = Piece::none();
+
+                m_pieceBB[piece] ^= xormove;
+
+                m_piecesByColorBB[piece.color()] ^= xormove;
+
+                if (capturedPiece == Piece::none())
+                {
+                    m_pieceBB[Piece::none()] ^= xormove;
+                }
+                else
+                {
+                    m_pieceBB[capturedPiece] ^= tobb;
+                    m_pieceBB[Piece::none()] ^= frombb;
+
+                    m_piecesByColorBB[capturedPiece.color()] ^= tobb;
+
+                    --m_pieceCount[capturedPiece];
+                    ++m_pieceCount[Piece::none()];
+                }
+
+                return capturedPiece;
+            }
+
+            return doMoveColdPath(move);
+        }
+
+        NOINLINE constexpr Piece doMoveColdPath(Move move)
+        {
+            if (move.type == MoveType::Promotion)
+            {
+                // We split it even though it's similar just because
+                // the normal case is much more common.
+                const Piece capturedPiece = m_pieces[move.to];
+                const Piece fromPiece = m_pieces[move.from];
+                const Piece toPiece = move.promotedPiece;
+
+                m_pieces[move.to] = toPiece;
+                m_pieces[move.from] = Piece::none();
+
+                m_pieceBB[fromPiece] ^= move.from;
+                m_pieceBB[toPiece] ^= move.to;
+
+                m_pieceBB[capturedPiece] ^= move.to;
+                m_pieceBB[Piece::none()] ^= move.from;
+
+                m_piecesByColorBB[fromPiece.color()] ^= move.to;
+                m_piecesByColorBB[fromPiece.color()] ^= move.from;
+                if (capturedPiece != Piece::none())
+                {
+                    m_piecesByColorBB[capturedPiece.color()] ^= move.to;
+                    --m_pieceCount[capturedPiece];
+                    ++m_pieceCount[Piece::none()];
+                }
+
+                --m_pieceCount[fromPiece];
+                ++m_pieceCount[toPiece];
+
+                return capturedPiece;
+            }
+            else if (move.type == MoveType::EnPassant)
+            {
+                const Piece movedPiece = m_pieces[move.from];
+                const Piece capturedPiece(PieceType::Pawn, !movedPiece.color());
+                const Square capturedPieceSq(move.to.file(), move.from.rank());
+
+                // on ep move there are 3 squares involved
+                m_pieces[move.to] = movedPiece;
+                m_pieces[move.from] = Piece::none();
+                m_pieces[capturedPieceSq] = Piece::none();
+
+                m_pieceBB[movedPiece] ^= move.from;
+                m_pieceBB[movedPiece] ^= move.to;
+
+                m_pieceBB[Piece::none()] ^= move.from;
+                m_pieceBB[Piece::none()] ^= move.to;
+
+                m_pieceBB[capturedPiece] ^= capturedPieceSq;
+                m_pieceBB[Piece::none()] ^= capturedPieceSq;
+
+                m_piecesByColorBB[movedPiece.color()] ^= move.to;
+                m_piecesByColorBB[movedPiece.color()] ^= move.from;
+                m_piecesByColorBB[capturedPiece.color()] ^= capturedPieceSq;
+
+                --m_pieceCount[capturedPiece];
+                ++m_pieceCount[Piece::none()];
+
+                return capturedPiece;
+            }
+            else // if (move.type == MoveType::Castle)
+            {
+                const Square rookFromSq = move.to;
+                const Square kingFromSq = move.from;
+
+                const Piece rook = m_pieces[rookFromSq];
+                const Piece king = m_pieces[kingFromSq];
+                const Color color = king.color();
+
+                const CastleType castleType = CastlingTraits::moveCastlingType(move);
+                const Square rookToSq = CastlingTraits::rookDestination[color][castleType];
+                const Square kingToSq = CastlingTraits::kingDestination[color][castleType];
+
+                // 4 squares are involved
+                m_pieces[rookFromSq] = Piece::none();
+                m_pieces[kingFromSq] = Piece::none();
+                m_pieces[rookToSq] = rook;
+                m_pieces[kingToSq] = king;
+
+                m_pieceBB[rook] ^= rookFromSq;
+                m_pieceBB[rook] ^= rookToSq;
+
+                m_pieceBB[king] ^= kingFromSq;
+                m_pieceBB[king] ^= kingToSq;
+
+                m_pieceBB[Piece::none()] ^= rookFromSq;
+                m_pieceBB[Piece::none()] ^= rookToSq;
+
+                m_pieceBB[Piece::none()] ^= kingFromSq;
+                m_pieceBB[Piece::none()] ^= kingToSq;
+
+                m_piecesByColorBB[color] ^= rookFromSq;
+                m_piecesByColorBB[color] ^= rookToSq;
+                m_piecesByColorBB[color] ^= kingFromSq;
+                m_piecesByColorBB[color] ^= kingToSq;
+
+                return Piece::none();
+            }
+        }
+
+        constexpr void undoMove(Move move, Piece capturedPiece)
+        {
+            if (move.type == MoveType::Normal || move.type == MoveType::Promotion)
+            {
+                const Piece toPiece = m_pieces[move.to];
+                const Piece fromPiece = move.promotedPiece == Piece::none() ? toPiece : Piece(PieceType::Pawn, toPiece.color());
+
+                m_pieces[move.from] = fromPiece;
+                m_pieces[move.to] = capturedPiece;
+
+                m_pieceBB[fromPiece] ^= move.from;
+                m_pieceBB[toPiece] ^= move.to;
+
+                m_pieceBB[capturedPiece] ^= move.to;
+                m_pieceBB[Piece::none()] ^= move.from;
+
+                m_piecesByColorBB[fromPiece.color()] ^= move.to;
+                m_piecesByColorBB[fromPiece.color()] ^= move.from;
+                if (capturedPiece != Piece::none())
+                {
+                    m_piecesByColorBB[capturedPiece.color()] ^= move.to;
+                    ++m_pieceCount[capturedPiece];
+                    --m_pieceCount[Piece::none()];
+                }
+
+                if (move.type == MoveType::Promotion)
+                {
+                    --m_pieceCount[toPiece];
+                    ++m_pieceCount[fromPiece];
+                }
+            }
+            else if (move.type == MoveType::EnPassant)
+            {
+                const Piece movedPiece = m_pieces[move.to];
+                const Piece capturedPiece(PieceType::Pawn, !movedPiece.color());
+                const Square capturedPieceSq(move.to.file(), move.from.rank());
+
+                m_pieces[move.to] = Piece::none();
+                m_pieces[move.from] = movedPiece;
+                m_pieces[capturedPieceSq] = capturedPiece;
+
+                m_pieceBB[movedPiece] ^= move.from;
+                m_pieceBB[movedPiece] ^= move.to;
+
+                m_pieceBB[Piece::none()] ^= move.from;
+                m_pieceBB[Piece::none()] ^= move.to;
+
+                // on ep move there are 3 squares involved
+                m_pieceBB[capturedPiece] ^= capturedPieceSq;
+                m_pieceBB[Piece::none()] ^= capturedPieceSq;
+
+                m_piecesByColorBB[movedPiece.color()] ^= move.to;
+                m_piecesByColorBB[movedPiece.color()] ^= move.from;
+                m_piecesByColorBB[capturedPiece.color()] ^= capturedPieceSq;
+
+                ++m_pieceCount[capturedPiece];
+                --m_pieceCount[Piece::none()];
+            }
+            else // if (move.type == MoveType::Castle)
+            {
+                const Square rookFromSq = move.to;
+                const Square kingFromSq = move.from;
+
+                const Color color = move.to.rank() == rank1 ? Color::White : Color::Black;
+
+                const CastleType castleType = CastlingTraits::moveCastlingType(move);
+                const Square rookToSq = CastlingTraits::rookDestination[color][castleType];
+                const Square kingToSq = CastlingTraits::kingDestination[color][castleType];
+
+                const Piece rook = m_pieces[rookToSq];
+                const Piece king = m_pieces[kingToSq];
+
+                // 4 squares are involved
+                m_pieces[rookFromSq] = rook;
+                m_pieces[kingFromSq] = king;
+                m_pieces[rookToSq] = Piece::none();
+                m_pieces[kingToSq] = Piece::none();
+
+                m_pieceBB[rook] ^= rookFromSq;
+                m_pieceBB[rook] ^= rookToSq;
+
+                m_pieceBB[king] ^= kingFromSq;
+                m_pieceBB[king] ^= kingToSq;
+
+                m_pieceBB[Piece::none()] ^= rookFromSq;
+                m_pieceBB[Piece::none()] ^= rookToSq;
+
+                m_pieceBB[Piece::none()] ^= kingFromSq;
+                m_pieceBB[Piece::none()] ^= kingToSq;
+
+                m_piecesByColorBB[color] ^= rookFromSq;
+                m_piecesByColorBB[color] ^= rookToSq;
+                m_piecesByColorBB[color] ^= kingFromSq;
+                m_piecesByColorBB[color] ^= kingToSq;
+            }
+        }
+
+        // Returns whether a given square is attacked by any piece
+        // of `attackerColor` side.
+        [[nodiscard]] bool isSquareAttacked(Square sq, Color attackerColor) const;
+
+        // Returns whether a given square is attacked by any piece
+        // of `attackerColor` side after `move` is made.
+        // Move must be pseudo legal.
+        [[nodiscard]] bool isSquareAttackedAfterMove(Move move, Square sq, Color attackerColor) const;
+
+        // Move must be pseudo legal.
+        // Must not be a king move.
+        [[nodiscard]] bool createsDiscoveredAttackOnOwnKing(Move move) const;
+
+        // Returns whether a piece on a given square is attacked
+        // by any enemy piece. False if square is empty.
+        [[nodiscard]] bool isPieceAttacked(Square sq) const;
+
+        // Returns whether a piece on a given square is attacked
+        // by any enemy piece after `move` is made. False if square is empty.
+        // Move must be pseudo legal.
+        [[nodiscard]] bool isPieceAttackedAfterMove(Move move, Square sq) const;
+
+        // Returns whether the king of the moving side is attacked
+        // by any enemy piece after a move is made.
+        // Move must be pseudo legal.
+        [[nodiscard]] bool isOwnKingAttackedAfterMove(Move move) const;
+
+        // Return a bitboard with all (pseudo legal) attacks by the piece on
+        // the given square. Empty if no piece on the square.
+        [[nodiscard]] Bitboard attacks(Square sq) const;
+
+        // Returns a bitboard with all squared that have pieces
+        // that attack a given square (pseudo legally)
+        [[nodiscard]] Bitboard attackers(Square sq, Color attackerColor) const;
+
+        [[nodiscard]] constexpr Piece pieceAt(Square sq) const
+        {
+            assert(sq.isOk());
+
+            return m_pieces[sq];
+        }
+
+        [[nodiscard]] constexpr Bitboard piecesBB(Color c) const
+        {
+            return m_piecesByColorBB[c];
+        }
+
+        [[nodiscard]] inline Square kingSquare(Color c) const
+        {
+            return piecesBB(Piece(PieceType::King, c)).first();
+        }
+
+        [[nodiscard]] constexpr Bitboard piecesBB(Piece pc) const
+        {
+            return m_pieceBB[pc];
+        }
+
+        [[nodiscard]] constexpr Bitboard piecesBB() const
+        {
+            Bitboard bb{};
+
+            // don't collect from null piece
+            return piecesBB(Color::White) | piecesBB(Color::Black);
+
+            return bb;
+        }
+
+        [[nodiscard]] constexpr std::uint8_t pieceCount(Piece pt) const
+        {
+            return m_pieceCount[pt];
+        }
+
+        [[nodiscard]] constexpr bool isPromotion(Square from, Square to) const
+        {
+            assert(from.isOk() && to.isOk());
+
+            return m_pieces[from].type() == PieceType::Pawn && (to.rank() == rank1 || to.rank() == rank8);
+        }
+
+        const Piece* piecesRaw() const;
+
+    private:
+        EnumArray<Square, Piece> m_pieces;
+        EnumArray<Piece, Bitboard> m_pieceBB;
+        EnumArray<Color, Bitboard> m_piecesByColorBB;
+        EnumArray<Piece, uint8_t> m_pieceCount;
+
+        // NOTE: currently we don't track it because it's not
+        // required to perform ep if we don't need to check validity
+        // Square m_epSquare = Square::none();
+
+        [[nodiscard]] static constexpr bool bbsEqual(const Board& lhs, const Board& rhs) noexcept
+        {
+            for (Piece pc : values<Piece>())
+            {
+                if (lhs.m_pieceBB[pc] != rhs.m_pieceBB[pc])
+                {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+    };
+
+    struct Position;
+
+    struct MoveLegalityChecker
+    {
+        MoveLegalityChecker(const Position& position);
+
+        [[nodiscard]] bool isPseudoLegalMoveLegal(const Move& move) const;
+
+    private:
+        const Position* m_position;
+        Bitboard m_checkers;
+        Bitboard m_ourBlockersForKing;
+        Bitboard m_potentialCheckRemovals;
+        Square m_ksq;
+    };
+
+    struct CompressedPosition;
+
+    struct PositionHash128
+    {
+        std::uint64_t high;
+        std::uint64_t low;
+    };
+
+    struct Position : public Board
+    {
+        using BaseType = Board;
+
+        constexpr Position() noexcept :
+            Board(),
+            m_sideToMove(Color::White),
+            m_epSquare(Square::none()),
+            m_castlingRights(CastlingRights::All),
+            m_rule50Counter(0),
+            m_ply(0)
+        {
+        }
+
+        constexpr Position(const Board& board, Color sideToMove, Square epSquare, CastlingRights castlingRights) :
+            Board(board),
+            m_sideToMove(sideToMove),
+            m_epSquare(epSquare),
+            m_castlingRights(castlingRights),
+            m_rule50Counter(0),
+            m_ply(0)
+        {
+        }
+
+        void set(std::string_view fen);
+
+        // Returns false if the fen was not valid
+        // If the returned value was false the position
+        // is in unspecified state.
+        [[nodiscard]] bool trySet(std::string_view fen);
+
+        [[nodiscard]] static Position fromFen(std::string_view fen);
+
+        [[nodiscard]] static std::optional<Position> tryFromFen(std::string_view fen);
+
+        [[nodiscard]] static Position startPosition();
+
+        [[nodiscard]] std::string fen() const;
+
+        constexpr void setEpSquareUnchecked(Square sq)
+        {
+            m_epSquare = sq;
+        }
+
+        void setEpSquare(Square sq)
+        {
+            m_epSquare = sq;
+            nullifyEpSquareIfNotPossible();
+        }
+
+        constexpr void setSideToMove(Color color)
+        {
+            m_sideToMove = color;
+        }
+
+        constexpr void addCastlingRights(CastlingRights rights)
+        {
+            m_castlingRights |= rights;
+        }
+
+        constexpr void setCastlingRights(CastlingRights rights)
+        {
+            m_castlingRights = rights;
+        }
+
+        constexpr void setRule50Counter(std::uint8_t v)
+        {
+            m_rule50Counter = v;
+        }
+
+        constexpr void setPly(std::uint16_t ply)
+        {
+            m_ply = ply;
+        }
+
+        ReverseMove doMove(const Move& move);
+
+        constexpr void undoMove(const ReverseMove& reverseMove)
+        {
+            const Move& move = reverseMove.move;
+            BaseType::undoMove(move, reverseMove.capturedPiece);
+
+            m_epSquare = reverseMove.oldEpSquare;
+            m_castlingRights = reverseMove.oldCastlingRights;
+
+            m_sideToMove = !m_sideToMove;
+
+            --m_ply;
+            if (m_rule50Counter > 0)
+            {
+                m_rule50Counter -= 1;
+            }
+        }
+
+        [[nodiscard]] constexpr Color sideToMove() const
+        {
+            return m_sideToMove;
+        }
+
+        [[nodiscard]] std::uint8_t rule50Counter() const
+        {
+            return m_rule50Counter;
+        }
+
+        [[nodiscard]] std::uint16_t ply() const
+        {
+            return m_ply;
+        }
+
+        [[nodiscard]] std::uint16_t halfMove() const
+        {
+            return (m_ply + 1) / 2;
+        }
+
+        void setHalfMove(std::uint16_t hm)
+        {
+            m_ply = 2 * hm - 1 + (m_sideToMove == Color::Black);
+        }
+
+        [[nodiscard]] bool isCheck() const;
+
+        [[nodiscard]] Bitboard checkers() const;
+
+        [[nodiscard]] bool isCheckAfterMove(Move move) const;
+
+        // Checks whether ANY `move` is legal.
+        [[nodiscard]] bool isMoveLegal(Move move) const;
+
+        [[nodiscard]] bool isPseudoLegalMoveLegal(Move move) const;
+
+        [[nodiscard]] bool isMovePseudoLegal(Move move) const;
+
+        // Returns all pieces that block a slider
+        // from attacking our king. When two or more
+        // pieces block a single slider then none
+        // of these pieces are included.
+        [[nodiscard]] Bitboard blockersForKing(Color color) const;
+
+        [[nodiscard]] MoveLegalityChecker moveLegalityChecker() const
+        {
+            return { *this };
+        }
+
+        [[nodiscard]] constexpr Square epSquare() const
+        {
+            return m_epSquare;
+        }
+
+        [[nodiscard]] constexpr CastlingRights castlingRights() const
+        {
+            return m_castlingRights;
+        }
+
+        [[nodiscard]] constexpr bool friend operator==(const Position& lhs, const Position& rhs) noexcept
+        {
+            return
+                lhs.m_sideToMove == rhs.m_sideToMove
+                && lhs.m_epSquare == rhs.m_epSquare
+                && lhs.m_castlingRights == rhs.m_castlingRights
+                && static_cast<const Board&>(lhs) == static_cast<const Board&>(rhs);
+        }
+
+        [[nodiscard]] constexpr bool friend operator!=(const Position& lhs, const Position& rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+
+        // these are supposed to be used only for testing
+        // that's why there's this assert in afterMove
+
+        [[nodiscard]] constexpr Position beforeMove(const ReverseMove& reverseMove) const
+        {
+            Position cpy(*this);
+            cpy.undoMove(reverseMove);
+            return cpy;
+        }
+
+        [[nodiscard]] Position afterMove(Move move) const;
+
+        [[nodiscard]] constexpr bool isEpPossible() const
+        {
+            return m_epSquare != Square::none();
+        }
+
+        [[nodiscard]] inline CompressedPosition compress() const;
+
+    protected:
+        Color m_sideToMove;
+        Square m_epSquare;
+        CastlingRights m_castlingRights;
+        std::uint8_t m_rule50Counter;
+        std::uint16_t m_ply;
+
+        static_assert(sizeof(Color) + sizeof(Square) + sizeof(CastlingRights) + sizeof(std::uint8_t) == 4);
+
+        [[nodiscard]] FORCEINLINE bool isEpPossible(Square epSquare, Color sideToMove) const;
+
+        [[nodiscard]] NOINLINE bool isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const;
+
+        void nullifyEpSquareIfNotPossible();
+    };
+
+    struct CompressedPosition
+    {
+        friend struct Position;
+
+        // Occupied bitboard has bits set for
+        // each square with a piece on it.
+        // Each packedState byte holds 2 values (nibbles).
+        // First one at low bits, second one at high bits.
+        // Values correspond to consecutive squares
+        // in bitboard iteration order.
+        // Nibble values:
+        // these are the same as for Piece
+        // knights, bishops, queens can just be copied
+        //  0 : white pawn
+        //  1 : black pawn
+        //  2 : white knight
+        //  3 : black knight
+        //  4 : white bishop
+        //  5 : black bishop
+        //  6 : white rook
+        //  7 : black rook
+        //  8 : white queen
+        //  9 : black queen
+        // 10 : white king
+        // 11 : black king
+        //
+        // these are special
+        // 12 : pawn with ep square behind (white or black, depending on rank)
+        // 13 : white rook with coresponding castling rights
+        // 14 : black rook with coresponding castling rights
+        // 15 : black king and black is side to move
+        //
+        // Let N be the number of bits set in occupied bitboard.
+        // Only N nibbles are present. (N+1)/2 bytes are initialized.
+
+        static CompressedPosition readFromBigEndian(const unsigned char* data)
+        {
+            CompressedPosition pos{};
+            pos.m_occupied = Bitboard::fromBits(
+                (std::uint64_t)data[0] << 56
+                | (std::uint64_t)data[1] << 48
+                | (std::uint64_t)data[2] << 40
+                | (std::uint64_t)data[3] << 32
+                | (std::uint64_t)data[4] << 24
+                | (std::uint64_t)data[5] << 16
+                | (std::uint64_t)data[6] << 8
+                | (std::uint64_t)data[7]
+                );
+            std::memcpy(pos.m_packedState, data + 8, 16);
+            return pos;
+        }
+
+        constexpr CompressedPosition() :
+            m_occupied{},
+            m_packedState{}
+        {
+        }
+
+        [[nodiscard]] friend bool operator<(const CompressedPosition& lhs, const CompressedPosition& rhs)
+        {
+            if (lhs.m_occupied.bits() < rhs.m_occupied.bits()) return true;
+            if (lhs.m_occupied.bits() > rhs.m_occupied.bits()) return false;
+
+            return std::strcmp(reinterpret_cast<const char*>(lhs.m_packedState), reinterpret_cast<const char*>(rhs.m_packedState)) < 0;
+        }
+
+        [[nodiscard]] friend bool operator==(const CompressedPosition& lhs, const CompressedPosition& rhs)
+        {
+            return lhs.m_occupied == rhs.m_occupied
+                && std::strcmp(reinterpret_cast<const char*>(lhs.m_packedState), reinterpret_cast<const char*>(rhs.m_packedState)) == 0;
+        }
+
+        [[nodiscard]] inline Position decompress() const;
+
+        [[nodiscard]] constexpr Bitboard pieceBB() const
+        {
+            return m_occupied;
+        }
+
+        void writeToBigEndian(unsigned char* data)
+        {
+            const auto occupied = m_occupied.bits();
+            *data++ = occupied >> 56;
+            *data++ = (occupied >> 48) & 0xFF;
+            *data++ = (occupied >> 40) & 0xFF;
+            *data++ = (occupied >> 32) & 0xFF;
+            *data++ = (occupied >> 24) & 0xFF;
+            *data++ = (occupied >> 16) & 0xFF;
+            *data++ = (occupied >> 8) & 0xFF;
+            *data++ = occupied & 0xFF;
+            std::memcpy(data, m_packedState, 16);
+        }
+
+    private:
+        Bitboard m_occupied;
+        std::uint8_t m_packedState[16];
+    };
+
+    static_assert(sizeof(CompressedPosition) == 24);
+    static_assert(std::is_trivially_copyable_v<CompressedPosition>);
+
+    namespace detail
+    {
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressOrdinaryPiece(const Position&, Square, Piece piece)
+        {
+            return static_cast<std::uint8_t>(ordinal(piece));
+        }
+
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressPawn(const Position& position, Square sq, Piece piece)
+        {
+            const Square epSquare = position.epSquare();
+            if (epSquare == Square::none())
+            {
+                return static_cast<std::uint8_t>(ordinal(piece));
+            }
+            else
+            {
+                const Color sideToMove = position.sideToMove();
+                const Rank rank = sq.rank();
+                const File file = sq.file();
+                // use bitwise operators, there is a lot of unpredictable branches but in
+                // total the result is quite predictable
+                if (
+                    (file == epSquare.file())
+                    && (
+                    ((rank == rank4) & (sideToMove == Color::Black))
+                        | ((rank == rank5) & (sideToMove == Color::White))
+                        )
+                    )
+                {
+                    return 12;
+                }
+                else
+                {
+                    return static_cast<std::uint8_t>(ordinal(piece));
+                }
+            }
+        }
+
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressRook(const Position& position, Square sq, Piece piece)
+        {
+            const CastlingRights castlingRights = position.castlingRights();
+            const Color color = piece.color();
+
+            if (color == Color::White
+                && (
+                (sq == a1 && contains(castlingRights, CastlingRights::WhiteQueenSide))
+                    || (sq == h1 && contains(castlingRights, CastlingRights::WhiteKingSide))
+                    )
+                )
+            {
+                return 13;
+            }
+            else if (
+                color == Color::Black
+                && (
+                (sq == a8 && contains(castlingRights, CastlingRights::BlackQueenSide))
+                    || (sq == h8 && contains(castlingRights, CastlingRights::BlackKingSide))
+                    )
+                )
+            {
+                return 14;
+            }
+            else
+            {
+                return static_cast<std::uint8_t>(ordinal(piece));
+            }
+        }
+
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressKing(const Position& position, Square sq, Piece piece)
+        {
+            const Color color = piece.color();
+            const Color sideToMove = position.sideToMove();
+
+            if (color == Color::White)
+            {
+                return 10;
+            }
+            else if (sideToMove == Color::White)
+            {
+                return 11;
+            }
+            else
+            {
+                return 15;
+            }
+        }
+    }
+
+    namespace detail::lookup
+    {
+        static constexpr EnumArray<PieceType, std::uint8_t(*)(const Position&, Square, Piece)> pieceCompressorFunc = []() {
+            EnumArray<PieceType, std::uint8_t(*)(const Position&, Square, Piece)> pieceCompressorFunc{};
+
+            pieceCompressorFunc[PieceType::Knight] = detail::compressOrdinaryPiece;
+            pieceCompressorFunc[PieceType::Bishop] = detail::compressOrdinaryPiece;
+            pieceCompressorFunc[PieceType::Queen] = detail::compressOrdinaryPiece;
+
+            pieceCompressorFunc[PieceType::Pawn] = detail::compressPawn;
+            pieceCompressorFunc[PieceType::Rook] = detail::compressRook;
+            pieceCompressorFunc[PieceType::King] = detail::compressKing;
+
+            pieceCompressorFunc[PieceType::None] = [](const Position&, Square, Piece) -> std::uint8_t { /* should never happen */ return 0; };
+
+            return pieceCompressorFunc;
+        }();
+    }
+
+    [[nodiscard]] inline CompressedPosition Position::compress() const
+    {
+        auto compressPiece = [this](Square sq, Piece piece) -> std::uint8_t {
+            if (piece.type() == PieceType::Pawn) // it's likely to be a pawn
+            {
+                return detail::compressPawn(*this, sq, piece);
+            }
+            else
+            {
+                return detail::lookup::pieceCompressorFunc[piece.type()](*this, sq, piece);
+            }
+        };
+
+        const Bitboard occ = piecesBB();
+
+        CompressedPosition compressed;
+        compressed.m_occupied = occ;
+
+        auto it = occ.begin();
+        auto end = occ.end();
+        for (int i = 0;; ++i)
+        {
+            if (it == end) break;
+            compressed.m_packedState[i] = compressPiece(*it, pieceAt(*it));
+            ++it;
+
+            if (it == end) break;
+            compressed.m_packedState[i] |= compressPiece(*it, pieceAt(*it)) << 4;
+            ++it;
+        }
+
+        return compressed;
+    }
+
+    [[nodiscard]] inline Position CompressedPosition::decompress() const
+    {
+        Position pos;
+        pos.setCastlingRights(CastlingRights::None);
+
+        auto decompressPiece = [&pos](Square sq, std::uint8_t nibble) {
+            switch (nibble)
+            {
+            case 0:
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+            case 8:
+            case 9:
+            case 10:
+            case 11:
+            {
+                pos.place(fromOrdinal<Piece>(nibble), sq);
+                return;
+            }
+
+            case 12:
+            {
+                const Rank rank = sq.rank();
+                if (rank == rank4)
+                {
+                    pos.place(whitePawn, sq);
+                    pos.setEpSquareUnchecked(sq + Offset{ 0, -1 });
+                }
+                else // (rank == rank5)
+                {
+                    pos.place(blackPawn, sq);
+                    pos.setEpSquareUnchecked(sq + Offset{ 0, 1 });
+                }
+                return;
+            }
+
+            case 13:
+            {
+                pos.place(whiteRook, sq);
+                if (sq == a1)
+                {
+                    pos.addCastlingRights(CastlingRights::WhiteQueenSide);
+                }
+                else // (sq == H1)
+                {
+                    pos.addCastlingRights(CastlingRights::WhiteKingSide);
+                }
+                return;
+            }
+
+            case 14:
+            {
+                pos.place(blackRook, sq);
+                if (sq == a8)
+                {
+                    pos.addCastlingRights(CastlingRights::BlackQueenSide);
+                }
+                else // (sq == H8)
+                {
+                    pos.addCastlingRights(CastlingRights::BlackKingSide);
+                }
+                return;
+            }
+
+            case 15:
+            {
+                pos.place(blackKing, sq);
+                pos.setSideToMove(Color::Black);
+                return;
+            }
+
+            }
+
+            return;
+        };
+
+        const Bitboard occ = m_occupied;
+
+        auto it = occ.begin();
+        auto end = occ.end();
+        for (int i = 0;; ++i)
+        {
+            if (it == end) break;
+            decompressPiece(*it, m_packedState[i] & 0xF);
+            ++it;
+
+            if (it == end) break;
+            decompressPiece(*it, m_packedState[i] >> 4);
+            ++it;
+        }
+
+        return pos;
+    }
+
+
+    [[nodiscard]] bool Board::isSquareAttacked(Square sq, Color attackerColor) const
+    {
+        assert(sq.isOk());
+
+        const Bitboard occupied = piecesBB();
+        const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        const Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        const Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        const Bitboard allSliders = (bishops | rooks | queens);
+        if ((bb::pseudoAttacks<PieceType::Queen>(sq) & allSliders).any())
+        {
+            if (bb::isAttackedBySlider(
+                sq,
+                bishops,
+                rooks,
+                queens,
+                occupied
+            ))
+            {
+                return true;
+            }
+        }
+
+        const Bitboard king = piecesBB(Piece(PieceType::King, attackerColor));
+        if ((bb::pseudoAttacks<PieceType::King>(sq) & king).any())
+        {
+            return true;
+        }
+
+        const Bitboard knights = piecesBB(Piece(PieceType::Knight, attackerColor));
+        if ((bb::pseudoAttacks<PieceType::Knight>(sq) & knights).any())
+        {
+            return true;
+        }
+
+        const Bitboard pawns = piecesBB(Piece(PieceType::Pawn, attackerColor));
+        const Bitboard pawnAttacks = bb::pawnAttacks(pawns, attackerColor);
+
+        return pawnAttacks.isSet(sq);
+    }
+
+    [[nodiscard]] bool Board::isSquareAttackedAfterMove(Move move, Square sq, Color attackerColor) const
+    {
+        const Bitboard occupiedChange = Bitboard::square(move.from) | move.to;
+
+        Bitboard occupied = (piecesBB() ^ move.from) | move.to;
+
+        Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+        Bitboard king = piecesBB(Piece(PieceType::King, attackerColor));
+        Bitboard knights = piecesBB(Piece(PieceType::Knight, attackerColor));
+        Bitboard pawns = piecesBB(Piece(PieceType::Pawn, attackerColor));
+
+        if (move.type == MoveType::EnPassant)
+        {
+            const Square capturedPawnSq(move.to.file(), move.from.rank());
+            occupied ^= capturedPawnSq;
+            pawns ^= capturedPawnSq;
+        }
+        else if (pieceAt(move.to) != Piece::none())
+        {
+            const Bitboard notCaptured = ~Bitboard::square(move.to);
+            bishops &= notCaptured;
+            rooks &= notCaptured;
+            queens &= notCaptured;
+            knights &= notCaptured;
+            pawns &= notCaptured;
+        }
+
+        // Potential attackers may have moved.
+        const Piece movedPiece = pieceAt(move.from);
+        if (movedPiece.color() == attackerColor)
+        {
+            switch (movedPiece.type())
+            {
+            case PieceType::Pawn:
+                pawns ^= occupiedChange;
+                break;
+            case PieceType::Knight:
+                knights ^= occupiedChange;
+                break;
+            case PieceType::Bishop:
+                bishops ^= occupiedChange;
+                break;
+            case PieceType::Rook:
+                rooks ^= occupiedChange;
+                break;
+            case PieceType::Queen:
+                queens ^= occupiedChange;
+                break;
+            case PieceType::King:
+            {
+                if (move.type == MoveType::Castle)
+                {
+                    const CastleType castleType = CastlingTraits::moveCastlingType(move);
+
+                    king ^= move.from;
+                    king ^= CastlingTraits::kingDestination[attackerColor][castleType];
+                    rooks ^= move.to;
+                    rooks ^= CastlingTraits::rookDestination[attackerColor][castleType];
+
+                    break;
+                }
+                else
+                {
+                    king ^= occupiedChange;
+                }
+            }
+            }
+        }
+
+        // If it's a castling move then the change in square occupation
+        // cannot have an effect because otherwise there would be
+        // a slider attacker attacking the castling king.
+        // (It could have an effect in chess960 if the slider
+        // attacker was behind the rook involved in castling,
+        // but we don't care about chess960.)
+
+        const Bitboard allSliders = (bishops | rooks | queens);
+        if ((bb::pseudoAttacks<PieceType::Queen>(sq) & allSliders).any())
+        {
+            if (bb::isAttackedBySlider(
+                sq,
+                bishops,
+                rooks,
+                queens,
+                occupied
+            ))
+            {
+                return true;
+            }
+        }
+
+        if ((bb::pseudoAttacks<PieceType::King>(sq) & king).any())
+        {
+            return true;
+        }
+
+        if ((bb::pseudoAttacks<PieceType::Knight>(sq) & knights).any())
+        {
+            return true;
+        }
+
+        const Bitboard pawnAttacks = bb::pawnAttacks(pawns, attackerColor);
+
+        return pawnAttacks.isSet(sq);
+    }
+
+    [[nodiscard]] bool Board::createsDiscoveredAttackOnOwnKing(Move move) const
+    {
+        Bitboard occupied = (piecesBB() ^ move.from) | move.to;
+
+        const Piece movedPiece = pieceAt(move.from);
+        const Color kingColor = movedPiece.color();
+        const Color attackerColor = !kingColor;
+        const Square ksq = kingSquare(kingColor);
+
+        Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        if (move.type == MoveType::EnPassant)
+        {
+            const Square capturedPawnSq(move.to.file(), move.from.rank());
+            occupied ^= capturedPawnSq;
+        }
+        else if (pieceAt(move.to) != Piece::none())
+        {
+            const Bitboard notCaptured = ~Bitboard::square(move.to);
+            bishops &= notCaptured;
+            rooks &= notCaptured;
+            queens &= notCaptured;
+        }
+
+        const Bitboard allSliders = (bishops | rooks | queens);
+        if ((bb::pseudoAttacks<PieceType::Queen>(ksq) & allSliders).any())
+        {
+            if (bb::isAttackedBySlider(
+                ksq,
+                bishops,
+                rooks,
+                queens,
+                occupied
+            ))
+            {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    [[nodiscard]] bool Board::isPieceAttacked(Square sq) const
+    {
+        const Piece piece = pieceAt(sq);
+
+        if (piece == Piece::none())
+        {
+            return false;
+        }
+
+        return isSquareAttacked(sq, !piece.color());
+    }
+
+    [[nodiscard]] bool Board::isPieceAttackedAfterMove(Move move, Square sq) const
+    {
+        const Piece piece = pieceAt(sq);
+
+        if (piece == Piece::none())
+        {
+            return false;
+        }
+
+        if (sq == move.from)
+        {
+            // We moved the piece we're interested in.
+            // For every move the piece ends up on the move.to except
+            // for the case of castling moves.
+            // But we know pseudo legal castling moves
+            // are already legal, so the king cannot be in check after.
+            if (move.type == MoveType::Castle)
+            {
+                return false;
+            }
+
+            // So update the square we're interested in.
+            sq = move.to;
+        }
+
+        return isSquareAttackedAfterMove(move, sq, !piece.color());
+    }
+
+    [[nodiscard]] bool Board::isOwnKingAttackedAfterMove(Move move) const
+    {
+        if (move.type == MoveType::Castle)
+        {
+            // Pseudo legal castling moves are already legal.
+            // This is ensured by the move generator.
+            return false;
+        }
+
+        const Piece movedPiece = pieceAt(move.from);
+
+        return isPieceAttackedAfterMove(move, kingSquare(movedPiece.color()));
+    }
+
+    [[nodiscard]] Bitboard Board::attacks(Square sq) const
+    {
+        const Piece piece = pieceAt(sq);
+        if (piece == Piece::none())
+        {
+            return Bitboard::none();
+        }
+
+        if (piece.type() == PieceType::Pawn)
+        {
+            return bb::pawnAttacks(Bitboard::square(sq), piece.color());
+        }
+        else
+        {
+            return bb::attacks(piece.type(), sq, piecesBB());
+        }
+    }
+
+    [[nodiscard]] Bitboard Board::attackers(Square sq, Color attackerColor) const
+    {
+        // En-passant square is not included.
+
+        Bitboard allAttackers = Bitboard::none();
+
+        const Bitboard occupied = piecesBB();
+
+        const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        const Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        const Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        const Bitboard bishopLikePieces = (bishops | queens);
+        const Bitboard bishopAttacks = bb::attacks<PieceType::Bishop>(sq, occupied);
+        allAttackers |= bishopAttacks & bishopLikePieces;
+
+        const Bitboard rookLikePieces = (rooks | queens);
+        const Bitboard rookAttacks = bb::attacks<PieceType::Rook>(sq, occupied);
+        allAttackers |= rookAttacks & rookLikePieces;
+
+        const Bitboard king = piecesBB(Piece(PieceType::King, attackerColor));
+        allAttackers |= bb::pseudoAttacks<PieceType::King>(sq) & king;
+
+        const Bitboard knights = piecesBB(Piece(PieceType::Knight, attackerColor));
+        allAttackers |= bb::pseudoAttacks<PieceType::Knight>(sq) & knights;
+
+        const Bitboard pawns = piecesBB(Piece(PieceType::Pawn, attackerColor));
+        allAttackers |= bb::pawnAttacks(Bitboard::square(sq), !attackerColor) & pawns;
+
+        return allAttackers;
+    }
+
+    const Piece* Board::piecesRaw() const
+    {
+        return m_pieces.data();
+    }
+
+    namespace detail::lookup
+    {
+        static constexpr EnumArray<Piece, char> fenPiece = []() {
+            EnumArray<Piece, char> fenPiece{};
+
+            fenPiece[whitePawn] = 'P';
+            fenPiece[blackPawn] = 'p';
+            fenPiece[whiteKnight] = 'N';
+            fenPiece[blackKnight] = 'n';
+            fenPiece[whiteBishop] = 'B';
+            fenPiece[blackBishop] = 'b';
+            fenPiece[whiteRook] = 'R';
+            fenPiece[blackRook] = 'r';
+            fenPiece[whiteQueen] = 'Q';
+            fenPiece[blackQueen] = 'q';
+            fenPiece[whiteKing] = 'K';
+            fenPiece[blackKing] = 'k';
+            fenPiece[Piece::none()] = 'X';
+
+            return fenPiece;
+        }();
+    }
+
+    [[nodiscard]] std::string Board::fen() const
+    {
+        std::string fen;
+        fen.reserve(96); // longest fen is probably in range of around 88
+
+        Rank rank = rank8;
+        File file = fileA;
+        std::uint8_t emptyCounter = 0;
+
+        for (;;)
+        {
+            const Square sq(file, rank);
+            const Piece piece = m_pieces[sq];
+
+            if (piece == Piece::none())
+            {
+                ++emptyCounter;
+            }
+            else
+            {
+                if (emptyCounter != 0)
+                {
+                    fen.push_back(static_cast<char>(emptyCounter) + '0');
+                    emptyCounter = 0;
+                }
+
+                fen.push_back(detail::lookup::fenPiece[piece]);
+            }
+
+            ++file;
+            if (file > fileH)
+            {
+                file = fileA;
+                --rank;
+
+                if (emptyCounter != 0)
+                {
+                    fen.push_back(static_cast<char>(emptyCounter) + '0');
+                    emptyCounter = 0;
+                }
+
+                if (rank < rank1)
+                {
+                    break;
+                }
+                fen.push_back('/');
+            }
+        }
+
+        return fen;
+    }
+
+    MoveLegalityChecker::MoveLegalityChecker(const Position& position) :
+        m_position(&position),
+        m_checkers(position.checkers()),
+        m_ourBlockersForKing(
+            position.blockersForKing(position.sideToMove())
+            & position.piecesBB(position.sideToMove())
+        ),
+        m_ksq(position.kingSquare(position.sideToMove()))
+    {
+        if (m_checkers.exactlyOne())
+        {
+            const Bitboard knightCheckers = m_checkers & bb::pseudoAttacks<PieceType::Knight>(m_ksq);
+            if (knightCheckers.any())
+            {
+                // We're checked by a knight, we have to remove it or move the king.
+                m_potentialCheckRemovals = knightCheckers;
+            }
+            else
+            {
+                // If we're not checked by a knight we can block it.
+                m_potentialCheckRemovals = bb::between(m_ksq, m_checkers.first()) | m_checkers;
+            }
+        }
+        else
+        {
+            // Double check, king has to move.
+            m_potentialCheckRemovals = Bitboard::none();
+        }
+    }
+
+    [[nodiscard]] bool MoveLegalityChecker::isPseudoLegalMoveLegal(const Move& move) const
+    {
+        const Piece movedPiece = m_position->pieceAt(move.from);
+
+        if (m_checkers.any())
+        {
+            if (move.from == m_ksq || move.type == MoveType::EnPassant)
+            {
+                return m_position->isPseudoLegalMoveLegal(move);
+            }
+            else
+            {
+                // This means there's only one check and we either
+                // blocked it or removed the piece that attacked
+                // our king. So the only threat is if it's a discovered check.
+                return
+                    m_potentialCheckRemovals.isSet(move.to)
+                    && !m_ourBlockersForKing.isSet(move.from);
+            }
+        }
+        else
+        {
+            if (move.from == m_ksq)
+            {
+                return m_position->isPseudoLegalMoveLegal(move);
+            }
+            else if (move.type == MoveType::EnPassant)
+            {
+                return !m_position->createsDiscoveredAttackOnOwnKing(move);
+            }
+            else if (m_ourBlockersForKing.isSet(move.from))
+            {
+                // If it was a blocker it may have only moved in line with our king.
+                // Otherwise it's a discovered check.
+                return bb::line(m_ksq, move.from).isSet(move.to);
+            }
+            else
+            {
+                return true;
+            }
+        }
+    }
+
+    void Position::set(std::string_view fen)
+    {
+        (void)trySet(fen);
+    }
+
+    // Returns false if the fen was not valid
+    // If the returned value was false the position
+    // is in unspecified state.
+    [[nodiscard]] bool Position::trySet(std::string_view fen)
+    {
+        // Lazily splits by ' '. Returns empty string views if at the end.
+        auto nextPart = [fen, start = std::size_t{ 0 }]() mutable {
+            std::size_t end = fen.find(' ', start);
+            if (end == std::string::npos)
+            {
+                std::string_view substr = fen.substr(start);
+                start = fen.size();
+                return substr;
+            }
+            else
+            {
+                std::string_view substr = fen.substr(start, end - start);
+                start = end + 1; // to skip whitespace
+                return substr;
+            }
+        };
+
+        if (!BaseType::trySet(nextPart())) return false;
+
+        {
+            const auto side = nextPart();
+            if (side == std::string_view("w")) m_sideToMove = Color::White;
+            else if (side == std::string_view("b")) m_sideToMove = Color::Black;
+            else return false;
+
+            if (isSquareAttacked(kingSquare(!m_sideToMove), m_sideToMove)) return false;
+        }
+
+        {
+            const auto castlingRights = nextPart();
+            auto castlingRightsOpt = parser_bits::tryParseCastlingRights(castlingRights);
+            if (!castlingRightsOpt.has_value())
+            {
+                return false;
+            }
+            else
+            {
+                m_castlingRights = *castlingRightsOpt;
+            }
+        }
+
+        {
+            const auto epSquare = nextPart();
+            auto epSquareOpt = parser_bits::tryParseEpSquare(epSquare);
+            if (!epSquareOpt.has_value())
+            {
+                return false;
+            }
+            else
+            {
+                m_epSquare = *epSquareOpt;
+            }
+        }
+
+        {
+            const auto rule50 = nextPart();
+            if (!rule50.empty())
+            {
+                m_rule50Counter = std::stoi(rule50.data());
+            }
+            else
+            {
+                m_rule50Counter = 0;
+            }
+        }
+
+        {
+            const auto halfMove = nextPart();
+            if (!halfMove.empty())
+            {
+                m_ply = std::stoi(halfMove.data()) * 2 - (m_sideToMove == Color::White);
+            }
+            else
+            {
+                m_ply = 0;
+            }
+        }
+
+        nullifyEpSquareIfNotPossible();
+
+        return true;
+    }
+
+    [[nodiscard]] Position Position::fromFen(std::string_view fen)
+    {
+        Position pos{};
+        pos.set(fen);
+        return pos;
+    }
+
+    [[nodiscard]] std::optional<Position> Position::tryFromFen(std::string_view fen)
+    {
+        Position pos{};
+        if (pos.trySet(fen)) return pos;
+        else return {};
+    }
+
+    [[nodiscard]] Position Position::startPosition()
+    {
+        static const Position pos = fromFen("rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1");
+        return pos;
+    }
+
+    [[nodiscard]] std::string Position::fen() const
+    {
+        std::string fen = Board::fen();
+
+        fen += ' ';
+        fen += m_sideToMove == Color::White ? 'w' : 'b';
+
+        fen += ' ';
+        parser_bits::appendCastlingRightsToString(m_castlingRights, fen);
+
+        fen += ' ';
+        parser_bits::appendEpSquareToString(m_epSquare, fen);
+
+        fen += ' ';
+        fen += std::to_string(m_rule50Counter);
+
+        fen += ' ';
+        fen += std::to_string(halfMove());
+
+        return fen;
+    }
+
+    namespace detail::lookup
+    {
+        static constexpr EnumArray<Square, CastlingRights> preservedCastlingRights = []() {
+            EnumArray<Square, CastlingRights> preservedCastlingRights{};
+            for (CastlingRights& rights : preservedCastlingRights)
+            {
+                rights = ~CastlingRights::None;
+            }
+
+            preservedCastlingRights[e1] = ~CastlingRights::White;
+            preservedCastlingRights[e8] = ~CastlingRights::Black;
+
+            preservedCastlingRights[h1] = ~CastlingRights::WhiteKingSide;
+            preservedCastlingRights[a1] = ~CastlingRights::WhiteQueenSide;
+            preservedCastlingRights[h8] = ~CastlingRights::BlackKingSide;
+            preservedCastlingRights[a8] = ~CastlingRights::BlackQueenSide;
+
+            return preservedCastlingRights;
+        }();
+    }
+
+    ReverseMove Position::doMove(const Move& move)
+    {
+        assert(move.from.isOk() && move.to.isOk());
+
+        const PieceType movedPiece = pieceAt(move.from).type();
+
+        m_ply += 1;
+        m_rule50Counter += 1;
+
+        if (move.type != MoveType::Castle && (movedPiece == PieceType::Pawn || pieceAt(move.to) != Piece::none()))
+        {
+            m_rule50Counter = 0;
+        }
+
+        const Square oldEpSquare = m_epSquare;
+        const CastlingRights oldCastlingRights = m_castlingRights;
+        m_castlingRights &= detail::lookup::preservedCastlingRights[move.from];
+        m_castlingRights &= detail::lookup::preservedCastlingRights[move.to];
+
+        m_epSquare = Square::none();
+        // for double pushes move index differs by 16 or -16;
+        if((movedPiece == PieceType::Pawn) & ((ordinal(move.to) ^ ordinal(move.from)) == 16))
+        {
+            const Square potentialEpSquare = fromOrdinal<Square>((ordinal(move.to) + ordinal(move.from)) >> 1);
+            // Even though the move has not yet been made we can safely call
+            // this function and get the right result because the position of the
+            // pawn to be captured is not really relevant.
+            if (isEpPossible(potentialEpSquare, !m_sideToMove))
+            {
+                m_epSquare = potentialEpSquare;
+            }
+        }
+
+        const Piece captured = BaseType::doMove(move);
+        m_sideToMove = !m_sideToMove;
+        return { move, captured, oldEpSquare, oldCastlingRights };
+    }
+
+    [[nodiscard]] bool Position::isCheck() const
+    {
+        return BaseType::isSquareAttacked(kingSquare(m_sideToMove), !m_sideToMove);
+    }
+
+    [[nodiscard]] Bitboard Position::checkers() const
+    {
+        return BaseType::attackers(kingSquare(m_sideToMove), !m_sideToMove);
+    }
+
+    [[nodiscard]] bool Position::isCheckAfterMove(Move move) const
+    {
+        return BaseType::isSquareAttackedAfterMove(move, kingSquare(!m_sideToMove), m_sideToMove);
+    }
+
+    [[nodiscard]] Bitboard Position::blockersForKing(Color color) const
+    {
+        const Color attackerColor = !color;
+
+        const Bitboard occupied = piecesBB();
+
+        const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        const Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        const Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        const Square ksq = kingSquare(color);
+
+        const Bitboard opponentBishopLikePieces = (bishops | queens);
+        const Bitboard bishopPseudoAttacks = bb::pseudoAttacks<PieceType::Bishop>(ksq);
+
+        const Bitboard opponentRookLikePieces = (rooks | queens);
+        const Bitboard rookPseudoAttacks = bb::pseudoAttacks<PieceType::Rook>(ksq);
+
+        const Bitboard xrayers =
+            (bishopPseudoAttacks & opponentBishopLikePieces)
+            | (rookPseudoAttacks & opponentRookLikePieces);
+
+        Bitboard allBlockers = Bitboard::none();
+
+        for (Square xrayer : xrayers)
+        {
+            const Bitboard blockers = bb::between(xrayer, ksq) & occupied;
+            if (blockers.exactlyOne())
+            {
+                allBlockers |= blockers;
+            }
+        }
+
+        return allBlockers;
+    }
+
+    [[nodiscard]] Position Position::afterMove(Move move) const
+    {
+        Position cpy(*this);
+        auto pc = cpy.doMove(move);
+
+        (void)pc;
+        //assert(cpy.beforeMove(move, pc) == *this); // this assert would result in infinite recursion
+
+        return cpy;
+    }
+
+    [[nodiscard]] FORCEINLINE bool Position::isEpPossible(Square epSquare, Color sideToMove) const
+    {
+        const Bitboard pawnsAttackingEpSquare =
+            bb::pawnAttacks(Bitboard::square(epSquare), !sideToMove)
+            & piecesBB(Piece(PieceType::Pawn, sideToMove));
+
+        if (!pawnsAttackingEpSquare.any())
+        {
+            return false;
+        }
+
+        return isEpPossibleColdPath(epSquare, pawnsAttackingEpSquare, sideToMove);
+    }
+
+    [[nodiscard]] NOINLINE bool Position::isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const
+    {
+        // only set m_epSquare when it matters, ie. when
+        // the opposite side can actually capture
+        for (Square sq : pawnsAttackingEpSquare)
+        {
+            // If we're here the previous move by other side
+            // was a double pawn move so our king is either not in check
+            // or is attacked only by the moved pawn - in which
+            // case it can be captured by our pawn if it doesn't
+            // create a discovered check on our king.
+            // So overall we only have to check whether our king
+            // ends up being uncovered to a slider attack.
+
+            const Square ksq = kingSquare(sideToMove);
+
+            const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, !sideToMove));
+            const Bitboard rooks = piecesBB(Piece(PieceType::Rook, !sideToMove));
+            const Bitboard queens = piecesBB(Piece(PieceType::Queen, !sideToMove));
+
+            const Bitboard relevantAttackers = bishops | rooks | queens;
+            const Bitboard pseudoSliderAttacksFromKing = bb::pseudoAttacks<PieceType::Queen>(ksq);
+            if ((relevantAttackers & pseudoSliderAttacksFromKing).isEmpty())
+            {
+                // It's enough that one pawn can capture.
+                return true;
+            }
+
+            const Square capturedPawnSq(epSquare.file(), sq.rank());
+            const Bitboard occupied = ((piecesBB() ^ sq) | epSquare) ^ capturedPawnSq;
+
+            if (!bb::isAttackedBySlider(
+                ksq,
+                bishops,
+                rooks,
+                queens,
+                occupied
+            ))
+            {
+                // It's enough that one pawn can capture.
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    void Position::nullifyEpSquareIfNotPossible()
+    {
+        if (m_epSquare != Square::none() && !isEpPossible(m_epSquare, m_sideToMove))
+        {
+            m_epSquare = Square::none();
+        }
+    }
+
+    namespace uci
+    {
+        [[nodiscard]] std::string moveToUci(const Position& pos, const Move& move);
+        [[nodiscard]] Move uciToMove(const Position& pos, std::string_view sv);
+
+        [[nodiscard]] std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv);
+
+        [[nodiscard]] std::string moveToUci(const Position& pos, const Move& move)
+        {
+            std::string s;
+
+            parser_bits::appendSquareToString(move.from, s);
+
+            if (move.type == MoveType::Castle)
+            {
+                const CastleType castleType = CastlingTraits::moveCastlingType(move);
+
+                const Square kingDestination = CastlingTraits::kingDestination[pos.sideToMove()][castleType];
+                parser_bits::appendSquareToString(kingDestination, s);
+            }
+            else
+            {
+                parser_bits::appendSquareToString(move.to, s);
+
+                if (move.type == MoveType::Promotion)
+                {
+                    // lowercase piece symbol
+                    s += EnumTraits<PieceType>::toChar(move.promotedPiece.type(), Color::Black);
+                }
+            }
+
+            return s;
+        }
+
+        [[nodiscard]] Move uciToMove(const Position& pos, std::string_view sv)
+        {
+            const Square from = parser_bits::parseSquare(sv.data());
+            const Square to = parser_bits::parseSquare(sv.data() + 2);
+
+            if (sv.size() == 5)
+            {
+                const PieceType promotedPieceType = *fromChar<PieceType>(sv[4]);
+                return Move::promotion(from, to, Piece(promotedPieceType, pos.sideToMove()));
+            }
+            else
+            {
+                if (
+                    pos.pieceAt(from).type() == PieceType::King
+                    && std::abs(from.file() - to.file()) > 1
+                    )
+                {
+                    // uci king destinations are on files C or G.
+                    const CastleType castleType =
+                        (to.file() == fileG)
+                        ? CastleType::Short
+                        : CastleType::Long;
+
+                    return Move::castle(castleType, pos.sideToMove());
+                }
+                else if (pos.epSquare() == to)
+                {
+                    return Move::enPassant(from, to);
+                }
+                else
+                {
+                    return Move::normal(from, to);
+                }
+            }
+        }
+
+        [[nodiscard]] std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv)
+        {
+            if (sv.size() < 4 || sv.size() > 5)
+            {
+                return std::nullopt;
+            }
+
+            const auto from = parser_bits::tryParseSquare(sv.substr(0, 2));
+            const auto to = parser_bits::tryParseSquare(sv.substr(2, 2));
+
+            Move move{};
+
+            if (!from.has_value() || !to.has_value())
+            {
+                return std::nullopt;
+            }
+
+            if (sv.size() == 5)
+            {
+                const auto promotedPieceType = fromChar<PieceType>(sv[4]);
+                if (!promotedPieceType.has_value())
+                {
+                    return std::nullopt;
+                }
+
+                if (
+                    *promotedPieceType != PieceType::Knight
+                    && *promotedPieceType != PieceType::Bishop
+                    && *promotedPieceType != PieceType::Rook
+                    && *promotedPieceType != PieceType::Queen
+                    )
+                {
+                    return std::nullopt;
+                }
+
+                move = Move::promotion(*from, *to, Piece(*promotedPieceType, pos.sideToMove()));
+            }
+            else // sv.size() == 4
+            {
+
+                if (
+                    pos.pieceAt(*from).type() == PieceType::King
+                    && std::abs(from->file() - to->file()) > 1
+                    )
+                {
+                    // uci king destinations are on files C or G.
+
+                    if (pos.sideToMove() == Color::White)
+                    {
+                        if (*from != e1)
+                        {
+                            return std::nullopt;
+                        }
+
+                        if (*to != c1 && *to != g1)
+                        {
+                            return std::nullopt;
+                        }
+                    }
+                    else
+                    {
+                        if (*from != e8)
+                        {
+                            return std::nullopt;
+                        }
+
+                        if (*to != c8 && *to != g8)
+                        {
+                            return std::nullopt;
+                        }
+                    }
+
+                    const CastleType castleType =
+                        (to->file() == fileG)
+                        ? CastleType::Short
+                        : CastleType::Long;
+
+                    move = Move::castle(castleType, pos.sideToMove());
+                }
+                else if (to == pos.epSquare())
+                {
+                    move = Move::enPassant(*from, *to);
+                }
+                else
+                {
+                    move = Move::normal(*from, *to);
+                }
+            }
+
+            if (!pos.isMoveLegal(move))
+            {
+                return std::nullopt;
+            }
+
+            return move;
+        }
+    }
+}
+
+namespace binpack
+{
+    constexpr std::size_t KiB = 1024;
+    constexpr std::size_t MiB = (1024*KiB);
+    constexpr std::size_t GiB = (1024*MiB);
+
+    constexpr std::size_t suggestedChunkSize = MiB;
+    constexpr std::size_t maxMovelistSize = 10*KiB; // a safe upper bound
+    constexpr std::size_t maxChunkSize = 100*MiB; // to prevent malformed files from causing huge allocations
+
+    using namespace std::literals;
+
+    namespace nodchip
+    {
+        // This namespace contains modified code from https://github.com/nodchip/Stockfish
+        // which is released under GPL v3 license https://www.gnu.org/licenses/gpl-3.0.html
+
+        using namespace std;
+
+        struct StockfishMove
+        {
+            [[nodiscard]] static StockfishMove fromMove(chess::Move move)
+            {
+                StockfishMove sfm;
+
+                sfm.m_raw = 0;
+
+                unsigned moveFlag = 0;
+                if (move.type == chess::MoveType::Promotion) moveFlag = 1;
+                else if (move.type == chess::MoveType::EnPassant) moveFlag = 2;
+                else if (move.type == chess::MoveType::Castle) moveFlag = 3;
+
+                unsigned promotionIndex = 0;
+                if (move.type == chess::MoveType::Promotion)
+                {
+                    promotionIndex = static_cast<int>(move.promotedPiece.type()) - static_cast<int>(chess::PieceType::Knight);
+                }
+
+                sfm.m_raw |= static_cast<std::uint16_t>(moveFlag);
+                sfm.m_raw <<= 2;
+                sfm.m_raw |= static_cast<std::uint16_t>(promotionIndex);
+                sfm.m_raw <<= 6;
+                sfm.m_raw |= static_cast<int>(move.from);
+                sfm.m_raw <<= 6;
+                sfm.m_raw |= static_cast<int>(move.to);
+
+                return sfm;
+            }
+
+            [[nodiscard]] chess::Move toMove() const
+            {
+                const chess::Square to = static_cast<chess::Square>((m_raw & (0b111111 << 0) >> 0));
+                const chess::Square from = static_cast<chess::Square>((m_raw & (0b111111 << 6)) >> 6);
+
+                const unsigned promotionIndex = (m_raw & (0b11 << 12)) >> 12;
+                const chess::PieceType promotionType = static_cast<chess::PieceType>(static_cast<int>(chess::PieceType::Knight) + promotionIndex);
+
+                const unsigned moveFlag = (m_raw & (0b11 << 14)) >> 14;
+                chess::MoveType type = chess::MoveType::Normal;
+                if (moveFlag == 1) type = chess::MoveType::Promotion;
+                else if (moveFlag == 2) type = chess::MoveType::EnPassant;
+                else if (moveFlag == 3) type = chess::MoveType::Castle;
+
+                if (type == chess::MoveType::Promotion)
+                {
+                    const chess::Color stm = to.rank() == chess::rank8 ? chess::Color::White : chess::Color::Black;
+                    return chess::Move{from, to, type, chess::Piece(promotionType, stm)};
+                }
+
+                return chess::Move{from, to, type};
+            }
+
+        private:
+            std::uint16_t m_raw;
+        };
+        static_assert(sizeof(StockfishMove) == sizeof(std::uint16_t));
+
+        struct PackedSfen
+        {
+            uint8_t data[32];
+        };
+
+        struct PackedSfenValue
+        {
+            // phase
+            PackedSfen sfen;
+
+            // Evaluation value returned from Learner::search()
+            int16_t score;
+
+            // PV first move
+            // Used when finding the match rate with the teacher
+            StockfishMove move;
+
+            // Trouble of the phase from the initial phase.
+            uint16_t gamePly;
+
+            // 1 if the player on this side ultimately wins the game. -1 if you are losing.
+            // 0 if a draw is reached.
+            // The draw is in the teacher position generation command gensfen,
+            // Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
+            int8_t game_result;
+
+            // When exchanging the file that wrote the teacher aspect with other people
+            //Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
+            uint8_t padding;
+
+            // 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
+        };
+        static_assert(sizeof(PackedSfenValue) == 40);
+        // Class that handles bitstream
+
+        // useful when doing aspect encoding
+        struct BitStream
+        {
+            // Set the memory to store the data in advance.
+            // Assume that memory is cleared to 0.
+            void  set_data(uint8_t* data_) { data = data_; reset(); }
+
+            // Get the pointer passed in set_data().
+            uint8_t* get_data() const { return data; }
+
+            // Get the cursor.
+            int get_cursor() const { return bit_cursor; }
+
+            // reset the cursor
+            void reset() { bit_cursor = 0; }
+
+            // Write 1bit to the stream.
+            // If b is non-zero, write out 1. If 0, write 0.
+            void write_one_bit(int b)
+            {
+                if (b)
+                    data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+
+                ++bit_cursor;
+            }
+
+            // Get 1 bit from the stream.
+            int read_one_bit()
+            {
+                int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
+                ++bit_cursor;
+
+                return b;
+            }
+
+            // write n bits of data
+            // Data shall be written out from the lower order of d.
+            void write_n_bit(int d, int n)
+            {
+                for (int i = 0; i <n; ++i)
+                    write_one_bit(d & (1 << i));
+            }
+
+            // read n bits of data
+            // Reverse conversion of write_n_bit().
+            int read_n_bit(int n)
+            {
+                int result = 0;
+                for (int i = 0; i < n; ++i)
+                    result |= read_one_bit() ? (1 << i) : 0;
+
+                return result;
+            }
+
+        private:
+            // Next bit position to read/write.
+            int bit_cursor;
+
+            // data entity
+            uint8_t* data;
+        };
+
+
+        // Huffman coding
+        // * is simplified from mini encoding to make conversion easier.
+        //
+        // 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
+        // 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
+        //
+        // empty xxxxx0 + 0 (none)
+        // step xxxx01 + 2 xxxx0 + 2
+        // incense xx0011 + 2 xx001 + 2
+        // Katsura xx1011 + 2 xx101 + 2
+        // silver xx0111 + 2 xx011 + 2
+        // Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
+        // corner 011111 + 2 01111 + 2
+        // Fly 111111 + 2 11111 + 2
+        //
+        // Assuming all pieces are on the board,
+        // Sky 81-40 pieces = 41 boxes = 41bit
+        // Walk 4bit*18 pieces = 72bit
+        // Incense 6bit*4 pieces = 24bit
+        // Katsura 6bit*4 pieces = 24bit
+        // Silver 6bit*4 pieces = 24bit
+        // Gold 6bit* 4 pieces = 24bit
+        // corner 8bit* 2 pieces = 16bit
+        // Fly 8bit* 2 pieces = 16bit
+        // -------
+        // 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
+        //
+        // When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
+        // Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
+        // Therefore, in this expression, any aspect can be expressed by this bit number.
+        // It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
+        // Since the total number of bits can be fixed, we will include this as well.
+
+        // Huffman Encoding
+        //
+        // Empty  xxxxxxx0
+        // Pawn   xxxxx001 + 1 bit (Side to move)
+        // Knight xxxxx011 + 1 bit (Side to move)
+        // Bishop xxxxx101 + 1 bit (Side to move)
+        // Rook   xxxxx111 + 1 bit (Side to move)
+
+        struct HuffmanedPiece
+        {
+            int code; // how it will be coded
+            int bits; // How many bits do you have
+        };
+
+        // NOTE: Order adjusted for this library because originally NO_PIECE had index 0
+        constexpr HuffmanedPiece huffman_table[] =
+        {
+            {0b0001,4}, // PAWN     1
+            {0b0011,4}, // KNIGHT   3
+            {0b0101,4}, // BISHOP   5
+            {0b0111,4}, // ROOK     7
+            {0b1001,4}, // QUEEN    9
+            {-1,-1},    // KING - unused
+            {0b0000,1}, // NO_PIECE 0
+        };
+
+        // Class for compressing/decompressing sfen
+        // sfen can be packed to 256bit (32bytes) by Huffman coding.
+        // This is proven by mini. The above is Huffman coding.
+        //
+        // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
+        // Side to move (White = 0, Black = 1) (1bit)
+        // White King Position (6 bits)
+        // Black King Position (6 bits)
+        // Huffman Encoding of the board
+        // Castling availability (1 bit x 4)
+        // En passant square (1 or 1 + 6 bits)
+        // Rule 50 (6 bits)
+        // Game play (8 bits)
+        //
+        // TODO(someone): Rename SFEN to FEN.
+        //
+        struct SfenPacker
+        {
+            // Pack sfen and store in data[32].
+            void pack(const chess::Position& pos)
+            {
+                memset(data, 0, 32 /* 256bit */);
+                stream.set_data(data);
+
+                // turn
+                // Side to move.
+                stream.write_one_bit((int)(pos.sideToMove()));
+
+                // 7-bit positions for leading and trailing balls
+                // White king and black king, 6 bits for each.
+                stream.write_n_bit(static_cast<int>(pos.kingSquare(chess::Color::White)), 6);
+                stream.write_n_bit(static_cast<int>(pos.kingSquare(chess::Color::Black)), 6);
+
+                // Write the pieces on the board other than the kings.
+                for (chess::Rank r = chess::rank8; r >= chess::rank1; --r)
+                {
+                    for (chess::File f = chess::fileA; f <= chess::fileH; ++f)
+                    {
+                        chess::Piece pc = pos.pieceAt(chess::Square(f, r));
+                        if (pc.type() == chess::PieceType::King)
+                            continue;
+                        write_board_piece_to_stream(pc);
+                    }
+                }
+
+                // TODO(someone): Support chess960.
+                auto cr = pos.castlingRights();
+                stream.write_one_bit(contains(cr, chess::CastlingRights::WhiteKingSide));
+                stream.write_one_bit(contains(cr, chess::CastlingRights::WhiteQueenSide));
+                stream.write_one_bit(contains(cr, chess::CastlingRights::BlackKingSide));
+                stream.write_one_bit(contains(cr, chess::CastlingRights::BlackQueenSide));
+
+                if (pos.epSquare() == chess::Square::none()) {
+                    stream.write_one_bit(0);
+                }
+                else {
+                    stream.write_one_bit(1);
+                    stream.write_n_bit(static_cast<int>(pos.epSquare()), 6);
+                }
+
+                stream.write_n_bit(pos.rule50Counter(), 6);
+
+                stream.write_n_bit(pos.halfMove(), 8);
+
+                assert(stream.get_cursor() <= 256);
+            }
+
+            // sfen packed by pack() (256bit = 32bytes)
+            // Or sfen to decode with unpack()
+            uint8_t *data; // uint8_t[32];
+
+            BitStream stream;
+
+            // Output the board pieces to stream.
+            void write_board_piece_to_stream(chess::Piece pc)
+            {
+                // piece type
+                chess::PieceType pr = pc.type();
+                auto c = huffman_table[static_cast<int>(pr)];
+                stream.write_n_bit(c.code, c.bits);
+
+                if (pc == chess::Piece::none())
+                    return;
+
+                // first and second flag
+                stream.write_one_bit(static_cast<int>(pc.color()));
+            }
+
+            // Read one board piece from stream
+            [[nodiscard]] chess::Piece read_board_piece_from_stream()
+            {
+                int pr = static_cast<int>(chess::PieceType::None);
+                int code = 0, bits = 0;
+                while (true)
+                {
+                    code |= stream.read_one_bit() << bits;
+                    ++bits;
+
+                    assert(bits <= 6);
+
+                    for (pr = static_cast<int>(chess::PieceType::Pawn); pr <= static_cast<int>(chess::PieceType::None); ++pr)
+                        if (huffman_table[pr].code == code
+                            && huffman_table[pr].bits == bits)
+                            goto Found;
+                }
+            Found:;
+                if (pr == static_cast<int>(chess::PieceType::None))
+                    return chess::Piece::none();
+
+                // first and second flag
+                chess::Color c = (chess::Color)stream.read_one_bit();
+
+                return chess::Piece(static_cast<chess::PieceType>(pr), c);
+            }
+        };
+
+
+        [[nodiscard]] chess::Position pos_from_packed_sfen(const PackedSfen& sfen)
+        {
+            SfenPacker packer;
+            auto& stream = packer.stream;
+            stream.set_data((uint8_t*)&sfen);
+
+            chess::Position pos{};
+
+            // Active color
+            pos.setSideToMove((chess::Color)stream.read_one_bit());
+
+            // First the position of the ball
+            pos.place(chess::Piece(chess::PieceType::King, chess::Color::White), static_cast<chess::Square>(stream.read_n_bit(6)));
+            pos.place(chess::Piece(chess::PieceType::King, chess::Color::Black), static_cast<chess::Square>(stream.read_n_bit(6)));
+
+            // Piece placement
+            for (chess::Rank r = chess::rank8; r >= chess::rank1; --r)
+            {
+                for (chess::File f = chess::fileA; f <= chess::fileH; ++f)
+                {
+                    auto sq = chess::Square(f, r);
+
+                    // it seems there are already balls
+                    chess::Piece pc;
+                    if (pos.pieceAt(sq).type() != chess::PieceType::King)
+                    {
+                        assert(pos.pieceAt(sq) == chess::Piece::none());
+                        pc = packer.read_board_piece_from_stream();
+                    }
+                    else
+                    {
+                        pc = pos.pieceAt(sq);
+                    }
+
+                    // There may be no pieces, so skip in that case.
+                    if (pc == chess::Piece::none())
+                        continue;
+
+                    if (pc.type() != chess::PieceType::King)
+                    {
+                        pos.place(pc, sq);
+                    }
+
+                    assert(stream.get_cursor() <= 256);
+                }
+            }
+
+            // Castling availability.
+            chess::CastlingRights cr = chess::CastlingRights::None;
+            if (stream.read_one_bit()) {
+                cr |= chess::CastlingRights::WhiteKingSide;
+            }
+            if (stream.read_one_bit()) {
+                cr |= chess::CastlingRights::WhiteQueenSide;
+            }
+            if (stream.read_one_bit()) {
+                cr |= chess::CastlingRights::BlackKingSide;
+            }
+            if (stream.read_one_bit()) {
+                cr |= chess::CastlingRights::BlackQueenSide;
+            }
+            pos.setCastlingRights(cr);
+
+            // En passant square. Ignore if no pawn capture is possible
+            if (stream.read_one_bit()) {
+                chess::Square ep_square = static_cast<chess::Square>(stream.read_n_bit(6));
+                pos.setEpSquare(ep_square);
+            }
+
+            // Halfmove clock
+            pos.setRule50Counter(stream.read_n_bit(6));
+
+            // Fullmove number
+            pos.setHalfMove(stream.read_n_bit(8));
+
+            assert(stream.get_cursor() <= 256);
+
+            return pos;
+        }
+    }
+
+    struct CompressedTrainingDataFile
+    {
+        struct Header
+        {
+            std::uint32_t chunkSize;
+        };
+
+        CompressedTrainingDataFile(std::string path, std::ios_base::openmode om = std::ios_base::app) :
+            m_path(std::move(path)),
+            m_file(m_path, std::ios_base::binary | std::ios_base::in | std::ios_base::out | om)
+        {
+        }
+
+        void append(const char* data, std::uint32_t size)
+        {
+            writeChunkHeader({size});
+            m_file.write(data, size);
+        }
+
+        [[nodiscard]] bool hasNextChunk()
+        {
+            m_file.peek();
+            return !m_file.eof();
+        }
+
+        [[nodiscard]] std::vector<unsigned char> readNextChunk()
+        {
+            auto size = readChunkHeader().chunkSize;
+            std::vector<unsigned char> data(size);
+            m_file.read(reinterpret_cast<char*>(data.data()), size);
+            return data;
+        }
+
+    private:
+        std::string m_path;
+        std::fstream m_file;
+
+        void writeChunkHeader(Header h)
+        {
+            unsigned char header[8];
+            header[0] = 'B';
+            header[1] = 'I';
+            header[2] = 'N';
+            header[3] = 'P';
+            header[4] = h.chunkSize;
+            header[5] = h.chunkSize >> 8;
+            header[6] = h.chunkSize >> 16;
+            header[7] = h.chunkSize >> 24;
+            m_file.write(reinterpret_cast<const char*>(header), 8);
+        }
+
+        [[nodiscard]] Header readChunkHeader()
+        {
+            unsigned char header[8];
+            m_file.read(reinterpret_cast<char*>(header), 8);
+            if (header[0] != 'B' || header[1] != 'I' || header[2] != 'N' || header[3] != 'P')
+            {
+                assert(false);
+                // throw std::runtime_error("Invalid binpack file or chunk.");
+            }
+
+            const std::uint32_t size =
+                header[4]
+                | (header[5] << 8)
+                | (header[6] << 16)
+                | (header[7] << 24);
+
+            if (size > maxChunkSize)
+            {
+                assert(false);
+                // throw std::runtime_error("Chunks size larger than supported. Malformed file?");
+            }
+
+            return { size };
+        }
+    };
+
+    [[nodiscard]] inline std::uint16_t signedToUnsigned(std::int16_t a)
+    {
+        std::uint16_t r;
+        std::memcpy(&r, &a, sizeof(std::uint16_t));
+        if (r & 0x8000)
+        {
+            r ^= 0x7FFF;
+        }
+        r = (r << 1) | (r >> 15);
+        return r;
+    }
+
+    [[nodiscard]] inline std::int16_t unsignedToSigned(std::uint16_t r)
+    {
+        std::int16_t a;
+        r = (r << 15) | (r >> 1);
+        if (r & 0x8000)
+        {
+            r ^= 0x7FFF;
+        }
+        std::memcpy(&a, &r, sizeof(std::uint16_t));
+        return a;
+    }
+
+    struct TrainingDataEntry
+    {
+        chess::Position pos;
+        chess::Move move;
+        std::int16_t score;
+        std::uint16_t ply;
+        std::int16_t result;
+    };
+
+    [[nodiscard]] inline TrainingDataEntry packedSfenValueToTrainingDataEntry(const nodchip::PackedSfenValue& psv)
+    {
+        TrainingDataEntry ret;
+
+        ret.pos = nodchip::pos_from_packed_sfen(psv.sfen);
+        ret.move = psv.move.toMove();
+        ret.score = psv.score;
+        ret.ply = psv.gamePly;
+        ret.result = psv.game_result;
+
+        return ret;
+    }
+
+    [[nodiscard]] inline nodchip::PackedSfenValue trainingDataEntryToPackedSfenValue(const TrainingDataEntry& plain)
+    {
+        nodchip::PackedSfenValue ret;
+
+        nodchip::SfenPacker sp;
+        sp.data = reinterpret_cast<uint8_t*>(&ret.sfen);
+        sp.pack(plain.pos);
+
+        ret.score = plain.score;
+        ret.move = nodchip::StockfishMove::fromMove(plain.move);
+        ret.gamePly = plain.ply;
+        ret.game_result = plain.result;
+        ret.padding = 0xff; // for consistency with the .bin format.
+
+        return ret;
+    }
+
+    [[nodiscard]] inline bool isContinuation(const TrainingDataEntry& lhs, const TrainingDataEntry& rhs)
+    {
+        return
+            lhs.result == -rhs.result
+            && lhs.ply + 1 == rhs.ply
+            && lhs.pos.afterMove(lhs.move) == rhs.pos;
+    }
+
+    struct PackedTrainingDataEntry
+    {
+        unsigned char bytes[32];
+    };
+
+    [[nodiscard]] inline std::size_t usedBitsSafe(std::size_t value)
+    {
+        if (value == 0) return 0;
+        return chess::util::usedBits(value - 1);
+    }
+
+    static constexpr std::size_t scoreVleBlockSize = 4;
+
+    struct PackedMoveScoreListReader
+    {
+        TrainingDataEntry entry;
+        std::uint16_t numPlies;
+        unsigned char* movetext;
+
+        PackedMoveScoreListReader(const TrainingDataEntry& entry, unsigned char* movetext, std::uint16_t numPlies) :
+            entry(entry),
+            movetext(movetext),
+            numPlies(numPlies),
+            m_lastScore(-entry.score)
+        {
+
+        }
+
+        [[nodiscard]] std::uint8_t extractBitsLE8(std::size_t count)
+        {
+            if (count == 0) return 0;
+
+            if (m_readBitsLeft == 0)
+            {
+                m_readOffset += 1;
+                m_readBitsLeft = 8;
+            }
+
+            const std::uint8_t byte = movetext[m_readOffset] << (8 - m_readBitsLeft);
+            std::uint8_t bits = byte >> (8 - count);
+
+            if (count > m_readBitsLeft)
+            {
+                const auto spillCount = count - m_readBitsLeft;
+                bits |= movetext[m_readOffset + 1] >> (8 - spillCount);
+
+                m_readBitsLeft += 8;
+                m_readOffset += 1;
+            }
+
+            m_readBitsLeft -= count;
+
+            return bits;
+        }
+
+        [[nodiscard]] std::uint16_t extractVle16(std::size_t blockSize)
+        {
+            auto mask = (1 << blockSize) - 1;
+            std::uint16_t v = 0;
+            std::size_t offset = 0;
+            for(;;)
+            {
+                std::uint16_t block = extractBitsLE8(blockSize + 1);
+                v |= ((block & mask) << offset);
+                if (!(block >> blockSize))
+                {
+                    break;
+                }
+
+                offset += blockSize;
+            }
+            return v;
+        }
+
+        [[nodiscard]] TrainingDataEntry nextEntry()
+        {
+            entry.pos.doMove(entry.move);
+            auto [move, score] = nextMoveScore(entry.pos);
+            entry.move = move;
+            entry.score = score;
+            entry.ply += 1;
+            entry.result = -entry.result;
+            return entry;
+        }
+
+        [[nodiscard]] bool hasNext() const
+        {
+            return m_numReadPlies < numPlies;
+        }
+
+        [[nodiscard]] std::pair<chess::Move, std::int16_t> nextMoveScore(const chess::Position& pos)
+        {
+            chess::Move move;
+            std::int16_t score;
+
+            const chess::Color sideToMove = pos.sideToMove();
+            const chess::Bitboard ourPieces = pos.piecesBB(sideToMove);
+            const chess::Bitboard theirPieces = pos.piecesBB(!sideToMove);
+            const chess::Bitboard occupied = ourPieces | theirPieces;
+
+            const auto pieceId = extractBitsLE8(usedBitsSafe(ourPieces.count()));
+            const auto from = chess::Square(chess::nthSetBitIndex(ourPieces.bits(), pieceId));
+
+            const auto pt = pos.pieceAt(from).type();
+            switch (pt)
+            {
+            case chess::PieceType::Pawn:
+            {
+                const chess::Rank promotionRank = pos.sideToMove() == chess::Color::White ? chess::rank7 : chess::rank2;
+                const chess::Rank startRank = pos.sideToMove() == chess::Color::White ? chess::rank2 : chess::rank7;
+                const auto forward = sideToMove == chess::Color::White ? chess::FlatSquareOffset(0, 1) : chess::FlatSquareOffset(0, -1);
+
+                const chess::Square epSquare = pos.epSquare();
+
+                chess::Bitboard attackTargets = theirPieces;
+                if (epSquare != chess::Square::none())
+                {
+                    attackTargets |= epSquare;
+                }
+
+                chess::Bitboard destinations = chess::bb::pawnAttacks(chess::Bitboard::square(from), sideToMove) & attackTargets;
+
+                const chess::Square sqForward = from + forward;
+                if (!occupied.isSet(sqForward))
+                {
+                    destinations |= sqForward;
+
+                    const chess::Square sqForward2 = sqForward + forward;
+                    if (
+                        from.rank() == startRank
+                        && !occupied.isSet(sqForward2)
+                        )
+                    {
+                        destinations |= sqForward2;
+                    }
+                }
+
+                const auto destinationsCount = destinations.count();
+                if (from.rank() == promotionRank)
+                {
+                    const auto moveId = extractBitsLE8(usedBitsSafe(destinationsCount * 4ull));
+                    const chess::Piece promotedPiece = chess::Piece(
+                        chess::fromOrdinal<chess::PieceType>(ordinal(chess::PieceType::Knight) + (moveId % 4ull)),
+                        sideToMove
+                    );
+                    const auto to = chess::Square(chess::nthSetBitIndex(destinations.bits(), moveId / 4ull));
+
+                    move = chess::Move::promotion(from, to, promotedPiece);
+                    break;
+                }
+                else
+                {
+                    auto moveId = extractBitsLE8(usedBitsSafe(destinationsCount));
+                    const auto to = chess::Square(chess::nthSetBitIndex(destinations.bits(), moveId));
+                    if (to == epSquare)
+                    {
+                        move = chess::Move::enPassant(from, to);
+                        break;
+                    }
+                    else
+                    {
+                        move = chess::Move::normal(from, to);
+                        break;
+                    }
+                }
+            }
+            case chess::PieceType::King:
+            {
+                const chess::CastlingRights ourCastlingRightsMask =
+                    sideToMove == chess::Color::White
+                    ? chess::CastlingRights::White
+                    : chess::CastlingRights::Black;
+
+                const chess::CastlingRights castlingRights = pos.castlingRights();
+
+                const chess::Bitboard attacks = chess::bb::pseudoAttacks<chess::PieceType::King>(from) & ~ourPieces;
+                const std::size_t attacksSize = attacks.count();
+                const std::size_t numCastlings = chess::intrin::popcount(ordinal(castlingRights & ourCastlingRightsMask));
+
+                const auto moveId = extractBitsLE8(usedBitsSafe(attacksSize + numCastlings));
+
+                if (moveId >= attacksSize)
+                {
+                    const std::size_t idx = moveId - attacksSize;
+
+                    const chess::CastleType castleType =
+                        idx == 0
+                        && chess::contains(castlingRights, chess::CastlingTraits::castlingRights[sideToMove][chess::CastleType::Long])
+                        ? chess::CastleType::Long
+                        : chess::CastleType::Short;
+
+                    move = chess::Move::castle(castleType, sideToMove);
+                    break;
+                }
+                else
+                {
+                    auto to = chess::Square(chess::nthSetBitIndex(attacks.bits(), moveId));
+                    move = chess::Move::normal(from, to);
+                    break;
+                }
+                break;
+            }
+            default:
+            {
+                const chess::Bitboard attacks = chess::bb::attacks(pt, from, occupied) & ~ourPieces;
+                const auto moveId = extractBitsLE8(usedBitsSafe(attacks.count()));
+                auto to = chess::Square(chess::nthSetBitIndex(attacks.bits(), moveId));
+                move = chess::Move::normal(from, to);
+                break;
+            }
+            }
+
+            score = m_lastScore + unsignedToSigned(extractVle16(scoreVleBlockSize));
+            m_lastScore = -score;
+
+            ++m_numReadPlies;
+
+            return {move, score};
+        }
+
+        [[nodiscard]] std::size_t numReadBytes()
+        {
+            return m_readOffset + (m_readBitsLeft != 8);
+        }
+
+    private:
+        std::size_t m_readBitsLeft = 8;
+        std::size_t m_readOffset = 0;
+        std::int16_t m_lastScore = 0;
+        std::uint16_t m_numReadPlies = 0;
+    };
+
+    struct PackedMoveScoreList
+    {
+        std::uint16_t numPlies = 0;
+        std::vector<unsigned char> movetext;
+
+        void clear(const TrainingDataEntry& e)
+        {
+            numPlies = 0;
+            movetext.clear();
+            m_bitsLeft = 0;
+            m_lastScore = -e.score;
+        }
+
+        void addBitsLE8(std::uint8_t bits, std::size_t count)
+        {
+            if (count == 0) return;
+
+            if (m_bitsLeft == 0)
+            {
+                movetext.emplace_back(bits << (8 - count));
+                m_bitsLeft = 8;
+            }
+            else if (count <= m_bitsLeft)
+            {
+                movetext.back() |= bits << (m_bitsLeft - count);
+            }
+            else
+            {
+                const auto spillCount = count - m_bitsLeft;
+                movetext.back() |= bits >> spillCount;
+                movetext.emplace_back(bits << (8 - spillCount));
+                m_bitsLeft += 8;
+            }
+
+            m_bitsLeft -= count;
+        }
+
+        void addBitsVle16(std::uint16_t v, std::size_t blockSize)
+        {
+            auto mask = (1 << blockSize) - 1;
+            for(;;)
+            {
+                std::uint8_t block = (v & mask) | ((v > mask) << blockSize);
+                addBitsLE8(block, blockSize + 1);
+                v >>= blockSize;
+                if (v == 0) break;
+            }
+        }
+
+
+        void addMoveScore(const chess::Position& pos, chess::Move move, std::int16_t score)
+        {
+            const chess::Color sideToMove = pos.sideToMove();
+            const chess::Bitboard ourPieces = pos.piecesBB(sideToMove);
+            const chess::Bitboard theirPieces = pos.piecesBB(!sideToMove);
+            const chess::Bitboard occupied = ourPieces | theirPieces;
+
+            const std::uint8_t pieceId = (pos.piecesBB(sideToMove) & chess::bb::before(move.from)).count();
+            std::size_t numMoves = 0;
+            int moveId = 0;
+            const auto pt = pos.pieceAt(move.from).type();
+            switch (pt)
+            {
+            case chess::PieceType::Pawn:
+            {
+                const chess::Rank secondToLastRank = pos.sideToMove() == chess::Color::White ? chess::rank7 : chess::rank2;
+                const chess::Rank startRank = pos.sideToMove() == chess::Color::White ? chess::rank2 : chess::rank7;
+                const auto forward = sideToMove == chess::Color::White ? chess::FlatSquareOffset(0, 1) : chess::FlatSquareOffset(0, -1);
+
+                const chess::Square epSquare = pos.epSquare();
+
+                chess::Bitboard attackTargets = theirPieces;
+                if (epSquare != chess::Square::none())
+                {
+                    attackTargets |= epSquare;
+                }
+
+                chess::Bitboard destinations = chess::bb::pawnAttacks(chess::Bitboard::square(move.from), sideToMove) & attackTargets;
+
+                const chess::Square sqForward = move.from + forward;
+                if (!occupied.isSet(sqForward))
+                {
+                    destinations |= sqForward;
+
+                    const chess::Square sqForward2 = sqForward + forward;
+                    if (
+                        move.from.rank() == startRank
+                        && !occupied.isSet(sqForward2)
+                        )
+                    {
+                        destinations |= sqForward2;
+                    }
+                }
+
+                moveId = (destinations & chess::bb::before(move.to)).count();
+                numMoves = destinations.count();
+                if (move.from.rank() == secondToLastRank)
+                {
+                    const auto promotionIndex = (ordinal(move.promotedPiece.type()) - ordinal(chess::PieceType::Knight));
+                    moveId = moveId * 4 + promotionIndex;
+                    numMoves *= 4;
+                }
+
+                break;
+            }
+            case chess::PieceType::King:
+            {
+                const chess::CastlingRights ourCastlingRightsMask =
+                    sideToMove == chess::Color::White
+                    ? chess::CastlingRights::White
+                    : chess::CastlingRights::Black;
+
+                const chess::CastlingRights castlingRights = pos.castlingRights();
+
+                const chess::Bitboard attacks = chess::bb::pseudoAttacks<chess::PieceType::King>(move.from) & ~ourPieces;
+                const auto attacksSize = attacks.count();
+                const auto numCastlingRights = chess::intrin::popcount(ordinal(castlingRights & ourCastlingRightsMask));
+
+                numMoves += attacksSize;
+                numMoves += numCastlingRights;
+
+                if (move.type == chess::MoveType::Castle)
+                {
+                    const auto longCastlingRights = chess::CastlingTraits::castlingRights[sideToMove][chess::CastleType::Long];
+
+                    moveId = attacksSize - 1;
+
+                    if (chess::contains(castlingRights, longCastlingRights))
+                    {
+                        // We have to add one no matter if it's the used one or not.
+                        moveId += 1;
+                    }
+
+                    if (chess::CastlingTraits::moveCastlingType(move) == chess::CastleType::Short)
+                    {
+                        moveId += 1;
+                    }
+                }
+                else
+                {
+                    moveId = (attacks & chess::bb::before(move.to)).count();
+                }
+                break;
+            }
+            default:
+            {
+                const chess::Bitboard attacks = chess::bb::attacks(pt, move.from, occupied) & ~ourPieces;
+
+                moveId = (attacks & chess::bb::before(move.to)).count();
+                numMoves = attacks.count();
+            }
+            }
+
+            const std::size_t numPieces = ourPieces.count();
+            addBitsLE8(pieceId, usedBitsSafe(numPieces));
+            addBitsLE8(moveId, usedBitsSafe(numMoves));
+
+            std::uint16_t scoreDelta = signedToUnsigned(score - m_lastScore);
+            addBitsVle16(scoreDelta, scoreVleBlockSize);
+            m_lastScore = -score;
+
+            ++numPlies;
+        }
+
+    private:
+        std::size_t m_bitsLeft = 0;
+        std::int16_t m_lastScore = 0;
+    };
+
+
+    [[nodiscard]] inline PackedTrainingDataEntry packEntry(const TrainingDataEntry& plain)
+    {
+        PackedTrainingDataEntry packed;
+
+        auto compressedPos = plain.pos.compress();
+        auto compressedMove = plain.move.compress();
+
+        static_assert(sizeof(compressedPos) + sizeof(compressedMove) + 6 == sizeof(PackedTrainingDataEntry));
+
+        std::size_t offset = 0;
+        compressedPos.writeToBigEndian(packed.bytes);
+        offset += sizeof(compressedPos);
+        compressedMove.writeToBigEndian(packed.bytes + offset);
+        offset += sizeof(compressedMove);
+        std::uint16_t pr = plain.ply | (signedToUnsigned(plain.result) << 14);
+        packed.bytes[offset++] = signedToUnsigned(plain.score) >> 8;
+        packed.bytes[offset++] = signedToUnsigned(plain.score);
+        packed.bytes[offset++] = pr >> 8;
+        packed.bytes[offset++] = pr;
+        packed.bytes[offset++] = plain.pos.rule50Counter() >> 8;
+        packed.bytes[offset++] = plain.pos.rule50Counter();
+
+        return packed;
+    }
+
+    [[nodiscard]] inline TrainingDataEntry unpackEntry(const PackedTrainingDataEntry& packed)
+    {
+        TrainingDataEntry plain;
+
+        std::size_t offset = 0;
+        auto compressedPos = chess::CompressedPosition::readFromBigEndian(packed.bytes);
+        plain.pos = compressedPos.decompress();
+        offset += sizeof(compressedPos);
+        auto compressedMove = chess::CompressedMove::readFromBigEndian(packed.bytes + offset);
+        plain.move = compressedMove.decompress();
+        offset += sizeof(compressedMove);
+        plain.score = unsignedToSigned((packed.bytes[offset] << 8) | packed.bytes[offset+1]);
+        offset += 2;
+        std::uint16_t pr = (packed.bytes[offset] << 8) | packed.bytes[offset+1];
+        plain.ply = pr & 0x3FFF;
+        plain.pos.setPly(plain.ply);
+        plain.result = unsignedToSigned(pr >> 14);
+        offset += 2;
+        plain.pos.setRule50Counter((packed.bytes[offset] << 8) | packed.bytes[offset+1]);
+
+        return plain;
+    }
+
+    struct CompressedTrainingDataEntryWriter
+    {
+        static constexpr std::size_t chunkSize = suggestedChunkSize;
+
+        CompressedTrainingDataEntryWriter(std::string path, std::ios_base::openmode om = std::ios_base::app) :
+            m_outputFile(path, om),
+            m_lastEntry{},
+            m_movelist{},
+            m_packedSize(0),
+            m_packedEntries(chunkSize + maxMovelistSize),
+            m_isFirst(true)
+        {
+            m_lastEntry.ply = 0xFFFF; // so it's never a continuation
+            m_lastEntry.result = 0x7FFF;
+        }
+
+        void addTrainingDataEntry(const TrainingDataEntry& e)
+        {
+            bool isCont = isContinuation(m_lastEntry, e);
+            if (isCont)
+            {
+                // add to movelist
+                m_movelist.addMoveScore(e.pos, e.move, e.score);
+            }
+            else
+            {
+                if (!m_isFirst)
+                {
+                    writeMovelist();
+                }
+
+                if (m_packedSize >= chunkSize)
+                {
+                    m_outputFile.append(m_packedEntries.data(), m_packedSize);
+                    m_packedSize = 0;
+                }
+
+                auto packed = packEntry(e);
+                std::memcpy(m_packedEntries.data() + m_packedSize, &packed, sizeof(PackedTrainingDataEntry));
+                m_packedSize += sizeof(PackedTrainingDataEntry);
+
+                m_movelist.clear(e);
+
+                m_isFirst = false;
+            }
+
+            m_lastEntry = e;
+        }
+
+        ~CompressedTrainingDataEntryWriter()
+        {
+            if (m_packedSize > 0)
+            {
+                if (!m_isFirst)
+                {
+                    writeMovelist();
+                }
+
+                m_outputFile.append(m_packedEntries.data(), m_packedSize);
+                m_packedSize = 0;
+            }
+        }
+
+    private:
+        CompressedTrainingDataFile m_outputFile;
+        TrainingDataEntry m_lastEntry;
+        PackedMoveScoreList m_movelist;
+        std::size_t m_packedSize;
+        std::vector<char> m_packedEntries;
+        bool m_isFirst;
+
+        void writeMovelist()
+        {
+            m_packedEntries[m_packedSize++] = m_movelist.numPlies >> 8;
+            m_packedEntries[m_packedSize++] = m_movelist.numPlies;
+            if (m_movelist.numPlies > 0)
+            {
+                std::memcpy(m_packedEntries.data() + m_packedSize, m_movelist.movetext.data(), m_movelist.movetext.size());
+                m_packedSize += m_movelist.movetext.size();
+            }
+        };
+    };
+
+    struct CompressedTrainingDataEntryReader
+    {
+        static constexpr std::size_t chunkSize = suggestedChunkSize;
+
+        CompressedTrainingDataEntryReader(std::string path, std::ios_base::openmode om = std::ios_base::app) :
+            m_inputFile(path, om),
+            m_chunk(),
+            m_movelistReader(std::nullopt),
+            m_offset(0),
+            m_isEnd(false)
+        {
+            if (!m_inputFile.hasNextChunk())
+            {
+                m_isEnd = true;
+            }
+            else
+            {
+                m_chunk = m_inputFile.readNextChunk();
+            }
+        }
+
+        [[nodiscard]] bool hasNext()
+        {
+            return !m_isEnd;
+        }
+
+        [[nodiscard]] TrainingDataEntry next()
+        {
+            if (m_movelistReader.has_value())
+            {
+                const auto e = m_movelistReader->nextEntry();
+
+                if (!m_movelistReader->hasNext())
+                {
+                    m_offset += m_movelistReader->numReadBytes();
+                    m_movelistReader.reset();
+
+                    fetchNextChunkIfNeeded();
+                }
+
+                return e;
+            }
+
+            PackedTrainingDataEntry packed;
+            std::memcpy(&packed, m_chunk.data() + m_offset, sizeof(PackedTrainingDataEntry));
+            m_offset += sizeof(PackedTrainingDataEntry);
+
+            const std::uint16_t numPlies = (m_chunk[m_offset] << 8) | m_chunk[m_offset + 1];
+            m_offset += 2;
+
+            const auto e = unpackEntry(packed);
+
+            if (numPlies > 0)
+            {
+                m_movelistReader.emplace(e, reinterpret_cast<unsigned char*>(m_chunk.data()) + m_offset, numPlies);
+            }
+            else
+            {
+                fetchNextChunkIfNeeded();
+            }
+
+            return e;
+        }
+
+    private:
+        CompressedTrainingDataFile m_inputFile;
+        std::vector<unsigned char> m_chunk;
+        std::optional<PackedMoveScoreListReader> m_movelistReader;
+        std::size_t m_offset;
+        bool m_isEnd;
+
+        void fetchNextChunkIfNeeded()
+        {
+            if (m_offset + sizeof(PackedTrainingDataEntry) + 2 > m_chunk.size())
+            {
+                if (m_inputFile.hasNextChunk())
+                {
+                    m_chunk = m_inputFile.readNextChunk();
+                    m_offset = 0;
+                }
+                else
+                {
+                    m_isEnd = true;
+                }
+            }
+        }
+    };
+
+    inline void emitPlainEntry(std::string& buffer, const TrainingDataEntry& plain)
+    {
+        buffer += "fen ";
+        buffer += plain.pos.fen();
+        buffer += '\n';
+
+        buffer += "move ";
+        buffer += chess::uci::moveToUci(plain.pos, plain.move);
+        buffer += '\n';
+
+        buffer += "score ";
+        buffer += std::to_string(plain.score);
+        buffer += '\n';
+
+        buffer += "ply ";
+        buffer += std::to_string(plain.ply);
+        buffer += '\n';
+
+        buffer += "result ";
+        buffer += std::to_string(plain.result);
+        buffer += "\ne\n";
+    }
+
+    inline void emitBinEntry(std::vector<char>& buffer, const TrainingDataEntry& plain)
+    {
+        auto psv = trainingDataEntryToPackedSfenValue(plain);
+        const char* data = reinterpret_cast<const char*>(&psv);
+        buffer.insert(buffer.end(), data, data+sizeof(psv));
+    }
+
+    inline void convertPlainToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    {
+        constexpr std::size_t reportEveryNPositions = 100'000;
+
+        std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';
+
+        CompressedTrainingDataEntryWriter writer(outputPath, om);
+        TrainingDataEntry e;
+
+        std::string key;
+        std::string value;
+        std::string move;
+
+        std::ifstream inputFile(inputPath);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+
+        for(;;)
+        {
+            inputFile >> key;
+            if (!inputFile)
+            {
+                break;
+            }
+
+            if (key == "e"sv)
+            {
+                e.move = chess::uci::uciToMove(e.pos, move);
+
+                writer.addTrainingDataEntry(e);
+
+                ++numProcessedPositions;
+                const auto cur = inputFile.tellg();
+                if (numProcessedPositions % reportEveryNPositions == 0)
+                {
+                    std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+                }
+
+                continue;
+            }
+
+            inputFile >> std::ws;
+            std::getline(inputFile, value, '\n');
+
+            if (key == "fen"sv) e.pos = chess::Position::fromFen(value.c_str());
+            if (key == "move"sv) move = value;
+            if (key == "score"sv) e.score = std::stoi(value);
+            if (key == "ply"sv) e.ply = std::stoi(value);
+            if (key == "result"sv) e.result = std::stoi(value);
+        }
+    }
+
+    inline void convertBinpackToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    {
+        constexpr std::size_t bufferSize = MiB;
+
+        std::cout << "Decompressing " << inputPath << " to " << outputPath << '\n';
+
+        CompressedTrainingDataEntryReader reader(inputPath);
+        std::ofstream outputFile(outputPath, om);
+        const auto base = outputFile.tellp();
+        std::size_t numProcessedPositions = 0;
+        std::string buffer;
+        buffer.reserve(bufferSize * 2);
+
+        while(reader.hasNext())
+        {
+            emitPlainEntry(buffer, reader.next());
+
+            ++numProcessedPositions;
+
+            if (buffer.size() > bufferSize)
+            {
+                outputFile << buffer;
+                buffer.clear();
+
+                const auto cur = outputFile.tellp();
+                std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            outputFile << buffer;
+
+            const auto cur = outputFile.tellp();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+    }
+
+
+    inline void convertBinToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    {
+        constexpr std::size_t reportEveryNPositions = 100'000;
+
+        std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';
+
+        CompressedTrainingDataEntryWriter writer(outputPath, om);
+        TrainingDataEntry e;
+
+        std::string key;
+        std::string value;
+        std::string move;
+
+        std::ifstream inputFile(inputPath, std::ios_base::binary);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+
+        nodchip::PackedSfenValue psv;
+        for(;;)
+        {
+            inputFile.read(reinterpret_cast<char*>(&psv), sizeof(psv));
+            if (inputFile.gcount() != 40)
+            {
+                break;
+            }
+
+            writer.addTrainingDataEntry(packedSfenValueToTrainingDataEntry(psv));
+
+            ++numProcessedPositions;
+            const auto cur = inputFile.tellg();
+            if (numProcessedPositions % reportEveryNPositions == 0)
+            {
+                std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+            }
+        }
+    }
+
+    inline void convertBinpackToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    {
+        constexpr std::size_t bufferSize = MiB;
+
+        std::cout << "Decompressing " << inputPath << " to " << outputPath << '\n';
+
+        CompressedTrainingDataEntryReader reader(inputPath);
+        std::ofstream outputFile(outputPath, std::ios_base::binary | om);
+        const auto base = outputFile.tellp();
+        std::size_t numProcessedPositions = 0;
+        std::vector<char> buffer;
+        buffer.reserve(bufferSize * 2);
+
+        while(reader.hasNext())
+        {
+            emitBinEntry(buffer, reader.next());
+
+            ++numProcessedPositions;
+
+            if (buffer.size() > bufferSize)
+            {
+                outputFile.write(buffer.data(), buffer.size());
+                buffer.clear();
+
+                const auto cur = outputFile.tellp();
+                std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            outputFile.write(buffer.data(), buffer.size());
+
+            const auto cur = outputFile.tellp();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+    }
+
+    inline void convertBinToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    {
+        constexpr std::size_t reportEveryNPositions = 100'000;
+        constexpr std::size_t bufferSize = MiB;
+
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
+
+        TrainingDataEntry e;
+
+        std::string key;
+        std::string value;
+        std::string move;
+
+        std::ifstream inputFile(inputPath, std::ios_base::binary);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+
+        std::ofstream outputFile(outputPath, om);
+        std::string buffer;
+        buffer.reserve(bufferSize * 2);
+
+        nodchip::PackedSfenValue psv;
+        for(;;)
+        {
+            inputFile.read(reinterpret_cast<char*>(&psv), sizeof(psv));
+            if (inputFile.gcount() != 40)
+            {
+                break;
+            }
+
+            emitPlainEntry(buffer, packedSfenValueToTrainingDataEntry(psv));
+
+            ++numProcessedPositions;
+
+            if (buffer.size() > bufferSize)
+            {
+                outputFile << buffer;
+                buffer.clear();
+
+                const auto cur = outputFile.tellp();
+                std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            outputFile << buffer;
+
+            const auto cur = outputFile.tellp();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+    }
+
+    inline void convertPlainToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    {
+        constexpr std::size_t reportEveryNPositions = 100'000;
+        constexpr std::size_t bufferSize = MiB;
+
+        std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';
+
+        std::ofstream outputFile(outputPath, std::ios_base::binary | om);
+        std::vector<char> buffer;
+        buffer.reserve(bufferSize * 2);
+
+        TrainingDataEntry e;
+
+        std::string key;
+        std::string value;
+        std::string move;
+
+        std::ifstream inputFile(inputPath);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+
+        for(;;)
+        {
+            inputFile >> key;
+            if (!inputFile)
+            {
+                break;
+            }
+
+            if (key == "e"sv)
+            {
+                e.move = chess::uci::uciToMove(e.pos, move);
+
+                emitBinEntry(buffer, e);
+
+                ++numProcessedPositions;
+
+                if (buffer.size() > bufferSize)
+                {
+                    outputFile.write(buffer.data(), buffer.size());
+                    buffer.clear();
+
+                    const auto cur = outputFile.tellp();
+                    std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+                }
+
+                continue;
+            }
+
+            inputFile >> std::ws;
+            std::getline(inputFile, value, '\n');
+
+            if (key == "fen"sv) e.pos = chess::Position::fromFen(value.c_str());
+            if (key == "move"sv) move = value;
+            if (key == "score"sv) e.score = std::stoi(value);
+            if (key == "ply"sv) e.ply = std::stoi(value);
+            if (key == "result"sv) e.result = std::stoi(value);
+        }
+
+        if (!buffer.empty())
+        {
+            outputFile.write(buffer.data(), buffer.size());
+
+            const auto cur = outputFile.tellp();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 84feabb0..530c660b 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -11,6 +11,8 @@
 #include "learn.h"
 #include "multi_think.h"
 
+#include "../extra/nnue_data_binpack_format.h"
+
 #include <chrono>
 #include <climits>
 #include <cmath>
@@ -32,6 +34,12 @@ using namespace std;
 
 namespace Learner
 {
+    enum struct SfenOutputType
+    {
+        Bin,
+        Binpack
+    };
+
     static bool write_out_draw_game_in_training_data_generation = false;
     static bool detect_draw_by_consecutive_low_score = false;
     static bool detect_draw_by_insufficient_mating_material = false;
@@ -42,6 +50,94 @@ namespace Learner
     // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
     extern bool use_raw_nnue_eval;
 
+    static SfenOutputType sfen_output_type = SfenOutputType::Bin;
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static std::string filename_with_extension(const std::string& filename, const std::string& ext)
+    {
+        if (ends_with(filename, ext))
+        {
+            return filename;
+        }
+        else
+        {
+            return filename + "." + ext;
+        }
+    }
+
+    struct BasicSfenOutputStream
+    {
+        virtual void write(const PSVector& sfens) = 0;
+        virtual ~BasicSfenOutputStream() {}
+    };
+
+    struct BinSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = ios::out | ios::binary | ios::app;
+        static inline const std::string extension = "bin";
+
+        BinSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            m_stream.write(reinterpret_cast<const char*>(sfens.data()), sizeof(PackedSfenValue) * sfens.size());
+        }
+
+        ~BinSfenOutputStream() override {}
+
+    private:
+        fstream m_stream;
+    };
+
+    struct BinpackSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = ios::out | ios::binary | ios::app;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            for(auto& sfen : sfens)
+            {
+                // The library uses a type that's different but layout-compatibile.
+                binpack::nodchip::PackedSfenValue e;
+                std::memcpy(&e, &sfen, sizeof(binpack::nodchip::PackedSfenValue));
+                m_stream.addTrainingDataEntry(binpack::packedSfenValueToTrainingDataEntry(e));
+            }
+        }
+
+        ~BinpackSfenOutputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryWriter m_stream;
+    };
+
+    static std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename)
+    {
+        switch(sfen_output_type)
+        {
+            case SfenOutputType::Bin:
+                return std::make_unique<BinSfenOutputStream>(filename);
+            default:
+                return std::make_unique<BinpackSfenOutputStream>(filename);
+        }
+    }
+
     // Helper class for exporting Sfen
     struct SfenWriter
     {
@@ -58,7 +154,7 @@ namespace Learner
             sfen_buffers_pool.reserve((size_t)thread_num * 10);
             sfen_buffers.resize(thread_num);
 
-            output_file_stream.open(filename_, ios::out | ios::binary | ios::app);
+            output_file_stream = create_new_sfen_output(filename_);
             filename = filename_;
 
             finished = false;
@@ -68,7 +164,7 @@ namespace Learner
         {
             finished = true;
             file_worker_thread.join();
-            output_file_stream.close();
+            output_file_stream.reset();
 
 #if defined(_DEBUG)
             {
@@ -137,9 +233,6 @@ namespace Learner
             {
                 // Also output the current time to console.
                 sync_cout << endl << sfen_write_count << " sfens , at " << now_string() << sync_endl;
-
-                // This is enough for flush().
-                output_file_stream.flush();
             };
 
             while (!finished || sfen_buffers_pool.size())
@@ -163,7 +256,7 @@ namespace Learner
                 {
                     for (auto& buf : buffers)
                     {
-                        output_file_stream.write(reinterpret_cast<const char*>(buf->data()), sizeof(PackedSfenValue) * buf->size());
+                        output_file_stream->write(*buf);
 
                         sfen_write_count += buf->size();
 
@@ -174,8 +267,6 @@ namespace Learner
                         {
                             sfen_write_count_current_file = 0;
 
-                            output_file_stream.close();
-
                             // Sequential number attached to the file
                             int n = (int)(sfen_write_count / save_every);
 
@@ -183,7 +274,7 @@ namespace Learner
                             // Add ios::app in consideration of overwriting.
                             // (Depending on the operation, it may not be necessary.)
                             string new_filename = filename + "_" + std::to_string(n);
-                            output_file_stream.open(new_filename, ios::out | ios::binary | ios::app);
+                            output_file_stream = create_new_sfen_output(new_filename);
                             cout << endl << "output sfen file = " << new_filename << endl;
                         }
 
@@ -217,7 +308,7 @@ namespace Learner
 
     private:
 
-        fstream output_file_stream;
+        std::unique_ptr<BasicSfenOutputStream> output_file_stream;
 
         // A new net is saved after every save_every sfens are processed.
         uint64_t save_every = std::numeric_limits<uint64_t>::max();
@@ -951,7 +1042,7 @@ namespace Learner
         int write_maxply = 400;
 
         // File name to write
-        string output_file_name = "generated_kifu.bin";
+        string output_file_name = "generated_kifu";
 
         string token;
 
@@ -962,6 +1053,8 @@ namespace Learner
         // Add a random number to the end of the file name.
         bool random_file_name = false;
 
+        std::string sfen_format;
+
         while (true)
         {
             token = "";
@@ -1017,10 +1110,24 @@ namespace Learner
                 is >> detect_draw_by_insufficient_mating_material;
             else if (token == "use_raw_nnue_eval")
                 is >> use_raw_nnue_eval;
+            else if (token == "sfen_format")
+                is >> sfen_format;
             else
                 cout << "Error! : Illegal token " << token << endl;
         }
 
+        if (!sfen_format.empty())
+        {
+            if (sfen_format == "bin")
+                sfen_output_type = SfenOutputType::Bin;
+            else if (sfen_format == "binpack")
+                sfen_output_type = SfenOutputType::Binpack;
+            else
+            {
+                cout << "Unknown sfen format `" << sfen_format << "`. Using bin\n";
+            }
+        }
+
         // If search depth2 is not set, leave it the same as search depth.
         if (search_depth_max == INT_MIN)
             search_depth_max = search_depth_min;

From 6b76ebc2ca3b66003424d73f8561fb4906657fde Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 00:31:38 +0200
Subject: [PATCH 244/583] Support for binpack format in sfenreader in learner.
 Automatically detect file extension and choose the correct reader (bin or
 binpack)

---
 src/extra/nnue_data_binpack_format.h | 231 +++++-----------
 src/learn/gensfen.cpp                |   5 +-
 src/learn/learner.cpp                | 396 ++++++++++++++++++---------
 3 files changed, 328 insertions(+), 304 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 9f810a3b..bec0e9ad 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -2745,9 +2745,6 @@ namespace chess
                 0x000200282410A102ull,
                 0x4048240043802106ull
                     } };
-            alignas(64) extern EnumArray<Square, Bitboard> g_rookMasks;
-            alignas(64) extern EnumArray<Square, std::uint8_t> g_rookShifts;
-            alignas(64) extern EnumArray<Square, const Bitboard*> g_rookAttacks;
 
             alignas(64) constexpr EnumArray<Square, std::uint64_t> g_bishopMagics{ {
                 0x40106000A1160020ull,
@@ -2815,9 +2812,17 @@ namespace chess
                 0x0300404822C08200ull,
                 0x48081010008A2A80ull
             } };
-            alignas(64) extern EnumArray<Square, Bitboard> g_bishopMasks;
-            alignas(64) extern EnumArray<Square, std::uint8_t> g_bishopShifts;
-            alignas(64) extern EnumArray<Square, const Bitboard*> g_bishopAttacks;
+
+            alignas(64) static EnumArray<Square, Bitboard> g_rookMasks;
+            alignas(64) static EnumArray<Square, std::uint8_t> g_rookShifts;
+            alignas(64) static EnumArray<Square, const Bitboard*> g_rookAttacks;
+
+            alignas(64) static EnumArray<Square, Bitboard> g_bishopMasks;
+            alignas(64) static EnumArray<Square, std::uint8_t> g_bishopShifts;
+            alignas(64) static EnumArray<Square, const Bitboard*> g_bishopAttacks;
+
+            alignas(64) static std::array<Bitboard, 102400> g_allRookAttacks;
+            alignas(64) static std::array<Bitboard, 5248> g_allBishopAttacks;
 
             inline Bitboard bishopAttacks(Square s, Bitboard occupied)
             {
@@ -3402,17 +3407,6 @@ namespace chess
                 Bishop
             };
 
-            alignas(64) EnumArray<Square, Bitboard> g_rookMasks;
-            alignas(64) EnumArray<Square, std::uint8_t> g_rookShifts;
-            alignas(64) EnumArray<Square, const Bitboard*> g_rookAttacks;
-
-            alignas(64) EnumArray<Square, Bitboard> g_bishopMasks;
-            alignas(64) EnumArray<Square, std::uint8_t> g_bishopShifts;
-            alignas(64) EnumArray<Square, const Bitboard*> g_bishopAttacks;
-
-            alignas(64) static std::array<Bitboard, 102400> g_allRookAttacks;
-            alignas(64) static std::array<Bitboard, 5248> g_allBishopAttacks;
-
             template <MagicsType TypeV>
             [[nodiscard]] inline Bitboard slidingAttacks(Square sq, Bitboard occupied)
             {
@@ -3857,7 +3851,7 @@ namespace chess
             return true;
         }
 
-        [[nodiscard]] std::string fen() const;
+        [[nodiscard]] inline std::string fen() const;
 
         [[nodiscard]] inline bool trySet(std::string_view boardState)
         {
@@ -4093,7 +4087,7 @@ namespace chess
 
         // returns captured piece
         // doesn't check validity
-        FORCEINLINE constexpr Piece doMove(Move move)
+        inline constexpr Piece doMove(Move move)
         {
             if (move.type == MoveType::Normal)
             {
@@ -4132,7 +4126,7 @@ namespace chess
             return doMoveColdPath(move);
         }
 
-        NOINLINE constexpr Piece doMoveColdPath(Move move)
+        inline constexpr Piece doMoveColdPath(Move move)
         {
             if (move.type == MoveType::Promotion)
             {
@@ -4333,38 +4327,38 @@ namespace chess
 
         // Returns whether a given square is attacked by any piece
         // of `attackerColor` side.
-        [[nodiscard]] bool isSquareAttacked(Square sq, Color attackerColor) const;
+        [[nodiscard]] inline bool isSquareAttacked(Square sq, Color attackerColor) const;
 
         // Returns whether a given square is attacked by any piece
         // of `attackerColor` side after `move` is made.
         // Move must be pseudo legal.
-        [[nodiscard]] bool isSquareAttackedAfterMove(Move move, Square sq, Color attackerColor) const;
+        [[nodiscard]] inline bool isSquareAttackedAfterMove(Move move, Square sq, Color attackerColor) const;
 
         // Move must be pseudo legal.
         // Must not be a king move.
-        [[nodiscard]] bool createsDiscoveredAttackOnOwnKing(Move move) const;
+        [[nodiscard]] inline bool createsDiscoveredAttackOnOwnKing(Move move) const;
 
         // Returns whether a piece on a given square is attacked
         // by any enemy piece. False if square is empty.
-        [[nodiscard]] bool isPieceAttacked(Square sq) const;
+        [[nodiscard]] inline bool isPieceAttacked(Square sq) const;
 
         // Returns whether a piece on a given square is attacked
         // by any enemy piece after `move` is made. False if square is empty.
         // Move must be pseudo legal.
-        [[nodiscard]] bool isPieceAttackedAfterMove(Move move, Square sq) const;
+        [[nodiscard]] inline bool isPieceAttackedAfterMove(Move move, Square sq) const;
 
         // Returns whether the king of the moving side is attacked
         // by any enemy piece after a move is made.
         // Move must be pseudo legal.
-        [[nodiscard]] bool isOwnKingAttackedAfterMove(Move move) const;
+        [[nodiscard]] inline bool isOwnKingAttackedAfterMove(Move move) const;
 
         // Return a bitboard with all (pseudo legal) attacks by the piece on
         // the given square. Empty if no piece on the square.
-        [[nodiscard]] Bitboard attacks(Square sq) const;
+        [[nodiscard]] inline Bitboard attacks(Square sq) const;
 
         // Returns a bitboard with all squared that have pieces
         // that attack a given square (pseudo legally)
-        [[nodiscard]] Bitboard attackers(Square sq, Color attackerColor) const;
+        [[nodiscard]] inline Bitboard attackers(Square sq, Color attackerColor) const;
 
         [[nodiscard]] constexpr Piece pieceAt(Square sq) const
         {
@@ -4438,20 +4432,6 @@ namespace chess
 
     struct Position;
 
-    struct MoveLegalityChecker
-    {
-        MoveLegalityChecker(const Position& position);
-
-        [[nodiscard]] bool isPseudoLegalMoveLegal(const Move& move) const;
-
-    private:
-        const Position* m_position;
-        Bitboard m_checkers;
-        Bitboard m_ourBlockersForKing;
-        Bitboard m_potentialCheckRemovals;
-        Square m_ksq;
-    };
-
     struct CompressedPosition;
 
     struct PositionHash128
@@ -4484,20 +4464,20 @@ namespace chess
         {
         }
 
-        void set(std::string_view fen);
+        inline void set(std::string_view fen);
 
         // Returns false if the fen was not valid
         // If the returned value was false the position
         // is in unspecified state.
-        [[nodiscard]] bool trySet(std::string_view fen);
+        [[nodiscard]] inline bool trySet(std::string_view fen);
 
-        [[nodiscard]] static Position fromFen(std::string_view fen);
+        [[nodiscard]] static inline Position fromFen(std::string_view fen);
 
-        [[nodiscard]] static std::optional<Position> tryFromFen(std::string_view fen);
+        [[nodiscard]] static inline std::optional<Position> tryFromFen(std::string_view fen);
 
-        [[nodiscard]] static Position startPosition();
+        [[nodiscard]] static inline Position startPosition();
 
-        [[nodiscard]] std::string fen() const;
+        [[nodiscard]] inline std::string fen() const;
 
         constexpr void setEpSquareUnchecked(Square sq)
         {
@@ -4535,7 +4515,7 @@ namespace chess
             m_ply = ply;
         }
 
-        ReverseMove doMove(const Move& move);
+        inline ReverseMove doMove(const Move& move);
 
         constexpr void undoMove(const ReverseMove& reverseMove)
         {
@@ -4559,49 +4539,44 @@ namespace chess
             return m_sideToMove;
         }
 
-        [[nodiscard]] std::uint8_t rule50Counter() const
+        [[nodiscard]] inline std::uint8_t rule50Counter() const
         {
             return m_rule50Counter;
         }
 
-        [[nodiscard]] std::uint16_t ply() const
+        [[nodiscard]] inline std::uint16_t ply() const
         {
             return m_ply;
         }
 
-        [[nodiscard]] std::uint16_t halfMove() const
+        [[nodiscard]] inline std::uint16_t halfMove() const
         {
             return (m_ply + 1) / 2;
         }
 
-        void setHalfMove(std::uint16_t hm)
+        inline void setHalfMove(std::uint16_t hm)
         {
             m_ply = 2 * hm - 1 + (m_sideToMove == Color::Black);
         }
 
-        [[nodiscard]] bool isCheck() const;
+        [[nodiscard]] inline bool isCheck() const;
 
-        [[nodiscard]] Bitboard checkers() const;
+        [[nodiscard]] inline Bitboard checkers() const;
 
-        [[nodiscard]] bool isCheckAfterMove(Move move) const;
+        [[nodiscard]] inline bool isCheckAfterMove(Move move) const;
 
         // Checks whether ANY `move` is legal.
-        [[nodiscard]] bool isMoveLegal(Move move) const;
+        [[nodiscard]] inline bool isMoveLegal(Move move) const;
 
-        [[nodiscard]] bool isPseudoLegalMoveLegal(Move move) const;
+        [[nodiscard]] inline bool isPseudoLegalMoveLegal(Move move) const;
 
-        [[nodiscard]] bool isMovePseudoLegal(Move move) const;
+        [[nodiscard]] inline bool isMovePseudoLegal(Move move) const;
 
         // Returns all pieces that block a slider
         // from attacking our king. When two or more
         // pieces block a single slider then none
         // of these pieces are included.
-        [[nodiscard]] Bitboard blockersForKing(Color color) const;
-
-        [[nodiscard]] MoveLegalityChecker moveLegalityChecker() const
-        {
-            return { *this };
-        }
+        [[nodiscard]] inline Bitboard blockersForKing(Color color) const;
 
         [[nodiscard]] constexpr Square epSquare() const
         {
@@ -4637,7 +4612,7 @@ namespace chess
             return cpy;
         }
 
-        [[nodiscard]] Position afterMove(Move move) const;
+        [[nodiscard]] inline Position afterMove(Move move) const;
 
         [[nodiscard]] constexpr bool isEpPossible() const
         {
@@ -4655,11 +4630,11 @@ namespace chess
 
         static_assert(sizeof(Color) + sizeof(Square) + sizeof(CastlingRights) + sizeof(std::uint8_t) == 4);
 
-        [[nodiscard]] FORCEINLINE bool isEpPossible(Square epSquare, Color sideToMove) const;
+        [[nodiscard]] inline bool isEpPossible(Square epSquare, Color sideToMove) const;
 
-        [[nodiscard]] NOINLINE bool isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const;
+        [[nodiscard]] inline bool isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const;
 
-        void nullifyEpSquareIfNotPossible();
+        inline void nullifyEpSquareIfNotPossible();
     };
 
     struct CompressedPosition
@@ -5302,7 +5277,7 @@ namespace chess
         return allAttackers;
     }
 
-    const Piece* Board::piecesRaw() const
+    inline const Piece* Board::piecesRaw() const
     {
         return m_pieces.data();
     }
@@ -5330,7 +5305,7 @@ namespace chess
         }();
     }
 
-    [[nodiscard]] std::string Board::fen() const
+    [[nodiscard]] inline std::string Board::fen() const
     {
         std::string fen;
         fen.reserve(96); // longest fen is probably in range of around 88
@@ -5382,79 +5357,6 @@ namespace chess
         return fen;
     }
 
-    MoveLegalityChecker::MoveLegalityChecker(const Position& position) :
-        m_position(&position),
-        m_checkers(position.checkers()),
-        m_ourBlockersForKing(
-            position.blockersForKing(position.sideToMove())
-            & position.piecesBB(position.sideToMove())
-        ),
-        m_ksq(position.kingSquare(position.sideToMove()))
-    {
-        if (m_checkers.exactlyOne())
-        {
-            const Bitboard knightCheckers = m_checkers & bb::pseudoAttacks<PieceType::Knight>(m_ksq);
-            if (knightCheckers.any())
-            {
-                // We're checked by a knight, we have to remove it or move the king.
-                m_potentialCheckRemovals = knightCheckers;
-            }
-            else
-            {
-                // If we're not checked by a knight we can block it.
-                m_potentialCheckRemovals = bb::between(m_ksq, m_checkers.first()) | m_checkers;
-            }
-        }
-        else
-        {
-            // Double check, king has to move.
-            m_potentialCheckRemovals = Bitboard::none();
-        }
-    }
-
-    [[nodiscard]] bool MoveLegalityChecker::isPseudoLegalMoveLegal(const Move& move) const
-    {
-        const Piece movedPiece = m_position->pieceAt(move.from);
-
-        if (m_checkers.any())
-        {
-            if (move.from == m_ksq || move.type == MoveType::EnPassant)
-            {
-                return m_position->isPseudoLegalMoveLegal(move);
-            }
-            else
-            {
-                // This means there's only one check and we either
-                // blocked it or removed the piece that attacked
-                // our king. So the only threat is if it's a discovered check.
-                return
-                    m_potentialCheckRemovals.isSet(move.to)
-                    && !m_ourBlockersForKing.isSet(move.from);
-            }
-        }
-        else
-        {
-            if (move.from == m_ksq)
-            {
-                return m_position->isPseudoLegalMoveLegal(move);
-            }
-            else if (move.type == MoveType::EnPassant)
-            {
-                return !m_position->createsDiscoveredAttackOnOwnKing(move);
-            }
-            else if (m_ourBlockersForKing.isSet(move.from))
-            {
-                // If it was a blocker it may have only moved in line with our king.
-                // Otherwise it's a discovered check.
-                return bb::line(m_ksq, move.from).isSet(move.to);
-            }
-            else
-            {
-                return true;
-            }
-        }
-    }
-
     void Position::set(std::string_view fen)
     {
         (void)trySet(fen);
@@ -5611,7 +5513,7 @@ namespace chess
         }();
     }
 
-    ReverseMove Position::doMove(const Move& move)
+    inline ReverseMove Position::doMove(const Move& move)
     {
         assert(move.from.isOk() && move.to.isOk());
 
@@ -5649,12 +5551,12 @@ namespace chess
         return { move, captured, oldEpSquare, oldCastlingRights };
     }
 
-    [[nodiscard]] bool Position::isCheck() const
+    [[nodiscard]] inline bool Position::isCheck() const
     {
         return BaseType::isSquareAttacked(kingSquare(m_sideToMove), !m_sideToMove);
     }
 
-    [[nodiscard]] Bitboard Position::checkers() const
+    [[nodiscard]] inline Bitboard Position::checkers() const
     {
         return BaseType::attackers(kingSquare(m_sideToMove), !m_sideToMove);
     }
@@ -5664,7 +5566,7 @@ namespace chess
         return BaseType::isSquareAttackedAfterMove(move, kingSquare(!m_sideToMove), m_sideToMove);
     }
 
-    [[nodiscard]] Bitboard Position::blockersForKing(Color color) const
+    [[nodiscard]] inline Bitboard Position::blockersForKing(Color color) const
     {
         const Color attackerColor = !color;
 
@@ -5700,7 +5602,7 @@ namespace chess
         return allBlockers;
     }
 
-    [[nodiscard]] Position Position::afterMove(Move move) const
+    [[nodiscard]] inline Position Position::afterMove(Move move) const
     {
         Position cpy(*this);
         auto pc = cpy.doMove(move);
@@ -5711,7 +5613,7 @@ namespace chess
         return cpy;
     }
 
-    [[nodiscard]] FORCEINLINE bool Position::isEpPossible(Square epSquare, Color sideToMove) const
+    [[nodiscard]] inline bool Position::isEpPossible(Square epSquare, Color sideToMove) const
     {
         const Bitboard pawnsAttackingEpSquare =
             bb::pawnAttacks(Bitboard::square(epSquare), !sideToMove)
@@ -5725,7 +5627,7 @@ namespace chess
         return isEpPossibleColdPath(epSquare, pawnsAttackingEpSquare, sideToMove);
     }
 
-    [[nodiscard]] NOINLINE bool Position::isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const
+    [[nodiscard]] inline bool Position::isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const
     {
         // only set m_epSquare when it matters, ie. when
         // the opposite side can actually capture
@@ -5772,7 +5674,7 @@ namespace chess
         return false;
     }
 
-    void Position::nullifyEpSquareIfNotPossible()
+    inline void Position::nullifyEpSquareIfNotPossible()
     {
         if (m_epSquare != Square::none() && !isEpPossible(m_epSquare, m_sideToMove))
         {
@@ -5782,12 +5684,12 @@ namespace chess
 
     namespace uci
     {
-        [[nodiscard]] std::string moveToUci(const Position& pos, const Move& move);
-        [[nodiscard]] Move uciToMove(const Position& pos, std::string_view sv);
+        [[nodiscard]] inline std::string moveToUci(const Position& pos, const Move& move);
+        [[nodiscard]] inline Move uciToMove(const Position& pos, std::string_view sv);
 
-        [[nodiscard]] std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv);
+        [[nodiscard]] inline std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv);
 
-        [[nodiscard]] std::string moveToUci(const Position& pos, const Move& move)
+        [[nodiscard]] inline std::string moveToUci(const Position& pos, const Move& move)
         {
             std::string s;
 
@@ -5814,7 +5716,7 @@ namespace chess
             return s;
         }
 
-        [[nodiscard]] Move uciToMove(const Position& pos, std::string_view sv)
+        [[nodiscard]] inline Move uciToMove(const Position& pos, std::string_view sv)
         {
             const Square from = parser_bits::parseSquare(sv.data());
             const Square to = parser_bits::parseSquare(sv.data() + 2);
@@ -5850,7 +5752,7 @@ namespace chess
             }
         }
 
-        [[nodiscard]] std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv)
+        [[nodiscard]] inline std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv)
         {
             if (sv.size() < 4 || sv.size() > 5)
             {
@@ -6300,7 +6202,7 @@ namespace binpack
         };
 
 
-        [[nodiscard]] chess::Position pos_from_packed_sfen(const PackedSfen& sfen)
+        [[nodiscard]] inline chess::Position pos_from_packed_sfen(const PackedSfen& sfen)
         {
             SfenPacker packer;
             auto& stream = packer.stream;
@@ -6655,14 +6557,12 @@ namespace binpack
                 if (!occupied.isSet(sqForward))
                 {
                     destinations |= sqForward;
-
-                    const chess::Square sqForward2 = sqForward + forward;
                     if (
                         from.rank() == startRank
-                        && !occupied.isSet(sqForward2)
+                        && !occupied.isSet(sqForward + forward)
                         )
                     {
-                        destinations |= sqForward2;
+                        destinations |= sqForward + forward;
                     }
                 }
 
@@ -6845,13 +6745,12 @@ namespace binpack
                 {
                     destinations |= sqForward;
 
-                    const chess::Square sqForward2 = sqForward + forward;
                     if (
                         move.from.rank() == startRank
-                        && !occupied.isSet(sqForward2)
+                        && !occupied.isSet(sqForward + forward)
                         )
                     {
-                        destinations |= sqForward2;
+                        destinations |= sqForward + forward;
                     }
                 }
 
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 530c660b..99a783bb 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -133,9 +133,12 @@ namespace Learner
         {
             case SfenOutputType::Bin:
                 return std::make_unique<BinSfenOutputStream>(filename);
-            default:
+            case SfenOutputType::Binpack:
                 return std::make_unique<BinpackSfenOutputStream>(filename);
         }
+
+        assert(false);
+        return nullptr;
     }
 
     // Helper class for exporting Sfen
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 15f0825d..7cc04406 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -30,6 +30,8 @@
 #include "learn.h"
 #include "multi_think.h"
 
+#include "../extra/nnue_data_binpack_format.h"
+
 #include <chrono>
 #include <climits>
 #include <cmath>    // std::exp(),std::pow(),std::log()
@@ -85,8 +87,8 @@ namespace Learner
     static double dest_score_min_value = 0.0;
     static double dest_score_max_value = 1.0;
 
-    // Assume teacher signals are the scores of deep searches, 
-    // and convert them into winning probabilities in the trainer. 
+    // Assume teacher signals are the scores of deep searches,
+    // and convert them into winning probabilities in the trainer.
     // Sometimes we want to use the winning probabilities in the training
     // data directly. In those cases, we set false to this variable.
     static bool convert_teacher_signal_to_winning_probability = true;
@@ -125,19 +127,19 @@ namespace Learner
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage(double value, int ply)
     {
-        if (use_wdl) 
+        if (use_wdl)
         {
             return winning_percentage_wdl(value, ply);
         }
-        else 
+        else
         {
             return winning_percentage(value);
         }
     }
 
     double calc_cross_entropy_of_winning_percentage(
-        double deep_win_rate, 
-        double shallow_eval, 
+        double deep_win_rate,
+        double shallow_eval,
         int ply)
     {
         const double p = deep_win_rate;
@@ -146,8 +148,8 @@ namespace Learner
     }
 
     double calc_d_cross_entropy_of_winning_percentage(
-        double deep_win_rate, 
-        double shallow_eval, 
+        double deep_win_rate,
+        double shallow_eval,
         int ply)
     {
         constexpr double epsilon = 0.000001;
@@ -158,7 +160,7 @@ namespace Learner
         const double y2 = calc_cross_entropy_of_winning_percentage(
             deep_win_rate, shallow_eval + epsilon, ply);
 
-        // Divide by the winning_probability_coefficient to 
+        // Divide by the winning_probability_coefficient to
         // match scale with the sigmoidal win rate
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
@@ -195,7 +197,7 @@ namespace Learner
         const double scaled_teacher_signal = get_scaled_signal(teacher_signal);
 
         double p = scaled_teacher_signal;
-        if (convert_teacher_signal_to_winning_probability) 
+        if (convert_teacher_signal_to_winning_probability)
         {
             p = winning_percentage(scaled_teacher_signal, ply);
         }
@@ -217,7 +219,7 @@ namespace Learner
 
     double calculate_t(int game_result)
     {
-        // Use 1 as the correction term if the expected win rate is 1, 
+        // Use 1 as the correction term if the expected win rate is 1,
         // 0 if you lose, and 0.5 if you draw.
         // game_result = 1,0,-1 so add 1 and divide by 2.
         const double t = double(game_result + 1) * 0.5;
@@ -235,13 +237,13 @@ namespace Learner
         const double lambda = calculate_lambda(teacher_signal);
 
         double grad;
-        if (use_wdl) 
+        if (use_wdl)
         {
             const double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
             const double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
             grad = lambda * dce_p + (1.0 - lambda) * dce_t;
         }
-        else 
+        else
         {
             // Use the actual win rate as a correction term.
             // This is the idea of ​​elmo (WCSC27), modern O-parts.
@@ -252,18 +254,18 @@ namespace Learner
     }
 
     // Calculate cross entropy during learning
-    // The individual cross entropy of the win/loss term and win 
-    // rate term of the elmo expression is returned 
+    // The individual cross entropy of the win/loss term and win
+    // rate term of the elmo expression is returned
     // to the arguments cross_entropy_eval and cross_entropy_win.
     void calc_cross_entropy(
-        Value teacher_signal, 
-        Value shallow, 
+        Value teacher_signal,
+        Value shallow,
         const PackedSfenValue& psv,
-        double& cross_entropy_eval, 
-        double& cross_entropy_win, 
+        double& cross_entropy_eval,
+        double& cross_entropy_win,
         double& cross_entropy,
-        double& entropy_eval, 
-        double& entropy_win, 
+        double& entropy_eval,
+        double& entropy_win,
         double& entropy)
     {
         // Teacher winning probability.
@@ -292,24 +294,133 @@ namespace Learner
     }
 
     // Other objective functions may be considered in the future...
-    double calc_grad(Value shallow, const PackedSfenValue& psv) 
+    double calc_grad(Value shallow, const PackedSfenValue& psv)
     {
         return calc_grad((Value)psv.score, shallow, psv);
     }
 
+    struct BasicSfenInputStream
+    {
+        virtual std::optional<PackedSfenValue> next() = 0;
+        virtual bool eof() const = 0;
+        virtual ~BasicSfenInputStream() {}
+    };
+
+    struct BinSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = ios::in | ios::binary;
+        static inline const std::string extension = "bin";
+
+        BinSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream)
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            PackedSfenValue e;
+            if(m_stream.read(reinterpret_cast<char*>(&e), sizeof(PackedSfenValue)))
+            {
+                return e;
+            }
+            else
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinSfenInputStream() override {}
+
+    private:
+        fstream m_stream;
+        bool m_eof;
+    };
+
+    struct BinpackSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = ios::in | ios::binary;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream.hasNext())
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            if (!m_stream.hasNext())
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+
+            auto training_data_entry = m_stream.next();
+            auto v = binpack::trainingDataEntryToPackedSfenValue(training_data_entry);
+            PackedSfenValue psv;
+            // same layout, different types. One is from generic library.
+            std::memcpy(&psv, &v, sizeof(PackedSfenValue));
+
+            return psv;
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinpackSfenInputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryReader m_stream;
+        bool m_eof;
+    };
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static bool has_extension(const std::string& filename, const std::string& extension)
+    {
+        return ends_with(filename, "." + extension);
+    }
+
+    static std::unique_ptr<BasicSfenInputStream> open_sfen_input_file(const std::string& filename)
+    {
+        if (has_extension(filename, BinSfenInputStream::extension))
+            return std::make_unique<BinSfenInputStream>(filename);
+        else if (has_extension(filename, BinpackSfenInputStream::extension))
+            return std::make_unique<BinpackSfenInputStream>(filename);
+
+        assert(false);
+        return nullptr;
+    }
+
     // Sfen reader
     struct SfenReader
     {
         // Number of phases used for calculation such as mse
         // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
-        // Since search() is performed with depth = 1 in calculation of 
+        // Since search() is performed with depth = 1 in calculation of
         // move match rate, simple comparison is not possible...
         static constexpr uint64_t sfen_for_mse_size = 2000;
 
         // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
         static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
 
-        // Buffer for reading files (If this is made larger, 
+        // Buffer for reading files (If this is made larger,
         // the shuffle becomes larger and the phases may vary.
         // If it is too large, the memory consumption will increase.
         // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
@@ -322,7 +433,7 @@ namespace Learner
 
         // Do not use std::random_device().
         // Because it always the same integers on MinGW.
-        SfenReader(int thread_num) : 
+        SfenReader(int thread_num) :
             prng(std::chrono::system_clock::now().time_since_epoch().count())
         {
             packed_sfens.resize(thread_num);
@@ -369,13 +480,15 @@ namespace Learner
 
         void read_validation_set(const string& file_name, int eval_limit)
         {
-            ifstream input(file_name, ios::binary);
+            auto input = open_sfen_input_file(file_name);
 
-            while (input)
+            while(!input->eof())
             {
-                PackedSfenValue p;
-                if (input.read(reinterpret_cast<char*>(&p), sizeof(PackedSfenValue)))
+                std::optional<PackedSfenValue> p_opt = input->next();
+                if (p_opt.has_value())
                 {
+                    auto& p = *p_opt;
+
                     if (eval_limit < abs(p.score))
                         continue;
 
@@ -398,7 +511,7 @@ namespace Learner
             // then retrieve one and return it.
             auto& thread_ps = packed_sfens[thread_id];
 
-            // Fill the read buffer if there is no remaining buffer, 
+            // Fill the read buffer if there is no remaining buffer,
             // but if it doesn't even exist, finish.
             // If the buffer is empty, fill it.
             if ((thread_ps == nullptr || thread_ps->empty())
@@ -406,7 +519,7 @@ namespace Learner
                 return false;
 
             // read_to_thread_buffer_impl() returned true,
-            // Since the filling of the thread buffer with the 
+            // Since the filling of the thread buffer with the
             // phase has been completed successfully
             // thread_ps->rbegin() is alive.
 
@@ -458,33 +571,42 @@ namespace Learner
         // Start a thread that loads the phase file in the background.
         void start_file_read_worker()
         {
-            file_worker_thread = std::thread([&] { 
-                this->file_read_worker(); 
+            file_worker_thread = std::thread([&] {
+                this->file_read_worker();
                 });
         }
 
         void file_read_worker()
         {
             auto open_next_file = [&]() {
-                if (fs.is_open())
-                    fs.close();
-
                 // no more
-                if (filenames.empty())
-                    return false;
+                for(;;)
+                {
+                    sfen_input_stream.reset();
 
-                // Get the next file name.
-                string filename = filenames.back();
-                filenames.pop_back();
+                    if (filenames.empty())
+                        return false;
 
-                fs.open(filename, ios::in | ios::binary);
-                cout << "open filename = " << filename << endl;
+                    // Get the next file name.
+                    string filename = filenames.back();
+                    filenames.pop_back();
 
-                assert(fs);
+                    sfen_input_stream = open_sfen_input_file(filename);
+                    cout << "open filename = " << filename << endl;
 
-                return true;
+                    // in case the file is empty or was deleted.
+                    if (!sfen_input_stream->eof())
+                        return true;
+                }
             };
 
+            if (sfen_input_stream == nullptr && !open_next_file())
+            {
+                cout << "..end of files." << endl;
+                end_of_files = true;
+                return;
+            }
+
             while (true)
             {
                 // Wait for the buffer to run out.
@@ -501,10 +623,10 @@ namespace Learner
                 // Read from the file into the file buffer.
                 while (sfens.size() < SFEN_READ_SIZE)
                 {
-                    PackedSfenValue p;
-                    if (fs.read(reinterpret_cast<char*>(&p), sizeof(PackedSfenValue)))
+                    std::optional<PackedSfenValue> p = sfen_input_stream->next();
+                    if (p.has_value())
                     {
-                        sfens.push_back(p);
+                        sfens.push_back(*p);
                     }
                     else if(!open_next_file())
                     {
@@ -535,8 +657,8 @@ namespace Learner
                     auto buf = std::make_unique<PSVector>();
                     buf->resize(THREAD_BUFFER_SIZE);
                     memcpy(
-                        buf->data(), 
-                        &sfens[i * THREAD_BUFFER_SIZE], 
+                        buf->data(),
+                        &sfens[i * THREAD_BUFFER_SIZE],
                         sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
 
                     buffers.emplace_back(std::move(buf));
@@ -545,7 +667,7 @@ namespace Learner
                 {
                     std::unique_lock<std::mutex> lk(mutex);
 
-                    // The mutex lock is required because the 
+                    // The mutex lock is required because the%
                     // contents of packed_sfens_pool are changed.
 
                     for (auto& buf : buffers)
@@ -600,7 +722,7 @@ namespace Learner
         atomic<bool> end_of_files;
 
         // handle of sfen file
-        std::fstream fs;
+        std::unique_ptr<BasicSfenInputStream> sfen_input_stream;
 
         // sfen for each thread
         // (When the thread is used up, the thread should call delete to release it.)
@@ -621,9 +743,9 @@ namespace Learner
     // Class to generate sfen with multiple threads
     struct LearnerThink : public MultiThink
     {
-        LearnerThink(SfenReader& sr_) : 
-            sr(sr_), 
-            stop_flag(false), 
+        LearnerThink(SfenReader& sr_) :
+            sr(sr_),
+            stop_flag(false),
             save_only_once(false)
         {
             learn_sum_cross_entropy_eval = 0.0;
@@ -644,9 +766,9 @@ namespace Learner
         virtual void thread_worker(size_t thread_id);
 
         // Start a thread that loads the phase file in the background.
-        void start_file_read_worker() 
-        { 
-            sr.start_file_read_worker(); 
+        void start_file_read_worker()
+        {
+            sr.start_file_read_worker();
         }
 
         Value get_shallow_value(Position& task_pos);
@@ -674,7 +796,7 @@ namespace Learner
         // Option not to learn kk/kkp/kpp/kppp
         std::array<bool, 4> freeze;
 
-        // If the absolute value of the evaluation value of the deep search 
+        // If the absolute value of the evaluation value of the deep search
         // of the teacher phase exceeds this value, discard the teacher phase.
         int eval_limit;
 
@@ -742,7 +864,7 @@ namespace Learner
 
     void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
     {
-        // There is no point in hitting the replacement table, 
+        // There is no point in hitting the replacement table,
         // so at this timing the generation of the replacement table is updated.
         // It doesn't matter if you have disabled the substitution table.
         TT.new_search();
@@ -766,7 +888,7 @@ namespace Learner
         atomic<double> sum_norm;
         sum_norm = 0;
 
-        // The number of times the pv first move of deep 
+        // The number of times the pv first move of deep
         // search matches the pv first move of search(1).
         atomic<int> move_accord_count;
         move_accord_count = 0;
@@ -778,7 +900,7 @@ namespace Learner
         pos.set(StartFEN, false, &si, th);
         std::cout << "hirate eval = " << Eval::evaluate(pos);
 
-        // It's better to parallelize here, but it's a bit 
+        // It's better to parallelize here, but it's a bit
         // troublesome because the search before slave has not finished.
         // I created a mechanism to call task, so I will use it.
 
@@ -792,7 +914,7 @@ namespace Learner
         {
             // Assign work to each thread using TaskDispatcher.
             // A task definition for that.
-            // It is not possible to capture pos used in ↑, 
+            // It is not possible to capture pos used in ↑,
             // so specify the variables you want to capture one by one.
             auto task =
                 [
@@ -823,7 +945,7 @@ namespace Learner
                 // Evaluation value of deep search
                 auto deep_value = (Value)ps.score;
 
-                // Note) This code does not consider when 
+                // Note) This code does not consider when
                 //       eval_limit is specified in the learn command.
 
                 // --- calculation of cross entropy
@@ -834,14 +956,14 @@ namespace Learner
                 double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
                 double test_entropy_eval, test_entropy_win, test_entropy;
                 calc_cross_entropy(
-                    deep_value, 
-                    shallow_value, 
-                    ps, 
-                    test_cross_entropy_eval, 
-                    test_cross_entropy_win, 
-                    test_cross_entropy, 
-                    test_entropy_eval, 
-                    test_entropy_win, 
+                    deep_value,
+                    shallow_value,
+                    ps,
+                    test_cross_entropy_eval,
+                    test_cross_entropy_win,
+                    test_cross_entropy,
+                    test_entropy_eval,
+                    test_entropy_win,
                     test_entropy);
 
                 // The total cross entropy need not be abs() by definition.
@@ -878,9 +1000,9 @@ namespace Learner
         latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
         latest_loss_count += sr.sfen_for_mse.size();
 
-        // learn_cross_entropy may be called train cross 
+        // learn_cross_entropy may be called train cross
         // entropy in the world of machine learning,
-        // When omitting the acronym, it is nice to be able to 
+        // When omitting the acronym, it is nice to be able to
         // distinguish it from test cross entropy(tce) by writing it as lce.
 
         if (sr.sfen_for_mse.size() && done)
@@ -907,7 +1029,7 @@ namespace Learner
             }
             cout << endl;
         }
-        else 
+        else
         {
             cout << "Error! : sr.sfen_for_mse.size() = " << sr.sfen_for_mse.size() << " ,  done = " << done << endl;
         }
@@ -977,7 +1099,7 @@ namespace Learner
                     {
                         sr.save_count = 0;
 
-                        // During this time, as the gradient calculation proceeds, 
+                        // During this time, as the gradient calculation proceeds,
                         // the value becomes too large and I feel annoyed, so stop other threads.
                         const bool converged = save();
                         if (converged)
@@ -1007,11 +1129,11 @@ namespace Learner
                         sr.last_done = sr.total_done;
                     }
 
-                    // Next time, I want you to do this series of 
+                    // Next time, I want you to do this series of
                     // processing again when you process only mini_batch_size.
                     sr.next_update_weights += mini_batch_size;
 
-                    // Since I was waiting for the update of this 
+                    // Since I was waiting for the update of this
                     // sr.next_update_weights except the main thread,
                     // Once this value is updated, it will start moving again.
                 }
@@ -1048,16 +1170,16 @@ namespace Learner
             if (pos.set_from_packed_sfen(ps.sfen, &si, th, mirror) != 0)
             {
                 // I got a strange sfen. Should be debugged!
-                // Since it is an illegal sfen, it may not be 
+                // Since it is an illegal sfen, it may not be
                 // displayed with pos.sfen(), but it is better than not.
                 cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
                 goto RETRY_READ;
             }
 
             // There is a possibility that all the pieces are blocked and stuck.
-            // Also, the declaration win phase is excluded from 
+            // Also, the declaration win phase is excluded from
             // learning because you cannot go to leaf with PV moves.
-            // (shouldn't write out such teacher aspect itself, 
+            // (shouldn't write out such teacher aspect itself,
             // but may have written it out with an old generation routine)
             // Skip the position if there are no legal moves (=checkmated or stalemate).
             if (MoveList<LEGAL>(pos).size() == 0)
@@ -1073,7 +1195,7 @@ namespace Learner
             const auto deep_value = (Value)ps.score;
 
             // I feel that the mini batch has a better gradient.
-            // Go to the leaf node as it is, add only to the gradient array, 
+            // Go to the leaf node as it is, add only to the gradient array,
             // and later try AdaGrad at the time of rmse aggregation.
 
             const auto rootColor = pos.side_to_move();
@@ -1088,30 +1210,30 @@ namespace Learner
             auto pos_add_grad = [&]() {
                 // Use the value of evaluate in leaf as shallow_value.
                 // Using the return value of qsearch() as shallow_value,
-                // If PV is interrupted in the middle, the phase where 
-                // evaluate() is called to calculate the gradient, 
-                // and I don't think this is a very desirable property, 
+                // If PV is interrupted in the middle, the phase where
+                // evaluate() is called to calculate the gradient,
+                // and I don't think this is a very desirable property,
                 // as the aspect that gives that gradient will be different.
-                // I have turned off the substitution table, but since 
+                // I have turned off the substitution table, but since
                 // the pv array has not been updated due to one stumbling block etc...
 
-                const Value shallow_value = 
-                    (rootColor == pos.side_to_move()) 
-                    ? Eval::evaluate(pos) 
+                const Value shallow_value =
+                    (rootColor == pos.side_to_move())
+                    ? Eval::evaluate(pos)
                     : -Eval::evaluate(pos);
 
                 // Calculate loss for training data
                 double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
                 double learn_entropy_eval, learn_entropy_win, learn_entropy;
                 calc_cross_entropy(
-                    deep_value, 
-                    shallow_value, 
-                    ps, 
-                    learn_cross_entropy_eval, 
-                    learn_cross_entropy_win, 
-                    learn_cross_entropy, 
-                    learn_entropy_eval, 
-                    learn_entropy_win, 
+                    deep_value,
+                    shallow_value,
+                    ps,
+                    learn_cross_entropy_eval,
+                    learn_cross_entropy_win,
+                    learn_cross_entropy,
+                    learn_entropy_eval,
+                    learn_entropy_win,
                     learn_entropy);
 
                 learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
@@ -1154,7 +1276,7 @@ namespace Learner
                 Eval::NNUE::update_eval(pos);
             }
 
-            if (illegal_move) 
+            if (illegal_move)
             {
                 sync_cout << "An illegal move was detected... Excluded the position from the learning data..." << sync_endl;
                 continue;
@@ -1182,12 +1304,12 @@ namespace Learner
             // Do not dig a subfolder because I want to save it only once.
             Eval::save_eval("");
         }
-        else if (is_final) 
+        else if (is_final)
         {
             Eval::save_eval("final");
             return true;
         }
-        else 
+        else
         {
             static int dir_number = 0;
             const std::string dir_name = std::to_string(dir_number++);
@@ -1199,27 +1321,27 @@ namespace Learner
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
                 cout << "loss: " << latest_loss;
-                if (latest_loss < best_loss) 
+                if (latest_loss < best_loss)
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
                     best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
                     trials = newbob_num_trials;
                 }
-                else 
+                else
                 {
                     cout << " >= best (" << best_loss << "), rejected" << endl;
-                    if (best_nn_directory.empty()) 
+                    if (best_nn_directory.empty())
                     {
                         cout << "WARNING: no improvement from initial model" << endl;
                     }
-                    else 
+                    else
                     {
                         cout << "restoring parameters from " << best_nn_directory << endl;
                         Eval::NNUE::RestoreParameters(best_nn_directory);
                     }
 
-                    if (--trials > 0 && !is_final) 
+                    if (--trials > 0 && !is_final)
                     {
                         cout
                             << "reducing learning rate scale from " << newbob_scale
@@ -1230,8 +1352,8 @@ namespace Learner
                         Eval::NNUE::SetGlobalLearningRateScale(newbob_scale);
                     }
                 }
-                
-                if (trials == 0) 
+
+                if (trials == 0)
                 {
                     cout << "converged" << endl;
                     return true;
@@ -1247,9 +1369,9 @@ namespace Learner
     // sfen_file_streams: fstream of each teacher phase file
     // sfen_count_in_file: The number of teacher positions present in each file.
     void shuffle_write(
-        const string& output_file_name, 
-        PRNG& prng, 
-        vector<fstream>& sfen_file_streams, 
+        const string& output_file_name,
+        PRNG& prng,
+        vector<fstream>& sfen_file_streams,
         vector<uint64_t>& sfen_count_in_file)
     {
         uint64_t total_sfen_count = 0;
@@ -1323,7 +1445,7 @@ namespace Learner
         // Temporary file is written to tmp/ folder for each buffer_size phase.
         // For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
         // In a PC with a small memory, it would be better to reduce this.
-        // However, if the number of files increases too much, 
+        // However, if the number of files increases too much,
         // it will not be possible to open at the same time due to OS restrictions.
         // There should have been a limit of 512 per process on Windows, so you can open here as 500,
         // The current setting is 500 files x 20M = 10G = 10 billion phases.
@@ -1377,7 +1499,7 @@ namespace Learner
 
             // Read in units of sizeof(PackedSfenValue),
             // Ignore the last remaining fraction. (Fails in fs.read, so exit while)
-            // (The remaining fraction seems to be half-finished data 
+            // (The remaining fraction seems to be half-finished data
             // that was created because it was stopped halfway during teacher generation.)
         }
 
@@ -1385,14 +1507,14 @@ namespace Learner
             write_buffer(buf_write_marker);
 
         // Only shuffled files have been written write_file_count.
-        // As a second pass, if you open all of them at the same time, 
+        // As a second pass, if you open all of them at the same time,
         // select one at random and load one phase at a time
         // Now you have shuffled.
 
-        // Original file for shirt full + tmp file + file to write 
+        // Original file for shirt full + tmp file + file to write
         // requires 3 times the storage capacity of the original file.
         // 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
-        // If you want to delete (or delete by hand) the 
+        // If you want to delete (or delete by hand) the
         // original file at this point after writing to tmp,
         // The storage capacity is about twice that of the original file.
         // So, maybe we should have an option to delete the original file.
@@ -1477,11 +1599,11 @@ namespace Learner
 
         std::cout << "write : " << output_file_name << endl;
 
-        // If the file to be written exceeds 2GB, it cannot be 
+        // If the file to be written exceeds 2GB, it cannot be
         // written in one shot with fstream::write, so use wrapper.
         write_memory_to_file(
-            output_file_name, 
-            (void*)&buf[0], 
+            output_file_name,
+            (void*)&buf[0],
             sizeof(PackedSfenValue) * buf.size());
 
         std::cout << "..shuffle_on_memory done." << std::endl;
@@ -1521,10 +1643,10 @@ namespace Learner
         uint64_t buffer_size = 20000000;
         // fast shuffling assuming each file is shuffled
         bool shuffle_quick = false;
-        // A function to read the entire file in memory and shuffle it. 
+        // A function to read the entire file in memory and shuffle it.
         // (Requires file size memory)
         bool shuffle_on_memory = false;
-        // Conversion of packed sfen. In plain, it consists of sfen(string), 
+        // Conversion of packed sfen. In plain, it consists of sfen(string),
         // evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
         bool use_convert_plain = false;
         // convert plain format teacher to Yaneura King's bin
@@ -1541,15 +1663,15 @@ namespace Learner
         // File name to write in those cases (default is "shuffled_sfen.bin")
         string output_file_name = "shuffled_sfen.bin";
 
-        // If the absolute value of the evaluation value 
-        // in the deep search of the teacher phase exceeds this value, 
+        // If the absolute value of the evaluation value
+        // in the deep search of the teacher phase exceeds this value,
         // that phase is discarded.
         int eval_limit = 32000;
 
         // Flag to save the evaluation function file only once near the end.
         bool save_only_once = false;
 
-        // Shuffle about what you are pre-reading on the teacher aspect. 
+        // Shuffle about what you are pre-reading on the teacher aspect.
         // (Shuffle of about 10 million phases)
         // Turn on if you want to pass a pre-shuffled file.
         bool no_shuffle = false;
@@ -1559,8 +1681,8 @@ namespace Learner
         ELMO_LAMBDA2 = 0.33;
         ELMO_LAMBDA_LIMIT = 32000;
 
-        // Discount rate. If this is set to a value other than 0, 
-        // the slope will be added even at other than the PV termination. 
+        // Discount rate. If this is set to a value other than 0,
+        // the slope will be added even at other than the PV termination.
         // (At that time, apply this discount rate)
         double discount_rate = 0;
 
@@ -1620,18 +1742,18 @@ namespace Learner
             else if (option == "eta2_epoch") is >> eta2_epoch;
 
             // Accept also the old option name.
-            else if (option == "use_draw_in_training" 
-                  || option == "use_draw_games_in_training") 
+            else if (option == "use_draw_in_training"
+                  || option == "use_draw_games_in_training")
                 is >> use_draw_games_in_training;
 
             // Accept also the old option name.
-            else if (option == "use_draw_in_validation" 
-                  || option == "use_draw_games_in_validation") 
+            else if (option == "use_draw_in_validation"
+                  || option == "use_draw_games_in_validation")
                 is >> use_draw_games_in_validation;
 
             // Accept also the old option name.
-            else if (option == "use_hash_in_training" 
-                  || option == "skip_duplicated_positions_in_training") 
+            else if (option == "use_hash_in_training"
+                  || option == "skip_duplicated_positions_in_training")
                 is >> skip_duplicated_positions_in_training;
 
             else if (option == "winning_probability_coefficient") is >> winning_probability_coefficient;
@@ -1792,9 +1914,9 @@ namespace Learner
             Eval::init_NNUE();
             cout << "convert_bin_from_pgn-extract.." << endl;
             convert_bin_from_pgn_extract(
-                filenames, 
-                output_file_name, 
-                pgn_eval_side_to_move, 
+                filenames,
+                output_file_name,
+                pgn_eval_side_to_move,
                 convert_no_eval_fens_as_score_zero);
 
             return;
@@ -1808,7 +1930,7 @@ namespace Learner
         // Insert the file name for the number of loops.
         for (int i = 0; i < loop; ++i)
         {
-            // sfen reader, I'll read it in reverse 
+            // sfen reader, I'll read it in reverse
             // order so I'll reverse it here. I'm sorry.
             for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
             {
@@ -1891,12 +2013,12 @@ namespace Learner
 
         learn_think.mini_batch_size = mini_batch_size;
 
-        if (validation_set_file_name.empty()) 
+        if (validation_set_file_name.empty())
         {
             // Get about 10,000 data for mse calculation.
             sr.read_for_mse();
         }
-        else 
+        else
         {
             sr.read_validation_set(validation_set_file_name, eval_limit);
         }

From 585a5351bf1dee8c3fb56f74a53f3d035781189f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 00:42:24 +0200
Subject: [PATCH 245/583] Fix warnings.

---
 src/extra/nnue_data_binpack_format.h | 234 ++++++++-------------------
 1 file changed, 66 insertions(+), 168 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index bec0e9ad..9b7a868e 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -501,14 +501,14 @@ namespace chess
 
         [[nodiscard]] constexpr ValueType& operator[](const KeyType& dir)
         {
-            assert(ordinal(dir) < SizeV);
+            assert(static_cast<int>(ordinal(dir)) < static_cast<int>(SizeV));
 
             return elements[ordinal(dir)];
         }
 
         [[nodiscard]] constexpr const ValueType& operator[](const KeyType& dir) const
         {
-            assert(ordinal(dir) < SizeV);
+            assert(static_cast<int>(ordinal(dir)) < static_cast<int>(SizeV));
 
             return elements[ordinal(dir)];
         }
@@ -1141,9 +1141,9 @@ namespace chess
         {
         }
 
-        constexpr Offset(int files, int ranks) :
-            files(files),
-            ranks(ranks)
+        constexpr Offset(int files_, int ranks_) :
+            files(files_),
+            ranks(ranks_)
         {
         }
 
@@ -1328,7 +1328,7 @@ namespace chess
         [[nodiscard]] constexpr Color color() const
         {
             assert(isOk());
-            return !fromOrdinal<Color>(ordinal(rank()) + ordinal(file()) & 1);
+            return !fromOrdinal<Color>((ordinal(rank()) + ordinal(file())) & 1);
         }
 
         constexpr void flipVertically()
@@ -1887,11 +1887,11 @@ namespace chess
         {
         }
 
-        constexpr ReverseMove(const Move& move, Piece capturedPiece, Square oldEpSquare, CastlingRights oldCastlingRights) :
-            move(move),
-            capturedPiece(capturedPiece),
-            oldEpSquare(oldEpSquare),
-            oldCastlingRights(oldCastlingRights)
+        constexpr ReverseMove(const Move& move_, Piece capturedPiece_, Square oldEpSquare_, CastlingRights oldCastlingRights_) :
+            move(move_),
+            capturedPiece(capturedPiece_),
+            oldEpSquare(oldEpSquare_),
+            oldCastlingRights(oldCastlingRights_)
         {
         }
 
@@ -3100,13 +3100,13 @@ namespace chess
                 return bbs;
             }
 
-            [[nodiscard]] static Bitboard generateSliderPseudoAttacks(const std::array<Offset, 4> & offsets, Square fromSq)
+            [[nodiscard]] static Bitboard generateSliderPseudoAttacks(const std::array<Offset, 4> & offsets_, Square fromSq)
             {
                 assert(fromSq.isOk());
 
                 Bitboard bb{};
 
-                for (auto&& offset : offsets)
+                for (auto&& offset : offsets_)
                 {
                     SquareCoords fromSqC = fromSq.coords();
 
@@ -3370,32 +3370,32 @@ namespace chess
 
             static const EnumArray2<Square, Square, Bitboard> between = []()
             {
-                EnumArray2<Square, Square, Bitboard> between;
+                EnumArray2<Square, Square, Bitboard> between_;
 
                 for (Square s1 : values<Square>())
                 {
                     for (Square s2 : values<Square>())
                     {
-                        between[s1][s2] = generateBetween(s1, s2);
+                        between_[s1][s2] = generateBetween(s1, s2);
                     }
                 }
 
-                return between;
+                return between_;
             }();
 
             static const EnumArray2<Square, Square, Bitboard> line = []()
             {
-                EnumArray2<Square, Square, Bitboard> line;
+                EnumArray2<Square, Square, Bitboard> line_;
 
                 for (Square s1 : values<Square>())
                 {
                     for (Square s2 : values<Square>())
                     {
-                        line[s1][s2] = generateLine(s1, s2);
+                        line_[s1][s2] = generateLine(s1, s2);
                     }
                 }
 
-                return line;
+                return line_;
             }();
         }
 
@@ -4262,12 +4262,12 @@ namespace chess
             else if (move.type == MoveType::EnPassant)
             {
                 const Piece movedPiece = m_pieces[move.to];
-                const Piece capturedPiece(PieceType::Pawn, !movedPiece.color());
+                const Piece capturedPiece_(PieceType::Pawn, !movedPiece.color());
                 const Square capturedPieceSq(move.to.file(), move.from.rank());
 
                 m_pieces[move.to] = Piece::none();
                 m_pieces[move.from] = movedPiece;
-                m_pieces[capturedPieceSq] = capturedPiece;
+                m_pieces[capturedPieceSq] = capturedPiece_;
 
                 m_pieceBB[movedPiece] ^= move.from;
                 m_pieceBB[movedPiece] ^= move.to;
@@ -4276,14 +4276,14 @@ namespace chess
                 m_pieceBB[Piece::none()] ^= move.to;
 
                 // on ep move there are 3 squares involved
-                m_pieceBB[capturedPiece] ^= capturedPieceSq;
+                m_pieceBB[capturedPiece_] ^= capturedPieceSq;
                 m_pieceBB[Piece::none()] ^= capturedPieceSq;
 
                 m_piecesByColorBB[movedPiece.color()] ^= move.to;
                 m_piecesByColorBB[movedPiece.color()] ^= move.from;
-                m_piecesByColorBB[capturedPiece.color()] ^= capturedPieceSq;
+                m_piecesByColorBB[capturedPiece_.color()] ^= capturedPieceSq;
 
-                ++m_pieceCount[capturedPiece];
+                ++m_pieceCount[capturedPiece_];
                 --m_pieceCount[Piece::none()];
             }
             else // if (move.type == MoveType::Castle)
@@ -4565,9 +4565,6 @@ namespace chess
 
         [[nodiscard]] inline bool isCheckAfterMove(Move move) const;
 
-        // Checks whether ANY `move` is legal.
-        [[nodiscard]] inline bool isMoveLegal(Move move) const;
-
         [[nodiscard]] inline bool isPseudoLegalMoveLegal(Move move) const;
 
         [[nodiscard]] inline bool isMovePseudoLegal(Move move) const;
@@ -4806,7 +4803,7 @@ namespace chess
             }
         }
 
-        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressKing(const Position& position, Square sq, Piece piece)
+        [[nodiscard]] FORCEINLINE constexpr std::uint8_t compressKing(const Position& position, Square /* sq */, Piece piece)
         {
             const Color color = piece.color();
             const Color sideToMove = position.sideToMove();
@@ -4829,19 +4826,19 @@ namespace chess
     namespace detail::lookup
     {
         static constexpr EnumArray<PieceType, std::uint8_t(*)(const Position&, Square, Piece)> pieceCompressorFunc = []() {
-            EnumArray<PieceType, std::uint8_t(*)(const Position&, Square, Piece)> pieceCompressorFunc{};
+            EnumArray<PieceType, std::uint8_t(*)(const Position&, Square, Piece)> pieceCompressorFunc_{};
 
-            pieceCompressorFunc[PieceType::Knight] = detail::compressOrdinaryPiece;
-            pieceCompressorFunc[PieceType::Bishop] = detail::compressOrdinaryPiece;
-            pieceCompressorFunc[PieceType::Queen] = detail::compressOrdinaryPiece;
+            pieceCompressorFunc_[PieceType::Knight] = detail::compressOrdinaryPiece;
+            pieceCompressorFunc_[PieceType::Bishop] = detail::compressOrdinaryPiece;
+            pieceCompressorFunc_[PieceType::Queen] = detail::compressOrdinaryPiece;
 
-            pieceCompressorFunc[PieceType::Pawn] = detail::compressPawn;
-            pieceCompressorFunc[PieceType::Rook] = detail::compressRook;
-            pieceCompressorFunc[PieceType::King] = detail::compressKing;
+            pieceCompressorFunc_[PieceType::Pawn] = detail::compressPawn;
+            pieceCompressorFunc_[PieceType::Rook] = detail::compressRook;
+            pieceCompressorFunc_[PieceType::King] = detail::compressKing;
 
-            pieceCompressorFunc[PieceType::None] = [](const Position&, Square, Piece) -> std::uint8_t { /* should never happen */ return 0; };
+            pieceCompressorFunc_[PieceType::None] = [](const Position&, Square, Piece) -> std::uint8_t { /* should never happen */ return 0; };
 
-            return pieceCompressorFunc;
+            return pieceCompressorFunc_;
         }();
     }
 
@@ -5089,6 +5086,8 @@ namespace chess
                     king ^= occupiedChange;
                 }
             }
+            case PieceType::None:
+                assert(false);
             }
         }
 
@@ -5285,23 +5284,23 @@ namespace chess
     namespace detail::lookup
     {
         static constexpr EnumArray<Piece, char> fenPiece = []() {
-            EnumArray<Piece, char> fenPiece{};
+            EnumArray<Piece, char> fenPiece_{};
 
-            fenPiece[whitePawn] = 'P';
-            fenPiece[blackPawn] = 'p';
-            fenPiece[whiteKnight] = 'N';
-            fenPiece[blackKnight] = 'n';
-            fenPiece[whiteBishop] = 'B';
-            fenPiece[blackBishop] = 'b';
-            fenPiece[whiteRook] = 'R';
-            fenPiece[blackRook] = 'r';
-            fenPiece[whiteQueen] = 'Q';
-            fenPiece[blackQueen] = 'q';
-            fenPiece[whiteKing] = 'K';
-            fenPiece[blackKing] = 'k';
-            fenPiece[Piece::none()] = 'X';
+            fenPiece_[whitePawn] = 'P';
+            fenPiece_[blackPawn] = 'p';
+            fenPiece_[whiteKnight] = 'N';
+            fenPiece_[blackKnight] = 'n';
+            fenPiece_[whiteBishop] = 'B';
+            fenPiece_[blackBishop] = 'b';
+            fenPiece_[whiteRook] = 'R';
+            fenPiece_[blackRook] = 'r';
+            fenPiece_[whiteQueen] = 'Q';
+            fenPiece_[blackQueen] = 'q';
+            fenPiece_[whiteKing] = 'K';
+            fenPiece_[blackKing] = 'k';
+            fenPiece_[Piece::none()] = 'X';
 
-            return fenPiece;
+            return fenPiece_;
         }();
     }
 
@@ -5495,21 +5494,21 @@ namespace chess
     namespace detail::lookup
     {
         static constexpr EnumArray<Square, CastlingRights> preservedCastlingRights = []() {
-            EnumArray<Square, CastlingRights> preservedCastlingRights{};
-            for (CastlingRights& rights : preservedCastlingRights)
+            EnumArray<Square, CastlingRights> preservedCastlingRights_{};
+            for (CastlingRights& rights : preservedCastlingRights_)
             {
                 rights = ~CastlingRights::None;
             }
 
-            preservedCastlingRights[e1] = ~CastlingRights::White;
-            preservedCastlingRights[e8] = ~CastlingRights::Black;
+            preservedCastlingRights_[e1] = ~CastlingRights::White;
+            preservedCastlingRights_[e8] = ~CastlingRights::Black;
 
-            preservedCastlingRights[h1] = ~CastlingRights::WhiteKingSide;
-            preservedCastlingRights[a1] = ~CastlingRights::WhiteQueenSide;
-            preservedCastlingRights[h8] = ~CastlingRights::BlackKingSide;
-            preservedCastlingRights[a8] = ~CastlingRights::BlackQueenSide;
+            preservedCastlingRights_[h1] = ~CastlingRights::WhiteKingSide;
+            preservedCastlingRights_[a1] = ~CastlingRights::WhiteQueenSide;
+            preservedCastlingRights_[h8] = ~CastlingRights::BlackKingSide;
+            preservedCastlingRights_[a8] = ~CastlingRights::BlackQueenSide;
 
-            return preservedCastlingRights;
+            return preservedCastlingRights_;
         }();
     }
 
@@ -5687,8 +5686,6 @@ namespace chess
         [[nodiscard]] inline std::string moveToUci(const Position& pos, const Move& move);
         [[nodiscard]] inline Move uciToMove(const Position& pos, std::string_view sv);
 
-        [[nodiscard]] inline std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv);
-
         [[nodiscard]] inline std::string moveToUci(const Position& pos, const Move& move)
         {
             std::string s;
@@ -5751,103 +5748,6 @@ namespace chess
                 }
             }
         }
-
-        [[nodiscard]] inline std::optional<Move> tryUciToMove(const Position& pos, std::string_view sv)
-        {
-            if (sv.size() < 4 || sv.size() > 5)
-            {
-                return std::nullopt;
-            }
-
-            const auto from = parser_bits::tryParseSquare(sv.substr(0, 2));
-            const auto to = parser_bits::tryParseSquare(sv.substr(2, 2));
-
-            Move move{};
-
-            if (!from.has_value() || !to.has_value())
-            {
-                return std::nullopt;
-            }
-
-            if (sv.size() == 5)
-            {
-                const auto promotedPieceType = fromChar<PieceType>(sv[4]);
-                if (!promotedPieceType.has_value())
-                {
-                    return std::nullopt;
-                }
-
-                if (
-                    *promotedPieceType != PieceType::Knight
-                    && *promotedPieceType != PieceType::Bishop
-                    && *promotedPieceType != PieceType::Rook
-                    && *promotedPieceType != PieceType::Queen
-                    )
-                {
-                    return std::nullopt;
-                }
-
-                move = Move::promotion(*from, *to, Piece(*promotedPieceType, pos.sideToMove()));
-            }
-            else // sv.size() == 4
-            {
-
-                if (
-                    pos.pieceAt(*from).type() == PieceType::King
-                    && std::abs(from->file() - to->file()) > 1
-                    )
-                {
-                    // uci king destinations are on files C or G.
-
-                    if (pos.sideToMove() == Color::White)
-                    {
-                        if (*from != e1)
-                        {
-                            return std::nullopt;
-                        }
-
-                        if (*to != c1 && *to != g1)
-                        {
-                            return std::nullopt;
-                        }
-                    }
-                    else
-                    {
-                        if (*from != e8)
-                        {
-                            return std::nullopt;
-                        }
-
-                        if (*to != c8 && *to != g8)
-                        {
-                            return std::nullopt;
-                        }
-                    }
-
-                    const CastleType castleType =
-                        (to->file() == fileG)
-                        ? CastleType::Short
-                        : CastleType::Long;
-
-                    move = Move::castle(castleType, pos.sideToMove());
-                }
-                else if (to == pos.epSquare())
-                {
-                    move = Move::enPassant(*from, *to);
-                }
-                else
-                {
-                    move = Move::normal(*from, *to);
-                }
-            }
-
-            if (!pos.isMoveLegal(move))
-            {
-                return std::nullopt;
-            }
-
-            return move;
-        }
     }
 }
 
@@ -6206,7 +6106,7 @@ namespace binpack
         {
             SfenPacker packer;
             auto& stream = packer.stream;
-            stream.set_data((uint8_t*)&sfen);
+            stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
 
             chess::Position pos{};
 
@@ -6450,11 +6350,11 @@ namespace binpack
         std::uint16_t numPlies;
         unsigned char* movetext;
 
-        PackedMoveScoreListReader(const TrainingDataEntry& entry, unsigned char* movetext, std::uint16_t numPlies) :
-            entry(entry),
-            movetext(movetext),
-            numPlies(numPlies),
-            m_lastScore(-entry.score)
+        PackedMoveScoreListReader(const TrainingDataEntry& entry_, unsigned char* movetext_, std::uint16_t numPlies_) :
+            entry(entry_),
+            numPlies(numPlies_),
+            movetext(movetext_),
+            m_lastScore(-entry_.score)
         {
 
         }
@@ -7247,7 +7147,6 @@ namespace binpack
 
     inline void convertBinToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
     {
-        constexpr std::size_t reportEveryNPositions = 100'000;
         constexpr std::size_t bufferSize = MiB;
 
         std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
@@ -7300,7 +7199,6 @@ namespace binpack
 
     inline void convertPlainToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
     {
-        constexpr std::size_t reportEveryNPositions = 100'000;
         constexpr std::size_t bufferSize = MiB;
 
         std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';

From a7ca8265937f8ead43355284c608893cf68ffbb5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 01:02:56 +0200
Subject: [PATCH 246/583] MIT license/copyright notice in the library file.

---
 src/extra/nnue_data_binpack_format.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 9b7a868e..5dd5819c 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -1,3 +1,29 @@
+/*
+
+Copyright 2020 Tomasz Sobczyk
+
+Permission is hereby granted, free of charge,
+to any person obtaining a copy of this software
+and associated documentation files (the "Software"),
+to deal in the Software without restriction,
+including without limitation the rights to use, copy,
+modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall
+be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH
+THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*/
+
 #pragma once
 
 #include <cstdio>

From 53ad4d8b5613ff005d737d42b5ef25fcd88f38f9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 01:48:48 +0200
Subject: [PATCH 247/583] A speculative build fix for linux.

---
 src/extra/nnue_data_binpack_format.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 5dd5819c..c86a55c2 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -40,7 +40,13 @@ THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include <cassert>
 #include <array>
 #include <immintrin.h>
+
+#ifdef linux
+#include <x86intrin.h>
+#else
 #include <intrin.h>
+#endif
+
 #include <nmmintrin.h>
 #include <limits>
 

From 7e6901af27effddcf75ac293377e7879eb2d517f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 12:36:39 +0200
Subject: [PATCH 248/583] Remove unused immintring. Include intrin.h only on
 some platforms, otherwise builtins are used.

---
 src/extra/nnue_data_binpack_format.h | 109 +--------------------------
 1 file changed, 4 insertions(+), 105 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index c86a55c2..3204b4b4 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -39,17 +39,11 @@ THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include <cstdio>
 #include <cassert>
 #include <array>
-#include <immintrin.h>
-
-#ifdef linux
-#include <x86intrin.h>
-#else
-#include <intrin.h>
-#endif
-
-#include <nmmintrin.h>
 #include <limits>
 
+#if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(__clang__)
+#include <intrin.h>
+#endif
 
 namespace chess
 {
@@ -177,87 +171,12 @@ namespace chess
     #endif
     }
 
-
-    template <typename IntT>
-    [[nodiscard]] constexpr IntT mulSaturate(IntT lhs, IntT rhs)
-    {
-        static_assert(std::is_unsigned_v<IntT>); // currently no support for signed
-
-    #if defined (_MSC_VER)
-
-        if (lhs == 0) return 0;
-
-        const IntT result = lhs * rhs;
-        return result / lhs == rhs ? result : std::numeric_limits<IntT>::max();
-
-    #elif defined (__GNUC__)
-
-        IntT result{};
-        return __builtin_mul_overflow(lhs, rhs, &result) ? std::numeric_limits<IntT>::max() : result;
-
-    #endif
-    }
-
-    template <typename IntT>
-    [[nodiscard]] constexpr IntT addSaturate(IntT lhs, IntT rhs)
-    {
-        static_assert(std::is_unsigned_v<IntT>); // currently no support for signed
-
-    #if defined (_MSC_VER)
-
-        const IntT result = lhs + rhs;
-        return result >= lhs ? result : std::numeric_limits<IntT>::max();
-
-    #elif defined (__GNUC__)
-
-        IntT result{};
-        return __builtin_add_overflow(lhs, rhs, &result) ? std::numeric_limits<IntT>::max() : result;
-
-    #endif
-    }
-
-    template <typename IntT>
-    [[nodiscard]] constexpr bool addOverflows(IntT lhs, IntT rhs)
-    {
-    #if defined (_MSC_VER)
-
-        return static_cast<IntT>(lhs + rhs) < lhs;
-
-    #elif defined (__GNUC__)
-
-        IntT result{};
-        __builtin_add_overflow(lhs, rhs, &result);
-        return result;
-
-    #endif
-    }
-
     template <typename IntT>
     [[nodiscard]] constexpr IntT floorLog2(IntT value)
     {
         return intrin::msb_constexpr(value);
     }
 
-    template <typename IntT>
-    constexpr std::size_t maxFibonacciNumberIndexForType()
-    {
-        static_assert(std::is_unsigned_v<IntT>);
-
-        switch (sizeof(IntT))
-        {
-        case 8:
-            return 93;
-        case 4:
-            return 47;
-        case 2:
-            return 24;
-        case 1:
-            return 13;
-        }
-
-        return 0;
-    }
-
     template <typename IntT>
     constexpr auto computeMasks()
     {
@@ -278,26 +197,6 @@ namespace chess
     template <typename IntT>
     constexpr auto nbitmask = computeMasks<IntT>();
 
-    template <typename IntT>
-    constexpr auto computeFibonacciNumbers()
-    {
-        constexpr std::size_t size = maxFibonacciNumberIndexForType<IntT>() + 1;
-        std::array<IntT, size> numbers{};
-        numbers[0] = 0;
-        numbers[1] = 1;
-
-        for (std::size_t i = 2; i < size; ++i)
-        {
-            numbers[i] = numbers[i - 1] + numbers[i - 2];
-        }
-
-        return numbers;
-    }
-
-    // F(0) = 0, F(1) = 1
-    template <typename IntT>
-    constexpr auto fibonacciNumbers = computeFibonacciNumbers<IntT>();
-
     template <std::size_t N, typename FromT, typename ToT = std::make_signed_t<FromT>>
     inline ToT signExtend(FromT value)
     {
@@ -2700,7 +2599,7 @@ namespace chess
         return Bitboard::square(sq0) | sq1;
     }
 
-    [[nodiscard]] constexpr Bitboard operator""_bb(std::uint64_t bits)
+    [[nodiscard]] constexpr Bitboard operator""_bb(unsigned long long bits)
     {
         return Bitboard::fromBits(bits);
     }

From 59402d4a6de1fc27a0253c8f7d3c2d604a5236fb Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 13:02:45 +0200
Subject: [PATCH 249/583] Include <climits> for CHAR_BIT. Test both formats in
 instrumented learn.

---
 src/extra/nnue_data_binpack_format.h | 1 +
 tests/instrumented_learn.sh          | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 3204b4b4..839fc17c 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -40,6 +40,7 @@ THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include <cassert>
 #include <array>
 #include <limits>
+#include <climits>
 
 #if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(__clang__)
 #include <intrin.h>
diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 756569e6..147c0c97 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -78,7 +78,9 @@ cat << EOF > gensfen01.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value false\n"
  send "isready\n"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.bin use_raw_nnue_eval 0\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
+ expect "gensfen finished."
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
  expect "gensfen finished."
 
  send "quit\n"
@@ -100,7 +102,9 @@ cat << EOF > gensfen02.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value true\n"
  send "isready\n"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_02/training_data.bin use_raw_nnue_eval 0\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
+ expect "gensfen finished."
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
  expect "gensfen finished."
 
  send "quit\n"

From ac6e6f73f281458e6c5488debf4d96d7a50c8bf4 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Thu, 10 Sep 2020 20:54:47 +0900
Subject: [PATCH 250/583] Added EnableTranspositionTable UCI option to
 enable/disable transposition table.

---
 src/tt.cpp        | 11 +++++++++++
 src/tt.h          |  4 ++++
 src/ucioption.cpp |  7 ++++++-
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/tt.cpp b/src/tt.cpp
index 60a3a5f1..fc8ab3b1 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -28,6 +28,10 @@
 
 TranspositionTable TT; // Our global transposition table
 
+#ifdef EVAL_LEARN
+bool TranspositionTable::enable_transposition_table = true;
+#endif
+
 /// TTEntry::save() populates the TTEntry with a new node's data, possibly
 /// overwriting an old position. Update is not atomic and can be racy.
 
@@ -116,6 +120,13 @@ void TranspositionTable::clear() {
 
 TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
 
+#ifdef EVAL_LEARN
+  if (!enable_transposition_table) {
+      found = false;
+      return first_entry(0);
+  }
+#endif
+
   TTEntry* const tte = first_entry(key);
   const uint16_t key16 = (uint16_t)key;  // Use the low 16 bits as key inside the cluster
 
diff --git a/src/tt.h b/src/tt.h
index fdfd6769..e83b6f3c 100644
--- a/src/tt.h
+++ b/src/tt.h
@@ -84,6 +84,10 @@ public:
     return &table[mul_hi64(key, clusterCount)].entry[0];
   }
 
+#ifdef EVAL_LEARN
+  static bool enable_transposition_table;
+#endif
+
 private:
   friend struct TTEntry;
 
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 0e561416..b24d8d78 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -44,7 +44,10 @@ void on_use_NNUE(const Option& ) { Eval::init_NNUE(); }
 void on_eval_file(const Option& ) { Eval::init_NNUE(); }
 #ifdef EVAL_LEARN
 void on_prune_at_shallow_depth_on_pv_node(const Option& o) {
-  Search::prune_at_shallow_depth_on_pv_node = o;
+    Search::prune_at_shallow_depth_on_pv_node = o;
+}
+void on_enable_transposition_table(const Option& o) {
+    TranspositionTable::enable_transposition_table = o;
 }
 #endif
 
@@ -102,6 +105,8 @@ void init(OptionsMap& o) {
   o["EvalSaveDir"] << Option("evalsave");
   // Prune at shallow depth on PV nodes. Setting this value to true gains elo in shallow search.
   o["PruneAtShallowDepthOnPvNode"] << Option(false, on_prune_at_shallow_depth_on_pv_node);
+  // Enable transposition table.
+  o["EnableTranspositionTable"] << Option(true, on_enable_transposition_table);
 #endif
 }
 

From c76bb34a96ed36511e360a60bd9f33e364617139 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 10 Sep 2020 15:18:47 +0200
Subject: [PATCH 251/583] Add convert UCI function that allows conversion of
 files between any of plain, bin, and binpack. Usage: convert infile outfile
 [append].

---
 src/extra/nnue_data_binpack_format.h |  33 +++++----
 src/learn/convert.cpp                | 105 +++++++++++++++++++++++++++
 src/uci.cpp                          |   3 +
 3 files changed, 125 insertions(+), 16 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 839fc17c..2c555939 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -6915,7 +6915,7 @@ namespace binpack
     {
         constexpr std::size_t reportEveryNPositions = 100'000;
 
-        std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
 
         CompressedTrainingDataEntryWriter writer(outputPath, om);
         TrainingDataEntry e;
@@ -6961,13 +6961,15 @@ namespace binpack
             if (key == "ply"sv) e.ply = std::stoi(value);
             if (key == "result"sv) e.result = std::stoi(value);
         }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
     inline void convertBinpackToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
     {
         constexpr std::size_t bufferSize = MiB;
 
-        std::cout << "Decompressing " << inputPath << " to " << outputPath << '\n';
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
 
         CompressedTrainingDataEntryReader reader(inputPath);
         std::ofstream outputFile(outputPath, om);
@@ -6999,6 +7001,8 @@ namespace binpack
             const auto cur = outputFile.tellp();
             std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
         }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
 
@@ -7006,14 +7010,9 @@ namespace binpack
     {
         constexpr std::size_t reportEveryNPositions = 100'000;
 
-        std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
 
         CompressedTrainingDataEntryWriter writer(outputPath, om);
-        TrainingDataEntry e;
-
-        std::string key;
-        std::string value;
-        std::string move;
 
         std::ifstream inputFile(inputPath, std::ios_base::binary);
         const auto base = inputFile.tellg();
@@ -7037,13 +7036,15 @@ namespace binpack
                 std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
             }
         }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
     inline void convertBinpackToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
     {
         constexpr std::size_t bufferSize = MiB;
 
-        std::cout << "Decompressing " << inputPath << " to " << outputPath << '\n';
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
 
         CompressedTrainingDataEntryReader reader(inputPath);
         std::ofstream outputFile(outputPath, std::ios_base::binary | om);
@@ -7075,6 +7076,8 @@ namespace binpack
             const auto cur = outputFile.tellp();
             std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
         }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
     inline void convertBinToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
@@ -7083,12 +7086,6 @@ namespace binpack
 
         std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
 
-        TrainingDataEntry e;
-
-        std::string key;
-        std::string value;
-        std::string move;
-
         std::ifstream inputFile(inputPath, std::ios_base::binary);
         const auto base = inputFile.tellg();
         std::size_t numProcessedPositions = 0;
@@ -7127,13 +7124,15 @@ namespace binpack
             const auto cur = outputFile.tellp();
             std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
         }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
     inline void convertPlainToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
     {
         constexpr std::size_t bufferSize = MiB;
 
-        std::cout << "Compressing " << inputPath << " to " << outputPath << '\n';
+        std::cout << "Converting " << inputPath << " to " << outputPath << '\n';
 
         std::ofstream outputFile(outputPath, std::ios_base::binary | om);
         std::vector<char> buffer;
@@ -7194,5 +7193,7 @@ namespace binpack
             const auto cur = outputFile.tellp();
             std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
         }
+
+        std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 }
\ No newline at end of file
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index d07fc00c..364ad3dd 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -12,6 +12,8 @@
 #include "../position.h"
 #include "../tt.h"
 
+#include "../extra/nnue_data_binpack_format.h"
+
 #include <sstream>
 #include <fstream>
 #include <unordered_set>
@@ -497,5 +499,108 @@ namespace Learner
         ofs.close();
         std::cout << "all done" << std::endl;
     }
+
+    static inline const std::string plain_extension = ".plain";
+    static inline const std::string bin_extension = ".bin";
+    static inline const std::string binpack_extension = ".binpack";
+
+    static bool file_exists(const std::string& name)
+    {
+        std::ifstream f(name);
+        return f.good();
+    }
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static bool is_convert_of_type(
+        const std::string& input_path,
+        const std::string& output_path,
+        const std::string& expected_input_extension,
+        const std::string& expected_output_extension)
+    {
+        return ends_with(input_path, expected_input_extension)
+            && ends_with(output_path, expected_output_extension);
+    }
+
+    using ConvertFunctionType = void(std::string inputPath, std::string outputPath, std::ios_base::openmode om);
+
+    static ConvertFunctionType* get_convert_function(const std::string& input_path, const std::string& output_path)
+    {
+        if (is_convert_of_type(input_path, output_path, plain_extension, bin_extension))
+            return binpack::convertPlainToBin;
+        if (is_convert_of_type(input_path, output_path, plain_extension, binpack_extension))
+            return binpack::convertPlainToBinpack;
+
+        if (is_convert_of_type(input_path, output_path, bin_extension, plain_extension))
+            return binpack::convertBinToPlain;
+        if (is_convert_of_type(input_path, output_path, bin_extension, binpack_extension))
+            return binpack::convertBinToBinpack;
+
+        if (is_convert_of_type(input_path, output_path, binpack_extension, plain_extension))
+            return binpack::convertBinpackToPlain;
+        if (is_convert_of_type(input_path, output_path, binpack_extension, bin_extension))
+            return binpack::convertBinpackToBin;
+
+        return nullptr;
+    }
+
+    static void convert(const std::string& input_path, const std::string& output_path, std::ios_base::openmode om)
+    {
+        if(!file_exists(input_path))
+        {
+            std::cerr << "Input file does not exist.\n";
+            return;
+        }
+
+        auto func = get_convert_function(input_path, output_path);
+        if (func != nullptr)
+        {
+            func(input_path, output_path, om);
+        }
+        else
+        {
+            std::cerr << "Conversion between files of these types is not supported.\n";
+        }
+    }
+
+    static void convert(const std::vector<std::string>& args)
+    {
+        if (args.size() < 2 || args.size() > 3)
+        {
+            std::cerr << "Invalid arguments.\n";
+            std::cerr << "Usage: convert from_path to_path [append]\n";
+            return;
+        }
+
+        const bool append = (args.size() == 3) && (args[2] == "append");
+        const std::ios_base::openmode openmode =
+            append
+            ? std::ios_base::app
+            : std::ios_base::trunc;
+
+        convert(args[0], args[1], openmode);
+    }
+
+    void convert(istringstream& is)
+    {
+        std::vector<std::string> args;
+
+        while (true)
+        {
+            std::string token = "";
+            is >> token;
+            if (token == "")
+                break;
+
+            args.push_back(token);
+        }
+
+        convert(args);
+    }
 }
 #endif
diff --git a/src/uci.cpp b/src/uci.cpp
index 6675f2e0..96adf927 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -50,6 +50,8 @@ namespace Learner
   // Learning from the generated game record
   void learn(Position& pos, istringstream& is);
 
+  void convert(istringstream& is);
+
   // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
   typedef std::pair<Value, std::vector<Move> > ValueAndPV;
 
@@ -352,6 +354,7 @@ void UCI::loop(int argc, char* argv[]) {
 #if defined (EVAL_LEARN)
       else if (token == "gensfen") Learner::gen_sfen(pos, is);
       else if (token == "learn") Learner::learn(pos, is);
+      else if (token == "convert") Learner::convert(is);
 
       // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);

From c6f5f6a082592a2402f14908224fd33f9ad6fc0e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 11:02:00 +0200
Subject: [PATCH 252/583] Replace "use_raw_nnue_eval" with an uci option "Use
 NNUE pure"

---
 src/evaluate.cpp      | 44 +++++++++++++++++++++++++------------------
 src/evaluate.h        | 11 ++++++++++-
 src/learn/gensfen.cpp |  8 --------
 src/learn/learner.cpp |  7 -------
 src/position.cpp      | 10 +++++-----
 src/ucioption.cpp     |  6 +++++-
 6 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 8edc9bb8..94581998 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -32,23 +32,32 @@
 #include "thread.h"
 #include "uci.h"
 
-#ifdef EVAL_LEARN
-namespace Learner
-{
-    extern bool use_raw_nnue_eval;
-}
-#endif
-
 namespace Eval {
 
-  bool useNNUE;
+  UseNNUEMode useNNUE;
   std::string eval_file_loaded="None";
 
+  static UseNNUEMode nnue_mode_from_option(const std::string& mode)
+  {
+    if (mode == "false")
+      return UseNNUEMode::False;
+    else if (mode == "true")
+      return UseNNUEMode::True;
+
+#ifdef EVAL_LEARN
+    else if (mode == "pure")
+      return UseNNUEMode::Pure;
+#endif
+
+    return UseNNUEMode::False;
+  }
+
   void init_NNUE() {
 
-    useNNUE = Options["Use NNUE"];
+    useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
+
     std::string eval_file = std::string(Options["EvalFile"]);
-    if (useNNUE && eval_file_loaded != eval_file)
+    if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
         if (Eval::NNUE::load_eval_file(eval_file))
             eval_file_loaded = eval_file;
   }
@@ -56,8 +65,7 @@ namespace Eval {
   void verify_NNUE() {
 
     std::string eval_file = std::string(Options["EvalFile"]);
-    if (useNNUE && eval_file_loaded != eval_file)
-    {
+    if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)    {
         UCI::OptionsMap defaults;
         UCI::init(defaults);
 
@@ -69,7 +77,7 @@ namespace Eval {
         std::exit(EXIT_FAILURE);
     }
 
-    if (useNNUE)
+    if (useNNUE != UseNNUEMode::False)
         sync_cout << "info string NNUE evaluation using " << eval_file << " enabled." << sync_endl;
     else
         sync_cout << "info string classical evaluation enabled." << sync_endl;
@@ -948,17 +956,17 @@ make_v:
 
 Value Eval::evaluate(const Position& pos) {
 #ifdef EVAL_LEARN
-  if (Learner::use_raw_nnue_eval) {
+  if (useNNUE == UseNNUEMode::Pure) {
       return NNUE::evaluate(pos);
   }
 #endif
 
-  bool classical = !Eval::useNNUE
-                ||  abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
+  bool classical = useNNUE == UseNNUEMode::False
+                || abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
   Value v = classical ? Evaluation<NO_TRACE>(pos).value()
                       : NNUE::evaluate(pos) * 5 / 4 + Tempo;
 
-  if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
+  if (classical && useNNUE != UseNNUEMode::False && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
       v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
 
   // Damp down the evaluation linearly when shuffling
@@ -1015,7 +1023,7 @@ std::string Eval::trace(const Position& pos) {
 
   ss << "\nClassical evaluation: " << to_cp(v) << " (white side)\n";
 
-  if (Eval::useNNUE)
+  if (useNNUE != UseNNUEMode::False)
   {
       v = NNUE::evaluate(pos);
       v = pos.side_to_move() == WHITE ? v : -v;
diff --git a/src/evaluate.h b/src/evaluate.h
index e808068d..61052e90 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -26,11 +26,20 @@
 class Position;
 
 namespace Eval {
+  enum struct UseNNUEMode
+  {
+    False,
+    True
+
+#ifdef EVAL_LEARN
+    ,Pure
+#endif
+  };
 
   std::string trace(const Position& pos);
   Value evaluate(const Position& pos);
 
-  extern bool useNNUE;
+  extern UseNNUEMode useNNUE;
   extern std::string eval_file_loaded;
   void init_NNUE();
   void verify_NNUE();
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 99a783bb..9088fd81 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -44,12 +44,6 @@ namespace Learner
     static bool detect_draw_by_consecutive_low_score = false;
     static bool detect_draw_by_insufficient_mating_material = false;
 
-    // Use raw NNUE eval value in the Eval::evaluate().
-    // If hybrid eval is enabled, training data
-    // generation and training don't work well.
-    // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-    extern bool use_raw_nnue_eval;
-
     static SfenOutputType sfen_output_type = SfenOutputType::Bin;
 
     static bool ends_with(const std::string& lhs, const std::string& end)
@@ -1111,8 +1105,6 @@ namespace Learner
                 is >> detect_draw_by_consecutive_low_score;
             else if (token == "detect_draw_by_insufficient_mating_material")
                 is >> detect_draw_by_insufficient_mating_material;
-            else if (token == "use_raw_nnue_eval")
-                is >> use_raw_nnue_eval;
             else if (token == "sfen_format")
                 is >> sfen_format;
             else
diff --git a/src/learn/learner.cpp b/src/learn/learner.cpp
index 7cc04406..da093192 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learner.cpp
@@ -93,12 +93,6 @@ namespace Learner
     // data directly. In those cases, we set false to this variable.
     static bool convert_teacher_signal_to_winning_probability = true;
 
-    // Use raw NNUE eval value in the Eval::evaluate(). If hybrid eval is enabled, training data
-    // generation and training don't work well.
-    // https://discordapp.com/channels/435943710472011776/733545871911813221/748524079761326192
-    // This CANNOT be static since it's used elsewhere.
-    bool use_raw_nnue_eval = false;
-
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
@@ -1811,7 +1805,6 @@ namespace Learner
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
             else if (option == "convert_teacher_signal_to_winning_probability") is >> convert_teacher_signal_to_winning_probability;
-            else if (option == "use_raw_nnue_eval") is >> use_raw_nnue_eval;
 
             // Otherwise, it's a filename.
             else
diff --git a/src/position.cpp b/src/position.cpp
index fe89b753..5ac461bc 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -755,7 +755,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
       else
           st->nonPawnMaterial[them] -= PieceValue[MG][captured];
 
-      if (Eval::useNNUE)
+      if (Eval::useNNUE != Eval::UseNNUEMode::False)
       {
           dp.dirty_num = 2;  // 1 piece moved, 1 piece captured
           dp.piece[1] = captured;
@@ -799,7 +799,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   // Move the piece. The tricky Chess960 castling is handled earlier
   if (type_of(m) != CASTLING)
   {
-      if (Eval::useNNUE)
+      if (Eval::useNNUE != Eval::UseNNUEMode::False)
       {
           dp.piece[0] = pc;
           dp.from[0] = from;
@@ -830,7 +830,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
           remove_piece(to);
           put_piece(promotion, to);
 
-          if (Eval::useNNUE)
+          if (Eval::useNNUE != Eval::UseNNUEMode::False)
           {
               // Promoting pawn to SQ_NONE, promoted piece from SQ_NONE
               dp.to[0] = SQ_NONE;
@@ -968,7 +968,7 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ
   rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1);
   to = relative_square(us, kingSide ? SQ_G1 : SQ_C1);
 
-  if (Do && Eval::useNNUE)
+  if (Do && Eval::useNNUE != Eval::UseNNUEMode::False)
   {
       auto& dp = st->dirtyPiece;
       dp.piece[0] = make_piece(us, KING);
@@ -997,7 +997,7 @@ void Position::do_null_move(StateInfo& newSt) {
   assert(!checkers());
   assert(&newSt != st);
 
-  if (Eval::useNNUE)
+  if (Eval::useNNUE != Eval::UseNNUEMode::False)
   {
       std::memcpy(&newSt, st, sizeof(StateInfo));
       st->accumulator.computed_score = false;
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index b24d8d78..61e47539 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -86,7 +86,11 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
-  o["Use NNUE"]              << Option(true, on_use_NNUE);
+#ifdef EVAL_LEARN
+  o["Use NNUE"]              << Option("true var true var false var pure", "true", on_use_NNUE);
+#else
+  o["Use NNUE"]              << Option("true var true var false", "true", on_use_NNUE);
+#endif
   // The default must follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work.
   o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);

From 683c6146ce7217df8693ba83ff9a27a941915aaf Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 12:05:46 +0200
Subject: [PATCH 253/583] Move declarations around and split them.

---
 src/Makefile                         |   2 +-
 src/extra/sfen_packer.cpp            | 587 +++++++++++++--------------
 src/extra/sfen_packer.h              |  23 ++
 src/learn/convert.cpp                |   3 +-
 src/learn/convert.h                  |  37 ++
 src/learn/gensfen.cpp                |   4 +-
 src/learn/gensfen.h                  |  16 +
 src/learn/{learner.cpp => learn.cpp} |   5 +-
 src/learn/learn.h                    | 118 ++----
 src/learn/packed_sfen.h              |  49 +++
 src/position.cpp                     |  41 ++
 src/position.h                       |  14 +-
 src/search.h                         |  11 +
 src/uci.cpp                          |  25 +-
 14 files changed, 511 insertions(+), 424 deletions(-)
 create mode 100644 src/extra/sfen_packer.h
 create mode 100644 src/learn/convert.h
 create mode 100644 src/learn/gensfen.h
 rename src/learn/{learner.cpp => learn.cpp} (99%)
 create mode 100644 src/learn/packed_sfen.h

diff --git a/src/Makefile b/src/Makefile
index 49c6c1b3..88d759d2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -56,7 +56,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/features/enpassant.cpp \
 	nnue/nnue_test_command.cpp \
 	extra/sfen_packer.cpp \
-	learn/learner.cpp \
+	learn/learn.cpp \
 	learn/gensfen.cpp \
 	learn/convert.cpp \
 	learn/learning_tools.cpp \
diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
index 1d82111d..b58ad5dd 100644
--- a/src/extra/sfen_packer.cpp
+++ b/src/extra/sfen_packer.cpp
@@ -1,5 +1,9 @@
 ﻿#if defined (EVAL_LEARN)
 
+#include "sfen_packer.h"
+
+#include "../learn/packed_sfen.h"
+
 #include "../misc.h"
 #include "../position.h"
 
@@ -9,153 +13,166 @@
 
 using namespace std;
 
-// -----------------------------------
-// stage compression/decompression
-// -----------------------------------
+namespace Learner {
 
-// Class that handles bitstream
-// useful when doing aspect encoding
-struct BitStream
-{
-  // Set the memory to store the data in advance.
-  // Assume that memory is cleared to 0.
-  void  set_data(uint8_t* data_) { data = data_; reset(); }
-
-  // Get the pointer passed in set_data().
-  uint8_t* get_data() const { return data; }
-
-  // Get the cursor.
-  int get_cursor() const { return bit_cursor; }
-
-  // reset the cursor
-  void reset() { bit_cursor = 0; }
-
-  // Write 1bit to the stream.
-  // If b is non-zero, write out 1. If 0, write 0.
-  void write_one_bit(int b)
+  // Class that handles bitstream
+  // useful when doing aspect encoding
+  struct BitStream
   {
-    if (b)
-      data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+    // Set the memory to store the data in advance.
+    // Assume that memory is cleared to 0.
+    void set_data(std::uint8_t* data_) { data = data_; reset(); }
 
-    ++bit_cursor;
-  }
+    // Get the pointer passed in set_data().
+    uint8_t* get_data() const { return data; }
 
-  // Get 1 bit from the stream.
-  int read_one_bit()
+    // Get the cursor.
+    int get_cursor() const { return bit_cursor; }
+
+    // reset the cursor
+    void reset() { bit_cursor = 0; }
+
+    // Write 1bit to the stream.
+    // If b is non-zero, write out 1. If 0, write 0.
+    void write_one_bit(int b)
+    {
+      if (b)
+        data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+
+      ++bit_cursor;
+    }
+
+    // Get 1 bit from the stream.
+    int read_one_bit()
+    {
+      int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
+      ++bit_cursor;
+
+      return b;
+    }
+
+    // write n bits of data
+    // Data shall be written out from the lower order of d.
+    void write_n_bit(int d, int n)
+    {
+      for (int i = 0; i <n; ++i)
+        write_one_bit(d & (1 << i));
+    }
+
+    // read n bits of data
+    // Reverse conversion of write_n_bit().
+    int read_n_bit(int n)
+    {
+      int result = 0;
+      for (int i = 0; i < n; ++i)
+        result |= read_one_bit() ? (1 << i) : 0;
+
+      return result;
+    }
+
+  private:
+    // Next bit position to read/write.
+    int bit_cursor;
+
+    // data entity
+    std::uint8_t* data;
+  };
+
+  // Class for compressing/decompressing sfen
+  // sfen can be packed to 256bit (32bytes) by Huffman coding.
+  // This is proven by mini. The above is Huffman coding.
+  //
+  // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
+  // Side to move (White = 0, Black = 1) (1bit)
+  // White King Position (6 bits)
+  // Black King Position (6 bits)
+  // Huffman Encoding of the board
+  // Castling availability (1 bit x 4)
+  // En passant square (1 or 1 + 6 bits)
+  // Rule 50 (6 bits)
+  // Game play (8 bits)
+  //
+  // TODO(someone): Rename SFEN to FEN.
+  //
+  struct SfenPacker
   {
-    int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
-    ++bit_cursor;
+    void pack(const Position& pos);
 
-    return b;
-  }
+    // sfen packed by pack() (256bit = 32bytes)
+    // Or sfen to decode with unpack()
+    uint8_t *data; // uint8_t[32];
 
-  // write n bits of data
-  // Data shall be written out from the lower order of d.
-  void write_n_bit(int d, int n)
+    BitStream stream;
+
+    // Output the board pieces to stream.
+    void write_board_piece_to_stream(Piece pc);
+
+    // Read one board piece from stream
+    Piece read_board_piece_from_stream();
+  };
+
+
+  // Huffman coding
+  // * is simplified from mini encoding to make conversion easier.
+  //
+  // 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
+  // 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
+  //
+  // empty xxxxx0 + 0 (none)
+  // step xxxx01 + 2 xxxx0 + 2
+  // incense xx0011 + 2 xx001 + 2
+  // Katsura xx1011 + 2 xx101 + 2
+  // silver xx0111 + 2 xx011 + 2
+  // Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
+  // corner 011111 + 2 01111 + 2
+  // Fly 111111 + 2 11111 + 2
+  //
+  // Assuming all pieces are on the board,
+  // Sky 81-40 pieces = 41 boxes = 41bit
+  // Walk 4bit*18 pieces = 72bit
+  // Incense 6bit*4 pieces = 24bit
+  // Katsura 6bit*4 pieces = 24bit
+  // Silver 6bit*4 pieces = 24bit
+  // Gold 6bit* 4 pieces = 24bit
+  // corner 8bit* 2 pieces = 16bit
+  // Fly 8bit* 2 pieces = 16bit
+  // -------
+  // 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
+  //
+  // When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
+  // Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
+  // Therefore, in this expression, any aspect can be expressed by this bit number.
+  // It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
+  // Since the total number of bits can be fixed, we will include this as well.
+
+  // Huffman Encoding
+  //
+  // Empty  xxxxxxx0
+  // Pawn   xxxxx001 + 1 bit (Side to move)
+  // Knight xxxxx011 + 1 bit (Side to move)
+  // Bishop xxxxx101 + 1 bit (Side to move)
+  // Rook   xxxxx111 + 1 bit (Side to move)
+
+  struct HuffmanedPiece
   {
-    for (int i = 0; i <n; ++i)
-      write_one_bit(d & (1 << i));
-  }
+    int code; // how it will be coded
+    int bits; // How many bits do you have
+  };
 
-  // read n bits of data
-  // Reverse conversion of write_n_bit().
-  int read_n_bit(int n)
+  constexpr HuffmanedPiece huffman_table[] =
   {
-    int result = 0;
-    for (int i = 0; i < n; ++i)
-      result |= read_one_bit() ? (1 << i) : 0;
+    {0b0000,1}, // NO_PIECE
+    {0b0001,4}, // PAWN
+    {0b0011,4}, // KNIGHT
+    {0b0101,4}, // BISHOP
+    {0b0111,4}, // ROOK
+    {0b1001,4}, // QUEEN
+  };
 
-    return result;
-  }
-
-private:
-  // Next bit position to read/write.
-  int bit_cursor;
-
-  // data entity
-  uint8_t* data;
-};
-
-
-// Huffman coding
-// * is simplified from mini encoding to make conversion easier.
-//
-// 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
-// 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
-//
-// empty xxxxx0 + 0 (none)
-// step xxxx01 + 2 xxxx0 + 2
-// incense xx0011 + 2 xx001 + 2
-// Katsura xx1011 + 2 xx101 + 2
-// silver xx0111 + 2 xx011 + 2
-// Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
-// corner 011111 + 2 01111 + 2
-// Fly 111111 + 2 11111 + 2
-//
-// Assuming all pieces are on the board,
-// Sky 81-40 pieces = 41 boxes = 41bit
-// Walk 4bit*18 pieces = 72bit
-// Incense 6bit*4 pieces = 24bit
-// Katsura 6bit*4 pieces = 24bit
-// Silver 6bit*4 pieces = 24bit
-// Gold 6bit* 4 pieces = 24bit
-// corner 8bit* 2 pieces = 16bit
-// Fly 8bit* 2 pieces = 16bit
-// -------
-// 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
-//
-// When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
-// Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
-// Therefore, in this expression, any aspect can be expressed by this bit number.
-// It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
-// Since the total number of bits can be fixed, we will include this as well.
-
-// Huffman Encoding
-//
-// Empty  xxxxxxx0
-// Pawn   xxxxx001 + 1 bit (Side to move)
-// Knight xxxxx011 + 1 bit (Side to move)
-// Bishop xxxxx101 + 1 bit (Side to move)
-// Rook   xxxxx111 + 1 bit (Side to move)
-
-struct HuffmanedPiece
-{
-  int code; // how it will be coded
-  int bits; // How many bits do you have
-};
-
-HuffmanedPiece huffman_table[] =
-{
-  {0b0000,1}, // NO_PIECE
-  {0b0001,4}, // PAWN
-  {0b0011,4}, // KNIGHT
-  {0b0101,4}, // BISHOP
-  {0b0111,4}, // ROOK
-  {0b1001,4}, // QUEEN
-};
-
-// Class for compressing/decompressing sfen
-// sfen can be packed to 256bit (32bytes) by Huffman coding.
-// This is proven by mini. The above is Huffman coding.
-//
-// Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
-// Side to move (White = 0, Black = 1) (1bit)
-// White King Position (6 bits)
-// Black King Position (6 bits)
-// Huffman Encoding of the board
-// Castling availability (1 bit x 4)
-// En passant square (1 or 1 + 6 bits)
-// Rule 50 (6 bits)
-// Game play (8 bits)
-//
-// TODO(someone): Rename SFEN to FEN.
-//
-struct SfenPacker
-{
   // Pack sfen and store in data[32].
-  void pack(const Position& pos)
+  void SfenPacker::pack(const Position& pos)
   {
-// cout << pos;
+  // cout << pos;
 
     memset(data, 0, 32 /* 256bit */);
     stream.set_data(data);
@@ -202,17 +219,8 @@ struct SfenPacker
     assert(stream.get_cursor() <= 256);
   }
 
-  // sfen packed by pack() (256bit = 32bytes)
-  // Or sfen to decode with unpack()
-  uint8_t *data; // uint8_t[32];
-
-//private:
-  // Position::set_from_packed_sfen(uint8_t data[32]) I want to use these functions, so the line is bad, but I want to keep it public.
-
-  BitStream stream;
-
   // Output the board pieces to stream.
-  void write_board_piece_to_stream(Piece pc)
+  void SfenPacker::write_board_piece_to_stream(Piece pc)
   {
     // piece type
     PieceType pr = type_of(pc);
@@ -227,7 +235,7 @@ struct SfenPacker
   }
 
   // Read one board piece from stream
-  Piece read_board_piece_from_stream()
+  Piece SfenPacker::read_board_piece_from_stream()
   {
     PieceType pr = NO_PIECE_TYPE;
     int code = 0, bits = 0;
@@ -252,181 +260,148 @@ struct SfenPacker
 
     return make_piece(c, pr);
   }
-};
 
-
-// -----------------------------------
-// Add to Position class
-// -----------------------------------
-
-// Add a function that directly unpacks for speed. It's pretty tough.
-// Write it by combining packer::unpack() and Position::set().
-// If there is a problem with the passed phase and there is an error, non-zero is returned.
-int Position::set_from_packed_sfen(const PackedSfen& sfen , StateInfo * si, Thread* th, bool mirror)
-{
-	SfenPacker packer;
-	auto& stream = packer.stream;
-
-  // TODO: separate streams for writing and reading. Here we actually have to
-  // const_cast which is not safe in the long run.
-	stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
-
-	std::memset(this, 0, sizeof(Position));
-	std::memset(si, 0, sizeof(StateInfo));
-  std::fill_n(&pieceList[0][0], sizeof(pieceList) / sizeof(Square), SQ_NONE);
-  st = si;
-
-	// Active color
-	sideToMove = (Color)stream.read_one_bit();
-
-  pieceList[W_KING][0] = SQUARE_NB;
-  pieceList[B_KING][0] = SQUARE_NB;
-
-	// First the position of the ball
-	if (mirror)
-	{
-		for (auto c : Colors)
-			board[flip_file((Square)stream.read_n_bit(6))] = make_piece(c, KING);
-	}
-	else
-	{
-		for (auto c : Colors)
-			board[stream.read_n_bit(6)] = make_piece(c, KING);
-	}
-
-  // Piece placement
-  for (Rank r = RANK_8; r >= RANK_1; --r)
+  int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror)
   {
-    for (File f = FILE_A; f <= FILE_H; ++f)
+    SfenPacker packer;
+    auto& stream = packer.stream;
+
+    // TODO: separate streams for writing and reading. Here we actually have to
+    // const_cast which is not safe in the long run.
+    stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
+
+    std::memset(&pos, 0, sizeof(Position));
+    std::memset(si, 0, sizeof(StateInfo));
+    std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
+    pos.st = si;
+
+    // Active color
+    pos.sideToMove = (Color)stream.read_one_bit();
+
+    pos.pieceList[W_KING][0] = SQUARE_NB;
+    pos.pieceList[B_KING][0] = SQUARE_NB;
+
+    // First the position of the ball
+    if (mirror)
     {
-      auto sq = make_square(f, r);
+      for (auto c : Colors)
+        pos.board[flip_file((Square)stream.read_n_bit(6))] = make_piece(c, KING);
+    }
+    else
+    {
+      for (auto c : Colors)
+        pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
+    }
+
+    // Piece placement
+    for (Rank r = RANK_8; r >= RANK_1; --r)
+    {
+      for (File f = FILE_A; f <= FILE_H; ++f)
+      {
+        auto sq = make_square(f, r);
+        if (mirror) {
+          sq = flip_file(sq);
+        }
+
+        // it seems there are already balls
+        Piece pc;
+        if (type_of(pos.board[sq]) != KING)
+        {
+          assert(pos.board[sq] == NO_PIECE);
+          pc = packer.read_board_piece_from_stream();
+        }
+        else
+        {
+          pc = pos.board[sq];
+          // put_piece() will catch ASSERT unless you remove it all.
+          pos.board[sq] = NO_PIECE;
+        }
+
+        // There may be no pieces, so skip in that case.
+        if (pc == NO_PIECE)
+          continue;
+
+        pos.put_piece(Piece(pc), sq);
+
+        if (stream.get_cursor()> 256)
+          return 1;
+
+        //assert(stream.get_cursor() <= 256);
+      }
+    }
+
+    // Castling availability.
+    // TODO(someone): Support chess960.
+    pos.st->castlingRights = 0;
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(WHITE, SQ_H1); pos.piece_on(rsq) != W_ROOK; --rsq) {}
+      pos.set_castling_right(WHITE, rsq);
+    }
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(WHITE, SQ_A1); pos.piece_on(rsq) != W_ROOK; ++rsq) {}
+      pos.set_castling_right(WHITE, rsq);
+    }
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(BLACK, SQ_H1); pos.piece_on(rsq) != B_ROOK; --rsq) {}
+      pos.set_castling_right(BLACK, rsq);
+    }
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(BLACK, SQ_A1); pos.piece_on(rsq) != B_ROOK; ++rsq) {}
+      pos.set_castling_right(BLACK, rsq);
+    }
+
+    // En passant square. Ignore if no pawn capture is possible
+    if (stream.read_one_bit()) {
+      Square ep_square = static_cast<Square>(stream.read_n_bit(6));
       if (mirror) {
-        sq = flip_file(sq);
+        ep_square = flip_file(ep_square);
       }
+      pos.st->epSquare = ep_square;
 
-      // it seems there are already balls
-      Piece pc;
-      if (type_of(board[sq]) != KING)
-      {
-        assert(board[sq] == NO_PIECE);
-        pc = packer.read_board_piece_from_stream();
-      }
-      else
-      {
-        pc = board[sq];
-        board[sq] = NO_PIECE; // put_piece() will catch ASSERT unless you remove it all.
-      }
-
-      // There may be no pieces, so skip in that case.
-      if (pc == NO_PIECE)
-        continue;
-
-      put_piece(Piece(pc), sq);
-
-      //cout << sq << ' ' << board[sq] << ' ' << stream.get_cursor() << endl;
-
-      if (stream.get_cursor()> 256)
-        return 1;
-      //assert(stream.get_cursor() <= 256);
-
+      if (!(pos.attackers_to(pos.st->epSquare) & pos.pieces(pos.sideToMove, PAWN))
+        || !(pos.pieces(~pos.sideToMove, PAWN) & (pos.st->epSquare + pawn_push(~pos.sideToMove))))
+        pos.st->epSquare = SQ_NONE;
     }
-  }
-
-  // Castling availability.
-  // TODO(someone): Support chess960.
-  st->castlingRights = 0;
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(WHITE, SQ_H1); piece_on(rsq) != W_ROOK; --rsq) {}
-    set_castling_right(WHITE, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(WHITE, SQ_A1); piece_on(rsq) != W_ROOK; ++rsq) {}
-    set_castling_right(WHITE, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(BLACK, SQ_H1); piece_on(rsq) != B_ROOK; --rsq) {}
-    set_castling_right(BLACK, rsq);
-  }
-  if (stream.read_one_bit()) {
-    Square rsq;
-    for (rsq = relative_square(BLACK, SQ_A1); piece_on(rsq) != B_ROOK; ++rsq) {}
-    set_castling_right(BLACK, rsq);
-  }
-
-  // En passant square. Ignore if no pawn capture is possible
-  if (stream.read_one_bit()) {
-    Square ep_square = static_cast<Square>(stream.read_n_bit(6));
-    if (mirror) {
-      ep_square = flip_file(ep_square);
+    else {
+      pos.st->epSquare = SQ_NONE;
     }
-    st->epSquare = ep_square;
 
-    if (!(attackers_to(st->epSquare) & pieces(sideToMove, PAWN))
-      || !(pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove))))
-      st->epSquare = SQ_NONE;
-  }
-  else {
-    st->epSquare = SQ_NONE;
+    // Halfmove clock
+    pos.st->rule50 = static_cast<Square>(stream.read_n_bit(6));
+
+    // Fullmove number
+    pos.gamePly = static_cast<Square>(stream.read_n_bit(8));
+
+    // Convert from fullmove starting from 1 to gamePly starting from 0,
+    // handle also common incorrect FEN with fullmove = 0.
+    pos.gamePly = std::max(2 * (pos.gamePly - 1), 0) + (pos.sideToMove == BLACK);
+
+    assert(stream.get_cursor() <= 256);
+
+    pos.chess960 = false;
+    pos.thisThread = th;
+    pos.set_state(pos.st);
+
+    assert(pos_is_ok());
+
+    return 0;
   }
 
-  // Halfmove clock
-  st->rule50 = static_cast<Square>(stream.read_n_bit(6));
+  PackedSfen sfen_pack(Position& pos)
+  {
+    PackedSfen sfen;
 
-  // Fullmove number
-  gamePly = static_cast<Square>(stream.read_n_bit(8));
-  // Convert from fullmove starting from 1 to gamePly starting from 0,
-  // handle also common incorrect FEN with fullmove = 0.
-  gamePly = std::max(2 * (gamePly - 1), 0) + (sideToMove == BLACK);
+    SfenPacker sp;
+    sp.data = (uint8_t*)&sfen;
+    sp.pack(pos);
 
-  assert(stream.get_cursor() <= 256);
-
-  chess960 = false;
-  thisThread = th;
-set_state(st);
-
-  //std::cout << *this << std::endl;
-
-  assert(pos_is_ok());
-
-	return 0;
+    return sfen;
+  }
 }
 
-// Give the board, hand piece, and turn, and return the sfen.
-//std::string Position::sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly_)
-//{
-// // Copy it to an internal structure and call sfen() if the conversion process depends only on it
-// // Maybe it will be converted normally...
-//  Position pos;
-//
-//  memcpy(pos.board, board, sizeof(Piece) * 81);
-//  memcpy(pos.hand, hands, sizeof(Hand) * 2);
-//  pos.sideToMove = turn;
-//  pos.gamePly = gamePly_;
-//
-//  return pos.sfen();
-//
-// // Implementation of ↑ is beautiful, but slow.
-// // This is a bottleneck when learning a large amount of game records, so write a function to unpack directly.
-//}
-
-// Get the packed sfen. Returns to the buffer specified in the argument.
-void Position::sfen_pack(PackedSfen& sfen)
-{
-  SfenPacker sp;
-  sp.data = (uint8_t*)&sfen;
-  sp.pack(*this);
-}
-
-//// Unpack the packed sfen. Returns an sfen string.
-//std::string Position::sfen_unpack(const PackedSfen& sfen)
-//{
-// SfenPacker sp;
-// sp.data = (uint8_t*)&sfen;
-// return sp.unpack();
-//}
-
 
 #endif // USE_SFEN_PACKER
diff --git a/src/extra/sfen_packer.h b/src/extra/sfen_packer.h
new file mode 100644
index 00000000..c3832db2
--- /dev/null
+++ b/src/extra/sfen_packer.h
@@ -0,0 +1,23 @@
+#ifndef _SFEN_PACKER_H_
+#define _SFEN_PACKER_H_
+
+#if defined(EVAL_LEARN)
+
+#include <cstdint>
+
+#include "../types.h"
+
+#include "../learn/packed_sfen.h"
+class Position;
+struct StateInfo;
+class Thread;
+
+namespace Learner {
+
+    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror);
+    PackedSfen sfen_pack(Position& pos);
+}
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index 364ad3dd..d50233eb 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -1,9 +1,10 @@
 #if defined(EVAL_LEARN)
 
+#include "convert.h"
+
 // evaluate header for learning
 #include "../eval/evaluate_common.h"
 
-#include "learn.h"
 #include "multi_think.h"
 #include "../uci.h"
 #include "../syzygy/tbprobe.h"
diff --git a/src/learn/convert.h b/src/learn/convert.h
new file mode 100644
index 00000000..a79820a3
--- /dev/null
+++ b/src/learn/convert.h
@@ -0,0 +1,37 @@
+#ifndef _CONVERT_H_
+#define _CONVERT_H_
+
+#include <vector>
+#include <string>
+#include <sstream>
+
+#if defined(EVAL_LEARN)
+namespace Learner {
+    void convert_bin_from_pgn_extract(
+        const std::vector<std::string>& filenames,
+        const std::string& output_file_name,
+        const bool pgn_eval_side_to_move,
+        const bool convert_no_eval_fens_as_score_zero);
+
+    void convert_bin(
+        const std::vector<std::string>& filenames,
+        const std::string& output_file_name,
+        const int ply_minimum,
+        const int ply_maximum,
+        const int interpolate_eval,
+        const int src_score_min_value,
+        const int src_score_max_value,
+        const int dest_score_min_value,
+        const int dest_score_max_value,
+        const bool check_invalid_fen,
+        const bool check_illegal_move);
+
+    void convert_plain(
+        const std::vector<std::string>& filenames,
+        const std::string& output_file_name);
+
+    void convert(std::istringstream& is);
+}
+#endif
+
+#endif
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 9088fd81..9f53e983 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,5 +1,8 @@
 ﻿#if defined(EVAL_LEARN)
 
+#include "gensfen.h"
+#include "packed_sfen.h"
+
 #include "../eval/evaluate_common.h"
 #include "../misc.h"
 #include "../nnue/evaluate_nnue_learner.h"
@@ -8,7 +11,6 @@
 #include "../thread.h"
 #include "../tt.h"
 #include "../uci.h"
-#include "learn.h"
 #include "multi_think.h"
 
 #include "../extra/nnue_data_binpack_format.h"
diff --git a/src/learn/gensfen.h b/src/learn/gensfen.h
new file mode 100644
index 00000000..dd0f71fb
--- /dev/null
+++ b/src/learn/gensfen.h
@@ -0,0 +1,16 @@
+#ifndef _GENSFEN_H_
+#define _GENSFEN_H_
+
+#include <sstream>
+
+#include "../position.h"
+
+#if defined(EVAL_LEARN)
+namespace Learner {
+
+    // Automatic generation of teacher position
+    void gen_sfen(Position& pos, std::istringstream& is);
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/src/learn/learner.cpp b/src/learn/learn.cpp
similarity index 99%
rename from src/learn/learner.cpp
rename to src/learn/learn.cpp
index da093192..f4f7b409 100644
--- a/src/learn/learner.cpp
+++ b/src/learn/learn.cpp
@@ -19,6 +19,9 @@
 
 #if defined(EVAL_LEARN)
 
+#include "learn.h"
+#include "convert.h"
+
 #include "../eval/evaluate_common.h"
 #include "../misc.h"
 #include "../nnue/evaluate_nnue_learner.h"
@@ -27,7 +30,7 @@
 #include "../thread.h"
 #include "../tt.h"
 #include "../uci.h"
-#include "learn.h"
+#include "../search.h"
 #include "multi_think.h"
 
 #include "../extra/nnue_data_binpack_format.h"
diff --git a/src/learn/learn.h b/src/learn/learn.h
index b7ca18e8..b8acc2df 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -14,7 +14,7 @@
 // Even if it is a double type, there is almost no difference in the way of convergence, so fix it to float.
 
 // when using float
-typedef float LearnFloatType;
+using LearnFloatType = float;
 
 // when using double
 //typedef double LearnFloatType;
@@ -36,105 +36,47 @@ typedef float LearnFloatType;
 // ----------------------
 // Definition of struct used in Learner
 // ----------------------
+
+#include "packed_sfen.h"
+
 #include "../position.h"
 
+#include <sstream>
+
 namespace Learner
 {
-	// ----------------------
-	// Settings for learning
-	// ----------------------
+    // ----------------------
+    // Settings for learning
+    // ----------------------
 
-	// mini-batch size.
-	// Calculate the gradient by combining this number of phases.
-	// If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
-	// If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
-	// I don't think you need to change this value in most cases.
+    // mini-batch size.
+    // Calculate the gradient by combining this number of phases.
+    // If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
+    // If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
+    // I don't think you need to change this value in most cases.
 
-	constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
+    constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
 
-	// The number of phases to read from the file at one time. After reading this much, shuffle.
-	// It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
-	// Must be a multiple of THREAD_BUFFER_SIZE(=10000).
+    // The number of phases to read from the file at one time. After reading this much, shuffle.
+    // It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
+    // Must be a multiple of THREAD_BUFFER_SIZE(=10000).
 
-	constexpr std::size_t LEARN_SFEN_READ_SIZE = 1000 * 1000 * 10;
+    constexpr std::size_t LEARN_SFEN_READ_SIZE = 1000 * 1000 * 10;
 
-	// Saving interval of evaluation function at learning. Save each time you learn this number of phases.
-	// Needless to say, the longer the saving interval, the shorter the learning time.
-	// Folder name is incremented for each save like 0/, 1/, 2/...
-	// By default, once every 1 billion phases.
-	constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 1000000000ULL;
+    // Saving interval of evaluation function at learning. Save each time you learn this number of phases.
+    // Needless to say, the longer the saving interval, the shorter the learning time.
+    // Folder name is incremented for each save like 0/, 1/, 2/...
+    // By default, once every 1 billion phases.
+    constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 1000000000ULL;
 
-	// Reduce the output of rmse during learning to 1 for this number of times.
-	// rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
-	constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
+    // Reduce the output of rmse during learning to 1 for this number of times.
+    // rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
+    constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
 
-	//Structure in which PackedSfen and evaluation value are integrated
-	// If you write different contents for each option, it will be a problem when reusing the teacher game
-	// For the time being, write all the following members regardless of the options.
-	struct PackedSfenValue
-	{
-		// phase
-		PackedSfen sfen;
+    double calc_grad(Value shallow, const PackedSfenValue& psv);
 
-		// Evaluation value returned from Learner::search()
-		int16_t score;
-
-		// PV first move
-		// Used when finding the match rate with the teacher
-		uint16_t move;
-
-		// Trouble of the phase from the initial phase.
-		uint16_t gamePly;
-
-		// 1 if the player on this side ultimately wins the game. -1 if you are losing.
-		// 0 if a draw is reached.
-		// The draw is in the teacher position generation command gensfen,
-		// Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
-		int8_t game_result;
-
-		// When exchanging the file that wrote the teacher aspect with other people
-		//Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
-		uint8_t padding;
-
-		// 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
-	};
-
-	// Type that returns the reading line and the evaluation value at that time
-	// Used in Learner::search(), Learner::qsearch().
-	typedef std::pair<Value, std::vector<Move> > ValueAndPV;
-
-	// Phase array: PSVector stands for packed sfen vector.
-	typedef std::vector<PackedSfenValue> PSVector;
-
-	// So far, only Yaneura King 2018 Otafuku has this stub
-	// This stub is required if EVAL_LEARN is defined.
-	extern Learner::ValueAndPV  search(Position& pos, int depth , size_t multiPV = 1 , uint64_t NodesLimit = 0);
-	extern Learner::ValueAndPV qsearch(Position& pos);
-
-	double calc_grad(Value shallow, const PackedSfenValue& psv);
-	
-	void convert_bin_from_pgn_extract(
-		const std::vector<std::string>& filenames,
-		const std::string& output_file_name,
-		const bool pgn_eval_side_to_move,
-		const bool convert_no_eval_fens_as_score_zero);
-	
-	void convert_bin(
-		const std::vector<std::string>& filenames,
-		const std::string& output_file_name,
-		const int ply_minimum,
-		const int ply_maximum,
-		const int interpolate_eval,
-		const int src_score_min_value,
-		const int src_score_max_value,
-		const int dest_score_min_value,
-		const int dest_score_max_value,
-		const bool check_invalid_fen,
-		const bool check_illegal_move);
-
-	void convert_plain(
-		const std::vector<std::string>& filenames,
-		const std::string& output_file_name);
+    // Learning from the generated game record
+    void learn(Position& pos, std::istringstream& is);
 }
 
 #endif
diff --git a/src/learn/packed_sfen.h b/src/learn/packed_sfen.h
new file mode 100644
index 00000000..101e5e34
--- /dev/null
+++ b/src/learn/packed_sfen.h
@@ -0,0 +1,49 @@
+#ifndef _PACKED_SFEN_H_
+#define _PACKED_SFEN_H_
+
+#include <vector>
+#include <cstdint>
+
+#if defined(EVAL_LEARN)
+namespace Learner {
+
+    // packed sfen
+    struct PackedSfen { std::uint8_t data[32]; };
+
+    // Structure in which PackedSfen and evaluation value are integrated
+    // If you write different contents for each option, it will be a problem when reusing the teacher game
+    // For the time being, write all the following members regardless of the options.
+    struct PackedSfenValue
+    {
+        // phase
+        PackedSfen sfen;
+
+        // Evaluation value returned from Learner::search()
+        std::int16_t score;
+
+        // PV first move
+        // Used when finding the match rate with the teacher
+        std::uint16_t move;
+
+        // Trouble of the phase from the initial phase.
+        std::uint16_t gamePly;
+
+        // 1 if the player on this side ultimately wins the game. -1 if you are losing.
+        // 0 if a draw is reached.
+        // The draw is in the teacher position generation command gensfen,
+        // Only write if LEARN_GENSFEN_DRAW_RESULT is enabled.
+        std::int8_t game_result;
+
+        // When exchanging the file that wrote the teacher aspect with other people
+        //Because this structure size is not fixed, pad it so that it is 40 bytes in any environment.
+        std::uint8_t padding;
+
+        // 32 + 2 + 2 + 2 + 1 + 1 = 40bytes
+    };
+
+    // Phase array: PSVector stands for packed sfen vector.
+    using PSVector = std::vector<PackedSfenValue>;
+}
+#endif
+
+#endif
diff --git a/src/position.cpp b/src/position.cpp
index 5ac461bc..a9fc8272 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -32,6 +32,11 @@
 #include "uci.h"
 #include "syzygy/tbprobe.h"
 
+#if defined(EVAL_LEARN)
+#include "learn/packed_sfen.h"
+#include "extra/sfen_packer.h"
+#endif
+
 using std::string;
 
 namespace Zobrist {
@@ -1346,3 +1351,39 @@ bool Position::pos_is_ok() const {
 
   return true;
 }
+
+#if defined(EVAL_LEARN)
+
+// Add a function that directly unpacks for speed. It's pretty tough.
+// Write it by combining packer::unpack() and Position::set().
+// If there is a problem with the passed phase and there is an error, non-zero is returned.
+int Position::set_from_packed_sfen(const Learner::PackedSfen& sfen , StateInfo* si, Thread* th, bool mirror)
+{
+  return Learner::set_from_packed_sfen(*this, sfen, si, th, mirror);
+}
+
+// Give the board, hand piece, and turn, and return the sfen.
+//std::string Position::sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly_)
+//{
+// // Copy it to an internal structure and call sfen() if the conversion process depends only on it
+// // Maybe it will be converted normally...
+//  Position pos;
+//
+//  memcpy(pos.board, board, sizeof(Piece) * 81);
+//  memcpy(pos.hand, hands, sizeof(Hand) * 2);
+//  pos.sideToMove = turn;
+//  pos.gamePly = gamePly_;
+//
+//  return pos.sfen();
+//
+// // Implementation of ↑ is beautiful, but slow.
+// // This is a bottleneck when learning a large amount of game records, so write a function to unpack directly.
+//}
+
+// Get the packed sfen. Returns to the buffer specified in the argument.
+void Position::sfen_pack(Learner::PackedSfen& sfen)
+{
+  sfen = Learner::sfen_pack(*this);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/position.h b/src/position.h
index e3f758e0..382748af 100644
--- a/src/position.h
+++ b/src/position.h
@@ -30,6 +30,11 @@
 
 #include "nnue/nnue_accumulator.h"
 
+#if defined(EVAL_LEARN)
+#include "learn/packed_sfen.h"
+#include "extra/sfen_packer.h"
+#endif
+
 
 /// StateInfo struct stores information needed to restore a Position object to
 /// its previous state when we retract a move. Whenever a move is made on the
@@ -75,9 +80,6 @@ typedef std::unique_ptr<std::deque<StateInfo>> StateListPtr;
 /// traversing the search tree.
 class Thread;
 
-// packed sfen
-struct PackedSfen { uint8_t data[32]; }; 
-
 class Position {
 public:
   static void init();
@@ -178,15 +180,17 @@ public:
 #if defined(EVAL_LEARN)
   // --sfenization helper
 
+  friend int Learner::set_from_packed_sfen(Position& pos, const Learner::PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror);
+
   // Get the packed sfen. Returns to the buffer specified in the argument.
   // Do not include gamePly in pack.
-  void sfen_pack(PackedSfen& sfen);
+  void sfen_pack(Learner::PackedSfen& sfen);
 
   // It is slow to go through sfen, so I made a function to set packed sfen directly.
   // Equivalent to pos.set(sfen_unpack(data),si,th);.
   // If there is a problem with the passed phase and there is an error, non-zero is returned.
   // PackedSfen does not include gamePly so it cannot be restored. If you want to set it, specify it with an argument.
-  int set_from_packed_sfen(const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror = false);
+  int set_from_packed_sfen(const Learner::PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror = false);
 
   // Give the board, hand piece, and turn, and return the sfen.
   //static std::string sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly);
diff --git a/src/search.h b/src/search.h
index 9d5ce279..5e092273 100644
--- a/src/search.h
+++ b/src/search.h
@@ -117,4 +117,15 @@ void clear();
 
 } // namespace Search
 
+#if defined(EVAL_LEARN)
+namespace Learner {
+
+  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
+  using ValueAndPV = std::pair<Value, std::vector<Move>>;
+
+  ValueAndPV qsearch(Position& pos);
+  ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
+}
+#endif
+
 #endif // #ifndef SEARCH_H_INCLUDED
diff --git a/src/uci.cpp b/src/uci.cpp
index 96adf927..0a28fc1f 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -33,6 +33,10 @@
 #include "tt.h"
 #include "uci.h"
 
+#include "learn/gensfen.h"
+#include "learn/learn.h"
+#include "learn/convert.h"
+
 using namespace std;
 
 extern vector<string> setup_bench(const Position&, istream&);
@@ -40,27 +44,6 @@ extern vector<string> setup_bench(const Position&, istream&);
 // FEN string of the initial position, normal chess
 const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
 
-// Command to automatically generate a game record
-#if defined (EVAL_LEARN)
-namespace Learner
-{
-  // Automatic generation of teacher position
-  void gen_sfen(Position& pos, istringstream& is);
-
-  // Learning from the generated game record
-  void learn(Position& pos, istringstream& is);
-
-  void convert(istringstream& is);
-
-  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
-  typedef std::pair<Value, std::vector<Move> > ValueAndPV;
-
-  ValueAndPV qsearch(Position& pos);
-  ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
-
-}
-#endif
-
 void test_cmd(Position& pos, istringstream& is)
 {
     // Initialize as it may be searched.

From a059fa86c4aa00e8a2ed96aacdf01684ec50a0b4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 12:08:26 +0200
Subject: [PATCH 254/583] Move sfen_packer to learn.

---
 src/Makefile              |   2 +-
 src/extra/sfen_packer.cpp | 407 --------------------------------------
 src/extra/sfen_packer.h   |  23 ---
 src/position.cpp          |   2 +-
 src/position.h            |   2 +-
 5 files changed, 3 insertions(+), 433 deletions(-)
 delete mode 100644 src/extra/sfen_packer.cpp
 delete mode 100644 src/extra/sfen_packer.h

diff --git a/src/Makefile b/src/Makefile
index 88d759d2..aa13603a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -55,7 +55,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/features/castling_right.cpp \
 	nnue/features/enpassant.cpp \
 	nnue/nnue_test_command.cpp \
-	extra/sfen_packer.cpp \
+	learn/sfen_packer.cpp \
 	learn/learn.cpp \
 	learn/gensfen.cpp \
 	learn/convert.cpp \
diff --git a/src/extra/sfen_packer.cpp b/src/extra/sfen_packer.cpp
deleted file mode 100644
index b58ad5dd..00000000
--- a/src/extra/sfen_packer.cpp
+++ /dev/null
@@ -1,407 +0,0 @@
-﻿#if defined (EVAL_LEARN)
-
-#include "sfen_packer.h"
-
-#include "../learn/packed_sfen.h"
-
-#include "../misc.h"
-#include "../position.h"
-
-#include <sstream>
-#include <fstream>
-#include <cstring> // std::memset()
-
-using namespace std;
-
-namespace Learner {
-
-  // Class that handles bitstream
-  // useful when doing aspect encoding
-  struct BitStream
-  {
-    // Set the memory to store the data in advance.
-    // Assume that memory is cleared to 0.
-    void set_data(std::uint8_t* data_) { data = data_; reset(); }
-
-    // Get the pointer passed in set_data().
-    uint8_t* get_data() const { return data; }
-
-    // Get the cursor.
-    int get_cursor() const { return bit_cursor; }
-
-    // reset the cursor
-    void reset() { bit_cursor = 0; }
-
-    // Write 1bit to the stream.
-    // If b is non-zero, write out 1. If 0, write 0.
-    void write_one_bit(int b)
-    {
-      if (b)
-        data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
-
-      ++bit_cursor;
-    }
-
-    // Get 1 bit from the stream.
-    int read_one_bit()
-    {
-      int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
-      ++bit_cursor;
-
-      return b;
-    }
-
-    // write n bits of data
-    // Data shall be written out from the lower order of d.
-    void write_n_bit(int d, int n)
-    {
-      for (int i = 0; i <n; ++i)
-        write_one_bit(d & (1 << i));
-    }
-
-    // read n bits of data
-    // Reverse conversion of write_n_bit().
-    int read_n_bit(int n)
-    {
-      int result = 0;
-      for (int i = 0; i < n; ++i)
-        result |= read_one_bit() ? (1 << i) : 0;
-
-      return result;
-    }
-
-  private:
-    // Next bit position to read/write.
-    int bit_cursor;
-
-    // data entity
-    std::uint8_t* data;
-  };
-
-  // Class for compressing/decompressing sfen
-  // sfen can be packed to 256bit (32bytes) by Huffman coding.
-  // This is proven by mini. The above is Huffman coding.
-  //
-  // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
-  // Side to move (White = 0, Black = 1) (1bit)
-  // White King Position (6 bits)
-  // Black King Position (6 bits)
-  // Huffman Encoding of the board
-  // Castling availability (1 bit x 4)
-  // En passant square (1 or 1 + 6 bits)
-  // Rule 50 (6 bits)
-  // Game play (8 bits)
-  //
-  // TODO(someone): Rename SFEN to FEN.
-  //
-  struct SfenPacker
-  {
-    void pack(const Position& pos);
-
-    // sfen packed by pack() (256bit = 32bytes)
-    // Or sfen to decode with unpack()
-    uint8_t *data; // uint8_t[32];
-
-    BitStream stream;
-
-    // Output the board pieces to stream.
-    void write_board_piece_to_stream(Piece pc);
-
-    // Read one board piece from stream
-    Piece read_board_piece_from_stream();
-  };
-
-
-  // Huffman coding
-  // * is simplified from mini encoding to make conversion easier.
-  //
-  // 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
-  // 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
-  //
-  // empty xxxxx0 + 0 (none)
-  // step xxxx01 + 2 xxxx0 + 2
-  // incense xx0011 + 2 xx001 + 2
-  // Katsura xx1011 + 2 xx101 + 2
-  // silver xx0111 + 2 xx011 + 2
-  // Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
-  // corner 011111 + 2 01111 + 2
-  // Fly 111111 + 2 11111 + 2
-  //
-  // Assuming all pieces are on the board,
-  // Sky 81-40 pieces = 41 boxes = 41bit
-  // Walk 4bit*18 pieces = 72bit
-  // Incense 6bit*4 pieces = 24bit
-  // Katsura 6bit*4 pieces = 24bit
-  // Silver 6bit*4 pieces = 24bit
-  // Gold 6bit* 4 pieces = 24bit
-  // corner 8bit* 2 pieces = 16bit
-  // Fly 8bit* 2 pieces = 16bit
-  // -------
-  // 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
-  //
-  // When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
-  // Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
-  // Therefore, in this expression, any aspect can be expressed by this bit number.
-  // It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
-  // Since the total number of bits can be fixed, we will include this as well.
-
-  // Huffman Encoding
-  //
-  // Empty  xxxxxxx0
-  // Pawn   xxxxx001 + 1 bit (Side to move)
-  // Knight xxxxx011 + 1 bit (Side to move)
-  // Bishop xxxxx101 + 1 bit (Side to move)
-  // Rook   xxxxx111 + 1 bit (Side to move)
-
-  struct HuffmanedPiece
-  {
-    int code; // how it will be coded
-    int bits; // How many bits do you have
-  };
-
-  constexpr HuffmanedPiece huffman_table[] =
-  {
-    {0b0000,1}, // NO_PIECE
-    {0b0001,4}, // PAWN
-    {0b0011,4}, // KNIGHT
-    {0b0101,4}, // BISHOP
-    {0b0111,4}, // ROOK
-    {0b1001,4}, // QUEEN
-  };
-
-  // Pack sfen and store in data[32].
-  void SfenPacker::pack(const Position& pos)
-  {
-  // cout << pos;
-
-    memset(data, 0, 32 /* 256bit */);
-    stream.set_data(data);
-
-    // turn
-    // Side to move.
-    stream.write_one_bit((int)(pos.side_to_move()));
-
-    // 7-bit positions for leading and trailing balls
-    // White king and black king, 6 bits for each.
-    for(auto c: Colors)
-      stream.write_n_bit(pos.king_square(c), 6);
-
-    // Write the pieces on the board other than the kings.
-    for (Rank r = RANK_8; r >= RANK_1; --r)
-    {
-      for (File f = FILE_A; f <= FILE_H; ++f)
-      {
-        Piece pc = pos.piece_on(make_square(f, r));
-        if (type_of(pc) == KING)
-          continue;
-        write_board_piece_to_stream(pc);
-      }
-    }
-
-    // TODO(someone): Support chess960.
-    stream.write_one_bit(pos.can_castle(WHITE_OO));
-    stream.write_one_bit(pos.can_castle(WHITE_OOO));
-    stream.write_one_bit(pos.can_castle(BLACK_OO));
-    stream.write_one_bit(pos.can_castle(BLACK_OOO));
-
-    if (pos.ep_square() == SQ_NONE) {
-      stream.write_one_bit(0);
-    }
-    else {
-      stream.write_one_bit(1);
-      stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
-    }
-
-    stream.write_n_bit(pos.state()->rule50, 6);
-
-    stream.write_n_bit(1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2, 8);
-
-    assert(stream.get_cursor() <= 256);
-  }
-
-  // Output the board pieces to stream.
-  void SfenPacker::write_board_piece_to_stream(Piece pc)
-  {
-    // piece type
-    PieceType pr = type_of(pc);
-    auto c = huffman_table[pr];
-    stream.write_n_bit(c.code, c.bits);
-
-    if (pc == NO_PIECE)
-      return;
-
-    // first and second flag
-    stream.write_one_bit(color_of(pc));
-  }
-
-  // Read one board piece from stream
-  Piece SfenPacker::read_board_piece_from_stream()
-  {
-    PieceType pr = NO_PIECE_TYPE;
-    int code = 0, bits = 0;
-    while (true)
-    {
-      code |= stream.read_one_bit() << bits;
-      ++bits;
-
-      assert(bits <= 6);
-
-      for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
-        if (huffman_table[pr].code == code
-          && huffman_table[pr].bits == bits)
-          goto Found;
-    }
-  Found:;
-    if (pr == NO_PIECE_TYPE)
-      return NO_PIECE;
-
-    // first and second flag
-    Color c = (Color)stream.read_one_bit();
-
-    return make_piece(c, pr);
-  }
-
-  int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror)
-  {
-    SfenPacker packer;
-    auto& stream = packer.stream;
-
-    // TODO: separate streams for writing and reading. Here we actually have to
-    // const_cast which is not safe in the long run.
-    stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
-
-    std::memset(&pos, 0, sizeof(Position));
-    std::memset(si, 0, sizeof(StateInfo));
-    std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
-    pos.st = si;
-
-    // Active color
-    pos.sideToMove = (Color)stream.read_one_bit();
-
-    pos.pieceList[W_KING][0] = SQUARE_NB;
-    pos.pieceList[B_KING][0] = SQUARE_NB;
-
-    // First the position of the ball
-    if (mirror)
-    {
-      for (auto c : Colors)
-        pos.board[flip_file((Square)stream.read_n_bit(6))] = make_piece(c, KING);
-    }
-    else
-    {
-      for (auto c : Colors)
-        pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
-    }
-
-    // Piece placement
-    for (Rank r = RANK_8; r >= RANK_1; --r)
-    {
-      for (File f = FILE_A; f <= FILE_H; ++f)
-      {
-        auto sq = make_square(f, r);
-        if (mirror) {
-          sq = flip_file(sq);
-        }
-
-        // it seems there are already balls
-        Piece pc;
-        if (type_of(pos.board[sq]) != KING)
-        {
-          assert(pos.board[sq] == NO_PIECE);
-          pc = packer.read_board_piece_from_stream();
-        }
-        else
-        {
-          pc = pos.board[sq];
-          // put_piece() will catch ASSERT unless you remove it all.
-          pos.board[sq] = NO_PIECE;
-        }
-
-        // There may be no pieces, so skip in that case.
-        if (pc == NO_PIECE)
-          continue;
-
-        pos.put_piece(Piece(pc), sq);
-
-        if (stream.get_cursor()> 256)
-          return 1;
-
-        //assert(stream.get_cursor() <= 256);
-      }
-    }
-
-    // Castling availability.
-    // TODO(someone): Support chess960.
-    pos.st->castlingRights = 0;
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(WHITE, SQ_H1); pos.piece_on(rsq) != W_ROOK; --rsq) {}
-      pos.set_castling_right(WHITE, rsq);
-    }
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(WHITE, SQ_A1); pos.piece_on(rsq) != W_ROOK; ++rsq) {}
-      pos.set_castling_right(WHITE, rsq);
-    }
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(BLACK, SQ_H1); pos.piece_on(rsq) != B_ROOK; --rsq) {}
-      pos.set_castling_right(BLACK, rsq);
-    }
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(BLACK, SQ_A1); pos.piece_on(rsq) != B_ROOK; ++rsq) {}
-      pos.set_castling_right(BLACK, rsq);
-    }
-
-    // En passant square. Ignore if no pawn capture is possible
-    if (stream.read_one_bit()) {
-      Square ep_square = static_cast<Square>(stream.read_n_bit(6));
-      if (mirror) {
-        ep_square = flip_file(ep_square);
-      }
-      pos.st->epSquare = ep_square;
-
-      if (!(pos.attackers_to(pos.st->epSquare) & pos.pieces(pos.sideToMove, PAWN))
-        || !(pos.pieces(~pos.sideToMove, PAWN) & (pos.st->epSquare + pawn_push(~pos.sideToMove))))
-        pos.st->epSquare = SQ_NONE;
-    }
-    else {
-      pos.st->epSquare = SQ_NONE;
-    }
-
-    // Halfmove clock
-    pos.st->rule50 = static_cast<Square>(stream.read_n_bit(6));
-
-    // Fullmove number
-    pos.gamePly = static_cast<Square>(stream.read_n_bit(8));
-
-    // Convert from fullmove starting from 1 to gamePly starting from 0,
-    // handle also common incorrect FEN with fullmove = 0.
-    pos.gamePly = std::max(2 * (pos.gamePly - 1), 0) + (pos.sideToMove == BLACK);
-
-    assert(stream.get_cursor() <= 256);
-
-    pos.chess960 = false;
-    pos.thisThread = th;
-    pos.set_state(pos.st);
-
-    assert(pos_is_ok());
-
-    return 0;
-  }
-
-  PackedSfen sfen_pack(Position& pos)
-  {
-    PackedSfen sfen;
-
-    SfenPacker sp;
-    sp.data = (uint8_t*)&sfen;
-    sp.pack(pos);
-
-    return sfen;
-  }
-}
-
-
-#endif // USE_SFEN_PACKER
diff --git a/src/extra/sfen_packer.h b/src/extra/sfen_packer.h
deleted file mode 100644
index c3832db2..00000000
--- a/src/extra/sfen_packer.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _SFEN_PACKER_H_
-#define _SFEN_PACKER_H_
-
-#if defined(EVAL_LEARN)
-
-#include <cstdint>
-
-#include "../types.h"
-
-#include "../learn/packed_sfen.h"
-class Position;
-struct StateInfo;
-class Thread;
-
-namespace Learner {
-
-    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror);
-    PackedSfen sfen_pack(Position& pos);
-}
-
-#endif
-
-#endif
\ No newline at end of file
diff --git a/src/position.cpp b/src/position.cpp
index a9fc8272..9465afbc 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -34,7 +34,7 @@
 
 #if defined(EVAL_LEARN)
 #include "learn/packed_sfen.h"
-#include "extra/sfen_packer.h"
+#include "learn/sfen_packer.h"
 #endif
 
 using std::string;
diff --git a/src/position.h b/src/position.h
index 382748af..10cf45ba 100644
--- a/src/position.h
+++ b/src/position.h
@@ -32,7 +32,7 @@
 
 #if defined(EVAL_LEARN)
 #include "learn/packed_sfen.h"
-#include "extra/sfen_packer.h"
+#include "learn/sfen_packer.h"
 #endif
 
 
From 96fa8fa8dce77a840190d7ec4bf61adc6ffd5cc7 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 12:21:41 +0200
Subject: [PATCH 255/583] Add missing files.

---
 src/learn/sfen_packer.cpp | 407 ++++++++++++++++++++++++++++++++++++++
 src/learn/sfen_packer.h   |  24 +++
 2 files changed, 431 insertions(+)
 create mode 100644 src/learn/sfen_packer.cpp
 create mode 100644 src/learn/sfen_packer.h

diff --git a/src/learn/sfen_packer.cpp b/src/learn/sfen_packer.cpp
new file mode 100644
index 00000000..236c875f
--- /dev/null
+++ b/src/learn/sfen_packer.cpp
@@ -0,0 +1,407 @@
+﻿#if defined (EVAL_LEARN)
+
+#include "sfen_packer.h"
+
+#include "packed_sfen.h"
+
+#include "misc.h"
+#include "position.h"
+
+#include <sstream>
+#include <fstream>
+#include <cstring> // std::memset()
+
+using namespace std;
+
+namespace Learner {
+
+  // Class that handles bitstream
+  // useful when doing aspect encoding
+  struct BitStream
+  {
+    // Set the memory to store the data in advance.
+    // Assume that memory is cleared to 0.
+    void set_data(std::uint8_t* data_) { data = data_; reset(); }
+
+    // Get the pointer passed in set_data().
+    uint8_t* get_data() const { return data; }
+
+    // Get the cursor.
+    int get_cursor() const { return bit_cursor; }
+
+    // reset the cursor
+    void reset() { bit_cursor = 0; }
+
+    // Write 1bit to the stream.
+    // If b is non-zero, write out 1. If 0, write 0.
+    void write_one_bit(int b)
+    {
+      if (b)
+        data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+
+      ++bit_cursor;
+    }
+
+    // Get 1 bit from the stream.
+    int read_one_bit()
+    {
+      int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
+      ++bit_cursor;
+
+      return b;
+    }
+
+    // write n bits of data
+    // Data shall be written out from the lower order of d.
+    void write_n_bit(int d, int n)
+    {
+      for (int i = 0; i <n; ++i)
+        write_one_bit(d & (1 << i));
+    }
+
+    // read n bits of data
+    // Reverse conversion of write_n_bit().
+    int read_n_bit(int n)
+    {
+      int result = 0;
+      for (int i = 0; i < n; ++i)
+        result |= read_one_bit() ? (1 << i) : 0;
+
+      return result;
+    }
+
+  private:
+    // Next bit position to read/write.
+    int bit_cursor;
+
+    // data entity
+    std::uint8_t* data;
+  };
+
+  // Class for compressing/decompressing sfen
+  // sfen can be packed to 256bit (32bytes) by Huffman coding.
+  // This is proven by mini. The above is Huffman coding.
+  //
+  // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
+  // Side to move (White = 0, Black = 1) (1bit)
+  // White King Position (6 bits)
+  // Black King Position (6 bits)
+  // Huffman Encoding of the board
+  // Castling availability (1 bit x 4)
+  // En passant square (1 or 1 + 6 bits)
+  // Rule 50 (6 bits)
+  // Game play (8 bits)
+  //
+  // TODO(someone): Rename SFEN to FEN.
+  //
+  struct SfenPacker
+  {
+    void pack(const Position& pos);
+
+    // sfen packed by pack() (256bit = 32bytes)
+    // Or sfen to decode with unpack()
+    uint8_t *data; // uint8_t[32];
+
+    BitStream stream;
+
+    // Output the board pieces to stream.
+    void write_board_piece_to_stream(Piece pc);
+
+    // Read one board piece from stream
+    Piece read_board_piece_from_stream();
+  };
+
+
+  // Huffman coding
+  // * is simplified from mini encoding to make conversion easier.
+  //
+  // 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
+  // 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
+  //
+  // empty xxxxx0 + 0 (none)
+  // step xxxx01 + 2 xxxx0 + 2
+  // incense xx0011 + 2 xx001 + 2
+  // Katsura xx1011 + 2 xx101 + 2
+  // silver xx0111 + 2 xx011 + 2
+  // Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
+  // corner 011111 + 2 01111 + 2
+  // Fly 111111 + 2 11111 + 2
+  //
+  // Assuming all pieces are on the board,
+  // Sky 81-40 pieces = 41 boxes = 41bit
+  // Walk 4bit*18 pieces = 72bit
+  // Incense 6bit*4 pieces = 24bit
+  // Katsura 6bit*4 pieces = 24bit
+  // Silver 6bit*4 pieces = 24bit
+  // Gold 6bit* 4 pieces = 24bit
+  // corner 8bit* 2 pieces = 16bit
+  // Fly 8bit* 2 pieces = 16bit
+  // -------
+  // 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
+  //
+  // When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
+  // Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
+  // Therefore, in this expression, any aspect can be expressed by this bit number.
+  // It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
+  // Since the total number of bits can be fixed, we will include this as well.
+
+  // Huffman Encoding
+  //
+  // Empty  xxxxxxx0
+  // Pawn   xxxxx001 + 1 bit (Side to move)
+  // Knight xxxxx011 + 1 bit (Side to move)
+  // Bishop xxxxx101 + 1 bit (Side to move)
+  // Rook   xxxxx111 + 1 bit (Side to move)
+
+  struct HuffmanedPiece
+  {
+    int code; // how it will be coded
+    int bits; // How many bits do you have
+  };
+
+  constexpr HuffmanedPiece huffman_table[] =
+  {
+    {0b0000,1}, // NO_PIECE
+    {0b0001,4}, // PAWN
+    {0b0011,4}, // KNIGHT
+    {0b0101,4}, // BISHOP
+    {0b0111,4}, // ROOK
+    {0b1001,4}, // QUEEN
+  };
+
+  // Pack sfen and store in data[32].
+  void SfenPacker::pack(const Position& pos)
+  {
+  // cout << pos;
+
+    memset(data, 0, 32 /* 256bit */);
+    stream.set_data(data);
+
+    // turn
+    // Side to move.
+    stream.write_one_bit((int)(pos.side_to_move()));
+
+    // 7-bit positions for leading and trailing balls
+    // White king and black king, 6 bits for each.
+    for(auto c: Colors)
+      stream.write_n_bit(pos.king_square(c), 6);
+
+    // Write the pieces on the board other than the kings.
+    for (Rank r = RANK_8; r >= RANK_1; --r)
+    {
+      for (File f = FILE_A; f <= FILE_H; ++f)
+      {
+        Piece pc = pos.piece_on(make_square(f, r));
+        if (type_of(pc) == KING)
+          continue;
+        write_board_piece_to_stream(pc);
+      }
+    }
+
+    // TODO(someone): Support chess960.
+    stream.write_one_bit(pos.can_castle(WHITE_OO));
+    stream.write_one_bit(pos.can_castle(WHITE_OOO));
+    stream.write_one_bit(pos.can_castle(BLACK_OO));
+    stream.write_one_bit(pos.can_castle(BLACK_OOO));
+
+    if (pos.ep_square() == SQ_NONE) {
+      stream.write_one_bit(0);
+    }
+    else {
+      stream.write_one_bit(1);
+      stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
+    }
+
+    stream.write_n_bit(pos.state()->rule50, 6);
+
+    stream.write_n_bit(1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2, 8);
+
+    assert(stream.get_cursor() <= 256);
+  }
+
+  // Output the board pieces to stream.
+  void SfenPacker::write_board_piece_to_stream(Piece pc)
+  {
+    // piece type
+    PieceType pr = type_of(pc);
+    auto c = huffman_table[pr];
+    stream.write_n_bit(c.code, c.bits);
+
+    if (pc == NO_PIECE)
+      return;
+
+    // first and second flag
+    stream.write_one_bit(color_of(pc));
+  }
+
+  // Read one board piece from stream
+  Piece SfenPacker::read_board_piece_from_stream()
+  {
+    PieceType pr = NO_PIECE_TYPE;
+    int code = 0, bits = 0;
+    while (true)
+    {
+      code |= stream.read_one_bit() << bits;
+      ++bits;
+
+      assert(bits <= 6);
+
+      for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
+        if (huffman_table[pr].code == code
+          && huffman_table[pr].bits == bits)
+          goto Found;
+    }
+  Found:;
+    if (pr == NO_PIECE_TYPE)
+      return NO_PIECE;
+
+    // first and second flag
+    Color c = (Color)stream.read_one_bit();
+
+    return make_piece(c, pr);
+  }
+
+  int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror)
+  {
+    SfenPacker packer;
+    auto& stream = packer.stream;
+
+    // TODO: separate streams for writing and reading. Here we actually have to
+    // const_cast which is not safe in the long run.
+    stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
+
+    pos.clear();
+    std::memset(si, 0, sizeof(StateInfo));
+    std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
+    pos.st = si;
+
+    // Active color
+    pos.sideToMove = (Color)stream.read_one_bit();
+
+    pos.pieceList[W_KING][0] = SQUARE_NB;
+    pos.pieceList[B_KING][0] = SQUARE_NB;
+
+    // First the position of the ball
+    if (mirror)
+    {
+      for (auto c : Colors)
+        pos.board[flip_file((Square)stream.read_n_bit(6))] = make_piece(c, KING);
+    }
+    else
+    {
+      for (auto c : Colors)
+        pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
+    }
+
+    // Piece placement
+    for (Rank r = RANK_8; r >= RANK_1; --r)
+    {
+      for (File f = FILE_A; f <= FILE_H; ++f)
+      {
+        auto sq = make_square(f, r);
+        if (mirror) {
+          sq = flip_file(sq);
+        }
+
+        // it seems there are already balls
+        Piece pc;
+        if (type_of(pos.board[sq]) != KING)
+        {
+          assert(pos.board[sq] == NO_PIECE);
+          pc = packer.read_board_piece_from_stream();
+        }
+        else
+        {
+          pc = pos.board[sq];
+          // put_piece() will catch ASSERT unless you remove it all.
+          pos.board[sq] = NO_PIECE;
+        }
+
+        // There may be no pieces, so skip in that case.
+        if (pc == NO_PIECE)
+          continue;
+
+        pos.put_piece(Piece(pc), sq);
+
+        if (stream.get_cursor()> 256)
+          return 1;
+
+        //assert(stream.get_cursor() <= 256);
+      }
+    }
+
+    // Castling availability.
+    // TODO(someone): Support chess960.
+    pos.st->castlingRights = 0;
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(WHITE, SQ_H1); pos.piece_on(rsq) != W_ROOK; --rsq) {}
+      pos.set_castling_right(WHITE, rsq);
+    }
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(WHITE, SQ_A1); pos.piece_on(rsq) != W_ROOK; ++rsq) {}
+      pos.set_castling_right(WHITE, rsq);
+    }
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(BLACK, SQ_H1); pos.piece_on(rsq) != B_ROOK; --rsq) {}
+      pos.set_castling_right(BLACK, rsq);
+    }
+    if (stream.read_one_bit()) {
+      Square rsq;
+      for (rsq = relative_square(BLACK, SQ_A1); pos.piece_on(rsq) != B_ROOK; ++rsq) {}
+      pos.set_castling_right(BLACK, rsq);
+    }
+
+    // En passant square. Ignore if no pawn capture is possible
+    if (stream.read_one_bit()) {
+      Square ep_square = static_cast<Square>(stream.read_n_bit(6));
+      if (mirror) {
+        ep_square = flip_file(ep_square);
+      }
+      pos.st->epSquare = ep_square;
+
+      if (!(pos.attackers_to(pos.st->epSquare) & pos.pieces(pos.sideToMove, PAWN))
+        || !(pos.pieces(~pos.sideToMove, PAWN) & (pos.st->epSquare + pawn_push(~pos.sideToMove))))
+        pos.st->epSquare = SQ_NONE;
+    }
+    else {
+      pos.st->epSquare = SQ_NONE;
+    }
+
+    // Halfmove clock
+    pos.st->rule50 = static_cast<Square>(stream.read_n_bit(6));
+
+    // Fullmove number
+    pos.gamePly = static_cast<Square>(stream.read_n_bit(8));
+
+    // Convert from fullmove starting from 1 to gamePly starting from 0,
+    // handle also common incorrect FEN with fullmove = 0.
+    pos.gamePly = std::max(2 * (pos.gamePly - 1), 0) + (pos.sideToMove == BLACK);
+
+    assert(stream.get_cursor() <= 256);
+
+    pos.chess960 = false;
+    pos.thisThread = th;
+    pos.set_state(pos.st);
+
+    assert(pos_is_ok());
+
+    return 0;
+  }
+
+  PackedSfen sfen_pack(Position& pos)
+  {
+    PackedSfen sfen;
+
+    SfenPacker sp;
+    sp.data = (uint8_t*)&sfen;
+    sp.pack(pos);
+
+    return sfen;
+  }
+}
+
+
+#endif // USE_SFEN_PACKER
diff --git a/src/learn/sfen_packer.h b/src/learn/sfen_packer.h
new file mode 100644
index 00000000..af900902
--- /dev/null
+++ b/src/learn/sfen_packer.h
@@ -0,0 +1,24 @@
+#ifndef _SFEN_PACKER_H_
+#define _SFEN_PACKER_H_
+
+#if defined(EVAL_LEARN)
+
+#include "types.h"
+
+#include "learn/packed_sfen.h"
+
+#include <cstdint>
+
+class Position;
+struct StateInfo;
+class Thread;
+
+namespace Learner {
+
+    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror);
+    PackedSfen sfen_pack(Position& pos);
+}
+
+#endif
+
+#endif
\ No newline at end of file

From 3c87d4fa9b8ec9951c69141ebf426ea4495a8bbd Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 12:22:03 +0200
Subject: [PATCH 256/583] "Fix" warning when memsetting Position

---
 src/position.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/position.h b/src/position.h
index 10cf45ba..aa2d34e7 100644
--- a/src/position.h
+++ b/src/position.h
@@ -192,6 +192,8 @@ public:
   // PackedSfen does not include gamePly so it cannot be restored. If you want to set it, specify it with an argument.
   int set_from_packed_sfen(const Learner::PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror = false);
 
+  void clear() { std::memset(this, 0, sizeof(Position)); }
+
   // Give the board, hand piece, and turn, and return the sfen.
   //static std::string sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly);
 

From 98f24570abe9605df21f786921a41f34fdfaf2fc Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 12:23:29 +0200
Subject: [PATCH 257/583] Add src to include paths, remove non-standard ".." in
 includes in learn directory.

---
 src/Makefile                 |  2 +-
 src/learn/convert.cpp        | 22 ++++++++++++----------
 src/learn/gensfen.cpp        | 24 ++++++++++++++----------
 src/learn/gensfen.h          |  4 ++--
 src/learn/half_float.h       |  2 +-
 src/learn/learn.cpp          | 26 +++++++++++++++-----------
 src/learn/learn.h            |  5 ++---
 src/learn/learning_tools.cpp |  2 +-
 src/learn/learning_tools.h   |  6 +++---
 src/learn/multi_think.cpp    | 14 +++++++-------
 src/learn/multi_think.h      | 11 ++++++-----
 11 files changed, 64 insertions(+), 54 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index aa13603a..ac0b7338 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -321,7 +321,7 @@ endif
 ### ==========================================================================
 
 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
+CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -I. $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
 DEPENDFLAGS += -std=c++17
 LDFLAGS += $(EXTRALDFLAGS) $(LEARNLDFLAGS)
 
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index d50233eb..e9dcb10b 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -2,18 +2,20 @@
 
 #include "convert.h"
 
-// evaluate header for learning
-#include "../eval/evaluate_common.h"
-
 #include "multi_think.h"
-#include "../uci.h"
-#include "../syzygy/tbprobe.h"
-#include "../misc.h"
-#include "../thread.h"
-#include "../position.h"
-#include "../tt.h"
 
-#include "../extra/nnue_data_binpack_format.h"
+#include "uci.h"
+#include "misc.h"
+#include "thread.h"
+#include "position.h"
+#include "tt.h"
+
+// evaluate header for learning
+#include "eval/evaluate_common.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "syzygy/tbprobe.h"
 
 #include <sstream>
 #include <fstream>
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 9f53e983..ebf47188 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,19 +1,23 @@
 ﻿#if defined(EVAL_LEARN)
 
 #include "gensfen.h"
-#include "packed_sfen.h"
 
-#include "../eval/evaluate_common.h"
-#include "../misc.h"
-#include "../nnue/evaluate_nnue_learner.h"
-#include "../position.h"
-#include "../syzygy/tbprobe.h"
-#include "../thread.h"
-#include "../tt.h"
-#include "../uci.h"
+#include "packed_sfen.h"
 #include "multi_think.h"
 
-#include "../extra/nnue_data_binpack_format.h"
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+#include "tt.h"
+#include "uci.h"
+
+#include "eval/evaluate_common.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "nnue/evaluate_nnue_learner.h"
+
+#include "syzygy/tbprobe.h"
 
 #include <chrono>
 #include <climits>
diff --git a/src/learn/gensfen.h b/src/learn/gensfen.h
index dd0f71fb..45e4ca23 100644
--- a/src/learn/gensfen.h
+++ b/src/learn/gensfen.h
@@ -1,9 +1,9 @@
 #ifndef _GENSFEN_H_
 #define _GENSFEN_H_
 
-#include <sstream>
+#include "position.h"
 
-#include "../position.h"
+#include <sstream>
 
 #if defined(EVAL_LEARN)
 namespace Learner {
diff --git a/src/learn/half_float.h b/src/learn/half_float.h
index 30b3e482..ebe77526 100644
--- a/src/learn/half_float.h
+++ b/src/learn/half_float.h
@@ -7,7 +7,7 @@
 // Floating point operation by 16bit type
 // Assume that the float type code generated by the compiler is in IEEE 754 format and use it.
 
-#include "../types.h"
+#include "types.h"
 
 namespace HalfFloat
 {
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index f4f7b409..b5df2276 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -20,20 +20,24 @@
 #if defined(EVAL_LEARN)
 
 #include "learn.h"
-#include "convert.h"
 
-#include "../eval/evaluate_common.h"
-#include "../misc.h"
-#include "../nnue/evaluate_nnue_learner.h"
-#include "../position.h"
-#include "../syzygy/tbprobe.h"
-#include "../thread.h"
-#include "../tt.h"
-#include "../uci.h"
-#include "../search.h"
+#include "convert.h"
 #include "multi_think.h"
 
-#include "../extra/nnue_data_binpack_format.h"
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+#include "tt.h"
+#include "uci.h"
+#include "search.h"
+
+#include "eval/evaluate_common.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "nnue/evaluate_nnue_learner.h"
+
+#include "syzygy/tbprobe.h"
 
 #include <chrono>
 #include <climits>
diff --git a/src/learn/learn.h b/src/learn/learn.h
index b8acc2df..7ee89009 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -3,8 +3,6 @@
 
 #if defined(EVAL_LEARN)
 
-#include <vector>
-
 // ----------------------
 // Floating point for learning
 // ----------------------
@@ -39,9 +37,10 @@ using LearnFloatType = float;
 
 #include "packed_sfen.h"
 
-#include "../position.h"
+#include "position.h"
 
 #include <sstream>
+#include <vector>
 
 namespace Learner
 {
diff --git a/src/learn/learning_tools.cpp b/src/learn/learning_tools.cpp
index eca11c47..285b3487 100644
--- a/src/learn/learning_tools.cpp
+++ b/src/learn/learning_tools.cpp
@@ -2,7 +2,7 @@
 
 #if defined (EVAL_LEARN)
 
-#include "../misc.h"
+#include "misc.h"
 
 using namespace Eval;
 
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 1f9bdf96..194a9732 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -3,11 +3,11 @@
 
 // A set of machine learning tools related to the weight array used for machine learning of evaluation functions
 
-#include "learn.h"
-
 #if defined (EVAL_LEARN)
 
-#include "../misc.h"  // PRNG , my_insertion_sort
+#include "learn.h"
+
+#include "misc.h"  // PRNG , my_insertion_sort
 
 #include <array>
 #include <cmath>	// std::sqrt()
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index 82ebeabb..28b3e152 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -1,10 +1,10 @@
-﻿#include "../types.h"
-
-#if defined(EVAL_LEARN)
+﻿#if defined(EVAL_LEARN)
 
 #include "multi_think.h"
-#include "../tt.h"
-#include "../uci.h"
+
+#include "tt.h"
+#include "uci.h"
+#include "types.h"
 
 #include <thread>
 
@@ -35,13 +35,13 @@ void MultiThink::go_think()
 
 	// Secure end flag of worker thread
 	thread_finished.resize(thread_num);
-	
+
 	// start worker thread
 	for (size_t i = 0; i < thread_num; ++i)
 	{
 		thread_finished[i] = 0;
 		threads.push_back(std::thread([i, this]
-		{ 
+		{
 			// exhaust all processor threads.
 			WinProcGroup::bindThisThread(i);
 
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index 6225144c..4f423da0 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -3,15 +3,16 @@
 
 #if defined(EVAL_LEARN)
 
-#include <functional>
-#include <mutex>
+#include "learn.h"
 
-#include "../misc.h"
-#include "../learn/learn.h"
-#include "../thread_win32_osx.h"
+#include "misc.h"
+#include "thread_win32_osx.h"
 
 #include <atomic>
 #include <limits>
+#include <functional>
+#include <mutex>
+
 
 // Learning from a game record, when making yourself think and generating a fixed track, etc.
 // Helper class used when multiple threads want to call Search::think() individually.

From 3388c22d7165429c040e468c43b326364c19122b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 13:06:53 +0200
Subject: [PATCH 258/583] Fix incorrect use of UCI::Option of type "combo".

---
 src/evaluate.cpp  | 2 +-
 src/ucioption.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 94581998..3b0b0f88 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -37,7 +37,7 @@ namespace Eval {
   UseNNUEMode useNNUE;
   std::string eval_file_loaded="None";
 
-  static UseNNUEMode nnue_mode_from_option(const std::string& mode)
+  static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
   {
     if (mode == "false")
       return UseNNUEMode::False;
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 61e47539..91fa199b 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -86,7 +86,7 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
-#ifdef EVAL_LEARN
+#if defined(EVAL_LEARN)
   o["Use NNUE"]              << Option("true var true var false var pure", "true", on_use_NNUE);
 #else
   o["Use NNUE"]              << Option("true var true var false", "true", on_use_NNUE);

From bcfe28b2ae468d045e2f96b659401422531bacba Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 11 Sep 2020 13:07:16 +0200
Subject: [PATCH 259/583] Fix compilation of sfen_packer.cpp in debug.

---
 src/learn/sfen_packer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learn/sfen_packer.cpp b/src/learn/sfen_packer.cpp
index 236c875f..791870ca 100644
--- a/src/learn/sfen_packer.cpp
+++ b/src/learn/sfen_packer.cpp
@@ -386,7 +386,7 @@ namespace Learner {
     pos.thisThread = th;
     pos.set_state(pos.st);
 
-    assert(pos_is_ok());
+    assert(pos.pos_is_ok());
 
     return 0;
   }

From 580b09381b0fa42d3bce3e5eb7acc10a15675cf3 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sat, 12 Sep 2020 14:11:46 +0200
Subject: [PATCH 260/583] Add a learning command to CI

fixes a small issue, with ponder

Probably the learning command can be improved a bit, so that despite the limited data, the code coverage is better.
---
 src/learn/learn.cpp         |  2 ++
 tests/instrumented_learn.sh | 37 ++++++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index b5df2276..0459dd90 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1981,6 +1981,8 @@ namespace Learner
         // Read evaluation function parameters
         Eval::init_NNUE();
 
+        Threads.main()->ponder = false;
+
         cout << "init_training.." << endl;
         Eval::NNUE::InitializeTraining(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
         Eval::NNUE::SetBatchSize(nn_batch_size);
diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 147c0c97..71f9421c 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -64,8 +64,8 @@ EOF
   ;;
 esac
 
-mkdir -p training_data_01
-mkdir -p training_data_02
+mkdir -p training_data
+mkdir -p validation_data
 
 # gensfen testing 01
 cat << EOF > gensfen01.exp
@@ -78,9 +78,9 @@ cat << EOF > gensfen01.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value false\n"
  send "isready\n"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
  expect "gensfen finished."
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
  expect "gensfen finished."
 
  send "quit\n"
@@ -102,9 +102,9 @@ cat << EOF > gensfen02.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value true\n"
  send "isready\n"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
+ send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/valdidation_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
  expect "gensfen finished."
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data_01/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
+ send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
  expect "gensfen finished."
 
  send "quit\n"
@@ -115,7 +115,30 @@ cat << EOF > gensfen02.exp
  exit \$value
 EOF
 
-for exp in gensfen01.exp gensfen02.exp
+# simple learning
+cat << EOF > learn01.exp
+ set timeout 240
+ spawn $exeprefix ./stockfish
+
+ send "uci\n"
+ send "setoption name SkipLoadingEval value true\n"
+ send "setoption name Use NNUE value true\n"
+ send "setoption name Threads value $threads\n"
+ send "isready\n"
+ send "learn targetdir training_data loop 2 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 mirror_percentage 50 validation_set_file_name validation_data/validation_data.bin\n"
+
+ expect "save_eval() finished."
+
+ send "quit\n"
+ expect eof
+
+ # return error code of the spawned program, useful for valgrind
+ lassign [wait] pid spawnid os_error_flag value
+ exit \$value
+
+EOF
+
+for exp in gensfen01.exp gensfen02.exp learn01.exp
 do
 
   echo "$prefix expect $exp $postfix"

From 8d499e6efa924c4214bfeef65a1368c3f8b025bf Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sat, 12 Sep 2020 14:36:43 +0200
Subject: [PATCH 261/583] Fix flags for dependency generation
 (98f24570abe9605df21f786921a41f34fdfaf2fc)

---
 src/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index ac0b7338..35030be7 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -322,7 +322,7 @@ endif
 
 ### 3.1 Selecting compiler (default = gcc)
 CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -I. $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
-DEPENDFLAGS += -std=c++17
+DEPENDFLAGS += -std=c++17 -I.
 LDFLAGS += $(EXTRALDFLAGS) $(LEARNLDFLAGS)
 
 ifeq ($(COMP),)
@@ -928,6 +928,6 @@ profile-learn: net config-sanity objclean profileclean
 	rm generated_kifu.bin
 
 .depend:
-	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null
+	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@
 
 -include .depend

From d33e7a9b07d1aae2edf72f87ae0ba00db5a15cd9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 16:19:24 +0200
Subject: [PATCH 262/583] Remove conditional compilation on EVAL_LEARN

---
 src/eval/evaluate_common.h                     |  4 ----
 src/evaluate.cpp                               |  5 -----
 src/evaluate.h                                 |  7 ++-----
 src/learn/convert.cpp                          |  3 ---
 src/learn/convert.h                            |  2 --
 src/learn/gensfen.cpp                          |  5 +----
 src/learn/gensfen.h                            |  2 --
 src/learn/learn.cpp                            |  4 ----
 src/learn/learn.h                              |  4 ----
 src/learn/learning_tools.cpp                   |  4 ----
 src/learn/learning_tools.h                     |  3 ---
 src/learn/multi_think.cpp                      |  7 +------
 src/learn/multi_think.h                        |  4 ----
 src/learn/packed_sfen.h                        |  3 ---
 src/learn/sfen_packer.cpp                      |  7 +------
 src/learn/sfen_packer.h                        |  4 ----
 src/nnue/evaluate_nnue_learner.cpp             |  4 ----
 src/nnue/evaluate_nnue_learner.h               |  4 ----
 src/nnue/trainer/trainer.h                     |  4 ----
 src/nnue/trainer/trainer_affine_transform.h    |  4 ----
 src/nnue/trainer/trainer_clipped_relu.h        |  4 ----
 src/nnue/trainer/trainer_feature_transformer.h |  4 ----
 src/nnue/trainer/trainer_input_slice.h         |  4 ----
 src/nnue/trainer/trainer_sum.h                 |  4 ----
 src/position.cpp                               |  6 ------
 src/position.h                                 |  4 ----
 src/search.cpp                                 | 10 ----------
 src/search.h                                   |  9 ---------
 src/tt.cpp                                     |  4 ----
 src/tt.h                                       |  2 --
 src/uci.cpp                                    |  7 +------
 src/ucioption.cpp                              |  8 --------
 32 files changed, 6 insertions(+), 144 deletions(-)

diff --git a/src/eval/evaluate_common.h b/src/eval/evaluate_common.h
index 7799fe79..47e69a44 100644
--- a/src/eval/evaluate_common.h
+++ b/src/eval/evaluate_common.h
@@ -1,8 +1,6 @@
 ﻿#ifndef _EVALUATE_COMMON_H_
 #define _EVALUATE_COMMON_H_
 
-#if defined(EVAL_LEARN)
-
 // A common header-like function for modern evaluation functions.
 
 #include <string>
@@ -21,6 +19,4 @@ namespace Eval
 	double get_eta();
 }
 
-#endif // defined(EVAL_LEARN)
-
 #endif // _EVALUATE_KPPT_COMMON_H_
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 3b0b0f88..e619a747 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -43,11 +43,8 @@ namespace Eval {
       return UseNNUEMode::False;
     else if (mode == "true")
       return UseNNUEMode::True;
-
-#ifdef EVAL_LEARN
     else if (mode == "pure")
       return UseNNUEMode::Pure;
-#endif
 
     return UseNNUEMode::False;
   }
@@ -955,11 +952,9 @@ make_v:
 /// evaluation of the position from the point of view of the side to move.
 
 Value Eval::evaluate(const Position& pos) {
-#ifdef EVAL_LEARN
   if (useNNUE == UseNNUEMode::Pure) {
       return NNUE::evaluate(pos);
   }
-#endif
 
   bool classical = useNNUE == UseNNUEMode::False
                 || abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
diff --git a/src/evaluate.h b/src/evaluate.h
index 61052e90..900a77fc 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -29,11 +29,8 @@ namespace Eval {
   enum struct UseNNUEMode
   {
     False,
-    True
-
-#ifdef EVAL_LEARN
-    ,Pure
-#endif
+    True,
+    Pure
   };
 
   std::string trace(const Position& pos);
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index e9dcb10b..483296a1 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -1,5 +1,3 @@
-#if defined(EVAL_LEARN)
-
 #include "convert.h"
 
 #include "multi_think.h"
@@ -606,4 +604,3 @@ namespace Learner
         convert(args);
     }
 }
-#endif
diff --git a/src/learn/convert.h b/src/learn/convert.h
index a79820a3..a41885d9 100644
--- a/src/learn/convert.h
+++ b/src/learn/convert.h
@@ -5,7 +5,6 @@
 #include <string>
 #include <sstream>
 
-#if defined(EVAL_LEARN)
 namespace Learner {
     void convert_bin_from_pgn_extract(
         const std::vector<std::string>& filenames,
@@ -32,6 +31,5 @@ namespace Learner {
 
     void convert(std::istringstream& is);
 }
-#endif
 
 #endif
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index ebf47188..afbcce37 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,6 +1,4 @@
-﻿#if defined(EVAL_LEARN)
-
-#include "gensfen.h"
+﻿#include "gensfen.h"
 
 #include "packed_sfen.h"
 #include "multi_think.h"
@@ -1207,4 +1205,3 @@ namespace Learner
         std::cout << "gensfen finished." << endl;
     }
 }
-#endif
diff --git a/src/learn/gensfen.h b/src/learn/gensfen.h
index 45e4ca23..d39e44c9 100644
--- a/src/learn/gensfen.h
+++ b/src/learn/gensfen.h
@@ -5,12 +5,10 @@
 
 #include <sstream>
 
-#if defined(EVAL_LEARN)
 namespace Learner {
 
     // Automatic generation of teacher position
     void gen_sfen(Position& pos, std::istringstream& is);
 }
-#endif
 
 #endif
\ No newline at end of file
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 0459dd90..3f951888 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -17,8 +17,6 @@
 // → I will not be involved in the engine because it is a problem that the GUI should assist.
 // etc..
 
-#if defined(EVAL_LEARN)
-
 #include "learn.h"
 
 #include "convert.h"
@@ -2048,5 +2046,3 @@ namespace Learner
     }
 
 } // namespace Learner
-
-#endif // EVAL_LEARN
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 7ee89009..4b09f825 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -1,8 +1,6 @@
 ﻿#ifndef _LEARN_H_
 #define _LEARN_H_
 
-#if defined(EVAL_LEARN)
-
 // ----------------------
 // Floating point for learning
 // ----------------------
@@ -78,6 +76,4 @@ namespace Learner
     void learn(Position& pos, std::istringstream& is);
 }
 
-#endif
-
 #endif // ifndef _LEARN_H_
diff --git a/src/learn/learning_tools.cpp b/src/learn/learning_tools.cpp
index 285b3487..925905c6 100644
--- a/src/learn/learning_tools.cpp
+++ b/src/learn/learning_tools.cpp
@@ -1,7 +1,5 @@
 ﻿#include "learning_tools.h"
 
-#if defined (EVAL_LEARN)
-
 #include "misc.h"
 
 using namespace Eval;
@@ -18,5 +16,3 @@ namespace EvalLearningTools
 	uint64_t Weight::eta1_epoch;
 	uint64_t Weight::eta2_epoch;
 }
-
-#endif
diff --git a/src/learn/learning_tools.h b/src/learn/learning_tools.h
index 194a9732..dcb2c4aa 100644
--- a/src/learn/learning_tools.h
+++ b/src/learn/learning_tools.h
@@ -3,8 +3,6 @@
 
 // A set of machine learning tools related to the weight array used for machine learning of evaluation functions
 
-#if defined (EVAL_LEARN)
-
 #include "learn.h"
 
 #include "misc.h"  // PRNG , my_insertion_sort
@@ -98,5 +96,4 @@ namespace EvalLearningTools
 	};
 }
 
-#endif // defined (EVAL_LEARN)
 #endif
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index 28b3e152..043238fa 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -1,6 +1,4 @@
-﻿#if defined(EVAL_LEARN)
-
-#include "multi_think.h"
+﻿#include "multi_think.h"
 
 #include "tt.h"
 #include "uci.h"
@@ -118,6 +116,3 @@ void MultiThink::go_think()
 		Options[s.first] = std::string(s.second);
 
 }
-
-
-#endif // defined(EVAL_LEARN)
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index 4f423da0..7de9d6b9 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -1,8 +1,6 @@
 ﻿#ifndef _MULTI_THINK_
 #define _MULTI_THINK_
 
-#if defined(EVAL_LEARN)
-
 #include "learn.h"
 
 #include "misc.h"
@@ -151,6 +149,4 @@ protected:
 	std::mutex task_mutex;
 };
 
-#endif // defined(EVAL_LEARN) && defined(YANEURAOU_2018_OTAFUKU_ENGINE)
-
 #endif
diff --git a/src/learn/packed_sfen.h b/src/learn/packed_sfen.h
index 101e5e34..3aa4fcac 100644
--- a/src/learn/packed_sfen.h
+++ b/src/learn/packed_sfen.h
@@ -4,7 +4,6 @@
 #include <vector>
 #include <cstdint>
 
-#if defined(EVAL_LEARN)
 namespace Learner {
 
     // packed sfen
@@ -45,5 +44,3 @@ namespace Learner {
     using PSVector = std::vector<PackedSfenValue>;
 }
 #endif
-
-#endif
diff --git a/src/learn/sfen_packer.cpp b/src/learn/sfen_packer.cpp
index 791870ca..734a477b 100644
--- a/src/learn/sfen_packer.cpp
+++ b/src/learn/sfen_packer.cpp
@@ -1,6 +1,4 @@
-﻿#if defined (EVAL_LEARN)
-
-#include "sfen_packer.h"
+﻿#include "sfen_packer.h"
 
 #include "packed_sfen.h"
 
@@ -402,6 +400,3 @@ namespace Learner {
     return sfen;
   }
 }
-
-
-#endif // USE_SFEN_PACKER
diff --git a/src/learn/sfen_packer.h b/src/learn/sfen_packer.h
index af900902..533d3fc9 100644
--- a/src/learn/sfen_packer.h
+++ b/src/learn/sfen_packer.h
@@ -1,8 +1,6 @@
 #ifndef _SFEN_PACKER_H_
 #define _SFEN_PACKER_H_
 
-#if defined(EVAL_LEARN)
-
 #include "types.h"
 
 #include "learn/packed_sfen.h"
@@ -19,6 +17,4 @@ namespace Learner {
     PackedSfen sfen_pack(Position& pos);
 }
 
-#endif
-
 #endif
\ No newline at end of file
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 8b0413e5..ea680e31 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -1,7 +1,5 @@
 ﻿// Code for learning NNUE evaluation function
 
-#if defined(EVAL_LEARN)
-
 #include <random>
 #include <fstream>
 #include <filesystem>
@@ -238,5 +236,3 @@ double get_eta() {
 }
 
 }  // namespace Eval
-
-#endif  // defined(EVAL_LEARN)
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 0e5fbcd2..e9bd2fd2 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -3,8 +3,6 @@
 #ifndef _EVALUATE_NNUE_LEARNER_H_
 #define _EVALUATE_NNUE_LEARNER_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../learn/learn.h"
 
 namespace Eval {
@@ -41,6 +39,4 @@ void CheckHealth();
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index 94553c07..659863ad 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_H_
 #define _NNUE_TRAINER_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../nnue_common.h"
 #include "../features/index_list.h"
 
@@ -120,6 +118,4 @@ std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 4b5ddee6..50751ffe 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../../learn/learn.h"
 #include "../layers/affine_transform.h"
 #include "trainer.h"
@@ -296,6 +294,4 @@ class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 72575bf8..cf7a2447 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
 #define _NNUE_TRAINER_CLIPPED_RELU_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../../learn/learn.h"
 #include "../layers/clipped_relu.h"
 #include "trainer.h"
@@ -137,6 +135,4 @@ class Trainer<Layers::ClippedReLU<PreviousLayer>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 6b94d952..190e009a 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 #define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../../learn/learn.h"
 #include "../nnue_feature_transformer.h"
 #include "trainer.h"
@@ -372,6 +370,4 @@ class Trainer<FeatureTransformer> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 6b0adc9f..e2cd0c25 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_INPUT_SLICE_H_
 #define _NNUE_TRAINER_INPUT_SLICE_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../../learn/learn.h"
 #include "../layers/input_slice.h"
 #include "trainer.h"
@@ -246,6 +244,4 @@ class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index 0b7abe36..65a0b681 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -3,8 +3,6 @@
 #ifndef _NNUE_TRAINER_SUM_H_
 #define _NNUE_TRAINER_SUM_H_
 
-#if defined(EVAL_LEARN)
-
 #include "../../learn/learn.h"
 #include "../layers/sum.h"
 #include "trainer.h"
@@ -185,6 +183,4 @@ class Trainer<Layers::Sum<PreviousLayer>> {
 
 }  // namespace Eval
 
-#endif  // defined(EVAL_LEARN)
-
 #endif
diff --git a/src/position.cpp b/src/position.cpp
index 9465afbc..38ac7c5c 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -32,10 +32,8 @@
 #include "uci.h"
 #include "syzygy/tbprobe.h"
 
-#if defined(EVAL_LEARN)
 #include "learn/packed_sfen.h"
 #include "learn/sfen_packer.h"
-#endif
 
 using std::string;
 
@@ -1352,8 +1350,6 @@ bool Position::pos_is_ok() const {
   return true;
 }
 
-#if defined(EVAL_LEARN)
-
 // Add a function that directly unpacks for speed. It's pretty tough.
 // Write it by combining packer::unpack() and Position::set().
 // If there is a problem with the passed phase and there is an error, non-zero is returned.
@@ -1385,5 +1381,3 @@ void Position::sfen_pack(Learner::PackedSfen& sfen)
 {
   sfen = Learner::sfen_pack(*this);
 }
-
-#endif
\ No newline at end of file
diff --git a/src/position.h b/src/position.h
index aa2d34e7..2163dca3 100644
--- a/src/position.h
+++ b/src/position.h
@@ -30,10 +30,8 @@
 
 #include "nnue/nnue_accumulator.h"
 
-#if defined(EVAL_LEARN)
 #include "learn/packed_sfen.h"
 #include "learn/sfen_packer.h"
-#endif
 
 
 /// StateInfo struct stores information needed to restore a Position object to
@@ -177,7 +175,6 @@ public:
   // Used by NNUE
   StateInfo* state() const;
 
-#if defined(EVAL_LEARN)
   // --sfenization helper
 
   friend int Learner::set_from_packed_sfen(Position& pos, const Learner::PackedSfen& sfen, StateInfo* si, Thread* th, bool mirror);
@@ -199,7 +196,6 @@ public:
 
   // Returns the position of the ball on the c side.
   Square king_square(Color c) const { return pieceList[make_piece(c, KING)][0]; }
-#endif // EVAL_LEARN
 
 private:
   // Initialization helpers (used while setting up a position)
diff --git a/src/search.cpp b/src/search.cpp
index b92ea7c8..f8cf3cbc 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -54,9 +54,7 @@ using std::string;
 using Eval::evaluate;
 using namespace Search;
 
-#if defined(EVAL_LEARN)
 bool Search::prune_at_shallow_depth_on_pv_node = false;
-#endif
 
 namespace {
 
@@ -991,9 +989,7 @@ moves_loop: // When in check, search starts from here
       ss->moveCount = ++moveCount;
 
       if (rootNode && thisThread == Threads.main() && Time.elapsed() > 3000
-#if defined(EVAL_LEARN)
           && !Limits.silent
-#endif
           )
           sync_cout << "info depth " << depth
                     << " currmove " << UCI::move(move, pos.is_chess960())
@@ -1011,9 +1007,7 @@ moves_loop: // When in check, search starts from here
 
       // Step 12. Pruning at shallow depth (~200 Elo)
       if (  !rootNode
-#ifdef EVAL_LEARN
           && (PvNode ? prune_at_shallow_depth_on_pv_node : true)
-#endif
           && pos.non_pawn_material(us)
           && bestValue > VALUE_TB_LOSS_IN_MAX_PLY)
       {
@@ -1564,10 +1558,8 @@ moves_loop: // When in check, search starts from here
 
       // Check for legality just before making the move
       if (
-#if defined(EVAL_LEARN)
         // HACK: pos.piece_on(from_sq(m)) sometimes will be NO_PIECE during machine learning.
         !pos.pseudo_legal(move) ||
-#endif // EVAL_LEARN
         !pos.legal(move)
         )
       {
@@ -1978,7 +1970,6 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
 
 // --- expose the functions such as fixed depth search used for learning to the outside
 
-#if defined (EVAL_LEARN)
 
 namespace Learner
 {
@@ -2278,4 +2269,3 @@ namespace Learner
   }
 
 }
-#endif
diff --git a/src/search.h b/src/search.h
index 5e092273..20dfe909 100644
--- a/src/search.h
+++ b/src/search.h
@@ -32,10 +32,7 @@ namespace Search {
 /// Threshold used for countermoves based pruning
 constexpr int CounterMovePruneThreshold = 0;
 
-
-#if defined(EVAL_LEARN)
 extern bool prune_at_shallow_depth_on_pv_node;
-#endif
 
 /// Stack struct keeps track of the information we need to remember from nodes
 /// shallower and deeper in the tree during the search. Each search thread has
@@ -90,9 +87,7 @@ struct LimitsType {
     time[WHITE] = time[BLACK] = inc[WHITE] = inc[BLACK] = npmsec = movetime = TimePoint(0);
     movestogo = depth = mate = perft = infinite = 0;
     nodes = 0;
-#if defined (EVAL_LEARN)
     silent = false;
-#endif
   }
 
   bool use_time_management() const {
@@ -103,11 +98,9 @@ struct LimitsType {
   TimePoint time[COLOR_NB], inc[COLOR_NB], npmsec, movetime, startTime;
   int movestogo, depth, mate, perft, infinite;
   int64_t nodes;
-#if defined (EVAL_LEARN)
   // Silent mode that does not output to the screen (for continuous self-play in process)
   // Do not output PV at this time.
   bool silent;
-#endif
 };
 
 extern LimitsType Limits;
@@ -117,7 +110,6 @@ void clear();
 
 } // namespace Search
 
-#if defined(EVAL_LEARN)
 namespace Learner {
 
   // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
@@ -126,6 +118,5 @@ namespace Learner {
   ValueAndPV qsearch(Position& pos);
   ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
 }
-#endif
 
 #endif // #ifndef SEARCH_H_INCLUDED
diff --git a/src/tt.cpp b/src/tt.cpp
index fc8ab3b1..c64670ac 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -28,9 +28,7 @@
 
 TranspositionTable TT; // Our global transposition table
 
-#ifdef EVAL_LEARN
 bool TranspositionTable::enable_transposition_table = true;
-#endif
 
 /// TTEntry::save() populates the TTEntry with a new node's data, possibly
 /// overwriting an old position. Update is not atomic and can be racy.
@@ -120,12 +118,10 @@ void TranspositionTable::clear() {
 
 TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
 
-#ifdef EVAL_LEARN
   if (!enable_transposition_table) {
       found = false;
       return first_entry(0);
   }
-#endif
 
   TTEntry* const tte = first_entry(key);
   const uint16_t key16 = (uint16_t)key;  // Use the low 16 bits as key inside the cluster
diff --git a/src/tt.h b/src/tt.h
index e83b6f3c..29072bd8 100644
--- a/src/tt.h
+++ b/src/tt.h
@@ -84,9 +84,7 @@ public:
     return &table[mul_hi64(key, clusterCount)].entry[0];
   }
 
-#ifdef EVAL_LEARN
   static bool enable_transposition_table;
-#endif
 
 private:
   friend struct TTEntry;
diff --git a/src/uci.cpp b/src/uci.cpp
index 0a28fc1f..1128d4d9 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -245,7 +245,6 @@ double UCI::win_rate_model_double(double v, int ply) {
 // Call qsearch(),search() directly for testing
 // --------------------
 
-#if defined(EVAL_LEARN)
 void qsearch_cmd(Position& pos)
 {
   cout << "qsearch : ";
@@ -277,8 +276,6 @@ void search_cmd(Position& pos, istringstream& is)
   cout << endl;
 }
 
-#endif
-
 /// UCI::loop() waits for a command from stdin, parses it and calls the appropriate
 /// function. Also intercepts EOF from stdin to ensure gracefully exiting if the
 /// GUI dies unexpectedly. When called with some command line arguments, e.g. to
@@ -334,7 +331,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "d")        sync_cout << pos << sync_endl;
       else if (token == "eval")     trace_eval(pos);
       else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
-#if defined (EVAL_LEARN)
+
       else if (token == "gensfen") Learner::gen_sfen(pos, is);
       else if (token == "learn") Learner::learn(pos, is);
       else if (token == "convert") Learner::convert(is);
@@ -343,8 +340,6 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "qsearch") qsearch_cmd(pos);
       else if (token == "search") search_cmd(pos, is);
 
-#endif
-
       // test command
       else if (token == "test") test_cmd(pos, is);
       else
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 91fa199b..aa85dc07 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -42,14 +42,12 @@ void on_threads(const Option& o) { Threads.set(size_t(o)); }
 void on_tb_path(const Option& o) { Tablebases::init(o); }
 void on_use_NNUE(const Option& ) { Eval::init_NNUE(); }
 void on_eval_file(const Option& ) { Eval::init_NNUE(); }
-#ifdef EVAL_LEARN
 void on_prune_at_shallow_depth_on_pv_node(const Option& o) {
     Search::prune_at_shallow_depth_on_pv_node = o;
 }
 void on_enable_transposition_table(const Option& o) {
     TranspositionTable::enable_transposition_table = o;
 }
-#endif
 
 /// Our case insensitive less() function as required by UCI protocol
 bool CaseInsensitiveLess::operator() (const string& s1, const string& s2) const {
@@ -86,11 +84,7 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
-#if defined(EVAL_LEARN)
   o["Use NNUE"]              << Option("true var true var false var pure", "true", on_use_NNUE);
-#else
-  o["Use NNUE"]              << Option("true var true var false", "true", on_use_NNUE);
-#endif
   // The default must follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work.
   o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);
@@ -102,7 +96,6 @@ void init(OptionsMap& o) {
   o["SkipLoadingEval"]       << Option(false);
   // how many moves to use a fixed move
   // o["BookMoves"] << Option(16, 0, 10000);
-#if defined(EVAL_LEARN)
   // When learning the evaluation function, you can change the folder to save the evaluation function.
   // Evalsave by default. This folder shall be prepared in advance.
   // Automatically create a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
@@ -111,7 +104,6 @@ void init(OptionsMap& o) {
   o["PruneAtShallowDepthOnPvNode"] << Option(false, on_prune_at_shallow_depth_on_pv_node);
   // Enable transposition table.
   o["EnableTranspositionTable"] << Option(true, on_enable_transposition_table);
-#endif
 }
 
 
From 1e2fca4040ef94c60c5318d1d707f395337fdb74 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 16:23:49 +0200
Subject: [PATCH 263/583] Move learn target to build target and profile-learn
 to profile-build.

---
 src/Makefile | 41 ++++++++++++-----------------------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 35030be7..b9ad8fbd 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -735,22 +735,31 @@ endif
         clang-profile-use clang-profile-make
 
 build: config-sanity
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
+	EXTRACXXFLAGS=' -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
+	all
 
 profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
+	LEARNCXXFLAGS=' -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOBENCH) > /dev/null
+	$(PGOGENSFEN) > /dev/null
 	@echo ""
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use)
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
+	LEARNCXXFLAGS=' -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
+	rm generated_kifu.bin
 
 strip:
 	$(STRIP) $(EXE)
@@ -901,32 +910,6 @@ icc-profile-use:
 	EXTRACXXFLAGS='-prof_use -prof_dir ./profdir' \
 	all
 
-learn: config-sanity
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' -DEVAL_LEARN -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
-	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
-	all
-
-profile-learn: net config-sanity objclean profileclean
-	@echo ""
-	@echo "Step 1/4. Building instrumented executable ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
-	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
-	@echo ""
-	@echo "Step 2/4. Running benchmark for pgo-build ..."
-	$(PGOGENSFEN)
-	@echo ""
-	@echo "Step 3/4. Building optimized executable ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
-	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
-	@echo ""
-	@echo "Step 4/4. Deleting profile data ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
-	rm generated_kifu.bin
-
 .depend:
 	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@
 

From 1da452029b3180769e206efc5a696fc37f37d1e6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 16:27:35 +0200
Subject: [PATCH 264/583] Update travis to use build target instead of learn.

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 608d22c1..418888f6 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -108,5 +108,5 @@ script:
 
   # NNUE testing
   - export CXXFLAGS="-O1 -fno-inline"
-  - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind
-  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
+  - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no build > /dev/null && ../tests/instrumented_learn.sh --valgrind
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no build > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined

From 9d84af11fe0fd1cf97f64efb490cd4fd35544326 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 18:20:21 +0200
Subject: [PATCH 265/583] Remove remaining learn builds from CI. No replacement
 needed.

---
 .travis.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 418888f6..6ebfeeb2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -75,11 +75,6 @@ script:
   # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
   - make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref
 
-  # start some basic learner CI
-  - make clean && make -j2 ARCH=x86-64-modern learn
-  - make clean && make -j2 ARCH=x86-64-modern profile-learn
-  - make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no learn
-
   # compile only for some more advanced architectures (might not run in travis)
   - make clean && make -j2 ARCH=x86-64-avx2 build
   - make clean && make -j2 ARCH=x86-64-bmi2 build

From a6b02a61b7da82611a2f2f4227eb2308185b1b8b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 18:22:09 +0200
Subject: [PATCH 266/583] Remove 32 bit builds.

---
 .travis.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 6ebfeeb2..aa325412 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -67,12 +67,6 @@ script:
   - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
   # TODO avoid _mm_malloc
   # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
-  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
-  - make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref
-  - make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref
-  # TODO avoid _mm_malloc
-  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
-  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
   - make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref
 
   # compile only for some more advanced architectures (might not run in travis)

From 8d1ad6fbf6795f2574dc954ee6fc255b25e68761 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 21:16:27 +0200
Subject: [PATCH 267/583] Add a makefile option to enable use of BLAS. Default
 to "no"

---
 src/Makefile | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index b9ad8fbd..1c43d631 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -111,6 +111,7 @@ else
    SUPPORTED_ARCH=false
 endif
 
+blas = no
 optimize = yes
 debug = no
 sanitize = no
@@ -132,17 +133,25 @@ ARCH = x86-64-modern
 STRIP = strip
 
 ### BLAS libraries
-ifeq ($(KERNEL),Linux)
-	BLASCXXFLAGS =
-	BLASLDFLAGS = -lopenblas
-else
-	BLASCXXFLAGS = -I/mingw64/include/OpenBLAS
-
-	ifeq ($(debug),yes)
-		BLASLDFLAGS = -lopenblas -Wl,-static
+ifeq ($(blas), yes)
+	ifeq ($(KERNEL),Linux)
+		BLASCXXFLAGS =
+		BLASLDFLAGS = -lopenblas
 	else
-		BLASLDFLAGS = -lopenblas -Wl,-s -static
+		BLASCXXFLAGS = -I/mingw64/include/OpenBLAS
+
+		ifeq ($(debug),yes)
+			BLASLDFLAGS = -lopenblas -Wl,-static
+		else
+			BLASLDFLAGS = -lopenblas -Wl,-s -static
+		endif
 	endif
+
+	BLASDEFINE = -DUSE_BLAS
+else
+	BLASCXXFLAGS =
+	BLASLDFLAGS =
+	BLASDEFINE =
 endif
 
 ### 2.2 Architecture specific
@@ -736,7 +745,7 @@ endif
 
 build: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	EXTRACXXFLAGS=' $(BLASDEFINE) $(BLASCXXFLAGS) -fopenmp ' \
 	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
 	all
 
@@ -744,7 +753,7 @@ profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' $(BLASDEFINE) $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
@@ -754,7 +763,7 @@ profile-build: net config-sanity objclean profileclean
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' -DUSE_BLAS $(BLASCXXFLAGS) -fopenmp ' \
+	LEARNCXXFLAGS=' $(BLASDEFINE) $(BLASCXXFLAGS) -fopenmp ' \
 	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."

From f049c4776a78ec3d3b44198c1972c0a6768815d7 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Sep 2020 21:19:15 +0200
Subject: [PATCH 268/583] Add tests in CI to cover compilation of both blas=no
 and blas=yes.

---
 .travis.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index aa325412..204f2657 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -70,6 +70,8 @@ script:
   - make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref
 
   # compile only for some more advanced architectures (might not run in travis)
+  - make clean && make -j2 ARCH=x86-64-avx2 blas=yes build
+
   - make clean && make -j2 ARCH=x86-64-avx2 build
   - make clean && make -j2 ARCH=x86-64-bmi2 build
   - make clean && make -j2 ARCH=x86-64-avx512 build

From fbae6604b1332c64cef74e2f81c83b1ab8ba147b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 00:18:01 +0200
Subject: [PATCH 269/583] Remove LEARNCXXFLAGS, LEARNLDFLAGS, BLASDEFINE,
 BLASCXXFLAGS, BLASLDFLAGS in favor of directly modifying CXXFLAGS and
 LDFLAGS.

---
 src/Makefile | 63 ++++++++++++++++++++++------------------------------
 1 file changed, 26 insertions(+), 37 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 1c43d631..9b59c5bb 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -132,28 +132,6 @@ neon = no
 ARCH = x86-64-modern
 STRIP = strip
 
-### BLAS libraries
-ifeq ($(blas), yes)
-	ifeq ($(KERNEL),Linux)
-		BLASCXXFLAGS =
-		BLASLDFLAGS = -lopenblas
-	else
-		BLASCXXFLAGS = -I/mingw64/include/OpenBLAS
-
-		ifeq ($(debug),yes)
-			BLASLDFLAGS = -lopenblas -Wl,-static
-		else
-			BLASLDFLAGS = -lopenblas -Wl,-s -static
-		endif
-	endif
-
-	BLASDEFINE = -DUSE_BLAS
-else
-	BLASCXXFLAGS =
-	BLASLDFLAGS =
-	BLASDEFINE =
-endif
-
 ### 2.2 Architecture specific
 
 ifeq ($(findstring x86,$(ARCH)),x86)
@@ -330,9 +308,8 @@ endif
 ### ==========================================================================
 
 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -I. $(EXTRACXXFLAGS) $(LEARNCXXFLAGS)
-DEPENDFLAGS += -std=c++17 -I.
-LDFLAGS += $(EXTRALDFLAGS) $(LEARNLDFLAGS)
+CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -fopenmp -I. $(EXTRACXXFLAGS)
+DEPENDFLAGS += -std=c++17 -I. $(EXTRALDFLAGS)
 
 ifeq ($(COMP),)
 	COMP=gcc
@@ -487,14 +464,33 @@ ifneq ($(comp),mingw)
 endif
 endif
 
-### 3.2.1 Debugging
+### 3.2.1. BLAS libraries
+ifeq ($(blas), yes)
+	LDFLAGS += -lopenblas
+
+	ifeq ($(KERNEL),Linux)
+		LDFLAGS +=
+	else
+		CXXFLAGS += -I/mingw64/include/OpenBLAS
+
+		ifeq ($(debug),yes)
+			LDFLAGS += -Wl,-static
+		else
+			LDFLAGS += -Wl,-s -static
+		endif
+	endif
+
+	CXXFLAGS += -DUSE_BLAS
+endif
+
+### 3.2.2 Debugging
 ifeq ($(debug),no)
 	CXXFLAGS += -DNDEBUG
 else
 	CXXFLAGS += -g
 endif
 
-### 3.2.2 Debugging with undefined behavior sanitizers
+### 3.2.3 Debugging with undefined behavior sanitizers
 ifneq ($(sanitize),no)
         CXXFLAGS += -g3 -fsanitize=$(sanitize)
         LDFLAGS += -fsanitize=$(sanitize)
@@ -744,17 +740,12 @@ endif
         clang-profile-use clang-profile-make
 
 build: config-sanity
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' $(BLASDEFINE) $(BLASCXXFLAGS) -fopenmp ' \
-	EXTRALDFLAGS=' $(BLASLDFLAGS) -fopenmp  ' \
-	all
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
 
 profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' $(BLASDEFINE) $(BLASCXXFLAGS) -fopenmp ' \
-	LEARNLDFLAGS='  $(BLASLDFLAGS) -fopenmp '
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOBENCH) > /dev/null
@@ -762,9 +753,7 @@ profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' $(BLASDEFINE) $(BLASCXXFLAGS) -fopenmp ' \
-	LEARNLDFLAGS=' $(BLASLDFLAGS) -fopenmp '
+	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use)
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean

From 72164ba59ca4f0143b170e4721ba9aa38c591cc6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 02:06:33 +0200
Subject: [PATCH 270/583] Add missing -fopenmp LDFLAG

---
 src/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Makefile b/src/Makefile
index 9b59c5bb..81e2ff17 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -309,6 +309,7 @@ endif
 
 ### 3.1 Selecting compiler (default = gcc)
 CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -fopenmp -I. $(EXTRACXXFLAGS)
+LDFLAGS += -fopenmp
 DEPENDFLAGS += -std=c++17 -I. $(EXTRALDFLAGS)
 
 ifeq ($(COMP),)

From 4b70f4bf23305ea6cb1e24e7fd9311cd20c6f46e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 02:07:29 +0200
Subject: [PATCH 271/583] Add extra ld flags to the proper variable.

---
 src/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 81e2ff17..5477d68e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -309,8 +309,8 @@ endif
 
 ### 3.1 Selecting compiler (default = gcc)
 CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -fopenmp -I. $(EXTRACXXFLAGS)
-LDFLAGS += -fopenmp
-DEPENDFLAGS += -std=c++17 -I. $(EXTRALDFLAGS)
+LDFLAGS += -fopenmp $(EXTRALDFLAGS)
+DEPENDFLAGS += -std=c++17 -I.
 
 ifeq ($(COMP),)
 	COMP=gcc

From 50b4ff83548632fc9070d701754abf0360c41839 Mon Sep 17 00:00:00 2001
From: Matthies <a.matthies@online.de>
Date: Sat, 12 Sep 2020 17:59:36 +0200
Subject: [PATCH 272/583] Add missing include to make MSVC compile

---
 src/extra/nnue_data_binpack_format.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 2c555939..7ceafbc0 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -41,6 +41,7 @@ THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include <array>
 #include <limits>
 #include <climits>
+#include <optional>
 
 #if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(__clang__)
 #include <intrin.h>
@@ -7196,4 +7197,4 @@ namespace binpack
 
         std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
-}
\ No newline at end of file
+}

From 0a5893d337aac9a89cea1c4cddbd7a7d44a0ae81 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 13 Sep 2020 14:05:52 +0900
Subject: [PATCH 273/583] Update README.md

Updated description according to recent option changes.
---
 README.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 6d28a998..081f75d5 100644
--- a/README.md
+++ b/README.md
@@ -17,12 +17,10 @@ setoption name Threads value x
 setoption name Hash value y
 setoption name SyzygyPath value path
 isready
-gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000 use_raw_nnue_eval 0
+gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000
 ```
 Specify how many threads and how much memory you would like to use with the x and y values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The path is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
 
-use_raw_nnue_eval controls if the training data generator or trainer uses raw NNUE eval values.  Don't forget to set use_raw_nnue_eval 0 when initial training data are generated.  Otherwise, the gensfen command will crash.
-
 This will save a file named "generated_kifu.bin" in the same folder as the binary. Once generation is done, rename the file to something like "1billiondepth12.bin" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
 #### Generation Parameters
 - Depth is the searched depth per move, or how far the engine looks forward. This value is an integer.
@@ -34,7 +32,7 @@ Use the "learn" binary. Create an empty folder named "evalsave" in the same dire
 ```
 uci
 setoption name SkipLoadingEval value true
-setoption name Use NNUE value true
+setoption name Use NNUE value pure
 setoption name Threads value x
 isready
 learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 mirror_percentage 50 validation_set_file_name validationdata\val.bin
@@ -46,7 +44,7 @@ Nets get saved in the "evalsave" folder.
 - lambda is the amount of weight it puts to eval of learning data vs win/draw/loss results. 1 puts all weight on eval, lambda 0 puts all weight on WDL results.
 
 ### Reinforcement Learning
-If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries with the setting `Use NNUE` set to true. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries with the setting `Use NNUE` set to `pure`. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
 
 After you have generated the training data, you must move it into your training data folder and delete the older data so that the binary does not accidentally train on the same data again. Do the same for the validation data and name it to val-1.bin to make it less confusing. Make sure the evalsave folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set eval_save_interval to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value. The validation file should be set to the new validation data, not the old data.
 

From 1c84da9caa08a142655bedf6def85e62e4736801 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 13 Sep 2020 16:32:01 +0900
Subject: [PATCH 274/583] Fixed a bug that an assertion fails in the trainer.
 if the SkipLoading is false.

Fixes #128
---
 src/learn/learn.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 0459dd90..46c6a9dc 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1988,7 +1988,13 @@ namespace Learner
         Eval::NNUE::SetBatchSize(nn_batch_size);
         Eval::NNUE::SetOptions(nn_options);
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
-            learn_think.best_nn_directory = std::string(Options["EvalDir"]);
+            // Save the current net to [EvalDir]\original.
+            Eval::save_eval("original");
+
+            // Set the folder above to best_nn_directory so that the trainer can
+            // resotre the network parameters from the original net file.
+            learn_think.best_nn_directory =
+                Path::Combine(Options["EvalSaveDir"], "original");
         }
 
         cout << "init done." << endl;

From a94a076e3925dcb47cc6a24182d35d01267642a4 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Sun, 13 Sep 2020 16:35:52 +0900
Subject: [PATCH 275/583] Fixed a comment.

---
 src/learn/learn.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 46c6a9dc..eaabc524 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1988,7 +1988,7 @@ namespace Learner
         Eval::NNUE::SetBatchSize(nn_batch_size);
         Eval::NNUE::SetOptions(nn_options);
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
-            // Save the current net to [EvalDir]\original.
+            // Save the current net to [EvalSaveDir]\original.
             Eval::save_eval("original");
 
             // Set the folder above to best_nn_directory so that the trainer can

From 3ea2d5ef6198ac43e9beaae60bad8fd6f4e071f2 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 13 Sep 2020 08:34:22 +0200
Subject: [PATCH 276/583] Remove use of non-existent EvalDir option.

additionally allow all options to be converted to string.
Without this, restoring of the options (multi_think.cpp:117) can't work.

fixes https://github.com/nodchip/Stockfish/issues/128

Now gensfen/learn pass with debug=yes in CI
---
 .travis.yml         | 2 +-
 src/learn/learn.cpp | 3 ---
 src/ucioption.cpp   | 2 +-
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 608d22c1..fee1bed2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -109,4 +109,4 @@ script:
   # NNUE testing
   - export CXXFLAGS="-O1 -fno-inline"
   - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no learn > /dev/null && ../tests/instrumented_learn.sh --valgrind
-  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes learn > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 0459dd90..67b186b3 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1987,9 +1987,6 @@ namespace Learner
         Eval::NNUE::InitializeTraining(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
         Eval::NNUE::SetBatchSize(nn_batch_size);
         Eval::NNUE::SetOptions(nn_options);
-        if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
-            learn_think.best_nn_directory = std::string(Options["EvalDir"]);
-        }
 
         cout << "init done." << endl;
 
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 91fa199b..1a80efff 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -165,7 +165,7 @@ Option::operator double() const {
 }
 
 Option::operator std::string() const {
-  assert(type == "string");
+  assert(type == "check" || type == "spin" || type == "combo" || type == "button" || type == "string");
   return currentValue;
 }
 

From fb877c2c3ec28ca4bd4d8586f3028ebb6f2cd6ad Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 12:14:35 +0200
Subject: [PATCH 277/583] Add some building instructions to readme.

---
 README.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/README.md b/README.md
index 6d28a998..cdcda0d4 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,31 @@
 ## Overview
 Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updateable neural network backwards) to Stockfish 11. To learn more about the Stockfish chess engine, look [here](stockfish.md) for an overview and [here](https://github.com/official-stockfish/Stockfish) for the official repository.
 
+## Building
+To compile:
+```
+make -jN ARCH=... build
+```
+
+To compile with Profile Guided Optimizations. Requires that the computer that is used for compilation supports the selected `ARCH`.
+```
+make -jN ARCH=... profile-build
+```
+
+`N` is the number of threads to use for compilation.
+
+`ARCH` is one of:
+`x86-64-vnni512`, `x86-64-vnni256`, `x86-64-avx512`, `x86-64-bmi2`, `x86-64-avx2`,
+`x86-64-sse41-popcnt`, `x86-64-modern`, `x86-64-ssse3`, `x86-64-sse3-popcnt`,
+`x86-64`, `x86-32-sse41-popcnt`, `x86-32-sse2`, `x86-32`, `ppc-64`, `ppc-32,
+armv7`, `armv7-neon`, `armv8`, `apple-silicon`, `general-64`, `general-32`.
+
+`ARCH` needs to be chosen based based on the instruction set of the CPU that will run stockfish. `x86-64-modern` will produce a binary that works on most common processors, but other options may increase performance for specific hardware.
+
+Additional options:
+
+- `blas=[yes/no]` - whether to use an external BLAS library. Default is `no`. Using an external BLAS library may have a significantly improve learning performance and by default expects openBLAS to be installed.
+
 ## Training Guide
 ### Generating Training Data
 To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands. 

From bd434b80c677966865c2e343658aec98c2966415 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 13:40:56 +0200
Subject: [PATCH 278/583] debug=yes for last CI test

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 204f2657..9dad6b1d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -100,4 +100,4 @@ script:
   # NNUE testing
   - export CXXFLAGS="-O1 -fno-inline"
   - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no build > /dev/null && ../tests/instrumented_learn.sh --valgrind
-  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=no build > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined

From 9ee8ce67bf6b0fa681ca7c29b5c33e52105f087e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 13:42:13 +0200
Subject: [PATCH 279/583] Move removal of generate training data file to
 profileclean.

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 5477d68e..3e10702f 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -758,7 +758,6 @@ profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
-	rm generated_kifu.bin
 
 strip:
 	$(STRIP) $(EXE)
@@ -805,6 +804,7 @@ profileclean:
 	@rm -rf profdir
 	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda
 	@rm -f stockfish.profdata *.profraw
+	@rm generated_kifu.bin
 
 default:
 	help

From e4a4f4001fe91604fed4ad01b1429d4674168aed Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 13:44:19 +0200
Subject: [PATCH 280/583] parametrize the name of the training data file
 generated during pgo

---
 src/Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 3e10702f..982df26b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -39,8 +39,9 @@ PREFIX = /usr/local
 BINDIR = $(PREFIX)/bin
 
 ### Built-in benchmark for pgo-builds
+PGO_TRAINING_DATA_FILE = pgo_training_data
 PGOBENCH = ./$(EXE) bench
-PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000
+PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 output_file_name $(PGO_TRAINING_DATA_FILE)
 
 ### Source and object files
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
@@ -804,7 +805,7 @@ profileclean:
 	@rm -rf profdir
 	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda
 	@rm -f stockfish.profdata *.profraw
-	@rm generated_kifu.bin
+	@rm $(PGO_TRAINING_DATA_FILE)
 
 default:
 	help

From 2e2de7607bbb958e699bb2e76a60ad36b912f5b0 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 13:47:19 +0200
Subject: [PATCH 281/583] Add extension to the PGO_TRAINING_DATA_FILE so that
 the generated file name matches the one we try to delete.

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 982df26b..499e8d78 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -39,7 +39,7 @@ PREFIX = /usr/local
 BINDIR = $(PREFIX)/bin
 
 ### Built-in benchmark for pgo-builds
-PGO_TRAINING_DATA_FILE = pgo_training_data
+PGO_TRAINING_DATA_FILE = pgo_training_data.bin
 PGOBENCH = ./$(EXE) bench
 PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 output_file_name $(PGO_TRAINING_DATA_FILE)
 

From 89f38c938bac12171abe5d778efd4857b478693b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 13:52:42 +0200
Subject: [PATCH 282/583] Don't prompt when the training data file doesn't
 exist when trying to delete it

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 499e8d78..69517c3c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -805,7 +805,7 @@ profileclean:
 	@rm -rf profdir
 	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda
 	@rm -f stockfish.profdata *.profraw
-	@rm $(PGO_TRAINING_DATA_FILE)
+	@rm -f $(PGO_TRAINING_DATA_FILE)
 
 default:
 	help

From 30a1bc4c64e0cf41269c34868b457ed6b4b5acb5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Sep 2020 14:19:30 +0200
Subject: [PATCH 283/583] Change default value of "PruneAtShallowDepthOnPvNode"
 so that the bench matches master.

---
 src/search.cpp    | 2 +-
 src/ucioption.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index f8cf3cbc..7c6f8ace 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -54,7 +54,7 @@ using std::string;
 using Eval::evaluate;
 using namespace Search;
 
-bool Search::prune_at_shallow_depth_on_pv_node = false;
+bool Search::prune_at_shallow_depth_on_pv_node = true;
 
 namespace {
 
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index e4a26098..06298596 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -101,7 +101,7 @@ void init(OptionsMap& o) {
   // Automatically create a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
   o["EvalSaveDir"] << Option("evalsave");
   // Prune at shallow depth on PV nodes. Setting this value to true gains elo in shallow search.
-  o["PruneAtShallowDepthOnPvNode"] << Option(false, on_prune_at_shallow_depth_on_pv_node);
+  o["PruneAtShallowDepthOnPvNode"] << Option(true, on_prune_at_shallow_depth_on_pv_node);
   // Enable transposition table.
   o["EnableTranspositionTable"] << Option(true, on_enable_transposition_table);
 }

From 5d088e02c8046c04536f00ffa2298b5982d153c0 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 13 Sep 2020 18:16:04 +0200
Subject: [PATCH 284/583] add convert_plain to CI

---
 tests/instrumented_learn.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 71f9421c..7f76fd76 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -80,6 +80,8 @@ cat << EOF > gensfen01.exp
  send "isready\n"
  send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
  expect "gensfen finished."
+ send "learn training_data/training_data.bin convert_plain output_file_name training_data.txt\n"
+ expect "all done"
  send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
  expect "gensfen finished."
 

From d160436921dec1675e18b8a2d2a1da1693002588 Mon Sep 17 00:00:00 2001
From: Joseph Ellis <jhellis3@gmail.com>
Date: Tue, 15 Sep 2020 15:02:44 -0500
Subject: [PATCH 285/583] Update description for PruneAtShallowDepthOnPvNode

---
 src/ucioption.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 06298596..dde3844a 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -100,7 +100,7 @@ void init(OptionsMap& o) {
   // Evalsave by default. This folder shall be prepared in advance.
   // Automatically create a folder under this folder like "0/", "1/", ... and save the evaluation function file there.
   o["EvalSaveDir"] << Option("evalsave");
-  // Prune at shallow depth on PV nodes. Setting this value to true gains elo in shallow search.
+  // Prune at shallow depth on PV nodes. False is recommended when using fixed depth search.
   o["PruneAtShallowDepthOnPvNode"] << Option(true, on_prune_at_shallow_depth_on_pv_node);
   // Enable transposition table.
   o["EnableTranspositionTable"] << Option(true, on_enable_transposition_table);

From 6ae09ba266021a61afe8f5a7b7a0d82f6609c8f6 Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Mon, 14 Sep 2020 19:11:57 +0900
Subject: [PATCH 286/583] Fixed a bug that the root color is wrong.

---
 src/learn/learn.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 753efafa..70459963 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -842,6 +842,8 @@ namespace Learner
         // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
         const auto [_, pv] = qsearch(task_pos);
 
+        const auto rootColor = task_pos.side_to_move();
+
         std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
         for (size_t i = 0; i < pv.size(); ++i)
         {
@@ -849,7 +851,6 @@ namespace Learner
             Eval::NNUE::update_eval(task_pos);
         }
 
-        const auto rootColor = task_pos.side_to_move();
         const Value shallow_value =
             (rootColor == task_pos.side_to_move())
             ? Eval::evaluate(task_pos)

From bc9be5a71fd9cc81f1761b5f0a827461bb15ffd3 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 16 Sep 2020 14:22:39 +0200
Subject: [PATCH 287/583] Allow setting PRNG seed

---
 src/misc.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/misc.h b/src/misc.h
index 4c04d3f0..7537624c 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -19,6 +19,7 @@
 #ifndef MISC_H_INCLUDED
 #define MISC_H_INCLUDED
 
+#include <algorithm>
 #include <cassert>
 #include <chrono>
 #include <functional>
@@ -28,6 +29,7 @@
 #include <vector>
 #include <utility>
 #include <cmath>
+#include <cctype>
 
 #include "types.h"
 
@@ -85,6 +87,19 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 /// For further analysis see
 ///   <http://vigna.di.unimi.it/ftp/papers/xorshift.pdf>
 
+static uint64_t string_hash(const std::string& str)
+{
+  uint64_t h = 525201411107845655ull;
+
+  for (auto c : str) {
+    h ^= static_cast<uint64_t>(c);
+    h *= 0x5bd1e9955bd1e995ull;
+    h ^= h >> 47;
+  }
+
+  return h;
+}
+
 class PRNG {
 
   uint64_t s;
@@ -109,6 +124,19 @@ public:
 
   // Return the random seed used internally.
   uint64_t get_seed() const { return s; }
+
+  void set_seed(uint64_t seed) { s = seed; }
+
+  void set_seed(const std::string& str)
+  {
+    if (std::all_of(str.begin(), str.end(), std::isdigit)) {
+      set_seed(std::stoull(str));
+    }
+    else
+    {
+      set_seed(string_hash(str));
+    }
+  }
 };
 
 // Display a random seed. (For debugging)

From efca5d561fcb7f685962d6d32fd5be8aac7a7f8f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 16 Sep 2020 14:38:54 +0200
Subject: [PATCH 288/583] More PRNG seeding options

---
 src/misc.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/misc.h b/src/misc.h
index 7537624c..5b7c8870 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -111,7 +111,9 @@ class PRNG {
   }
 
 public:
+  PRNG() { set_seed_from_time(); }
   PRNG(uint64_t seed) : s(seed) { assert(seed); }
+  PRNG(const std::string& seed) { set_seed(seed); }
 
   template<typename T> T rand() { return T(rand64()); }
 
@@ -127,9 +129,18 @@ public:
 
   void set_seed(uint64_t seed) { s = seed; }
 
+  void set_seed_from_time()
+  {
+      set_seed(std::chrono::system_clock::now().time_since_epoch().count());
+  }
+
   void set_seed(const std::string& str)
   {
-    if (std::all_of(str.begin(), str.end(), std::isdigit)) {
+    if (str.empty())
+    {
+      set_seed_from_time();
+    }
+    else if (std::all_of(str.begin(), str.end(), [](char c) { return std::isdigit(c);} )) {
       set_seed(std::stoull(str));
     }
     else
@@ -196,7 +207,9 @@ int write_memory_to_file(std::string filename, void* ptr, uint64_t size);
 // async version of PRNG
 struct AsyncPRNG
 {
+  AsyncPRNG() : prng() { }
   AsyncPRNG(uint64_t seed) : prng(seed) { assert(seed); }
+  AsyncPRNG(const std::string& seed) : prng(seed) { }
   // [ASYNC] Extract one random number.
   template<typename T> T rand() {
     std::unique_lock<std::mutex> lk(mutex);

From 184bde47dc0b1703bc03177c467e735f156fb273 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 16 Sep 2020 14:43:21 +0200
Subject: [PATCH 289/583] Add "seed" option to gensfen and learn

---
 src/learn/gensfen.cpp   | 10 +++++++---
 src/learn/learn.cpp     | 33 ++++++++++++++++++---------------
 src/learn/multi_think.h | 11 +++++++----
 3 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index afbcce37..f7cc5669 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -355,7 +355,8 @@ namespace Learner
         // It must be 2**N because it will be used as the mask to calculate hash_index.
         static_assert((GENSFEN_HASH_SIZE& (GENSFEN_HASH_SIZE - 1)) == 0);
 
-        MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_) :
+        MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_, const std::string& seed) :
+            MultiThink(seed),
             search_depth_min(search_depth_min_),
             search_depth_max(search_depth_max_),
             sfen_writer(sw_)
@@ -1055,6 +1056,7 @@ namespace Learner
         bool random_file_name = false;
 
         std::string sfen_format;
+        std::string seed;
 
         while (true)
         {
@@ -1111,6 +1113,8 @@ namespace Learner
                 is >> detect_draw_by_insufficient_mating_material;
             else if (token == "sfen_format")
                 is >> sfen_format;
+            else if (token == "seed")
+                is >> seed;
             else
                 cout << "Error! : Illegal token " << token << endl;
         }
@@ -1137,7 +1141,7 @@ namespace Learner
         {
             // Give a random number to output_file_name at this point.
             // Do not use std::random_device().  Because it always the same integers on MinGW.
-            PRNG r(std::chrono::system_clock::now().time_since_epoch().count());
+            PRNG r(seed);
             // Just in case, reassign the random numbers.
             for (int i = 0; i < 10; ++i)
                 r.rand(1);
@@ -1182,7 +1186,7 @@ namespace Learner
             SfenWriter sfen_writer(output_file_name, thread_num);
             sfen_writer.set_save_interval(save_every);
 
-            MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, sfen_writer);
+            MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, sfen_writer, seed);
             multi_think.nodes = nodes;
             multi_think.set_loop_max(loop_max);
             multi_think.eval_limit = eval_limit;
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 70459963..6d0a777d 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -432,8 +432,8 @@ namespace Learner
 
         // Do not use std::random_device().
         // Because it always the same integers on MinGW.
-        SfenReader(int thread_num) :
-            prng(std::chrono::system_clock::now().time_since_epoch().count())
+        SfenReader(int thread_num, const std::string& seed) :
+            prng(seed)
         {
             packed_sfens.resize(thread_num);
             total_read = 0;
@@ -742,7 +742,8 @@ namespace Learner
     // Class to generate sfen with multiple threads
     struct LearnerThink : public MultiThink
     {
-        LearnerThink(SfenReader& sr_) :
+        LearnerThink(SfenReader& sr_, const std::string& seed) :
+            MultiThink(seed),
             sr(sr_),
             stop_flag(false),
             save_only_once(false)
@@ -1437,7 +1438,7 @@ namespace Learner
 
     // Subcontracting the teacher shuffle "learn shuffle" command.
     // output_file_name: name of the output file where the shuffled teacher positions will be written
-    void shuffle_files(const vector<string>& filenames, const string& output_file_name, uint64_t buffer_size)
+    void shuffle_files(const vector<string>& filenames, const string& output_file_name, uint64_t buffer_size, const std::string& seed)
     {
         // The destination folder is
         // tmp/ for temporary writing
@@ -1460,7 +1461,7 @@ namespace Learner
 
         // random number to shuffle
         // Do not use std::random_device().  Because it always the same integers on MinGW.
-        PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
+        PRNG prng(seed);
 
         // generate the name of the temporary file
         auto make_filename = [](uint64_t i)
@@ -1533,11 +1534,11 @@ namespace Learner
     // Subcontracting the teacher shuffle "learn shuffleq" command.
     // This is written in 1 pass.
     // output_file_name: name of the output file where the shuffled teacher positions will be written
-    void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name)
+    void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name, const std::string& seed)
     {
         // random number to shuffle
         // Do not use std::random_device().  Because it always the same integers on MinGW.
-        PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
+        PRNG prng(seed);
 
         // number of files
         const size_t file_count = filenames.size();
@@ -1573,7 +1574,7 @@ namespace Learner
 
     // Subcontracting the teacher shuffle "learn shufflem" command.
     // Read the whole memory and write it out with the specified file name.
-    void shuffle_files_on_memory(const vector<string>& filenames, const string output_file_name)
+    void shuffle_files_on_memory(const vector<string>& filenames, const string output_file_name, const std::string& seed)
     {
         PSVector buf;
 
@@ -1591,7 +1592,7 @@ namespace Learner
 
         // shuffle from buf[0] to buf[size-1]
         // Do not use std::random_device().  Because it always the same integers on MinGW.
-        PRNG prng(std::chrono::system_clock::now().time_since_epoch().count());
+        PRNG prng(seed);
         uint64_t size = (uint64_t)buf.size();
         std::cout << "shuffle buf.size() = " << size << std::endl;
 
@@ -1613,9 +1614,7 @@ namespace Learner
     void learn(Position&, istringstream& is)
     {
         const auto thread_num = (int)Options["Threads"];
-        SfenReader sr(thread_num);
 
-        LearnerThink learn_think(sr);
         vector<string> filenames;
 
         // mini_batch_size 1M aspect by default. This can be increased.
@@ -1704,6 +1703,7 @@ namespace Learner
         uint64_t mirror_percentage = 0;
 
         string validation_set_file_name;
+        string seed;
 
         // Assume the filenames are staggered.
         while (true)
@@ -1811,7 +1811,7 @@ namespace Learner
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
             else if (option == "convert_teacher_signal_to_winning_probability") is >> convert_teacher_signal_to_winning_probability;
-
+            else if (option == "seed") is >> seed;
             // Otherwise, it's a filename.
             else
                 filenames.push_back(option);
@@ -1829,6 +1829,9 @@ namespace Learner
         cout << "Warning! OpenMP disabled." << endl;
 #endif
 
+        SfenReader sr(thread_num, seed);
+        LearnerThink learn_think(sr, seed);
+
         // Display learning game file
         if (target_dir != "")
         {
@@ -1861,21 +1864,21 @@ namespace Learner
         {
             cout << "buffer_size     : " << buffer_size << endl;
             cout << "shuffle mode.." << endl;
-            shuffle_files(filenames, output_file_name, buffer_size);
+            shuffle_files(filenames, output_file_name, buffer_size, seed);
             return;
         }
 
         if (shuffle_quick)
         {
             cout << "quick shuffle mode.." << endl;
-            shuffle_files_quick(filenames, output_file_name);
+            shuffle_files_quick(filenames, output_file_name, seed);
             return;
         }
 
         if (shuffle_on_memory)
         {
             cout << "shuffle on memory.." << endl;
-            shuffle_files_on_memory(filenames, output_file_name);
+            shuffle_files_on_memory(filenames, output_file_name, seed);
             return;
         }
 
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index 7de9d6b9..4b5662aa 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -10,6 +10,8 @@
 #include <limits>
 #include <functional>
 #include <mutex>
+#include <string>
+#include <cstdint>
 
 
 // Learning from a game record, when making yourself think and generating a fixed track, etc.
@@ -19,10 +21,11 @@ struct MultiThink
 {
 	static constexpr std::uint64_t LOOP_COUNT_FINISHED = std::numeric_limits<std::uint64_t>::max();
 
-	MultiThink() : prng(std::chrono::system_clock::now().time_since_epoch().count())
-	{
-		loop_count = 0;
-	}
+	MultiThink() : prng{}, loop_count(0) { }
+
+	MultiThink(std::uint64_t seed) : prng(seed), loop_count(0) { }
+
+	MultiThink(const std::string& seed) : prng(seed), loop_count(0) { }
 
 	// Call this function from the master thread, each thread will think,
 	// Return control when the thought ending condition is satisfied.

From e8472b5fbe1eed1cbcdfe06eb8ae9206bac773e0 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Fri, 18 Sep 2020 20:22:01 +0200
Subject: [PATCH 290/583] Fix races in gensfen as detected with thread
 sanitizer.

RootInTB was an incorrectly shared global, probably leading to wrong scoreing

Minor:
 setting TB global state from input by all threads (all threads write same values)
 setting Limits global state by all threads (idem)
 thread counting for finalization

CI can be enabled once races are fixed in the learner, manually goes like:
```
make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build
../tests/instrumented_learn.sh --sanitizer-thread
```

Needs some review.
---
 src/learn/multi_think.cpp | 33 ++++++++++++++-----
 src/learn/multi_think.h   |  5 +--
 src/search.cpp            | 69 ++++++++++++++-------------------------
 src/search.h              | 12 +++++++
 src/syzygy/tbprobe.h      |  2 --
 src/thread.cpp            |  2 ++
 src/thread.h              |  1 +
 7 files changed, 66 insertions(+), 58 deletions(-)

diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index 043238fa..22e49e81 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -3,6 +3,7 @@
 #include "tt.h"
 #include "uci.h"
 #include "types.h"
+#include "search.h"
 
 #include <thread>
 
@@ -23,6 +24,27 @@ void MultiThink::go_think()
 	// Call the derived class's init().
 	init();
 
+        // init global vars
+        Tablebases::init();
+
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        {
+          auto& limits = Search::Limits;
+
+          // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+          limits.infinite = true;
+
+          // Since PV is an obstacle when displayed, erase it.
+          limits.silent = true;
+
+          // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+          limits.nodes = 0;
+
+          // depth is also processed by the one passed as an argument of Learner::search().
+          limits.depth = 0;
+        }
+
 	// The loop upper limit is set with set_loop_max().
 	loop_count = 0;
 	done_count = 0;
@@ -32,12 +54,11 @@ void MultiThink::go_think()
 	auto thread_num = (size_t)Options["Threads"];
 
 	// Secure end flag of worker thread
-	thread_finished.resize(thread_num);
+        threads_finished=0;
 
 	// start worker thread
 	for (size_t i = 0; i < thread_num; ++i)
 	{
-		thread_finished[i] = 0;
 		threads.push_back(std::thread([i, this]
 		{
 			// exhaust all processor threads.
@@ -47,7 +68,7 @@ void MultiThink::go_think()
 			this->thread_worker(i);
 
 			// Set the end flag because the thread has ended
-			this->thread_finished[i] = 1;
+			this->threads_finished++;
 		}));
 	}
 
@@ -61,11 +82,7 @@ void MultiThink::go_think()
 	// function to determine if all threads have finished
 	auto threads_done = [&]()
 	{
-		// returns false if no one is finished
-		for (auto& f : thread_finished)
-			if (!f)
-				return false;
-		return true;
+		return threads_finished == thread_num;
 	};
 
 	// Call back if the callback function is set.
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index 4b5662aa..e6c436f8 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -96,10 +96,7 @@ private:
 	std::mutex loop_mutex;
 
 	// Thread end flag.
-	// vector<bool> may not be reflected properly when trying to rewrite from multiple threads...
-	typedef uint8_t Flag;
-	std::vector<Flag> thread_finished;
-
+        std::atomic<uint64_t> threads_finished;
 };
 
 // Mechanism to process task during idle time.
diff --git a/src/search.cpp b/src/search.cpp
index 7c6f8ace..9f5119a2 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -43,9 +43,24 @@ namespace Search {
 namespace Tablebases {
 
   int Cardinality;
-  bool RootInTB;
   bool UseRule50;
   Depth ProbeDepth;
+
+  void init() {
+
+      UseRule50 = bool(Options["Syzygy50MoveRule"]);
+      ProbeDepth = int(Options["SyzygyProbeDepth"]);
+      Cardinality = int(Options["SyzygyProbeLimit"]);
+
+      // Tables with fewer pieces than SyzygyProbeLimit are searched with
+      // ProbeDepth == DEPTH_ZERO
+      if (Cardinality > MaxCardinality)
+      {
+          Cardinality = MaxCardinality;
+          ProbeDepth = 0;
+      }
+  }
+
 }
 
 namespace TB = Tablebases;
@@ -1844,7 +1859,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
   size_t pvIdx = pos.this_thread()->pvIdx;
   size_t multiPV = std::min((size_t)Options["MultiPV"], rootMoves.size());
   uint64_t nodesSearched = Threads.nodes_searched();
-  uint64_t tbHits = Threads.tb_hits() + (TB::RootInTB ? rootMoves.size() : 0);
+  uint64_t tbHits = Threads.tb_hits() + (pos.this_thread()->rootInTB ? rootMoves.size() : 0);
 
   for (size_t i = 0; i < multiPV; ++i)
   {
@@ -1856,7 +1871,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
       Depth d = updated ? depth : depth - 1;
       Value v = updated ? rootMoves[i].score : rootMoves[i].previousScore;
 
-      bool tb = TB::RootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
+      bool tb = pos.this_thread()->rootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
       v = tb ? rootMoves[i].tbScore : v;
 
       if (ss.rdbuf()->in_avail()) // Not at first line
@@ -1923,10 +1938,8 @@ bool RootMove::extract_ponder_from_tt(Position& pos) {
 
 void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
 
-    RootInTB = false;
-    UseRule50 = bool(Options["Syzygy50MoveRule"]);
-    ProbeDepth = int(Options["SyzygyProbeDepth"]);
-    Cardinality = int(Options["SyzygyProbeLimit"]);
+    auto& rootInTB = pos.this_thread()->rootInTB;
+    rootInTB = false;
     bool dtz_available = true;
 
     // Tables with fewer pieces than SyzygyProbeLimit are searched with
@@ -1940,17 +1953,17 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
     if (Cardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING))
     {
         // Rank moves using DTZ tables
-        RootInTB = root_probe(pos, rootMoves);
+        rootInTB = root_probe(pos, rootMoves);
 
-        if (!RootInTB)
+        if (!rootInTB)
         {
             // DTZ tables are missing; try to rank moves using WDL tables
             dtz_available = false;
-            RootInTB = root_probe_wdl(pos, rootMoves);
+            rootInTB = root_probe_wdl(pos, rootMoves);
         }
     }
 
-    if (RootInTB)
+    if (rootInTB)
     {
         // Sort moves according to TB rank
         std::sort(rootMoves.begin(), rootMoves.end(),
@@ -1966,6 +1979,7 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
         for (auto& m : rootMoves)
             m.tbRank = 0;
     }
+
 }
 
 // --- expose the functions such as fixed depth search used for learning to the outside
@@ -1987,39 +2001,6 @@ namespace Learner
 
     std::memset(ss - 7, 0, 10 * sizeof(Stack));
 
-    // About Search::Limits
-    // Be careful because this member variable is global and affects other threads.
-    {
-      auto& limits = Search::Limits;
-
-      // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
-      limits.infinite = true;
-
-      // Since PV is an obstacle when displayed, erase it.
-      limits.silent = true;
-
-      // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
-      limits.nodes = 0;
-
-      // depth is also processed by the one passed as an argument of Learner::search().
-      limits.depth = 0;
-
-      // Set a large value to prevent the draw value from being returned due to the number of moves near the draw.
-      //limits.max_game_ply = 1 << 16;
-
-      // If you do not include the ball entry rule, it will be a draw and it will be difficult to settle.
-      //limits.enteringKingRule = EnteringKingRule::EKR_27_POINT;
-    }
-
-    // Set DrawValue
-    {
-      // Because it is not prepared for each thread
-      // May be overwritten by another thread. There is no help for it.
-      // If that happens, I think it should be 0.
-      //drawValueTable[REPETITION_DRAW][BLACK] = VALUE_ZERO;
-      //drawValueTable[REPETITION_DRAW][WHITE] = VALUE_ZERO;
-    }
-
     // Regarding this_thread.
 
     {
diff --git a/src/search.h b/src/search.h
index 20dfe909..fd5814ef 100644
--- a/src/search.h
+++ b/src/search.h
@@ -24,6 +24,7 @@
 #include "misc.h"
 #include "movepick.h"
 #include "types.h"
+#include "uci.h"
 
 class Position;
 
@@ -110,6 +111,17 @@ void clear();
 
 } // namespace Search
 
+namespace Tablebases {
+
+extern int MaxCardinality;
+extern int Cardinality;
+extern bool UseRule50;
+extern Depth ProbeDepth;
+
+void init();
+
+}
+
 namespace Learner {
 
   // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h
index b998989b..6af5d278 100644
--- a/src/syzygy/tbprobe.h
+++ b/src/syzygy/tbprobe.h
@@ -43,8 +43,6 @@ enum ProbeState {
     ZEROING_BEST_MOVE =  2  // Best move zeroes DTZ (capture or pawn move)
 };
 
-extern int MaxCardinality;
-
 void init(const std::string& paths);
 WDLScore probe_wdl(Position& pos, ProbeState* result);
 int probe_dtz(Position& pos, ProbeState* result);
diff --git a/src/thread.cpp b/src/thread.cpp
index 1aa66a81..ef4cb398 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -192,6 +192,8 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
           || std::count(limits.searchmoves.begin(), limits.searchmoves.end(), m))
           rootMoves.emplace_back(m);
 
+  Tablebases::init();
+
   if (!rootMoves.empty())
       Tablebases::rank_root_moves(pos, rootMoves);
 
diff --git a/src/thread.h b/src/thread.h
index 042bc2e9..e0c838c8 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -74,6 +74,7 @@ public:
   CapturePieceToHistory captureHistory;
   ContinuationHistory continuationHistory[2][2];
   Score contempt;
+  bool rootInTB;
 };
 
 
From 61bc8d12d39cb31303ec9162b1ca8a015d896192 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Fri, 18 Sep 2020 23:06:45 +0200
Subject: [PATCH 291/583] Fix some races in learning

declare a few variables atomic.

Other races remain...
---
 src/learn/learn.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 6d0a777d..6142ce6b 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -695,14 +695,14 @@ namespace Learner
         uint64_t last_done;
 
         // If total_read exceeds this value, update_weights() and calculate mse.
-        uint64_t next_update_weights;
+        std::atomic<uint64_t> next_update_weights;
 
         uint64_t save_count;
 
         // Do not shuffle when reading the phase.
         bool no_shuffle;
 
-        bool stop_flag;
+        std::atomic<bool> stop_flag;
 
         vector<Key> hash;
 
@@ -785,7 +785,7 @@ namespace Learner
         // Mini batch size size. Be sure to set it on the side that uses this class.
         uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
 
-        bool stop_flag;
+        std::atomic<bool> stop_flag;
 
         // Discount rate
         double discount_rate;

From da28ce3339bd19356ec59d50a897fde3d5e213c1 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sat, 19 Sep 2020 19:27:21 +0200
Subject: [PATCH 292/583] Add initialization also to learning patch

fixes https://github.com/nodchip/Stockfish/issues/160
---
 src/learn/learn.cpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 6142ce6b..c1900af3 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1985,6 +1985,27 @@ namespace Learner
 
         Threads.main()->ponder = false;
 
+        // init global vars
+        Tablebases::init();
+
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        {
+          auto& limits = Search::Limits;
+
+          // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+          limits.infinite = true;
+
+          // Since PV is an obstacle when displayed, erase it.
+          limits.silent = true;
+
+          // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+          limits.nodes = 0;
+
+          // depth is also processed by the one passed as an argument of Learner::search().
+          limits.depth = 0;
+        }
+
         cout << "init_training.." << endl;
         Eval::NNUE::InitializeTraining(eta1, eta1_epoch, eta2, eta2_epoch, eta3);
         Eval::NNUE::SetBatchSize(nn_batch_size);

From d4737819cd7aea0e7744df9973dd5c1db228000e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 20 Sep 2020 10:39:21 +0200
Subject: [PATCH 293/583] Fix castling rights feature encoding.

---
 src/nnue/features/castling_right.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
index 86fe06fe..ee2c88cf 100644
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -27,7 +27,7 @@ namespace Eval {
         }
 
         for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
-          if (relative_castling_rights & (i << 1)) {
+          if (relative_castling_rights & (1 << i)) {
             active->push_back(i);
           }
         }
@@ -55,8 +55,8 @@ namespace Eval {
         }
 
         for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
-          if ((relative_previous_castling_rights & (i << 1)) &&
-            (relative_current_castling_rights & (i << 1)) == 0) {
+          if ((relative_previous_castling_rights & (1 << i)) &&
+            (relative_current_castling_rights & (1 << i)) == 0) {
             removed->push_back(i);
           }
         }

From 2931463d3a8b2ea86ac223842dc775fb0ab68de6 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 20 Sep 2020 19:43:38 +0200
Subject: [PATCH 294/583] Revert earlier TB changes.

they were not correct. Unfortunately, also restores the race on RootInTB
---
 src/learn/learn.cpp       |  3 ---
 src/learn/multi_think.cpp |  3 ---
 src/search.cpp            | 36 +++++++++++-------------------------
 src/search.h              | 12 ------------
 src/syzygy/tbprobe.h      |  2 ++
 src/thread.cpp            |  2 --
 src/thread.h              |  1 -
 7 files changed, 13 insertions(+), 46 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index c1900af3..ba904e9d 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1985,9 +1985,6 @@ namespace Learner
 
         Threads.main()->ponder = false;
 
-        // init global vars
-        Tablebases::init();
-
         // About Search::Limits
         // Be careful because this member variable is global and affects other threads.
         {
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index 22e49e81..7c389d40 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -24,9 +24,6 @@ void MultiThink::go_think()
 	// Call the derived class's init().
 	init();
 
-        // init global vars
-        Tablebases::init();
-
         // About Search::Limits
         // Be careful because this member variable is global and affects other threads.
         {
diff --git a/src/search.cpp b/src/search.cpp
index 9f5119a2..e1616c5c 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -43,24 +43,9 @@ namespace Search {
 namespace Tablebases {
 
   int Cardinality;
+  bool RootInTB;
   bool UseRule50;
   Depth ProbeDepth;
-
-  void init() {
-
-      UseRule50 = bool(Options["Syzygy50MoveRule"]);
-      ProbeDepth = int(Options["SyzygyProbeDepth"]);
-      Cardinality = int(Options["SyzygyProbeLimit"]);
-
-      // Tables with fewer pieces than SyzygyProbeLimit are searched with
-      // ProbeDepth == DEPTH_ZERO
-      if (Cardinality > MaxCardinality)
-      {
-          Cardinality = MaxCardinality;
-          ProbeDepth = 0;
-      }
-  }
-
 }
 
 namespace TB = Tablebases;
@@ -1859,7 +1844,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
   size_t pvIdx = pos.this_thread()->pvIdx;
   size_t multiPV = std::min((size_t)Options["MultiPV"], rootMoves.size());
   uint64_t nodesSearched = Threads.nodes_searched();
-  uint64_t tbHits = Threads.tb_hits() + (pos.this_thread()->rootInTB ? rootMoves.size() : 0);
+  uint64_t tbHits = Threads.tb_hits() + (TB::RootInTB ? rootMoves.size() : 0);
 
   for (size_t i = 0; i < multiPV; ++i)
   {
@@ -1871,7 +1856,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
       Depth d = updated ? depth : depth - 1;
       Value v = updated ? rootMoves[i].score : rootMoves[i].previousScore;
 
-      bool tb = pos.this_thread()->rootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
+      bool tb = TB::RootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
       v = tb ? rootMoves[i].tbScore : v;
 
       if (ss.rdbuf()->in_avail()) // Not at first line
@@ -1938,8 +1923,10 @@ bool RootMove::extract_ponder_from_tt(Position& pos) {
 
 void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
 
-    auto& rootInTB = pos.this_thread()->rootInTB;
-    rootInTB = false;
+    RootInTB = false;
+    UseRule50 = bool(Options["Syzygy50MoveRule"]);
+    ProbeDepth = int(Options["SyzygyProbeDepth"]);
+    Cardinality = int(Options["SyzygyProbeLimit"]);
     bool dtz_available = true;
 
     // Tables with fewer pieces than SyzygyProbeLimit are searched with
@@ -1953,17 +1940,17 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
     if (Cardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING))
     {
         // Rank moves using DTZ tables
-        rootInTB = root_probe(pos, rootMoves);
+        RootInTB = root_probe(pos, rootMoves);
 
-        if (!rootInTB)
+        if (!RootInTB)
         {
             // DTZ tables are missing; try to rank moves using WDL tables
             dtz_available = false;
-            rootInTB = root_probe_wdl(pos, rootMoves);
+            RootInTB = root_probe_wdl(pos, rootMoves);
         }
     }
 
-    if (rootInTB)
+    if (RootInTB)
     {
         // Sort moves according to TB rank
         std::sort(rootMoves.begin(), rootMoves.end(),
@@ -1979,7 +1966,6 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
         for (auto& m : rootMoves)
             m.tbRank = 0;
     }
-
 }
 
 // --- expose the functions such as fixed depth search used for learning to the outside
diff --git a/src/search.h b/src/search.h
index fd5814ef..20dfe909 100644
--- a/src/search.h
+++ b/src/search.h
@@ -24,7 +24,6 @@
 #include "misc.h"
 #include "movepick.h"
 #include "types.h"
-#include "uci.h"
 
 class Position;
 
@@ -111,17 +110,6 @@ void clear();
 
 } // namespace Search
 
-namespace Tablebases {
-
-extern int MaxCardinality;
-extern int Cardinality;
-extern bool UseRule50;
-extern Depth ProbeDepth;
-
-void init();
-
-}
-
 namespace Learner {
 
   // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h
index 6af5d278..b998989b 100644
--- a/src/syzygy/tbprobe.h
+++ b/src/syzygy/tbprobe.h
@@ -43,6 +43,8 @@ enum ProbeState {
     ZEROING_BEST_MOVE =  2  // Best move zeroes DTZ (capture or pawn move)
 };
 
+extern int MaxCardinality;
+
 void init(const std::string& paths);
 WDLScore probe_wdl(Position& pos, ProbeState* result);
 int probe_dtz(Position& pos, ProbeState* result);
diff --git a/src/thread.cpp b/src/thread.cpp
index ef4cb398..1aa66a81 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -192,8 +192,6 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
           || std::count(limits.searchmoves.begin(), limits.searchmoves.end(), m))
           rootMoves.emplace_back(m);
 
-  Tablebases::init();
-
   if (!rootMoves.empty())
       Tablebases::rank_root_moves(pos, rootMoves);
 
diff --git a/src/thread.h b/src/thread.h
index e0c838c8..042bc2e9 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -74,7 +74,6 @@ public:
   CapturePieceToHistory captureHistory;
   ContinuationHistory continuationHistory[2][2];
   Score contempt;
-  bool rootInTB;
 };
 
 
From 9f3de8b40eda71b04e6b88f5deaf45a7d1efb402 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Sep 2020 21:10:10 +0200
Subject: [PATCH 295/583] Revert some unwanted changes from merge conflict
 resolution.

---
 src/evaluate.h        |  2 +-
 src/learn/gensfen.cpp | 21 ++-------------------
 src/misc.cpp          | 12 ++++++++++++
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index 74a490f6..e6ac7e1c 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -42,7 +42,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn.bin"
+  #define EvalFileDefaultName   "nn-03744f8d56d8.nnue"
 
   namespace NNUE {
 
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index ba0c3be8..24d05c96 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -47,7 +47,6 @@ namespace Learner
     static bool detect_draw_by_consecutive_low_score = false;
     static bool detect_draw_by_insufficient_mating_material = false;
 
-    static std::vector<std::string> bookStart;
     static SfenOutputType sfen_output_type = SfenOutputType::Bin;
 
     static bool ends_with(const std::string& lhs, const std::string& end)
@@ -817,7 +816,7 @@ namespace Learner
             auto th = Threads[thread_id];
 
             auto& pos = th->rootPos;
-            pos.set(bookStart[prng.rand(bookStart.size())], false, &si, th);
+            pos.set(StartFEN, false, &si, th);
 
             int resign_counter = 0;
             bool should_resign = prng.rand(10) > 1;
@@ -1127,28 +1126,12 @@ namespace Learner
             output_file_name = output_file_name + "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
         }
 
-        bookStart.clear();
-        {
-          std::string line;
-          std::ifstream myfile ("3moves_v2.epd");
-          if (myfile.is_open())
-          {
-            while (getline(myfile,line))
-            {
-                bookStart.push_back(line);
-            }
-            myfile.close();
-          } else {
-            bookStart.push_back(StartFEN);
-          }
-        }
         std::cout << "gensfen : " << endl
             << "  search_depth_min = " << search_depth_min << " to " << search_depth_max << endl
             << "  nodes = " << nodes << endl
             << "  loop_max = " << loop_max << endl
             << "  eval_limit = " << eval_limit << endl
-            << "  thread_num             = " << thread_num << endl
-            << "  bookStart              = " << bookStart.size() << endl
+            << "  thread_num (set by USI setoption) = " << thread_num << endl
             << "  random_move_minply     = " << random_move_minply << endl
             << "  random_move_maxply     = " << random_move_maxply << endl
             << "  random_move_count      = " << random_move_count << endl
diff --git a/src/misc.cpp b/src/misc.cpp
index a0e01820..d31538fa 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -408,11 +408,23 @@ static void* aligned_large_pages_alloc_win(size_t allocSize) {
 
 void* aligned_large_pages_alloc(size_t allocSize) {
 
+  static bool firstCall = true;
   void* mem;
 
   // Try to allocate large pages
   mem = aligned_large_pages_alloc_win(allocSize);
 
+  // Suppress info strings on the first call. The first call occurs before 'uci'
+  // is received and in that case this output confuses some GUIs.
+  if (!firstCall)
+  {
+      if (mem)
+          sync_cout << "info string Hash table allocation: Windows large pages used." << sync_endl;
+      else
+          sync_cout << "info string Hash table allocation: Windows large pages not used." << sync_endl;
+  }
+  firstCall = false;
+
   // Fall back to regular, page aligned, allocation if necessary
   if (!mem)
       mem = VirtualAlloc(NULL, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);

From 9f87282c6d2e9c81c1ca8997778ae996c40fbe62 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Sep 2020 21:59:25 +0200
Subject: [PATCH 296/583] Fix net not being downloaded on build. Make PGO build
 faster by reverting gensfen command change.

---
 src/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index d069dee6..0b2f99ed 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -41,7 +41,7 @@ BINDIR = $(PREFIX)/bin
 ### Built-in benchmark for pgo-builds
 PGO_TRAINING_DATA_FILE = pgo_training_data.bin
 PGOBENCH = ./$(EXE) bench
-PGOGENSFEN = ./$(EXE) gensfen depth 6 loop 10000 output_file_name $(PGO_TRAINING_DATA_FILE)
+PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 output_file_name $(PGO_TRAINING_DATA_FILE)
 
 ### Source and object files
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
@@ -746,10 +746,10 @@ endif
         config-sanity icc-profile-use icc-profile-make gcc-profile-use gcc-profile-make \
         clang-profile-use clang-profile-make
 
-build: config-sanity
+build: config-sanity net
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
 
-profile-build: config-sanity objclean profileclean
+profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)

From d4a5f917663fb1bdb2c085eb93d7791be9aef929 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Sep 2020 22:57:55 +0200
Subject: [PATCH 297/583] Add info string when loading/failing to load an eval
 file.

---
 src/evaluate.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 6996e7ae..aa9bbd67 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -77,7 +77,14 @@ namespace Eval {
         {
             ifstream stream(directory + eval_file, ios::binary);
             if (load_eval(eval_file, stream))
+            {
+                sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
                 eval_file_loaded = eval_file;
+            }
+            else
+            {
+                sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+            }
         }
   }
 

From baf8b5beaf5dff1b335100801a8b88da4ede5813 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Sep 2020 22:58:21 +0200
Subject: [PATCH 298/583] Change default net so that the architecture matches
 the architecture expected by the binary.

---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index e6ac7e1c..ac67494d 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -42,7 +42,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-03744f8d56d8.nnue"
+  #define EvalFileDefaultName   "nn-28e08a9fe2ad.nnue"
 
   namespace NNUE {
 

From 9955f51215d51c35b63c5c88d5dcadcb314fe2b7 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Sep 2020 23:23:37 +0200
Subject: [PATCH 299/583] Update bench signature. Bench: 4698761


From 0a3e070ffb8e47df46533c65bec638630e049300 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 25 Sep 2020 00:11:24 +0200
Subject: [PATCH 300/583] Adjust instrumented learn test for parameter changes.

---
 tests/instrumented_learn.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 7f76fd76..edbce5fe 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -127,7 +127,7 @@ cat << EOF > learn01.exp
  send "setoption name Use NNUE value true\n"
  send "setoption name Threads value $threads\n"
  send "isready\n"
- send "learn targetdir training_data loop 2 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 mirror_percentage 50 validation_set_file_name validation_data/validation_data.bin\n"
+ send "learn targetdir training_data loop 2 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
 
  expect "save_eval() finished."
 

From 654b94f0a7a8384d88b5f46cbbf250cceaa66417 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 25 Sep 2020 10:41:40 +0200
Subject: [PATCH 301/583] Remove old unused `use_raw_nnue_eval` option from
 gensfen tests

---
 tests/instrumented_learn.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index edbce5fe..ce1fc429 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -78,11 +78,11 @@ cat << EOF > gensfen01.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value false\n"
  send "isready\n"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin sfen_format bin\n"
  expect "gensfen finished."
  send "learn training_data/training_data.bin convert_plain output_file_name training_data.txt\n"
  expect "all done"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
+ send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack sfen_format binpack\n"
  expect "gensfen finished."
 
  send "quit\n"
@@ -104,9 +104,9 @@ cat << EOF > gensfen02.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value true\n"
  send "isready\n"
- send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/valdidation_data.bin use_raw_nnue_eval 0 sfen_format bin\n"
+ send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/valdidation_data.bin sfen_format bin\n"
  expect "gensfen finished."
- send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.binpack use_raw_nnue_eval 0 sfen_format binpack\n"
+ send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.binpack sfen_format binpack\n"
  expect "gensfen finished."
 
  send "quit\n"

From 89eeb36835fe9987283cad1660bbacc6ff1e8fab Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 25 Sep 2020 13:42:27 +0200
Subject: [PATCH 302/583] Initialize Tablebases::MaxCardinality to 0 to prevent
 uninitialized variable read in rank_root_moves

---
 src/search.h           | 5 -----
 src/syzygy/tbprobe.cpp | 2 +-
 src/syzygy/tbprobe.h   | 2 ++
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/search.h b/src/search.h
index 9e453d9a..ab832ee2 100644
--- a/src/search.h
+++ b/src/search.h
@@ -112,11 +112,6 @@ void clear();
 
 } // namespace Search
 
-namespace Tablebases {
-
-extern int MaxCardinality;
-
-}
 namespace Learner {
 
   // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp
index 4d682f1a..f4b9447f 100644
--- a/src/syzygy/tbprobe.cpp
+++ b/src/syzygy/tbprobe.cpp
@@ -52,7 +52,7 @@
 
 using namespace Tablebases;
 
-int Tablebases::MaxCardinality;
+int Tablebases::MaxCardinality = 0;
 
 namespace {
 
diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h
index 6af5d278..5f97c746 100644
--- a/src/syzygy/tbprobe.h
+++ b/src/syzygy/tbprobe.h
@@ -25,6 +25,8 @@
 
 namespace Tablebases {
 
+extern int MaxCardinality;
+
 enum WDLScore {
     WDLLoss        = -2, // Loss
     WDLBlessedLoss = -1, // Loss, but draw under 50-move rule

From b6e7733b4c047682f467414ba9f2959d67249705 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 25 Sep 2020 15:04:21 +0200
Subject: [PATCH 303/583] In gensfen call search before get_current_game_result
 so that rootMoves is initialized by Learner::init_for_search. Don't call
 Tablebases::rank_root_moves in get_current_game_result because it's called in
 Learner::init_for_search. This fixes accessing uninitialized variables
 related to tablebases.

---
 src/learn/gensfen.cpp | 182 ++++++++++++++++++++----------------------
 1 file changed, 86 insertions(+), 96 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 24d05c96..67d898ba 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -465,18 +465,7 @@ namespace Learner
             return 0;
         }
 
-        // Initialize the Syzygy Ending Tablebase and sort the moves.
-        Search::RootMoves rootMoves;
-        for (const auto& m : MoveList<LEGAL>(pos))
-        {
-            rootMoves.emplace_back(m);
-        }
-
-        if (!rootMoves.empty())
-        {
-            Tablebases::rank_root_moves(pos, rootMoves);
-        }
-        else
+        if(pos.this_thread()->rootMoves.empty())
         {
             // If there is no legal move
             return pos.checkers()
@@ -847,6 +836,11 @@ namespace Learner
                 // Current search depth
                 const int depth = search_depth_min + (int)prng.rand(search_depth_max - search_depth_min + 1);
 
+                // Starting search calls init_for_search
+                auto [search_value, search_pv] = search(pos, depth, 1, nodes);
+
+                // This has to be performed after search because it needs to know
+                // rootMoves which are filled in init_for_search.
                 const auto result = get_current_game_result(pos, move_hist_scores);
                 if (result.has_value())
                 {
@@ -854,102 +848,98 @@ namespace Learner
                     break;
                 }
 
+                // Always adjudivate by eval limit.
+                // Also because of this we don't have to check for TB/MATE scores
+                if (abs(search_value) >= eval_limit)
                 {
-                    auto [search_value, search_pv] = search(pos, depth, 1, nodes);
-
-                    // Always adjudivate by eval limit.
-                    // Also because of this we don't have to check for TB/MATE scores
-                    if (abs(search_value) >= eval_limit)
-                    {
-                        resign_counter++;
-                        if ((should_resign && resign_counter >= 4) || abs(search_value) >= 10000) {
-                            flush_psv((search_value >= eval_limit) ? 1 : -1);
-                            break;
-                        }
-                    } else {
-                        resign_counter = 0;
-                    }
-                    // Verification of a strange move
-                    if (search_pv.size() > 0
-                        && (search_pv[0] == MOVE_NONE || search_pv[0] == MOVE_NULL))
-                    {
-                        // (???)
-                        // MOVE_WIN is checking if it is the declaration victory stage before this
-                        // The declarative winning move should never come back here.
-                        // Also, when MOVE_RESIGN, search_value is a one-stop score, which should be the minimum value of eval_limit (-31998)...
-                        cout << "Error! : " << pos.fen() << next_move << search_value << endl;
+                    resign_counter++;
+                    if ((should_resign && resign_counter >= 4) || abs(search_value) >= 10000) {
+                        flush_psv((search_value >= eval_limit) ? 1 : -1);
                         break;
                     }
+                } else {
+                    resign_counter = 0;
+                }
+                // Verification of a strange move
+                if (search_pv.size() > 0
+                    && (search_pv[0] == MOVE_NONE || search_pv[0] == MOVE_NULL))
+                {
+                    // (???)
+                    // MOVE_WIN is checking if it is the declaration victory stage before this
+                    // The declarative winning move should never come back here.
+                    // Also, when MOVE_RESIGN, search_value is a one-stop score, which should be the minimum value of eval_limit (-31998)...
+                    cout << "Error! : " << pos.fen() << next_move << search_value << endl;
+                    break;
+                }
 
-                    // Save the move score for adjudication.
-                    move_hist_scores.push_back(search_value);
+                // Save the move score for adjudication.
+                move_hist_scores.push_back(search_value);
 
-                    // If depth 0, pv is not obtained, so search again at depth 2.
-                    if (search_depth_min <= 0)
+                // If depth 0, pv is not obtained, so search again at depth 2.
+                if (search_depth_min <= 0)
+                {
+                    auto [research_value, research_pv] = search(pos, 2);
+                    search_pv = research_pv;
+                }
+
+                // Discard stuff before write_minply is reached
+                // because it can harm training due to overfitting.
+                // Initial positions would be too common.
+                if (ply < write_minply - 1)
+                {
+                    a_psv.clear();
+                    goto SKIP_SAVE;
+                }
+
+                // Look into the position hashtable to see if the same
+                // position was seen before.
+                // This is a good heuristic to exlude already seen
+                // positions without many false positives.
+                {
+                    auto key = pos.key();
+                    auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
+                    auto old_key = hash[hash_index];
+                    if (key == old_key)
                     {
-                        auto [research_value, research_pv] = search(pos, 2);
-                        search_pv = research_pv;
-                    }
-
-                    // Discard stuff before write_minply is reached
-                    // because it can harm training due to overfitting.
-                    // Initial positions would be too common.
-                    if (ply < write_minply - 1)
-                    {
-                        a_psv.clear();
                         goto SKIP_SAVE;
                     }
-
-                    // Look into the position hashtable to see if the same
-                    // position was seen before.
-                    // This is a good heuristic to exlude already seen
-                    // positions without many false positives.
+                    else
                     {
-                        auto key = pos.key();
-                        auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
-                        auto old_key = hash[hash_index];
-                        if (key == old_key)
-                        {
-                            goto SKIP_SAVE;
-                        }
-                        else
-                        {
-                            // Replace with the current key.
-                            hash[hash_index] = key;
-                        }
+                        // Replace with the current key.
+                        hash[hash_index] = key;
                     }
-
-                    // Pack the current position into a packed sfen and save it into the buffer.
-                    {
-                        a_psv.emplace_back(PackedSfenValue());
-                        auto& psv = a_psv.back();
-
-                        // Here we only write the position data.
-                        // Result is added after the whole game is done.
-                        pos.sfen_pack(psv.sfen);
-
-                        psv.score = search_value;
-
-                        psv.gamePly = ply;
-
-                        // Take out the first PV move. This should be present unless depth 0.
-                        assert(search_pv.size() >= 1);
-                        psv.move = search_pv[0];
-                    }
-
-                SKIP_SAVE:;
-
-                    // For some reason, We could not get PV (hit the substitution table etc. and got stuck?)
-                    // so go to the next game. It's a rare case, so you can ignore it.
-                    if (search_pv.size() == 0)
-                    {
-                        break;
-                    }
-
-                    // Update the next move according to best search result.
-                    next_move = search_pv[0];
                 }
 
+                // Pack the current position into a packed sfen and save it into the buffer.
+                {
+                    a_psv.emplace_back(PackedSfenValue());
+                    auto& psv = a_psv.back();
+
+                    // Here we only write the position data.
+                    // Result is added after the whole game is done.
+                    pos.sfen_pack(psv.sfen);
+
+                    psv.score = search_value;
+
+                    psv.gamePly = ply;
+
+                    // Take out the first PV move. This should be present unless depth 0.
+                    assert(search_pv.size() >= 1);
+                    psv.move = search_pv[0];
+                }
+
+            SKIP_SAVE:;
+
+                // For some reason, We could not get PV (hit the substitution table etc. and got stuck?)
+                // so go to the next game. It's a rare case, so you can ignore it.
+                if (search_pv.size() == 0)
+                {
+                    break;
+                }
+
+                // Update the next move according to best search result.
+                next_move = search_pv[0];
+
                 // Random move.
                 auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
                 if (random_move.has_value())

From c99541828fbf9bf529bdb1675bf67debe39ce48e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 25 Sep 2020 16:06:33 +0200
Subject: [PATCH 304/583] Remove the re-search on depth 0. It is correctly
 handled by search now.

---
 src/learn/gensfen.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 67d898ba..7e931726 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -875,13 +875,6 @@ namespace Learner
                 // Save the move score for adjudication.
                 move_hist_scores.push_back(search_value);
 
-                // If depth 0, pv is not obtained, so search again at depth 2.
-                if (search_depth_min <= 0)
-                {
-                    auto [research_value, research_pv] = search(pos, 2);
-                    search_pv = research_pv;
-                }
-
                 // Discard stuff before write_minply is reached
                 // because it can harm training due to overfitting.
                 // Initial positions would be too common.

From 5e8a49f7f23489605435b0f359c3c70116bec5e3 Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Sat, 26 Sep 2020 10:03:03 +0800
Subject: [PATCH 305/583] Restore lambda and gradient function post-merge and
 minor fixes.

bench: 3788313
---
 README.md                              |   5 +-
 src/evaluate.h                         |   2 +-
 src/learn/learn.cpp                    | 110 ++++++++++++++++++++++---
 src/learn/learn.h                      |   6 +-
 src/nnue/features/castling_right.cpp   |  28 ++++++-
 src/nnue/features/castling_right.h     |   2 +-
 src/nnue/features/enpassant.cpp        |  20 ++++-
 src/nnue/features/enpassant.h          |   4 +-
 src/nnue/features/feature_set.h        |   6 +-
 src/nnue/features/half_kp.cpp          |   5 +-
 src/nnue/features/half_kp.h            |   7 +-
 src/nnue/features/half_relative_kp.cpp |   4 +-
 src/nnue/nnue_architecture.h           |   2 +-
 src/position.cpp                       |   1 +
 src/syzygy/tbprobe.h                   |   4 +-
 15 files changed, 170 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index 1bad4b06..f84a544a 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,7 @@ Additional options:
 To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands. 
 ```
 uci
+setoption name PruneAtShallowDepth value false
 setoption name Use NNUE value false
 setoption name Threads value x
 setoption name Hash value y
@@ -56,11 +57,13 @@ The process is the same as the generation of training data, except for the fact
 Use the "learn" binary. Create an empty folder named "evalsave" in the same directory as the binaries.
 ```
 uci
+setoption name EnableTranspositionTable value false
+setoption name PruneAtShallowDepth value false
 setoption name SkipLoadingEval value true
 setoption name Use NNUE value pure
 setoption name Threads value x
 isready
-learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 eta 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 mirror_percentage 50 validation_set_file_name validationdata\val.bin
+learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 lr 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 validation_set_file_name validationdata\val.bin
 ```
 Nets get saved in the "evalsave" folder. 
 
diff --git a/src/evaluate.h b/src/evaluate.h
index ac67494d..0c99fb5b 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -42,7 +42,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-28e08a9fe2ad.nnue"
+  #define EvalFileDefaultName   "nn-54f88d1580b4.nnue"
 
   namespace NNUE {
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index e2d9af1b..5320aaf8 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -157,6 +157,14 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
+    // A constant used in elmo (WCSC27). Adjustment required.
+    // Since elmo does not internally divide the expression, the value is different.
+    // You can set this value with the learn command.
+    // 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
+    double ELMO_LAMBDA = 0.33;
+    double ELMO_LAMBDA2 = 0.33;
+    double ELMO_LAMBDA_LIMIT = 32000;
+
     // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
     double get_scaled_signal(double signal)
     {
@@ -182,6 +190,18 @@ namespace Learner
         return winning_percentage(scaled_teacher_signal, ply);
     }
 
+    double calculate_lambda(double teacher_signal)
+    {
+        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT
+        // then apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
+        const double lambda =
+            (std::abs(teacher_signal) >= ELMO_LAMBDA_LIMIT)
+            ? ELMO_LAMBDA2
+            : ELMO_LAMBDA;
+
+        return lambda;
+    }
+
     double calculate_t(int game_result)
     {
         // Use 1 as the correction term if the expected win rate is 1,
@@ -192,6 +212,32 @@ namespace Learner
         return t;
     }
 
+    double calc_grad(Value teacher_signal, Value shallow, const PackedSfenValue& psv)
+    {
+        // elmo (WCSC27) method
+        // Correct with the actual game wins and losses.
+        const double q = winning_percentage(shallow, psv.gamePly);
+        const double p = calculate_p(teacher_signal, psv.gamePly);
+        const double t = calculate_t(psv.game_result);
+        const double lambda = calculate_lambda(teacher_signal);
+
+        double grad;
+        if (use_wdl)
+        {
+            const double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
+            const double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
+            grad = lambda * dce_p + (1.0 - lambda) * dce_t;
+        }
+        else
+        {
+            // Use the actual win rate as a correction term.
+            // This is the idea of ​​elmo (WCSC27), modern O-parts.
+            grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
+        }
+
+        return grad;
+    }
+
     // Calculate cross entropy during learning
     // The individual cross entropy of the win/loss term and win
     // rate term of the elmo expression is returned
@@ -202,16 +248,21 @@ namespace Learner
         const PackedSfenValue& psv,
         double& cross_entropy_eval,
         double& cross_entropy_win,
+        double& cross_entropy,
         double& entropy_eval,
-        double& entropy_win)
+        double& entropy_win,
+        double& entropy)
     {
         // Teacher winning probability.
         const double q = winning_percentage(shallow, psv.gamePly);
         const double p = calculate_p(teacher_signal, psv.gamePly);
         const double t = calculate_t(psv.game_result);
+        const double lambda = calculate_lambda(teacher_signal);
 
         constexpr double epsilon = 0.000001;
 
+        const double m = (1.0 - lambda) * t + lambda * p;
+
         cross_entropy_eval =
             (-p * std::log(q + epsilon) - (1.0 - p) * std::log(1.0 - q + epsilon));
         cross_entropy_win =
@@ -220,12 +271,17 @@ namespace Learner
             (-p * std::log(p + epsilon) - (1.0 - p) * std::log(1.0 - p + epsilon));
         entropy_win =
             (-t * std::log(t + epsilon) - (1.0 - t) * std::log(1.0 - t + epsilon));
+
+        cross_entropy =
+            (-m * std::log(q + epsilon) - (1.0 - m) * std::log(1.0 - q + epsilon));
+        entropy =
+            (-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
     }
 
     // Other objective functions may be considered in the future...
     double calc_grad(Value shallow, const PackedSfenValue& psv)
     {
-        return (double)(shallow - (Value)psv.score) / 2400.0;
+        return calc_grad((Value)psv.score, shallow, psv);
     }
 
     struct BasicSfenInputStream
@@ -798,12 +854,14 @@ namespace Learner
         cout << ", learning rate = " << global_learning_rate << ", ";
 
         // For calculation of verification data loss
-        atomic<double> test_sum_cross_entropy_eval, test_sum_cross_entropy_win;
-        atomic<double> test_sum_entropy_eval, test_sum_entropy_win;
+        atomic<double> test_sum_cross_entropy_eval, test_sum_cross_entropy_win, test_sum_cross_entropy;
+        atomic<double> test_sum_entropy_eval, test_sum_entropy_win, test_sum_entropy;
         test_sum_cross_entropy_eval = 0;
         test_sum_cross_entropy_win = 0;
+        test_sum_cross_entropy = 0;
         test_sum_entropy_eval = 0;
         test_sum_entropy_win = 0;
+        test_sum_entropy = 0;
 
         // norm for learning
         atomic<double> sum_norm;
@@ -843,8 +901,10 @@ namespace Learner
                     &ps,
                     &test_sum_cross_entropy_eval,
                     &test_sum_cross_entropy_win,
+                    &test_sum_cross_entropy,
                     &test_sum_entropy_eval,
                     &test_sum_entropy_win,
+                    &test_sum_entropy,
                     &sum_norm,
                     &task_count,
                     &move_accord_count
@@ -872,22 +932,26 @@ namespace Learner
                 // For the time being, regarding the win rate and loss terms only in the elmo method
                 // Calculate and display the cross entropy.
 
-                double test_cross_entropy_eval, test_cross_entropy_win;
-                double test_entropy_eval, test_entropy_win;
+                double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
+                double test_entropy_eval, test_entropy_win, test_entropy;
                 calc_cross_entropy(
                     deep_value,
                     shallow_value,
                     ps,
                     test_cross_entropy_eval,
                     test_cross_entropy_win,
+                    test_cross_entropy,
                     test_entropy_eval,
-                    test_entropy_win);
+                    test_entropy_win,
+                    test_entropy);
 
                 // The total cross entropy need not be abs() by definition.
                 test_sum_cross_entropy_eval += test_cross_entropy_eval;
                 test_sum_cross_entropy_win += test_cross_entropy_win;
+                test_sum_cross_entropy += test_cross_entropy;
                 test_sum_entropy_eval += test_entropy_eval;
                 test_sum_entropy_win += test_entropy_win;
+                test_sum_entropy += test_entropy;
                 sum_norm += (double)abs(shallow_value);
 
                 // Determine if the teacher's move and the score of the shallow search match
@@ -912,7 +976,7 @@ namespace Learner
         while (task_count)
             sleep(1);
 
-        latest_loss_sum += test_sum_cross_entropy_eval - test_sum_entropy_eval;
+        latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
         latest_loss_count += sr.sfen_for_mse.size();
 
         // learn_cross_entropy may be called train cross
@@ -927,6 +991,8 @@ namespace Learner
                 << " , test_cross_entropy_win = " << test_sum_cross_entropy_win / sr.sfen_for_mse.size()
                 << " , test_entropy_eval = " << test_sum_entropy_eval / sr.sfen_for_mse.size()
                 << " , test_entropy_win = " << test_sum_entropy_win / sr.sfen_for_mse.size()
+                << " , test_cross_entropy = " << test_sum_cross_entropy / sr.sfen_for_mse.size()
+                << " , test_entropy = " << test_sum_entropy / sr.sfen_for_mse.size()
                 << " , norm = " << sum_norm
                 << " , move accuracy = " << (move_accord_count * 100.0 / sr.sfen_for_mse.size()) << "%"
                 << endl;
@@ -938,6 +1004,8 @@ namespace Learner
                     << " , learn_cross_entropy_win = " << learn_sum_cross_entropy_win / done
                     << " , learn_entropy_eval = " << learn_sum_entropy_eval / done
                     << " , learn_entropy_win = " << learn_sum_entropy_win / done
+                    << " , learn_cross_entropy = " << learn_sum_cross_entropy / done
+                    << " , learn_entropy = " << learn_sum_entropy / done
                     << endl;
             }
         }
@@ -949,8 +1017,10 @@ namespace Learner
         // Clear 0 for next time.
         learn_sum_cross_entropy_eval = 0.0;
         learn_sum_cross_entropy_win = 0.0;
+        learn_sum_cross_entropy = 0.0;
         learn_sum_entropy_eval = 0.0;
         learn_sum_entropy_win = 0.0;
+        learn_sum_entropy = 0.0;
     }
 
     void LearnerThink::thread_worker(size_t thread_id)
@@ -1142,21 +1212,25 @@ namespace Learner
                     : -Eval::evaluate(pos);
 
                 // Calculate loss for training data
-                double learn_cross_entropy_eval, learn_cross_entropy_win;
-                double learn_entropy_eval, learn_entropy_win;
+                double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
+                double learn_entropy_eval, learn_entropy_win, learn_entropy;
                 calc_cross_entropy(
                     deep_value,
                     shallow_value,
                     ps,
                     learn_cross_entropy_eval,
                     learn_cross_entropy_win,
+                    learn_cross_entropy,
                     learn_entropy_eval,
-                    learn_entropy_win);
+                    learn_entropy_win,
+                    learn_entropy);
 
                 learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
                 learn_sum_cross_entropy_win += learn_cross_entropy_win;
+                learn_sum_cross_entropy += learn_cross_entropy;
                 learn_sum_entropy_eval += learn_entropy_eval;
                 learn_sum_entropy_win += learn_entropy_win;
+                learn_sum_entropy += learn_entropy;
 
                 Eval::NNUE::AddExample(pos, rootColor, ps, 1.0);
 
@@ -1560,6 +1634,11 @@ namespace Learner
 
         global_learning_rate = 1.0;
 
+        // elmo lambda
+        ELMO_LAMBDA = 0.33;
+        ELMO_LAMBDA2 = 0.33;
+        ELMO_LAMBDA_LIMIT = 32000;
+
         // if (gamePly <rand(reduction_gameply)) continue;
         // An option to exclude the early stage from the learning target moderately like
         // If set to 1, rand(1)==0, so nothing is excluded.
@@ -1627,6 +1706,12 @@ namespace Learner
             // Using WDL with win rate model instead of sigmoid
             else if (option == "use_wdl") is >> use_wdl;
 
+
+            // LAMBDA
+            else if (option == "lambda")       is >> ELMO_LAMBDA;
+            else if (option == "lambda2")      is >> ELMO_LAMBDA2;
+            else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
+
             else if (option == "reduction_gameply") is >> reduction_gameply;
 
             // shuffle related
@@ -1814,6 +1899,9 @@ namespace Learner
         reduction_gameply = max(reduction_gameply, 1);
         cout << "reduction_gameply : " << reduction_gameply << endl;
 
+        cout << "LAMBDA            : " << ELMO_LAMBDA << endl;
+        cout << "LAMBDA2           : " << ELMO_LAMBDA2 << endl;
+        cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
         cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
         cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
 
diff --git a/src/learn/learn.h b/src/learn/learn.h
index c76d76c5..4b09f825 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -23,7 +23,11 @@ using LearnFloatType = float;
 // configure
 // ======================
 
-#define LOSS_FUNCTION "cross_entropy_eval"
+// ----------------------
+// Learning with the method of elmo (WCSC27)
+// ----------------------
+
+#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
 
 // ----------------------
 // Definition of struct used in Learner
diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
index 2d7f563a..2b3f3209 100644
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -31,10 +31,30 @@ namespace Eval::NNUE::Features {
 
   // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
   void CastlingRight::AppendChangedIndices(
-    const Position& /* pos */, Color /* perspective */,
-    IndexList* /* removed */, IndexList* /* added */) {
-    // Not implemented.
-    assert(false);
+      const Position& pos, Color perspective,
+      IndexList* removed, IndexList* /* added */) {
+    int previous_castling_rights = pos.state()->previous->castlingRights;
+    int current_castling_rights = pos.state()->castlingRights;
+    int relative_previous_castling_rights;
+    int relative_current_castling_rights;
+    if (perspective == WHITE) {
+      relative_previous_castling_rights = previous_castling_rights;
+      relative_current_castling_rights = current_castling_rights;
+    }
+    else {
+      // Invert the perspective.
+      relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
+        & ((previous_castling_rights >> 2) & 3);
+      relative_current_castling_rights = ((current_castling_rights & 3) << 2)
+        & ((current_castling_rights >> 2) & 3);
+    }
+
+    for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
+      if ((relative_previous_castling_rights & (1 << i)) &&
+        (relative_current_castling_rights & (1 << i)) == 0) {
+        removed->push_back(i);
+      }
+    }
   }
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/castling_right.h b/src/nnue/features/castling_right.h
index 3a09e14b..2d8c5322 100644
--- a/src/nnue/features/castling_right.h
+++ b/src/nnue/features/castling_right.h
@@ -19,7 +19,7 @@ namespace Eval::NNUE::Features {
     // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
     static constexpr IndexType kMaxActiveDimensions = 4;
     // Timing of full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kAnyPieceMoved;
+    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
     // Get a list of indices with a value of 1 among the features
     static void AppendActiveIndices(const Position& pos, Color perspective,
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
index d771a85c..e5ceed5c 100644
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -21,10 +21,22 @@ namespace Eval::NNUE::Features {
 
   // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
   void EnPassant::AppendChangedIndices(
-    const Position& /* pos */, Color /* perspective */,
-    IndexList* /* removed */, IndexList* /* added */) {
-    // Not implemented.
-    assert(false);
+      const Position& pos, Color /* perspective */,
+      IndexList* removed, IndexList* added) {
+
+    auto previous_epSquare = pos.state()->previous->epSquare;
+    auto epSquare = pos.state()->epSquare;
+
+    if (previous_epSquare != SQ_NONE) {
+      if (epSquare != SQ_NONE && file_of(epSquare) == file_of(previous_epSquare))
+        return;
+      auto file = file_of(previous_epSquare);
+      removed->push_back(file);
+    }
+    if (epSquare != SQ_NONE) {
+      auto file = file_of(epSquare);
+      added->push_back(file);
+    }
   }
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/enpassant.h b/src/nnue/features/enpassant.h
index efa5eae9..065e74a0 100644
--- a/src/nnue/features/enpassant.h
+++ b/src/nnue/features/enpassant.h
@@ -19,13 +19,13 @@ namespace Eval::NNUE::Features {
     // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
     static constexpr IndexType kMaxActiveDimensions = 1;
     // Timing of full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kAnyPieceMoved;
+    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
     // Get a list of indices with a value of 1 among the features
     static void AppendActiveIndices(const Position& pos, Color perspective,
       IndexList* active);
 
-    // Get a list of indices whose values ??have changed from the previous one in the feature quantity
+    // Get a list of indices whose values have changed from the previous one in the feature quantity
     static void AppendChangedIndices(const Position& pos, Color perspective,
       IndexList* removed, IndexList* added);
   };
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index 2ef92e8e..24cdeb66 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -100,7 +100,6 @@ namespace Eval::NNUE::Features {
         IndexListType removed[2], IndexListType added[2], bool reset[2]) {
 
       const auto& dp = pos.state()->dirtyPiece;
-      if (dp.dirty_num == 0) return;
 
       for (Color perspective : { WHITE, BLACK }) {
         reset[perspective] = false;
@@ -108,12 +107,15 @@ namespace Eval::NNUE::Features {
           case TriggerEvent::kNone:
             break;
           case TriggerEvent::kFriendKingMoved:
+            if (dp.dirty_num == 0) continue;
             reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
             break;
           case TriggerEvent::kEnemyKingMoved:
-              reset[perspective] = dp.piece[0] == make_piece(~perspective, KING);
+            if (dp.dirty_num == 0) continue;
+            reset[perspective] = dp.piece[0] == make_piece(~perspective, KING);
             break;
           case TriggerEvent::kAnyKingMoved:
+            if (dp.dirty_num == 0) continue;
             reset[perspective] = type_of(dp.piece[0]) == KING;
             break;
           case TriggerEvent::kAnyPieceMoved:
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index ff20a00a..ae1d697f 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -41,7 +41,7 @@ namespace Eval::NNUE::Features {
   void HalfKP<AssociatedKing>::AppendActiveIndices(
       const Position& pos, Color perspective, IndexList* active) {
 
-    Square ksq = orient(perspective, pos.square<KING>(perspective));
+    Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
     Bitboard bb = pos.pieces() & ~pos.pieces(KING);
     while (bb) {
       Square s = pop_lsb(&bb);
@@ -55,7 +55,7 @@ namespace Eval::NNUE::Features {
       const Position& pos, Color perspective,
       IndexList* removed, IndexList* added) {
 
-    Square ksq = orient(perspective, pos.square<KING>(perspective));
+    Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
     const auto& dp = pos.state()->dirtyPiece;
     for (int i = 0; i < dp.dirty_num; ++i) {
       Piece pc = dp.piece[i];
@@ -68,5 +68,6 @@ namespace Eval::NNUE::Features {
   }
 
   template class HalfKP<Side::kFriend>;
+  template class HalfKP<Side::kEnemy>;
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
index ee6a8df3..23e8beb6 100644
--- a/src/nnue/features/half_kp.h
+++ b/src/nnue/features/half_kp.h
@@ -33,7 +33,8 @@ namespace Eval::NNUE::Features {
 
    public:
     // Feature name
-    static constexpr const char* kName = "HalfKP(Friend)";
+    static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+        "HalfKP(Friend)" : "HalfKP(Enemy)";
     // Hash value embedded in the evaluation file
     static constexpr std::uint32_t kHashValue =
         0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
@@ -43,7 +44,9 @@ namespace Eval::NNUE::Features {
     // Maximum number of simultaneously active features
     static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
     // Trigger for full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kFriendKingMoved;
+    static constexpr TriggerEvent kRefreshTrigger =
+        (AssociatedKing == Side::kFriend) ?
+        TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
 
     // Get a list of indices for active features
     static void AppendActiveIndices(const Position& pos, Color perspective,
diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 8a61bada..6b456a1f 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -39,7 +39,7 @@ inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
 template <Side AssociatedKing>
 void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
     const Position& pos, Color perspective, IndexList* active) {
-  Square ksq = orient(perspective, pos.square<KING>(perspective));
+  Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
   Bitboard bb = pos.pieces() & ~pos.pieces(KING);
   while (bb) {
     Square s = pop_lsb(&bb);
@@ -52,7 +52,7 @@ template <Side AssociatedKing>
 void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
     const Position& pos, Color perspective,
     IndexList* removed, IndexList* added) {
-  Square ksq = orient(perspective, pos.square<KING>(perspective));
+  Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
   const auto& dp = pos.state()->dirtyPiece;
   for (int i = 0; i < dp.dirty_num; ++i) {
     Piece pc = dp.piece[i];
diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h
index c395d515..91cdc4bd 100644
--- a/src/nnue/nnue_architecture.h
+++ b/src/nnue/nnue_architecture.h
@@ -22,7 +22,7 @@
 #define NNUE_ARCHITECTURE_H_INCLUDED
 
 // Defines the network structure
-#include "architectures/halfkp-cr-ep_256x2-32-32.h"
+#include "architectures/halfkp_256x2-32-32.h"
 
 namespace Eval::NNUE {
 
diff --git a/src/position.cpp b/src/position.cpp
index 52c47f66..5be655be 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -1013,6 +1013,7 @@ void Position::do_null_move(StateInfo& newSt) {
   {
       st->key ^= Zobrist::enpassant[file_of(st->epSquare)];
       st->epSquare = SQ_NONE;
+      st->accumulator.computed_accumulation = false;
   }
 
   st->key ^= Zobrist::side;
diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h
index 5f97c746..b998989b 100644
--- a/src/syzygy/tbprobe.h
+++ b/src/syzygy/tbprobe.h
@@ -25,8 +25,6 @@
 
 namespace Tablebases {
 
-extern int MaxCardinality;
-
 enum WDLScore {
     WDLLoss        = -2, // Loss
     WDLBlessedLoss = -1, // Loss, but draw under 50-move rule
@@ -45,6 +43,8 @@ enum ProbeState {
     ZEROING_BEST_MOVE =  2  // Best move zeroes DTZ (capture or pawn move)
 };
 
+extern int MaxCardinality;
+
 void init(const std::string& paths);
 WDLScore probe_wdl(Position& pos, ProbeState* result);
 int probe_dtz(Position& pos, ProbeState* result);

From 96a31807705eefe85b5be19322a7ff0ba5588f5f Mon Sep 17 00:00:00 2001
From: noobpwnftw <noobpwnftw@users.noreply.github.com>
Date: Sun, 27 Sep 2020 02:17:30 +0800
Subject: [PATCH 306/583] Update instrumented_learn.sh

Fix typo.
---
 tests/instrumented_learn.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index ce1fc429..44c5d7fa 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -104,7 +104,7 @@ cat << EOF > gensfen02.exp
  send "setoption name Threads value $threads\n"
  send "setoption name Use NNUE value true\n"
  send "isready\n"
- send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/valdidation_data.bin sfen_format bin\n"
+ send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.bin sfen_format bin\n"
  expect "gensfen finished."
  send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.binpack sfen_format binpack\n"
  expect "gensfen finished."

From 9d4bf4fe0c7cb2b3e207bf50a28fed958c6ffa27 Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Sun, 27 Sep 2020 02:28:28 +0800
Subject: [PATCH 307/583] Optimize accumulators for null move.

---
 src/position.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/position.cpp b/src/position.cpp
index 5be655be..4e47f772 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -999,21 +999,20 @@ void Position::do_null_move(StateInfo& newSt) {
   assert(!checkers());
   assert(&newSt != st);
 
-  if (Eval::useNNUE != Eval::UseNNUEMode::False)
-  {
-      std::memcpy(&newSt, st, sizeof(StateInfo));
-  }
-  else
-      std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));
+  std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));
 
   newSt.previous = st;
   st = &newSt;
 
+  // Used by NNUE
+  st->accumulator.computed_accumulation = false;
+  auto& dp = st->dirtyPiece;
+  dp.dirty_num = 0;
+
   if (st->epSquare != SQ_NONE)
   {
       st->key ^= Zobrist::enpassant[file_of(st->epSquare)];
       st->epSquare = SQ_NONE;
-      st->accumulator.computed_accumulation = false;
   }
 
   st->key ^= Zobrist::side;

From b44d539c945d16508bafba375bd4d98c19ac1624 Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Sun, 27 Sep 2020 23:13:13 +0800
Subject: [PATCH 308/583] Fix a bug that LR is not correctly scaled when
 initial LR is not 1.0

---
 src/learn/learn.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 5320aaf8..80de6a57 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -741,7 +741,6 @@ namespace Learner
             learn_sum_entropy_win = 0.0;
             learn_sum_entropy = 0.0;
 
-            newbob_scale = 1.0;
             newbob_decay = 1.0;
             newbob_num_trials = 2;
             best_loss = std::numeric_limits<double>::infinity();
@@ -795,7 +794,6 @@ namespace Learner
         atomic<double> learn_sum_entropy;
 
         shared_timed_mutex nn_mutex;
-        double newbob_scale;
         double newbob_decay;
         int newbob_num_trials;
         double best_loss;
@@ -1309,12 +1307,11 @@ namespace Learner
                     if (--trials > 0 && !is_final)
                     {
                         cout
-                            << "reducing learning rate from " << newbob_scale
-                            << " to " << (newbob_scale * newbob_decay)
+                            << "reducing learning rate from " << global_learning_rate
+                            << " to " << (global_learning_rate * newbob_decay)
                             << " (" << trials << " more trials)" << endl;
 
-                        newbob_scale *= newbob_decay;
-                        global_learning_rate = newbob_scale;
+                        global_learning_rate *= newbob_decay;
                     }
                 }
 
@@ -1956,7 +1953,6 @@ namespace Learner
         learn_think.sr.no_shuffle = no_shuffle;
         learn_think.reduction_gameply = reduction_gameply;
 
-        learn_think.newbob_scale = 1.0;
         learn_think.newbob_decay = newbob_decay;
         learn_think.newbob_num_trials = newbob_num_trials;
 

From d865159bd6e30b7c3b284286e3c8a2ce2cc21f8d Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Tue, 29 Sep 2020 17:30:08 +0800
Subject: [PATCH 309/583] Fix variable initialization in test commands

---
 src/nnue/nnue_test_command.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index 5f0776ef..f6f05c2e 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -56,7 +56,7 @@ void TestFeatures(Position& pos) {
   auto update_index_sets = [&](const Position& position, auto* index_sets) {
     for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
       Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2];
+      bool reset[2] = { false, false };
       RawFeatures::AppendChangedIndices(position, kRefreshTriggers[i],
                                         removed_indices, added_indices, reset);
       for (const auto perspective : Colors) {

From f848d67341afb078df3f8de8095e07204bc3e044 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 30 Sep 2020 20:18:15 +0200
Subject: [PATCH 310/583] Use fair scheduling of threads under valgrind

fixes some rare case where the master search thread makes no progress,
observed in CI.
---
 tests/instrumented.sh       | 2 +-
 tests/instrumented_learn.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index 03ded74a..03e9c9de 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -20,7 +20,7 @@ case $1 in
   --valgrind-thread)
     echo "valgrind-thread testing started"
     prefix=''
-    exeprefix='valgrind --error-exitcode=42'
+    exeprefix='valgrind --fair-sched=try --error-exitcode=42'
     postfix='1>/dev/null'
     threads="2"
   ;;
diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 44c5d7fa..267a3bb6 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -20,7 +20,7 @@ case $1 in
   --valgrind-thread)
     echo "valgrind-thread testing started"
     prefix=''
-    exeprefix='valgrind --error-exitcode=42'
+    exeprefix='valgrind --fair-sched=try --error-exitcode=42'
     postfix='1>/dev/null'
     threads="2"
   ;;

From 6f7a2287079682e5710c10106dd60e3c76abcc3e Mon Sep 17 00:00:00 2001
From: noobpwnftw <noobpwnftw@users.noreply.github.com>
Date: Thu, 1 Oct 2020 14:58:53 +0800
Subject: [PATCH 311/583] Minor cleanups

Remove unused code and magic numbers
---
 src/learn/gensfen.cpp | 58 +------------------------------------------
 1 file changed, 1 insertion(+), 57 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 7e931726..6fc59be9 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -387,12 +387,6 @@ namespace Learner
             int ply,
             int& random_move_c);
 
-        Value evaluate_leaf(
-            Position& pos,
-            std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
-            int ply,
-            vector<Move>& pv);
-
         // Min and max depths for search during gensfen
         int search_depth_min;
         int search_depth_max;
@@ -732,56 +726,6 @@ namespace Learner
         return random_move_flag;
     }
 
-    Value MultiThinkGenSfen::evaluate_leaf(
-        Position& pos,
-        std::vector<StateInfo, AlignedAllocator<StateInfo>>& states,
-        int ply,
-        vector<Move>& pv)
-    {
-        auto rootColor = pos.side_to_move();
-
-        for (auto m : pv)
-        {
-            // There should be no illegal move. This is as a debugging precaution.
-            if (!pos.pseudo_legal(m) || !pos.legal(m))
-            {
-                cout << "Error! : " << pos.fen() << m << endl;
-            }
-
-            pos.do_move(m, states[ply++]);
-        }
-
-        // Reach leaf
-        Value v;
-        if (pos.checkers())
-        {
-            // Sometime a king is checked.  An example is a case that a checkmate is
-            // found in the search.  If Eval::evaluate() is called whne a king is
-            // checked, classic eval crashes by an assertion. To avoid crashes, return
-            // VALUE_NONE and let the caller assign a value to the position.
-            v = VALUE_NONE;
-        }
-        else
-        {
-            v = Eval::evaluate(pos);
-
-            // evaluate() returns the evaluation value on the turn side, so
-            // If it's a turn different from root_color, you must invert v and return it.
-            if (rootColor != pos.side_to_move())
-            {
-                v = -v;
-            }
-        }
-
-        // Rewind the pv moves.
-        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-        {
-            pos.undo_move(*it);
-        }
-
-        return v;
-    }
-
     // thread_id = 0..Threads.size()-1
     void MultiThinkGenSfen::thread_worker(size_t thread_id)
     {
@@ -853,7 +797,7 @@ namespace Learner
                 if (abs(search_value) >= eval_limit)
                 {
                     resign_counter++;
-                    if ((should_resign && resign_counter >= 4) || abs(search_value) >= 10000) {
+                    if ((should_resign && resign_counter >= 4) || abs(search_value) >= VALUE_KNOWN_WIN) {
                         flush_psv((search_value >= eval_limit) ? 1 : -1);
                         break;
                     }

From 91cb4a6770fee0f8e586c3df5fd31f0f22dc7018 Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Sat, 3 Oct 2020 15:35:54 +0800
Subject: [PATCH 312/583] Skip eval dampening in Use NNUE = pure case

---
 src/evaluate.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 607ff7eb..b3894fe8 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -996,6 +996,11 @@ Value Eval::evaluate(const Position& pos) {
 
   if (Eval::useNNUE == UseNNUEMode::Pure) {
       v = NNUE::evaluate(pos);
+
+      // Guarantee evaluation does not hit the tablebase range
+      v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+
+      return v;
   }
   else if (Eval::useNNUE == UseNNUEMode::False)
       v = Evaluation<NO_TRACE>(pos).value();

From 31f9d66f120f499f20b859a1e143fca0560b88a6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 5 Oct 2020 20:33:47 +0200
Subject: [PATCH 313/583] Initial documentation for learn, gensfen, convert,
 and binpack.

---
 src/docs/binpack.md | 42 +++++++++++++++++++++
 src/docs/convert.md | 15 ++++++++
 src/docs/gensfen.md | 57 ++++++++++++++++++++++++++++
 src/docs/learn.md   | 92 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 206 insertions(+)
 create mode 100644 src/docs/binpack.md
 create mode 100644 src/docs/convert.md
 create mode 100644 src/docs/gensfen.md
 create mode 100644 src/docs/learn.md

diff --git a/src/docs/binpack.md b/src/docs/binpack.md
new file mode 100644
index 00000000..1940a5dc
--- /dev/null
+++ b/src/docs/binpack.md
@@ -0,0 +1,42 @@
+# Binpack
+
+Binpack is a binary training data storage format designed to take advantage of position chains differing by a single move. Therefore it is very good at compactly storing data generated from real games (as opposed to random positions for example sourced from an opening book).
+
+It is currently implemented through a single header library in `extra/nnue_data_binpack_format.h`.
+
+Below follows a rough description of the format in a BNF-like notation.
+
+```
+[[nodiscard]] std::uint16_t signedToUnsigned(std::int16_t a) {
+    std::uint16_t r;
+    std::memcpy(&r, &a, sizeof(std::uint16_t));
+    if (r & 0x8000) r ^= 0x7FFF; // flip value bits if negative
+    r = (r << 1) | (r >> 15); // store sign bit at bit 0
+    return r;
+}
+
+file := <block>*
+block := BINP<chain>*
+chain := <stem><movetext>
+stem := <pos><move><score><ply_and_result><rule50> (32 bytes)
+pos := https://github.com/Sopel97/nnue_data_compress/blob/master/src/chess/Position.h#L1166 (24 bytes)
+move := https://github.com/Sopel97/nnue_data_compress/blob/master/src/chess/Chess.h#L1044 (2 bytes)
+score := signedToUnsigned(score) (2 bytes, big endian)
+ply_and_result := ply bitwise_or (signedToUnsigned(result) << 14) (2 bytes, big endian)
+rule50 := rule_50_counter (2 bytes, big endian)
+    // this is a small defect from old version,
+    I didn't want to break backwards compatibility. Effectively means that there's
+    one byte left for something else in the future because rule50 always fits in one byte.
+
+movetext := <count><move_and_score>*
+count := number of plies in the movetext (2 bytes, big endian). Can be 0.
+move_and_score := <encoded_move><encoded_score> (~2 bytes)
+encoded_move := oof this one is complicated to explain.
+    https://github.com/Sopel97/nnue_data_compress/blob/master/src/compress_file.cpp#L827.
+    https://github.com/Sopel97/chess_pos_db/blob/master/docs/bcgn/variable_length.md
+
+encoded_score := https://en.wikipedia.org/wiki/Variable-width_encoding
+    with block size of 4 bits + 1 bit for extension bit.
+    Encoded value is signedToUnsigned(-prev_score - current_score)
+    (scores are always seen from the perspective of side to move in <pos>, that's why the '-' before prev_score)
+```
\ No newline at end of file
diff --git a/src/docs/convert.md b/src/docs/convert.md
new file mode 100644
index 00000000..05d230b2
--- /dev/null
+++ b/src/docs/convert.md
@@ -0,0 +1,15 @@
+# Convert
+
+`convert` allows conversion of training between any of `.plain`, `.bin`, and `.binpack`.
+
+As all commands in stockfish `convert` can be invoked either from command line (as `stockfish.exe convert ...`) or in the interactive prompt.
+
+The syntax of this command is as follows:
+```
+convert from_path to_path [append]
+```
+
+`from_path` is the path to the file to convert from. The type of the data is deduced based on its extension (one of `.plain`, `.bin`, `.binpack`).
+`to_path` is the path to an output file. The type of the data is deduced from its extension. If the file does not exist it is created.
+
+Last argument is optional. If not specified then they output file will be truncated prior to any writes. If the last argument is `append` then the converted training data will be appended to the end of the output file.
\ No newline at end of file
diff --git a/src/docs/gensfen.md b/src/docs/gensfen.md
new file mode 100644
index 00000000..c3e0a9c2
--- /dev/null
+++ b/src/docs/gensfen.md
@@ -0,0 +1,57 @@
+# Gensfen
+
+`gensfen` command allows generation of training data from self-play in a manner that suits training better than traditional games. It introduces random moves to diversify openings, allows reduced pruning, disabling of TT for less interference between searches, and fixed depth evaluation.
+
+As all commands in stockfish `gensfen` can be invoked either from command line (as `stockfish.exe gensfen ...`, but this is not recommended because it's not possible to specify UCI options before `gensfen` executes) or in the interactive prompt.
+
+`gensfen` takes named parameters in form `gensfen param_1_name param_1_value param_2_name param_2_value ...`.
+
+Currently the following options are available:
+
+`depth` - minimum depth of evaluation of each position. Default: 3.
+
+`depth2` - maximum depth of evaluation of each position. If not specified then the same as `depth`.
+
+`nodes` - the number of nodes to use for evaluation of each position. This number is multiplied by the number of PVs of the current search. This does NOT override the `depth` and `depth2` options. If specified then whichever of depth or nodes limit is reached first applies.
+
+`loop` - the number of training data entries to generate. 1 entry == 1 position.
+
+`output_file_name` - the name of the file to output to. If the extension is not present or doesn't match the selected training data format the right extension will be appened.
+
+`eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000).
+
+`random_move_minply` - the minimal ply at which a random move may be executed instead of a move chosen by search
+
+`random_move_maxply` - the maximal ply at which a random move may be executed instead of a move chosen by search
+
+`random_move_count` - maximum number of random moves in a single self-play game
+
+`random_move_like_apery` - either 0 or 1. If 1 then random king moves will be followed by a random king move from the opponent whenever possible with 50% probability.
+
+`random_multi_pv` - the number of PVs used for determining the random move. If not specified then a truly random move will be chosen. If specified then a multiPV search will be performed the random move will be one of the moves chosen by the search.
+
+`random_multi_pv_diff` - Makes the multiPV random move selection consider only moves that are at most `random_multi_pv_diff` worse than the next best move. Default: 30000 (all multiPV moves).
+
+`random_multi_pv_depth` - the depth to use for multiPV search for random move. Defaults to `depth2`.
+
+`write_minply` - minimum ply for which the training data entry will be emitted.
+
+`write_maxply` - maximum ply for which the training data entry will be emitted.
+
+`save_every` - the number of training data entries per file. If not specified then there will be always one file. If specified there may be more than one file generated (each having at most `save_every` training data entries) and each file will have a unique number attached.
+
+`random_file_name` - if specified then the output filename will be chosen randomly. Overrides `output_file_name`.
+
+`use_draw_in_training_data_generation` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 0.
+
+`write_out_draw_game_in_training_data_generation` - deprecated, alias for `use_draw_in_training_data_generation`
+
+`detect_draw_by_consecutive_low_score` - either 0 or 1. If 1 then drawn games will be adjudicated when the score remains 0 for at least 8 plies after ply 80. Default: 0.
+
+`use_game_draw_adjudication` - deprecated, alias for `detect_draw_by_consecutive_low_score`
+
+`detect_draw_by_insufficient_mating_material` - either 0 or 1. If 1 then position with insufficient material will be adjudicated as draws. Default: 0.
+
+`sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `bin`.
+
+`seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
diff --git a/src/docs/learn.md b/src/docs/learn.md
new file mode 100644
index 00000000..d1347db1
--- /dev/null
+++ b/src/docs/learn.md
@@ -0,0 +1,92 @@
+# Learn
+
+`learn` command allows allows training a network from training data.
+
+As all commands in stockfish `learn` can be invoked either from command line (as `stockfish.exe learn ...`, but this is not recommended because it's not possible to specify UCI options before `learn` executes) or in the interactive prompt.
+
+`learn` takes named parameters in form `learn param_1_name param_1_value param_2_name param_2_value ...`. Unrecognized parameters form a list of paths to training data files.
+
+Currently the following options are available:
+
+`bat` - the size of a minibatch in multiples of 10000. The number of positions inbetween weights updates. Default: 1000 (meaning mini batch size of 1000000).
+
+`targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
+
+`loop` - the number of times to loop over all training data.
+
+`basedir` - the base directory for the paths. Default: "" (current directory)
+
+`batchsize` - same as `bat` but doesn't scale by 10000
+
+`lr` - initial learning rate. Default: 1.
+
+`use_draw_games_in_training` - either 0 or 1. If 1 then draws will be used in training too. Default: 0.
+
+`use_draw_in_training` - deprecated, alias for `use_draw_games_in_training`
+
+`use_draw_games_in_validation` - either 0 or 1. If 1 then draws will be used in validation too. Default: 0.
+
+`use_draw_in_validation` - deprecated, alias for `use_draw_games_in_validation`
+
+`skip_duplicated_positions_in_training` - either 0 or 1. If 1 then a small hashtable will be used to try to eliminate duplicated position from training. Default: 0.
+
+`use_hash_in_training` - deprecated, alias for `skip_duplicated_positions_in_training`
+
+`winning_probability_coefficient` - some magic value for winning probability. If you need to read this then don't touch it. Default: 1.0 / PawnValueEg / 4.0 * std::log(10.0)
+
+`use_wdl` - either 0 or 1. If 1 then the evaluations will be converted to win/draw/loss percentages prior to learning on them. (Slightly changes the gradient because eval has a different derivative than wdl). Default: 0.
+
+`lambda` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 0.33.
+
+`lambda2` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 0.33.
+
+`lambda_limit` - the maximum absolute score value for which `lambda` is used as opposed to `lambda2`. For positions with absolute evaluation higher than `lambda_limit` `lambda2` will be used. Default: 32000 (so always `lambda`).
+
+`reduction_gameply` - the minimum ply after which positions won't be skipped. Positions at plies below this value are skipped with a probability that lessens linearly with the ply (reaching 0 at `reduction_gameply`). Default: 1.
+
+`eval_limit` - positions with absolute evaluation higher than this will be skipped. Default: 32000 (nothing is skipped).
+
+`save_only_once` - this is a modifier not a parameter, no value follows it. If specified then there will be only one network file generated.
+
+`no_shuffle` - this is a modifier not a parameter, no value follows it. If specified then data within a batch won't be shuffled.
+
+`nn_batch_size` - batch size used for learning. Default: 1000.
+
+`newbob_decay` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 1.0 (no LR drops)
+
+`newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 2.
+
+`nn_options` - if you're reading this you don't use it. It passes messages directly to the network evaluation. I don't know what it can do either.
+
+`eval_save_interval` - every `eval_save_interval` positions the network will be saved and either accepted or rejected (in which case an LR drop follows). Default: 1000000000 (1B). (generally people use values in 10M-100M range)
+
+`loss_output_interval` - every `loss_output_interval` fittness statistics are displayed. Default: `batchsize`
+
+`validation_set_file_name` - path to the file with training data to be used for validation (loss computation and move accuracy)
+
+`seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
+
+## Legacy subcommands and parameters
+
+### Convert
+
+`convert_plain`
+`convert_bin`
+`interpolate_eval`
+`check_invalid_fen`
+`check_illegal_move`
+`convert_bin_from_pgn-extract`
+`pgn_eval_side_to_move`
+`convert_no_eval_fens_as_score_zero`
+`src_score_min_value`
+`src_score_max_value`
+`dest_score_min_value`
+`dest_score_max_value`
+
+### Shuffle
+
+`shuffle`
+`buffer_size`
+`shuffleq`
+`shufflem`
+`output_file_name`
\ No newline at end of file

From 80cbc3ffee9f3c6d048107b437cdddbc3d69b34a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 6 Oct 2020 10:55:42 +0200
Subject: [PATCH 314/583] Fix grammar and spelling. Add recommendations for UCI
 options.

---
 src/docs/convert.md |  4 ++--
 src/docs/gensfen.md | 14 +++++++++-----
 src/docs/learn.md   | 16 +++++++++++-----
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/docs/convert.md b/src/docs/convert.md
index 05d230b2..2e07ec52 100644
--- a/src/docs/convert.md
+++ b/src/docs/convert.md
@@ -1,6 +1,6 @@
 # Convert
 
-`convert` allows conversion of training between any of `.plain`, `.bin`, and `.binpack`.
+`convert` allows conversion of training data between any of `.plain`, `.bin`, and `.binpack`.
 
 As all commands in stockfish `convert` can be invoked either from command line (as `stockfish.exe convert ...`) or in the interactive prompt.
 
@@ -12,4 +12,4 @@ convert from_path to_path [append]
 `from_path` is the path to the file to convert from. The type of the data is deduced based on its extension (one of `.plain`, `.bin`, `.binpack`).
 `to_path` is the path to an output file. The type of the data is deduced from its extension. If the file does not exist it is created.
 
-Last argument is optional. If not specified then they output file will be truncated prior to any writes. If the last argument is `append` then the converted training data will be appended to the end of the output file.
\ No newline at end of file
+The last argument is optional. If not specified then the output file will be truncated prior to any writes. If the last argument is `append` then the converted training data will be appended to the end of the output file.
\ No newline at end of file
diff --git a/src/docs/gensfen.md b/src/docs/gensfen.md
index c3e0a9c2..35c08582 100644
--- a/src/docs/gensfen.md
+++ b/src/docs/gensfen.md
@@ -1,10 +1,14 @@
 # Gensfen
 
-`gensfen` command allows generation of training data from self-play in a manner that suits training better than traditional games. It introduces random moves to diversify openings, allows reduced pruning, disabling of TT for less interference between searches, and fixed depth evaluation.
+`gensfen` command allows generation of training data from self-play in a manner that suits training better than traditional games. It introduces random moves to diversify openings, and fixed depth evaluation.
 
 As all commands in stockfish `gensfen` can be invoked either from command line (as `stockfish.exe gensfen ...`, but this is not recommended because it's not possible to specify UCI options before `gensfen` executes) or in the interactive prompt.
 
-`gensfen` takes named parameters in form `gensfen param_1_name param_1_value param_2_name param_2_value ...`.
+It is recommended to set the `PruneAtShallowDepth` UCI option to `false` as it will increase the quality of fixed depth searches.
+
+It is recommended to keep the `EnableTranspositionTable` UCI option at the default `true` value as it will make the generation process faster without noticably harming the uniformity of the data.
+
+`gensfen` takes named parameters in the form of `gensfen param_1_name param_1_value param_2_name param_2_value ...`.
 
 Currently the following options are available:
 
@@ -18,7 +22,7 @@ Currently the following options are available:
 
 `output_file_name` - the name of the file to output to. If the extension is not present or doesn't match the selected training data format the right extension will be appened.
 
-`eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000).
+`eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which is VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000).
 
 `random_move_minply` - the minimal ply at which a random move may be executed instead of a move chosen by search
 
@@ -42,9 +46,9 @@ Currently the following options are available:
 
 `random_file_name` - if specified then the output filename will be chosen randomly. Overrides `output_file_name`.
 
-`use_draw_in_training_data_generation` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 0.
+`write_out_draw_game_in_training_data_generation` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 0.
 
-`write_out_draw_game_in_training_data_generation` - deprecated, alias for `use_draw_in_training_data_generation`
+`use_draw_in_training_data_generation` - deprecated, alias for `write_out_draw_game_in_training_data_generation`
 
 `detect_draw_by_consecutive_low_score` - either 0 or 1. If 1 then drawn games will be adjudicated when the score remains 0 for at least 8 plies after ply 80. Default: 0.
 
diff --git a/src/docs/learn.md b/src/docs/learn.md
index d1347db1..eab33607 100644
--- a/src/docs/learn.md
+++ b/src/docs/learn.md
@@ -1,14 +1,20 @@
 # Learn
 
-`learn` command allows allows training a network from training data.
+`learn` command allows training a network from training data.
 
 As all commands in stockfish `learn` can be invoked either from command line (as `stockfish.exe learn ...`, but this is not recommended because it's not possible to specify UCI options before `learn` executes) or in the interactive prompt.
 
-`learn` takes named parameters in form `learn param_1_name param_1_value param_2_name param_2_value ...`. Unrecognized parameters form a list of paths to training data files.
+`learn` takes named parameters in the form of `learn param_1_name param_1_value param_2_name param_2_value ...`. Unrecognized parameters form a list of paths to training data files.
+
+It is recommended to set the `EnableTranspositionTable` UCI option to `false` to reduce the interference between qsearches which are used to provide shallow evaluation. Using TT may cause the shallow evaluation to diverge from the real evaluation of the net, hiding imperfections.
+
+It is recommended to set the `PruneAtShallowDepth` UCI option to `false` as it will provide more accurate shallow evaluation.
+
+It is **required** to set the `Use NNUE` UCI option to `pure` as otherwise the function being optimized will not always match the function being probed, in which case not much can be learned.
 
 Currently the following options are available:
 
-`bat` - the size of a minibatch in multiples of 10000. The number of positions inbetween weights updates. Default: 1000 (meaning mini batch size of 1000000).
+`bat` - the size of a batch in multiples of 10000. This determines how many entries are read and shuffled at once during training. Default: 1000 (meaning batch size of 1000000).
 
 `targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
 
@@ -50,7 +56,7 @@ Currently the following options are available:
 
 `no_shuffle` - this is a modifier not a parameter, no value follows it. If specified then data within a batch won't be shuffled.
 
-`nn_batch_size` - batch size used for learning. Default: 1000.
+`nn_batch_size` - minibatch size used for learning. Should be smaller than batch size. Default: 1000.
 
 `newbob_decay` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 1.0 (no LR drops)
 
@@ -60,7 +66,7 @@ Currently the following options are available:
 
 `eval_save_interval` - every `eval_save_interval` positions the network will be saved and either accepted or rejected (in which case an LR drop follows). Default: 1000000000 (1B). (generally people use values in 10M-100M range)
 
-`loss_output_interval` - every `loss_output_interval` fittness statistics are displayed. Default: `batchsize`
+`loss_output_interval` - every `loss_output_interval` fitness statistics are displayed. Default: `batchsize`
 
 `validation_set_file_name` - path to the file with training data to be used for validation (loss computation and move accuracy)
 

From 5fa28b12fa4dcece84db555ca19d15308b2f1e1a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 7 Oct 2020 11:20:53 +0200
Subject: [PATCH 315/583] Allow setting UCI options programmatically.

---
 src/uci.cpp | 19 ++++++++++++-------
 src/uci.h   |  1 +
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/uci.cpp b/src/uci.cpp
index a123bbc0..166e437c 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -109,7 +109,7 @@ namespace {
   // setoption() is called when engine receives the "setoption" UCI command. The
   // function updates the UCI option ("name") to the given value ("value").
 
-  void setoption(istringstream& is) {
+  void setoption_from_stream(istringstream& is) {
 
     string token, name, value;
 
@@ -123,10 +123,7 @@ namespace {
     while (is >> token)
         value += (value.empty() ? "" : " ") + token;
 
-    if (Options.count(name))
-        Options[name] = value;
-    else
-        sync_cout << "No such option: " << name << sync_endl;
+    UCI::setoption(name, value);
   }
 
 
@@ -195,7 +192,7 @@ namespace {
             else
                trace_eval(pos);
         }
-        else if (token == "setoption")  setoption(is);
+        else if (token == "setoption")  setoption_from_stream(is);
         else if (token == "position")   position(pos, is, states);
         else if (token == "ucinewgame") { Search::clear(); elapsed = now(); } // Search::clear() may take some while
     }
@@ -212,6 +209,14 @@ namespace {
 
 } // namespace
 
+void UCI::setoption(const std::string& name, const std::string& value)
+{
+    if (Options.count(name))
+        Options[name] = value;
+    else
+        sync_cout << "No such option: " << name << sync_endl;
+}
+
 // The win rate model returns the probability (per mille) of winning given an eval
 // and a game-ply. The model fits rather accurately the LTC fishtest statistics.
 int UCI::win_rate_model(Value v, int ply) {
@@ -318,7 +323,7 @@ void UCI::loop(int argc, char* argv[]) {
                     << "\n"       << Options
                     << "\nuciok"  << sync_endl;
 
-      else if (token == "setoption")  setoption(is);
+      else if (token == "setoption")  setoption_from_stream(is);
       else if (token == "go")         go(pos, is, states);
       else if (token == "position")   position(pos, is, states);
       else if (token == "ucinewgame") Search::clear();
diff --git a/src/uci.h b/src/uci.h
index 2e0f5c11..192963cb 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -75,6 +75,7 @@ std::string wdl(Value v, int ply);
 int win_rate_model(Value v, int ply);
 double win_rate_model_double(double v, int ply);
 Move to_move(const Position& pos, std::string& str);
+void setoption(const std::string& name, const std::string& value);
 
 } // namespace UCI
 

From d1c44dca042392b2bf0ceb0c8901d52ca92fa023 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 7 Oct 2020 11:27:52 +0200
Subject: [PATCH 316/583] Switch to set recommended gensfen UCI options

---
 src/docs/gensfen.md   |  2 ++
 src/learn/gensfen.cpp | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/src/docs/gensfen.md b/src/docs/gensfen.md
index 35c08582..e4263a92 100644
--- a/src/docs/gensfen.md
+++ b/src/docs/gensfen.md
@@ -12,6 +12,8 @@ It is recommended to keep the `EnableTranspositionTable` UCI option at the defau
 
 Currently the following options are available:
 
+`set_recommended_uci_options` - this is a modifier not a parameter, no value follows it. If specified then some UCI options are set to recommended values.
+
 `depth` - minimum depth of evaluation of each position. Default: 3.
 
 `depth2` - maximum depth of evaluation of each position. If not specified then the same as `depth`.
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 6fc59be9..5720236d 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1014,6 +1014,16 @@ namespace Learner
                 is >> sfen_format;
             else if (token == "seed")
                 is >> seed;
+            else if (token == "set_recommended_uci_options")
+            {
+                UCI::setoption("Contempt", "0");
+                UCI::setoption("Skill Level", "20");
+                UCI::setoption("UCI_Chess960", "false");
+                UCI::setoption("UCI_AnalyseMode", "false");
+                UCI::setoption("UCI_LimitStrength", "false");
+                UCI::setoption("PruneAtShallowDepth", "false");
+                UCI::setoption("EnableTranspositionTable", "true");
+            }
             else
                 cout << "Error! : Illegal token " << token << endl;
         }

From 2e57f3fa222d9dd879662864799c1896f732de11 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 7 Oct 2020 11:30:46 +0200
Subject: [PATCH 317/583] Switch to set recommended learn UCI options

---
 src/docs/learn.md   |  2 ++
 src/learn/learn.cpp | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/src/docs/learn.md b/src/docs/learn.md
index eab33607..5cd2e8b5 100644
--- a/src/docs/learn.md
+++ b/src/docs/learn.md
@@ -14,6 +14,8 @@ It is **required** to set the `Use NNUE` UCI option to `pure` as otherwise the f
 
 Currently the following options are available:
 
+`set_recommended_uci_options` - this is a modifier not a parameter, no value follows it. If specified then some UCI options are set to recommended values.
+
 `bat` - the size of a batch in multiples of 10000. This determines how many entries are read and shuffled at once during training. Default: 1000 (meaning batch size of 1000000).
 
 `targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 80de6a57..6bba1dda 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1745,6 +1745,18 @@ namespace Learner
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
             else if (option == "seed") is >> seed;
+            else if (option == "set_recommended_uci_options")
+            {
+                UCI::setoption("Use NNUE", "pure");
+                UCI::setoption("MultiPV", "1");
+                UCI::setoption("Contempt", "0");
+                UCI::setoption("Skill Level", "20");
+                UCI::setoption("UCI_Chess960", "false");
+                UCI::setoption("UCI_AnalyseMode", "false");
+                UCI::setoption("UCI_LimitStrength", "false");
+                UCI::setoption("PruneAtShallowDepth", "false");
+                UCI::setoption("EnableTranspositionTable", "false");
+            }
             // Otherwise, it's a filename.
             else
                 filenames.push_back(option);

From 8830209125bcf9a5aca8eba995c0687ed3c93ab2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 7 Oct 2020 11:46:26 +0200
Subject: [PATCH 318/583] Change some learn parameter defaults.

---
 src/docs/learn.md   | 16 ++++++++--------
 src/learn/learn.cpp | 14 +++++++-------
 src/learn/learn.h   |  2 +-
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/docs/learn.md b/src/docs/learn.md
index 5cd2e8b5..4c8c3fc1 100644
--- a/src/docs/learn.md
+++ b/src/docs/learn.md
@@ -28,11 +28,11 @@ Currently the following options are available:
 
 `lr` - initial learning rate. Default: 1.
 
-`use_draw_games_in_training` - either 0 or 1. If 1 then draws will be used in training too. Default: 0.
+`use_draw_games_in_training` - either 0 or 1. If 1 then draws will be used in training too. Default: 1.
 
 `use_draw_in_training` - deprecated, alias for `use_draw_games_in_training`
 
-`use_draw_games_in_validation` - either 0 or 1. If 1 then draws will be used in validation too. Default: 0.
+`use_draw_games_in_validation` - either 0 or 1. If 1 then draws will be used in validation too. Default: 1.
 
 `use_draw_in_validation` - deprecated, alias for `use_draw_games_in_validation`
 
@@ -44,9 +44,9 @@ Currently the following options are available:
 
 `use_wdl` - either 0 or 1. If 1 then the evaluations will be converted to win/draw/loss percentages prior to learning on them. (Slightly changes the gradient because eval has a different derivative than wdl). Default: 0.
 
-`lambda` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 0.33.
+`lambda` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 1.0.
 
-`lambda2` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 0.33.
+`lambda2` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 1.0.
 
 `lambda_limit` - the maximum absolute score value for which `lambda` is used as opposed to `lambda2`. For positions with absolute evaluation higher than `lambda_limit` `lambda2` will be used. Default: 32000 (so always `lambda`).
 
@@ -60,15 +60,15 @@ Currently the following options are available:
 
 `nn_batch_size` - minibatch size used for learning. Should be smaller than batch size. Default: 1000.
 
-`newbob_decay` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 1.0 (no LR drops)
+`newbob_decay` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 0.5 (no LR drops)
 
-`newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 2.
+`newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 4.
 
 `nn_options` - if you're reading this you don't use it. It passes messages directly to the network evaluation. I don't know what it can do either.
 
-`eval_save_interval` - every `eval_save_interval` positions the network will be saved and either accepted or rejected (in which case an LR drop follows). Default: 1000000000 (1B). (generally people use values in 10M-100M range)
+`eval_save_interval` - every `eval_save_interval` positions the network will be saved and either accepted or rejected (in which case an LR drop follows). Default: 100000000 (100M). (generally people use values in 10M-100M range)
 
-`loss_output_interval` - every `loss_output_interval` fitness statistics are displayed. Default: `batchsize`
+`loss_output_interval` - every `loss_output_interval` fitness statistics are displayed. Default: 1000000 (1M)
 
 `validation_set_file_name` - path to the file with training data to be used for validation (loss computation and move accuracy)
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 6bba1dda..c3335e37 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -77,8 +77,8 @@ T operator -= (std::atomic<T>& x, const T rhs) { return x += -rhs; }
 
 namespace Learner
 {
-    static bool use_draw_games_in_training = false;
-    static bool use_draw_games_in_validation = false;
+    static bool use_draw_games_in_training = true;
+    static bool use_draw_games_in_validation = true;
     static bool skip_duplicated_positions_in_training = true;
 
     static double winning_probability_coefficient = 1.0 / PawnValueEg / 4.0 * std::log(10.0);
@@ -1632,8 +1632,8 @@ namespace Learner
         global_learning_rate = 1.0;
 
         // elmo lambda
-        ELMO_LAMBDA = 0.33;
-        ELMO_LAMBDA2 = 0.33;
+        ELMO_LAMBDA = 1.0;
+        ELMO_LAMBDA2 = 1.0;
         ELMO_LAMBDA_LIMIT = 32000;
 
         // if (gamePly <rand(reduction_gameply)) continue;
@@ -1642,12 +1642,12 @@ namespace Learner
         int reduction_gameply = 1;
 
         uint64_t nn_batch_size = 1000;
-        double newbob_decay = 1.0;
-        int newbob_num_trials = 2;
+        double newbob_decay = 0.5;
+        int newbob_num_trials = 4;
         string nn_options;
 
         uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
-        uint64_t loss_output_interval = 0;
+        uint64_t loss_output_interval = 1'000'000;
 
         string validation_set_file_name;
         string seed;
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 4b09f825..3ba75ce3 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -64,7 +64,7 @@ namespace Learner
     // Needless to say, the longer the saving interval, the shorter the learning time.
     // Folder name is incremented for each save like 0/, 1/, 2/...
     // By default, once every 1 billion phases.
-    constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 1000000000ULL;
+    constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 100'000'000ULL;
 
     // Reduce the output of rmse during learning to 1 for this number of times.
     // rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.

From 3f55b3af42fc569dfae7b6f5bd3d946ba4d5891e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 7 Oct 2020 11:46:34 +0200
Subject: [PATCH 319/583] Change some gensfen parameter defaults.

---
 src/docs/gensfen.md   | 28 ++++++++++++++--------------
 src/learn/gensfen.cpp |  8 ++++----
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/docs/gensfen.md b/src/docs/gensfen.md
index e4263a92..ce0f365c 100644
--- a/src/docs/gensfen.md
+++ b/src/docs/gensfen.md
@@ -20,44 +20,44 @@ Currently the following options are available:
 
 `nodes` - the number of nodes to use for evaluation of each position. This number is multiplied by the number of PVs of the current search. This does NOT override the `depth` and `depth2` options. If specified then whichever of depth or nodes limit is reached first applies.
 
-`loop` - the number of training data entries to generate. 1 entry == 1 position.
+`loop` - the number of training data entries to generate. 1 entry == 1 position. Default: 8000000000 (8B).
 
-`output_file_name` - the name of the file to output to. If the extension is not present or doesn't match the selected training data format the right extension will be appened.
+`output_file_name` - the name of the file to output to. If the extension is not present or doesn't match the selected training data format the right extension will be appened. Default: generated_kifu
 
-`eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which is VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000).
+`eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which is VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000). Default: 3000
 
-`random_move_minply` - the minimal ply at which a random move may be executed instead of a move chosen by search
+`random_move_minply` - the minimal ply at which a random move may be executed instead of a move chosen by search. Default: 1.
 
-`random_move_maxply` - the maximal ply at which a random move may be executed instead of a move chosen by search
+`random_move_maxply` - the maximal ply at which a random move may be executed instead of a move chosen by search. Default: 24.
 
-`random_move_count` - maximum number of random moves in a single self-play game
+`random_move_count` - maximum number of random moves in a single self-play game. Default: 5.
 
-`random_move_like_apery` - either 0 or 1. If 1 then random king moves will be followed by a random king move from the opponent whenever possible with 50% probability.
+`random_move_like_apery` - either 0 or 1. If 1 then random king moves will be followed by a random king move from the opponent whenever possible with 50% probability. Default: 0.
 
 `random_multi_pv` - the number of PVs used for determining the random move. If not specified then a truly random move will be chosen. If specified then a multiPV search will be performed the random move will be one of the moves chosen by the search.
 
 `random_multi_pv_diff` - Makes the multiPV random move selection consider only moves that are at most `random_multi_pv_diff` worse than the next best move. Default: 30000 (all multiPV moves).
 
-`random_multi_pv_depth` - the depth to use for multiPV search for random move. Defaults to `depth2`.
+`random_multi_pv_depth` - the depth to use for multiPV search for random move. Default: `depth2`.
 
-`write_minply` - minimum ply for which the training data entry will be emitted.
+`write_minply` - minimum ply for which the training data entry will be emitted. Default: 16.
 
-`write_maxply` - maximum ply for which the training data entry will be emitted.
+`write_maxply` - maximum ply for which the training data entry will be emitted. Default: 400.
 
 `save_every` - the number of training data entries per file. If not specified then there will be always one file. If specified there may be more than one file generated (each having at most `save_every` training data entries) and each file will have a unique number attached.
 
 `random_file_name` - if specified then the output filename will be chosen randomly. Overrides `output_file_name`.
 
-`write_out_draw_game_in_training_data_generation` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 0.
+`write_out_draw_game_in_training_data_generation` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 1.
 
 `use_draw_in_training_data_generation` - deprecated, alias for `write_out_draw_game_in_training_data_generation`
 
-`detect_draw_by_consecutive_low_score` - either 0 or 1. If 1 then drawn games will be adjudicated when the score remains 0 for at least 8 plies after ply 80. Default: 0.
+`detect_draw_by_consecutive_low_score` - either 0 or 1. If 1 then drawn games will be adjudicated when the score remains 0 for at least 8 plies after ply 80. Default: 1.
 
 `use_game_draw_adjudication` - deprecated, alias for `detect_draw_by_consecutive_low_score`
 
-`detect_draw_by_insufficient_mating_material` - either 0 or 1. If 1 then position with insufficient material will be adjudicated as draws. Default: 0.
+`detect_draw_by_insufficient_mating_material` - either 0 or 1. If 1 then position with insufficient material will be adjudicated as draws. Default: 1.
 
-`sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `bin`.
+`sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `binpack`.
 
 `seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 5720236d..8ceb04e2 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -43,9 +43,9 @@ namespace Learner
         Binpack
     };
 
-    static bool write_out_draw_game_in_training_data_generation = false;
-    static bool detect_draw_by_consecutive_low_score = false;
-    static bool detect_draw_by_insufficient_mating_material = false;
+    static bool write_out_draw_game_in_training_data_generation = true;
+    static bool detect_draw_by_consecutive_low_score = true;
+    static bool detect_draw_by_insufficient_mating_material = true;
 
     static SfenOutputType sfen_output_type = SfenOutputType::Bin;
 
@@ -954,7 +954,7 @@ namespace Learner
         // Add a random number to the end of the file name.
         bool random_file_name = false;
 
-        std::string sfen_format;
+        std::string sfen_format = "binpack";
         std::string seed;
 
         while (true)

From adddf339bba43f0b8210f8ccef376966bcc1ac61 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 7 Oct 2020 16:07:29 +0200
Subject: [PATCH 320/583] Output sfens/second in the trainer, to track
 performance more easily

---
 src/learn/learn.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index c3335e37..5a540d31 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -28,6 +28,7 @@
 #include "tt.h"
 #include "uci.h"
 #include "search.h"
+#include "timeman.h"
 
 #include "extra/nnue_data_binpack_format.h"
 
@@ -845,9 +846,11 @@ namespace Learner
         // so at this timing the generation of the replacement table is updated.
         // It doesn't matter if you have disabled the substitution table.
         TT.new_search();
+        TimePoint elapsed = now() - Search::Limits.startTime + 1;
 
         cout << "PROGRESS: " << now_string() << ", ";
-        cout << sr.total_done << " sfens";
+        cout << sr.total_done << " sfens, ";
+        cout << sr.total_done * 1000 / elapsed  << " sfens/second";
         cout << ", iteration " << epoch;
         cout << ", learning rate = " << global_learning_rate << ", ";
 
@@ -1930,6 +1933,8 @@ namespace Learner
         {
           auto& limits = Search::Limits;
 
+          limits.startTime = now();
+
           // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
           limits.infinite = true;
 

From ef57ac78a339f2233242aed1a04838d0727296eb Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 8 Oct 2020 17:07:07 +0200
Subject: [PATCH 321/583] Print gensfen speed when outputting status.

---
 src/learn/gensfen.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 8ceb04e2..5f7541f5 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -229,10 +229,18 @@ namespace Learner
         // Dedicated thread to write to file
         void file_write_worker()
         {
+            auto startTime = now();
+
             auto output_status = [&]()
             {
                 // Also output the current time to console.
-                sync_cout << endl << sfen_write_count << " sfens , at " << now_string() << sync_endl;
+                const auto nowTime = now();
+                const TimePoint elapsed = nowTime - startTime + 1;
+
+                sync_cout << endl
+                    << sfen_write_count << " sfens, "
+                    << sfen_write_count * 1000 / elapsed << " sfens/second, "
+                    << "at " << now_string() << sync_endl;
             };
 
             while (!finished || sfen_buffers_pool.size())

From 2af4bf7eacdfbe02bde6ce714bf2f91d19119e89 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 9 Oct 2020 10:16:02 +0200
Subject: [PATCH 322/583] Move the docs folder one above, it was in src by
 mistake.

---
 {src/docs => docs}/binpack.md | 0
 {src/docs => docs}/convert.md | 0
 {src/docs => docs}/gensfen.md | 0
 {src/docs => docs}/learn.md   | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename {src/docs => docs}/binpack.md (100%)
 rename {src/docs => docs}/convert.md (100%)
 rename {src/docs => docs}/gensfen.md (100%)
 rename {src/docs => docs}/learn.md (100%)

diff --git a/src/docs/binpack.md b/docs/binpack.md
similarity index 100%
rename from src/docs/binpack.md
rename to docs/binpack.md
diff --git a/src/docs/convert.md b/docs/convert.md
similarity index 100%
rename from src/docs/convert.md
rename to docs/convert.md
diff --git a/src/docs/gensfen.md b/docs/gensfen.md
similarity index 100%
rename from src/docs/gensfen.md
rename to docs/gensfen.md
diff --git a/src/docs/learn.md b/docs/learn.md
similarity index 100%
rename from src/docs/learn.md
rename to docs/learn.md

From de20887e110bc70eaeb4b52e33694fc4a3b22738 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 9 Oct 2020 10:53:47 +0200
Subject: [PATCH 323/583] Update readme. Link to docs.

---
 README.md | 71 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 50 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index f84a544a..84898792 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,11 @@
 <h1 align="center">Stockfish NNUE</h1>
 
 ## Overview
+
 Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updateable neural network backwards) to Stockfish 11. To learn more about the Stockfish chess engine, look [here](stockfish.md) for an overview and [here](https://github.com/official-stockfish/Stockfish) for the official repository.
 
 ## Building
+
 To compile:
 ```
 make -jN ARCH=... build
@@ -33,8 +35,11 @@ Additional options:
 - `blas=[yes/no]` - whether to use an external BLAS library. Default is `no`. Using an external BLAS library may have a significantly improve learning performance and by default expects openBLAS to be installed.
 
 ## Training Guide
+
 ### Generating Training Data
-To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands. 
+
+To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands.
+
 ```
 uci
 setoption name PruneAtShallowDepth value false
@@ -45,16 +50,26 @@ setoption name SyzygyPath value path
 isready
 gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000
 ```
-Specify how many threads and how much memory you would like to use with the x and y values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The path is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
 
-This will save a file named "generated_kifu.bin" in the same folder as the binary. Once generation is done, rename the file to something like "1billiondepth12.bin" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
-#### Generation Parameters
-- Depth is the searched depth per move, or how far the engine looks forward. This value is an integer.
-- Loop is the amount of positions generated. This value is also an integer
-### Generating Validation Data
-The process is the same as the generation of training data, except for the fact that you need to set loop to 1 million, because you don't need a lot of validation data. The depth should be the same as before or slightly higher than the depth of the training data. After generation rename the validation data file to val.bin and drop it in a folder named "validationdata" in the same directory to make it easier. 
-### Training a Completely New Network
-Use the "learn" binary. Create an empty folder named "evalsave" in the same directory as the binaries.
+- `depth` is the searched depth per move, or how far the engine looks forward. This value is an integer.
+- `loop` is the amount of positions generated. This value is also an integer.
+
+Specify how many threads and how much memory you would like to use with the `x` and `y` values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The `path` is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
+
+This will create a file named "generated_kifu.binpack" in the same folder as the binary containing the generated training data. Once generation is done, you can rename the file to something like "1billiondepth12.binpack" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
+
+You will also need validation data that is used for loss calculation and accuracy computation. Validation data is generated in the same way as training data, but generally at most 1 million positions should be used as there's no need for more and it would just slow the learning process down. It may also be better to slightly increase the depth for validation data. After generation you can rename the validation data file to "val.binpack" and drop it in a folder named "validationdata" in the same directory to make it easier.
+
+More information about gensfen and available options can be found in the [docs](docs/gensfen.md)
+
+### Training a network
+
+#### Training a Completely New Network
+
+Whether a new network is created or not is controlled by the UCI option `SkipLoadingEval`. If set to true then a new network will be created, which allows learning from scratch. If left at its default (false) then a network will be loaded and trained further. The second scenario is described in the reinforcement learning paragraph.
+
+A simple command chain to start with training could look like this:
+
 ```
 uci
 setoption name EnableTranspositionTable value false
@@ -63,31 +78,45 @@ setoption name SkipLoadingEval value true
 setoption name Use NNUE value pure
 setoption name Threads value x
 isready
-learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 lr 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 validation_set_file_name validationdata\val.bin
+learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 lr 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 validation_set_file_name validationdata\val.binpack
 ```
-Nets get saved in the "evalsave" folder. 
 
-#### Training Parameters
-- eta is the learning rate
-- lambda is the amount of weight it puts to eval of learning data vs win/draw/loss results. 1 puts all weight on eval, lambda 0 puts all weight on WDL results.
+This will utilize training data files in the "trainingdata" directory and validation data from file "validationdata\val.bin". Produced nets are saved in the "evalsave" folder.
 
-### Reinforcement Learning
-If you would like to do some reinforcement learning on your original network, you must first generate training data using the learn binaries with the setting `Use NNUE` set to `pure`. Make sure that your previously trained network is in the eval folder. Use the commands specified above. Make sure `SkipLoadingEval` is set to false so that the data generated is using the neural net's eval by typing the command `setoption name SkipLoadingEval value false` before typing the `isready` command. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+More information about learn and available parameters can be found in the [docs](docs/learn.md)
 
-After you have generated the training data, you must move it into your training data folder and delete the older data so that the binary does not accidentally train on the same data again. Do the same for the validation data and name it to val-1.bin to make it less confusing. Make sure the evalsave folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set eval_save_interval to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value. The validation file should be set to the new validation data, not the old data.
+#### Reinforcement Learning
 
-After training is finished, your new net should be located in the "final" folder under the "evalsave" directory. You should test this new network against the older network to see if there are any improvements.
+If you would like to do some reinforcement learning on your original network, you must first generate training data with the setting `Use NNUE` set to `pure` and using the previous network (either name it "nn.bin" and put into alongside the binary or provide the `EvalFile` UCI option). Use the commands specified above. You should aim to generate less positions than the first run, around 1/10 of the number of positions generated in the first run. The depth should be higher as well. You should also do the same for validation data, with the depth being higher than the last run.
+
+After you have generated the training data, you must move it into your training data folder and move the older data so that the binary does not train on the same data again. Do the same for the validation data. Make sure the "evalsave" folder is empty. Then, using the same binary, type in the training commands shown above. Do __NOT__ set `SkipLoadingEval` to true, it must be false or you will get a completely new network, instead of a network trained with reinforcement learning. You should also set `eval_save_interval` to a number that is lower than the amount of positions in your training data, perhaps also 1/10 of the original value.
+
+After training is finished, your new net should be located in the "final" folder under the "evalsave" directory. You should test this new network against the older network to see if there are any improvements. Don't rely on the automatic rejection for network quality, sometimes even rejected nets can be better than the previous ones.
 
 ## Using Your Trained Net
+
 If you want to use your generated net, copy the net located in the "final" folder under the "evalsave" directory and move it into a new folder named "eval" under the directory with the binaries. You can then use the halfkp_256x2 binaries pertaining to your CPU with a standard chess GUI, such as Cutechess. Refer to the [releases page](https://abrok.eu/stockfish) to find out which binary is best for your CPU.
 
-If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to specify the net with the full file path with the "EvalFile" option by typing the command `setoption name EvalFile value path` where path is the full file path. The "Use NNUE" option must be set to true with the command `setoption name Use NNUE value true`.
+If the engine does not load any net file, or shows "Error! *** not found or wrong format", please try to specify the net with the full file path with the `EvalFile` UCI option by typing the command `setoption name EvalFile value path` where path is the full file path. The `Use NNUE` UCI option must be set either to `true` or `pure` with the command `setoption name Use NNUE value true/pure`.
+
+## Training data formats.
+
+Currently there are 3 training data formats. Two of them are supported directly.
+
+- `.bin` - the original training data format. Uses 40 bytes per entry. Is supported directly by the `gensfen` and `learn` commands.
+- `.plain` - a human readable training data format. This one is not supported directly by the `gensfen` and `learn` commands. It should not be used for data exchange because it's less compact than other formats. It is mostly useful for inspection of the data.
+- `.binpack` - a compact binary training data format that exploits positions chains to further reduce size. It uses on average between 2 to 3 bytes per entry when generating data with `gensfen`. It is supported directly by `gensfen` and `learn` commands. It is currently the default for the `gensfen` command. A more in depth description can be found [here](docs/binpack.md)
+
+### Conversion between formats.
+
+There is a builting converted that support all 3 formats described above. Any of them can be converted to any other. For more information and usage guide see [here](docs/convert.md).
 
 ## Resources
+
 - [Stockfish NNUE Wiki](https://www.qhapaq.org/shogi/shogiwiki/stockfish-nnue/)
 - [Training instructions](https://twitter.com/mktakizawa/status/1273042640280252416) from the creator of the Elmo shogi engine
 - [Original Talkchess thread](http://talkchess.com/forum3/viewtopic.php?t=74059) discussing Stockfish NNUE
-- [Guide to Stockfish NNUE](http://yaneuraou.yaneu.com/2020/06/19/stockfish-nnue-the-complete-guide/) 
+- [Guide to Stockfish NNUE](http://yaneuraou.yaneu.com/2020/06/19/stockfish-nnue-the-complete-guide/)
 - [Unofficial Stockfish Discord](https://discord.gg/nv8gDtt)
 
 A more updated list can be found in the #sf-nnue-resources channel in the Discord.

From 7d62b3f79959c2c5d44bdd3118734a5f8dd7bc26 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 11 Oct 2020 12:01:23 +0200
Subject: [PATCH 324/583] Store additional bits for fullmove clock and 50 more
 rule halfmove clock at the end of the bit stream. This change keeps backwards
 compatibility.

---
 src/extra/nnue_data_binpack_format.h | 90 +++++++++++++++-------------
 src/learn/sfen_packer.cpp            | 76 +++++++++++------------
 2 files changed, 86 insertions(+), 80 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 7ceafbc0..826b2959 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -4482,12 +4482,12 @@ namespace chess
             return m_ply;
         }
 
-        [[nodiscard]] inline std::uint16_t halfMove() const
+        [[nodiscard]] inline std::uint16_t fullMove() const
         {
             return (m_ply + 1) / 2;
         }
 
-        inline void setHalfMove(std::uint16_t hm)
+        inline void setFullMove(std::uint16_t hm)
         {
             m_ply = 2 * hm - 1 + (m_sideToMove == Color::Black);
         }
@@ -5366,10 +5366,10 @@ namespace chess
         }
 
         {
-            const auto halfMove = nextPart();
-            if (!halfMove.empty())
+            const auto fullMove = nextPart();
+            if (!fullMove.empty())
             {
-                m_ply = std::stoi(halfMove.data()) * 2 - (m_sideToMove == Color::White);
+                m_ply = std::stoi(fullMove.data()) * 2 - (m_sideToMove == Color::White);
             }
             else
             {
@@ -5419,7 +5419,7 @@ namespace chess
         fen += std::to_string(m_rule50Counter);
 
         fen += ' ';
-        fen += std::to_string(halfMove());
+        fen += std::to_string(fullMove());
 
         return fen;
     }
@@ -5862,43 +5862,24 @@ namespace binpack
         // Huffman coding
         // * is simplified from mini encoding to make conversion easier.
         //
-        // 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
-        // 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
-        //
-        // empty xxxxx0 + 0 (none)
-        // step xxxx01 + 2 xxxx0 + 2
-        // incense xx0011 + 2 xx001 + 2
-        // Katsura xx1011 + 2 xx101 + 2
-        // silver xx0111 + 2 xx011 + 2
-        // Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
-        // corner 011111 + 2 01111 + 2
-        // Fly 111111 + 2 11111 + 2
-        //
-        // Assuming all pieces are on the board,
-        // Sky 81-40 pieces = 41 boxes = 41bit
-        // Walk 4bit*18 pieces = 72bit
-        // Incense 6bit*4 pieces = 24bit
-        // Katsura 6bit*4 pieces = 24bit
-        // Silver 6bit*4 pieces = 24bit
-        // Gold 6bit* 4 pieces = 24bit
-        // corner 8bit* 2 pieces = 16bit
-        // Fly 8bit* 2 pieces = 16bit
-        // -------
-        // 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
-        //
-        // When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
-        // Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
-        // Therefore, in this expression, any aspect can be expressed by this bit number.
-        // It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
-        // Since the total number of bits can be fixed, we will include this as well.
-
         // Huffman Encoding
         //
         // Empty  xxxxxxx0
-        // Pawn   xxxxx001 + 1 bit (Side to move)
-        // Knight xxxxx011 + 1 bit (Side to move)
-        // Bishop xxxxx101 + 1 bit (Side to move)
-        // Rook   xxxxx111 + 1 bit (Side to move)
+        // Pawn   xxxxx001 + 1 bit (Color)
+        // Knight xxxxx011 + 1 bit (Color)
+        // Bishop xxxxx101 + 1 bit (Color)
+        // Rook   xxxxx111 + 1 bit (Color)
+        // Queen   xxxx1001 + 1 bit (Color)
+        //
+        // Worst case:
+        // - 32 empty squares    32 bits
+        // - 30 pieces           150 bits
+        // - 2 kings             12 bits
+        // - castling rights     4 bits
+        // - ep square           7 bits
+        // - rule50              7 bits
+        // - game ply            16 bits
+        // - TOTAL               228 bits < 256 bits
 
         struct HuffmanedPiece
         {
@@ -5980,7 +5961,17 @@ namespace binpack
 
                 stream.write_n_bit(pos.rule50Counter(), 6);
 
-                stream.write_n_bit(pos.halfMove(), 8);
+                stream.write_n_bit(pos.fullMove(), 8);
+
+                // Write high bits of half move. This is a fix for the
+                // limited range of half move counter.
+                // This is backwards compatibile.
+                stream.write_n_bit(pos.fullMove() >> 8, 8);
+
+                // Write the highest bit of rule50 at the end. This is a backwards
+                // compatibile fix for rule50 having only 6 bits stored.
+                // This bit is just ignored by the old parsers.
+                stream.write_n_bit(pos.rule50Counter() >> 6, 1);
 
                 assert(stream.get_cursor() <= 256);
             }
@@ -6105,10 +6096,23 @@ namespace binpack
             }
 
             // Halfmove clock
-            pos.setRule50Counter(stream.read_n_bit(6));
+            std::uint8_t rule50 = stream.read_n_bit(6);
 
             // Fullmove number
-            pos.setHalfMove(stream.read_n_bit(8));
+            std::uint16_t fullmove = stream.read_n_bit(8);
+
+            // Fullmove number, high bits
+            // This was added as a fix for fullmove clock
+            // overflowing at 256. This change is backwards compatibile.
+            fullmove |= stream.read_n_bit(8) << 8;
+
+            // Read the highest bit of rule50. This was added as a fix for rule50
+            // counter having only 6 bits stored.
+            // In older entries this will just be a zero bit.
+            rule50 |= stream.read_n_bit(1) << 6;
+
+            pos.setFullMove(fullmove);
+            pos.setRule50Counter(rule50);
 
             assert(stream.get_cursor() <= 256);
 
diff --git a/src/learn/sfen_packer.cpp b/src/learn/sfen_packer.cpp
index 19c745ad..2de7efa4 100644
--- a/src/learn/sfen_packer.cpp
+++ b/src/learn/sfen_packer.cpp
@@ -113,43 +113,24 @@ namespace Learner {
   // Huffman coding
   // * is simplified from mini encoding to make conversion easier.
   //
-  // 1 box on the board (other than NO_PIECE) = 2 to 6 bits (+ 1-bit flag + 1-bit forward and backward)
-  // 1 piece of hand piece = 1-5bit (+ 1-bit flag + 1bit ahead and behind)
-  //
-  // empty xxxxx0 + 0 (none)
-  // step xxxx01 + 2 xxxx0 + 2
-  // incense xx0011 + 2 xx001 + 2
-  // Katsura xx1011 + 2 xx101 + 2
-  // silver xx0111 + 2 xx011 + 2
-  // Gold x01111 + 1 x0111 + 1 // Gold is valid and has no flags.
-  // corner 011111 + 2 01111 + 2
-  // Fly 111111 + 2 11111 + 2
-  //
-  // Assuming all pieces are on the board,
-  // Sky 81-40 pieces = 41 boxes = 41bit
-  // Walk 4bit*18 pieces = 72bit
-  // Incense 6bit*4 pieces = 24bit
-  // Katsura 6bit*4 pieces = 24bit
-  // Silver 6bit*4 pieces = 24bit
-  // Gold 6bit* 4 pieces = 24bit
-  // corner 8bit* 2 pieces = 16bit
-  // Fly 8bit* 2 pieces = 16bit
-  // -------
-  // 241bit + 1bit (turn) + 7bit × 2 (King's position after) = 256bit
-  //
-  // When the piece on the board moves to the hand piece, the piece on the board becomes empty, so the box on the board can be expressed with 1 bit,
-  // Since the hand piece can be expressed by 1 bit less than the piece on the board, the total number of bits does not change in the end.
-  // Therefore, in this expression, any aspect can be expressed by this bit number.
-  // It is a hand piece and no flag is required, but if you include this, the bit number of the piece on the board will be -1
-  // Since the total number of bits can be fixed, we will include this as well.
-
   // Huffman Encoding
   //
   // Empty  xxxxxxx0
-  // Pawn   xxxxx001 + 1 bit (Side to move)
-  // Knight xxxxx011 + 1 bit (Side to move)
-  // Bishop xxxxx101 + 1 bit (Side to move)
-  // Rook   xxxxx111 + 1 bit (Side to move)
+  // Pawn   xxxxx001 + 1 bit (Color)
+  // Knight xxxxx011 + 1 bit (Color)
+  // Bishop xxxxx101 + 1 bit (Color)
+  // Rook   xxxxx111 + 1 bit (Color)
+  // Queen   xxxx1001 + 1 bit (Color)
+  //
+  // Worst case:
+  // - 32 empty squares    32 bits
+  // - 30 pieces           150 bits
+  // - 2 kings             12 bits
+  // - castling rights     4 bits
+  // - ep square           7 bits
+  // - rule50              7 bits
+  // - game ply            16 bits
+  // - TOTAL               228 bits < 256 bits
 
   struct HuffmanedPiece
   {
@@ -212,7 +193,18 @@ namespace Learner {
 
     stream.write_n_bit(pos.state()->rule50, 6);
 
-    stream.write_n_bit(1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2, 8);
+    const int fm = 1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2;
+    stream.write_n_bit(fm, 8);
+
+    // Write high bits of half move. This is a fix for the
+    // limited range of half move counter.
+    // This is backwards compatibile.
+    stream.write_n_bit(fm >> 8, 8);
+
+    // Write the highest bit of rule50 at the end. This is a backwards
+    // compatibile fix for rule50 having only 6 bits stored.
+    // This bit is just ignored by the old parsers.
+    stream.write_n_bit(pos.state()->rule50 >> 6, 1);
 
     assert(stream.get_cursor() <= 256);
   }
@@ -355,10 +347,20 @@ namespace Learner {
     }
 
     // Halfmove clock
-    pos.st->rule50 = static_cast<Square>(stream.read_n_bit(6));
+    pos.st->rule50 = stream.read_n_bit(6);
 
     // Fullmove number
-    pos.gamePly = static_cast<Square>(stream.read_n_bit(8));
+    pos.gamePly = stream.read_n_bit(8);
+
+    // Read the highest bit of rule50. This was added as a fix for rule50
+    // counter having only 6 bits stored.
+    // In older entries this will just be a zero bit.
+    pos.gamePly |= stream.read_n_bit(8) << 8;
+
+    // Read the highest bit of rule50. This was added as a fix for rule50
+    // counter having only 6 bits stored.
+    // In older entries this will just be a zero bit.
+    pos.st->rule50 |= stream.read_n_bit(1) << 6;
 
     // Convert from fullmove starting from 1 to gamePly starting from 0,
     // handle also common incorrect FEN with fullmove = 0.

From 4a2bf16b3046d92522d52518a33985273d72cc22 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 12 Oct 2020 11:46:50 +0200
Subject: [PATCH 325/583] Add option "auto_lr_drop" that specifies the amount
 of positions from previous lr drop after which to reduce lr by newbob_decay.

---
 src/learn/learn.cpp | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 5a540d31..3648a40f 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -744,6 +744,8 @@ namespace Learner
 
             newbob_decay = 1.0;
             newbob_num_trials = 2;
+            auto_lr_drop = 0;
+            last_lr_drop = 0;
             best_loss = std::numeric_limits<double>::infinity();
             latest_loss_sum = 0.0;
             latest_loss_count = 0;
@@ -797,6 +799,8 @@ namespace Learner
         shared_timed_mutex nn_mutex;
         double newbob_decay;
         int newbob_num_trials;
+        uint64_t auto_lr_drop;
+        uint64_t last_lr_drop;
         double best_loss;
         double latest_loss_sum;
         uint64_t latest_loss_count;
@@ -1295,7 +1299,21 @@ namespace Learner
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
                 cout << "loss: " << latest_loss;
-                if (latest_loss < best_loss)
+                auto tot = sr.total_done.load();
+                if (auto_lr_drop)
+                {
+                    cout << " < best (" << best_loss << "), accepted" << endl;
+                    best_loss = latest_loss;
+                    best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
+                    trials = newbob_num_trials;
+
+                    if (tot >= last_lr_drop + auto_lr_drop)
+                    {
+                        last_lr_drop = tot;
+                        global_learning_rate *= newbob_decay;
+                    }
+                }
+                else if (latest_loss < best_loss)
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
@@ -1647,6 +1665,7 @@ namespace Learner
         uint64_t nn_batch_size = 1000;
         double newbob_decay = 0.5;
         int newbob_num_trials = 4;
+        uint64_t auto_lr_drop = 0;
         string nn_options;
 
         uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
@@ -1729,6 +1748,7 @@ namespace Learner
             else if (option == "newbob_decay") is >> newbob_decay;
             else if (option == "newbob_num_trials") is >> newbob_num_trials;
             else if (option == "nn_options") is >> nn_options;
+            else if (option == "auto_lr_drop") is >> auto_lr_drop;
 
             else if (option == "eval_save_interval") is >> eval_save_interval;
             else if (option == "loss_output_interval") is >> loss_output_interval;
@@ -1972,6 +1992,7 @@ namespace Learner
 
         learn_think.newbob_decay = newbob_decay;
         learn_think.newbob_num_trials = newbob_num_trials;
+        learn_think.auto_lr_drop = auto_lr_drop;
 
         learn_think.eval_save_interval = eval_save_interval;
         learn_think.loss_output_interval = loss_output_interval;

From 4a340ad3b28823ea26e502d4dc3a68b41a349d39 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 12 Oct 2020 11:48:57 +0200
Subject: [PATCH 326/583] Add docs for auto_lr_drop

---
 docs/learn.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/learn.md b/docs/learn.md
index 4c8c3fc1..3a580134 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -64,6 +64,8 @@ Currently the following options are available:
 
 `newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 4.
 
+`auto_lr_drop` - every time this many positions are processed the learning rate is multiplied by `newbob_decay`. In other words this value specifies for how many positions a single learning rate stage lasts. If 0 then doesn't have any effect. Default: 0.
+
 `nn_options` - if you're reading this you don't use it. It passes messages directly to the network evaluation. I don't know what it can do either.
 
 `eval_save_interval` - every `eval_save_interval` positions the network will be saved and either accepted or rejected (in which case an LR drop follows). Default: 100000000 (100M). (generally people use values in 10M-100M range)

From 0494adeb2c9dba82f3ffd78823822aab4d450764 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 19:06:47 +0200
Subject: [PATCH 327/583] Move nnue evaluation stuff from evaluate.h to
 nnue/evaluate_nnue.h

---
 src/evaluate.cpp           | 90 +++-----------------------------------
 src/evaluate.h             | 19 --------
 src/learn/gensfen.cpp      |  1 +
 src/learn/learn.cpp        |  1 +
 src/learn/multi_think.cpp  |  2 +
 src/main.cpp               |  2 +
 src/nnue/evaluate_nnue.cpp | 85 ++++++++++++++++++++++++++++++++++-
 src/nnue/evaluate_nnue.h   | 15 +++++++
 src/nnue/nnue_common.h     |  2 +
 src/position.cpp           | 10 +++--
 src/search.cpp             |  2 +
 src/uci.cpp                |  1 +
 src/ucioption.cpp          |  1 +
 13 files changed, 122 insertions(+), 109 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index b3894fe8..0326a2f8 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -27,6 +27,8 @@
 #include <streambuf>
 #include <vector>
 
+#include "nnue/evaluate_nnue.h"
+
 #include "bitboard.h"
 #include "evaluate.h"
 #include "material.h"
@@ -37,88 +39,6 @@
 #include "incbin/incbin.h"
 
 using namespace std;
-using namespace Eval::NNUE;
-
-namespace Eval {
-
-  UseNNUEMode useNNUE;
-  string eval_file_loaded = "None";
-
-  static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
-  {
-    if (mode == "false")
-      return UseNNUEMode::False;
-    else if (mode == "true")
-      return UseNNUEMode::True;
-    else if (mode == "pure")
-      return UseNNUEMode::Pure;
-
-    return UseNNUEMode::False;
-  }
-
-  void NNUE::init() {
-
-    useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
-    if (useNNUE == UseNNUEMode::False)
-        return;
-
-    string eval_file = string(Options["EvalFile"]);
-
-    #if defined(DEFAULT_NNUE_DIRECTORY)
-    #define stringify2(x) #x
-    #define stringify(x) stringify2(x)
-    vector<string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
-    #else
-    vector<string> dirs = { "" , CommandLine::binaryDirectory };
-    #endif
-
-    for (string directory : dirs)
-        if (eval_file_loaded != eval_file)
-        {
-            ifstream stream(directory + eval_file, ios::binary);
-            if (load_eval(eval_file, stream))
-            {
-                sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
-                eval_file_loaded = eval_file;
-            }
-            else
-            {
-                sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
-            }
-        }
-  }
-
-  /// NNUE::verify() verifies that the last net used was loaded successfully
-  void NNUE::verify() {
-
-    string eval_file = string(Options["EvalFile"]);
-
-    if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
-    {
-        UCI::OptionsMap defaults;
-        UCI::init(defaults);
-
-        string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
-        string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
-        string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
-        string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + string(defaults["EvalFile"]);
-        string msg5 = "The engine will be terminated now.";
-
-        sync_cout << "info string ERROR: " << msg1 << sync_endl;
-        sync_cout << "info string ERROR: " << msg2 << sync_endl;
-        sync_cout << "info string ERROR: " << msg3 << sync_endl;
-        sync_cout << "info string ERROR: " << msg4 << sync_endl;
-        sync_cout << "info string ERROR: " << msg5 << sync_endl;
-
-        exit(EXIT_FAILURE);
-    }
-
-    if (useNNUE != UseNNUEMode::False)
-        sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
-    else
-        sync_cout << "info string classical evaluation enabled" << sync_endl;
-  }
-}
 
 namespace Trace {
 
@@ -994,7 +914,7 @@ Value Eval::evaluate(const Position& pos) {
 
   Value v;
 
-  if (Eval::useNNUE == UseNNUEMode::Pure) {
+  if (NNUE::useNNUE == NNUE::UseNNUEMode::Pure) {
       v = NNUE::evaluate(pos);
 
       // Guarantee evaluation does not hit the tablebase range
@@ -1002,7 +922,7 @@ Value Eval::evaluate(const Position& pos) {
 
       return v;
   }
-  else if (Eval::useNNUE == UseNNUEMode::False)
+  else if (NNUE::useNNUE == NNUE::UseNNUEMode::False)
       v = Evaluation<NO_TRACE>(pos).value();
   else
   {
@@ -1085,7 +1005,7 @@ std::string Eval::trace(const Position& pos) {
 
   ss << "\nClassical evaluation: " << to_cp(v) << " (white side)\n";
 
-  if (useNNUE != UseNNUEMode::False)
+  if (NNUE::useNNUE != NNUE::UseNNUEMode::False)
   {
       v = NNUE::evaluate(pos);
       v = pos.side_to_move() == WHITE ? v : -v;
diff --git a/src/evaluate.h b/src/evaluate.h
index bce5488d..fc626698 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -26,33 +26,14 @@
 class Position;
 
 namespace Eval {
-  enum struct UseNNUEMode
-  {
-    False,
-    True,
-    Pure
-  };
-
   std::string trace(const Position& pos);
   Value evaluate(const Position& pos);
 
-  extern UseNNUEMode useNNUE;
-  extern std::string eval_file_loaded;
-
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
   #define EvalFileDefaultName   "nn-98a7585c85e9.nnue"
 
-  namespace NNUE {
-
-    Value evaluate(const Position& pos);
-    bool load_eval(std::string name, std::istream& stream);
-    void init();
-    void verify();
-
-  } // namespace NNUE
-
 } // namespace Eval
 
 #endif // #ifndef EVALUATE_H_INCLUDED
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 5f7541f5..7c5b20be 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -12,6 +12,7 @@
 
 #include "extra/nnue_data_binpack_format.h"
 
+#include "nnue/evaluate_nnue.h"
 #include "nnue/evaluate_nnue_learner.h"
 
 #include "syzygy/tbprobe.h"
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 3648a40f..b2ee5aa1 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -32,6 +32,7 @@
 
 #include "extra/nnue_data_binpack_format.h"
 
+#include "nnue/evaluate_nnue.h"
 #include "nnue/evaluate_nnue_learner.h"
 
 #include "syzygy/tbprobe.h"
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index 80bc72b5..daed3e96 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -1,5 +1,7 @@
 ﻿#include "multi_think.h"
 
+#include "nnue/evaluate_nnue.h"
+
 #include "tt.h"
 #include "uci.h"
 #include "types.h"
diff --git a/src/main.cpp b/src/main.cpp
index e6dff918..1a13dc62 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -18,6 +18,8 @@
 
 #include <iostream>
 
+#include "nnue/evaluate_nnue.h"
+
 #include "bitboard.h"
 #include "endgame.h"
 #include "position.h"
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 28c86feb..f7f9adcc 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -19,12 +19,14 @@
 // Code for calculating NNUE evaluation function
 
 #include <iostream>
+#include <string>
+#include <fstream>
 #include <set>
 
-#include "../evaluate.h"
 #include "../position.h"
 #include "../misc.h"
 #include "../uci.h"
+#include "../types.h"
 
 #include "evaluate_nnue.h"
 
@@ -69,6 +71,9 @@ namespace Eval::NNUE {
       ",Network=" + Network::GetStructureString();
   }
 
+  UseNNUEMode useNNUE;
+  std::string eval_file_loaded = "None";
+
   namespace Detail {
 
   // Initialize the evaluation function parameters
@@ -190,4 +195,82 @@ namespace Eval::NNUE {
     return ReadParameters(stream);
   }
 
+  static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
+  {
+    if (mode == "false")
+      return UseNNUEMode::False;
+    else if (mode == "true")
+      return UseNNUEMode::True;
+    else if (mode == "pure")
+      return UseNNUEMode::Pure;
+
+    return UseNNUEMode::False;
+  }
+
+  void init() {
+
+    useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
+    if (useNNUE == UseNNUEMode::False)
+        return;
+
+    std::string eval_file = std::string(Options["EvalFile"]);
+
+    #if defined(DEFAULT_NNUE_DIRECTORY)
+    #define stringify2(x) #x
+    #define stringify(x) stringify2(x)
+    std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
+    #else
+    std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
+    #endif
+
+    for (std::string directory : dirs)
+        if (eval_file_loaded != eval_file)
+        {
+            std::ifstream stream(directory + eval_file, std::ios::binary);
+            if (load_eval(eval_file, stream))
+            {
+                sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
+                eval_file_loaded = eval_file;
+            }
+            else
+            {
+                sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+            }
+        }
+
+    #undef stringify2
+    #undef stringify
+  }
+
+  /// NNUE::verify() verifies that the last net used was loaded successfully
+  void verify() {
+
+    std::string eval_file = std::string(Options["EvalFile"]);
+
+    if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
+    {
+        UCI::OptionsMap defaults;
+        UCI::init(defaults);
+
+        std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+        std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
+        std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+        std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
+        std::string msg5 = "The engine will be terminated now.";
+
+        sync_cout << "info string ERROR: " << msg1 << sync_endl;
+        sync_cout << "info string ERROR: " << msg2 << sync_endl;
+        sync_cout << "info string ERROR: " << msg3 << sync_endl;
+        sync_cout << "info string ERROR: " << msg4 << sync_endl;
+        sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+        std::exit(EXIT_FAILURE);
+    }
+
+    if (useNNUE != UseNNUEMode::False)
+        sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
+    else
+        sync_cout << "info string classical evaluation enabled" << sync_endl;
+  }
+
 } // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index 68153cac..dcfa071d 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -27,6 +27,13 @@
 
 namespace Eval::NNUE {
 
+  enum struct UseNNUEMode
+  {
+    False,
+    True,
+    Pure
+  };
+
   // Hash value of evaluation function structure
   constexpr std::uint32_t kHashValue =
       FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
@@ -66,6 +73,9 @@ namespace Eval::NNUE {
   // Saved evaluation function file name
   extern std::string savedfileName;
 
+  extern UseNNUEMode useNNUE;
+  extern std::string eval_file_loaded;
+
   // Get a string that represents the structure of the evaluation function
   std::string GetArchitectureString();
 
@@ -83,6 +93,11 @@ namespace Eval::NNUE {
   // write evaluation function parameters
   bool WriteParameters(std::ostream& stream);
 
+  Value evaluate(const Position& pos);
+  bool load_eval(std::string name, std::istream& stream);
+  void init();
+  void verify();
+
 }  // namespace Eval::NNUE
 
 #endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 319f005b..9975134c 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -24,6 +24,8 @@
 #include <cstring>
 #include <iostream>
 
+#include "../types.h"
+
 #if defined(USE_AVX2)
 #include <immintrin.h>
 
diff --git a/src/position.cpp b/src/position.cpp
index 4e47f772..06a4e0b7 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -23,6 +23,8 @@
 #include <iomanip>
 #include <sstream>
 
+#include "nnue/evaluate_nnue.h"
+
 #include "bitboard.h"
 #include "misc.h"
 #include "movegen.h"
@@ -757,7 +759,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
       else
           st->nonPawnMaterial[them] -= PieceValue[MG][captured];
 
-      if (Eval::useNNUE != Eval::UseNNUEMode::False)
+      if (Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
       {
           dp.dirty_num = 2;  // 1 piece moved, 1 piece captured
           dp.piece[1] = captured;
@@ -801,7 +803,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   // Move the piece. The tricky Chess960 castling is handled earlier
   if (type_of(m) != CASTLING)
   {
-      if (Eval::useNNUE != Eval::UseNNUEMode::False)
+      if (Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
       {
           dp.piece[0] = pc;
           dp.from[0] = from;
@@ -832,7 +834,7 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
           remove_piece(to);
           put_piece(promotion, to);
 
-          if (Eval::useNNUE != Eval::UseNNUEMode::False)
+          if (Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
           {
               // Promoting pawn to SQ_NONE, promoted piece from SQ_NONE
               dp.to[0] = SQ_NONE;
@@ -970,7 +972,7 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ
   rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1);
   to = relative_square(us, kingSide ? SQ_G1 : SQ_C1);
 
-  if (Do && Eval::useNNUE != Eval::UseNNUEMode::False)
+  if (Do && Eval::NNUE::useNNUE != Eval::NNUE::UseNNUEMode::False)
   {
       auto& dp = st->dirtyPiece;
       dp.piece[0] = make_piece(us, KING);
diff --git a/src/search.cpp b/src/search.cpp
index 1623ff06..26a675d7 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -23,6 +23,8 @@
 #include <iostream>
 #include <sstream>
 
+#include "nnue/evaluate_nnue.h"
+
 #include "evaluate.h"
 #include "misc.h"
 #include "movegen.h"
diff --git a/src/uci.cpp b/src/uci.cpp
index 166e437c..73ff0256 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -22,6 +22,7 @@
 #include <sstream>
 #include <string>
 
+#include "nnue/evaluate_nnue.h"
 #include "evaluate.h"
 #include "movegen.h"
 #include "nnue/nnue_test_command.h"
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 099ca2ae..bdb1c6b1 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -21,6 +21,7 @@
 #include <ostream>
 #include <sstream>
 
+#include "nnue/evaluate_nnue.h"
 #include "evaluate.h"
 #include "misc.h"
 #include "search.h"

From 14f83ad7b91ab5f62f269d6317436c08f658ec07 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 19:24:41 +0200
Subject: [PATCH 328/583] Move public search/qsearch interface from namespace
 Learner to namespace Search

---
 src/learn/gensfen.cpp |  4 ++--
 src/learn/learn.cpp   |  6 +++---
 src/search.cpp        |  9 ++-------
 src/search.h          | 11 ++++-------
 src/uci.cpp           |  4 ++--
 5 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 7c5b20be..7b135b81 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -673,7 +673,7 @@ namespace Learner
             }
             else
             {
-                Learner::search(pos, random_multi_pv_depth, random_multi_pv);
+                Search::search(pos, random_multi_pv_depth, random_multi_pv);
 
                 // Select one from the top N hands of root Moves
                 auto& rm = pos.this_thread()->rootMoves;
@@ -790,7 +790,7 @@ namespace Learner
                 const int depth = search_depth_min + (int)prng.rand(search_depth_max - search_depth_min + 1);
 
                 // Starting search calls init_for_search
-                auto [search_value, search_pv] = search(pos, depth, 1, nodes);
+                auto [search_value, search_pv] = Search::search(pos, depth, 1, nodes);
 
                 // This has to be performed after search because it needs to know
                 // rootMoves which are filled in init_for_search.
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index b2ee5aa1..452bd15f 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -824,7 +824,7 @@ namespace Learner
         // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
         // Use qsearch() because it is difficult to compare the values.
         // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
-        const auto [_, pv] = qsearch(task_pos);
+        const auto [_, pv] = Search::qsearch(task_pos);
 
         const auto rootColor = task_pos.side_to_move();
 
@@ -962,7 +962,7 @@ namespace Learner
 
                 // Determine if the teacher's move and the score of the shallow search match
                 {
-                    const auto [value, pv] = search(task_pos, 1);
+                    const auto [value, pv] = Search::search(task_pos, 1);
                     if ((uint16_t)pv[0] == ps.move)
                         move_accord_count.fetch_add(1, std::memory_order_relaxed);
                 }
@@ -1186,7 +1186,7 @@ namespace Learner
 				goto RETRY_READ;
 
             // Evaluation value of shallow search (qsearch)
-            const auto [_, pv] = qsearch(pos);
+            const auto [_, pv] = Search::qsearch(pos);
 
             // Evaluation value of deep search
             const auto deep_value = (Value)ps.score;
diff --git a/src/search.cpp b/src/search.cpp
index 26a675d7..79848812 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1968,9 +1968,7 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
 }
 
 // --- expose the functions such as fixed depth search used for learning to the outside
-
-
-namespace Learner
+namespace Search
 {
   // For learning, prepare a stub that can call search,qsearch() from one thread.
   // From now on, it is better to have a Searcher and prepare a substitution table for each thread like Apery.
@@ -1978,7 +1976,7 @@ namespace Learner
 
   // Initialization for learning.
   // Called from Learner::search(),Learner::qsearch().
-  void init_for_search(Position& pos, Stack* ss)
+  static void init_for_search(Position& pos, Stack* ss)
   {
 
     // RootNode requires ss->ply == 0.
@@ -2046,9 +2044,6 @@ namespace Learner
     }
   }
 
-  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
-  typedef std::pair<Value, std::vector<Move> > ValueAndPV;
-
   // Stationary search.
   //
   // Precondition) Search thread is set by pos.set_this_thread(Threads[thread_id]).
diff --git a/src/search.h b/src/search.h
index ab832ee2..13123323 100644
--- a/src/search.h
+++ b/src/search.h
@@ -110,15 +110,12 @@ extern LimitsType Limits;
 void init();
 void clear();
 
-} // namespace Search
+// A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
+using ValueAndPV = std::pair<Value, std::vector<Move>>;
 
-namespace Learner {
+ValueAndPV qsearch(Position& pos);
+ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
 
-  // A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
-  using ValueAndPV = std::pair<Value, std::vector<Move>>;
-
-  ValueAndPV qsearch(Position& pos);
-  ValueAndPV search(Position& pos, int depth_, size_t multiPV = 1, uint64_t nodesLimit = 0);
 }
 
 #endif // #ifndef SEARCH_H_INCLUDED
diff --git a/src/uci.cpp b/src/uci.cpp
index 73ff0256..ff735b2e 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -254,7 +254,7 @@ double UCI::win_rate_model_double(double v, int ply) {
 void qsearch_cmd(Position& pos)
 {
   cout << "qsearch : ";
-  auto pv = Learner::qsearch(pos);
+  auto pv = Search::qsearch(pos);
   cout << "Value = " << pv.first << " , " << UCI::value(pv.first) << " , PV = ";
   for (auto m : pv.second)
     cout << UCI::move(m, false) << " ";
@@ -275,7 +275,7 @@ void search_cmd(Position& pos, istringstream& is)
   }
 
   cout << "search depth = " << depth << " , multi_pv = " << multi_pv << " : ";
-  auto pv = Learner::search(pos, depth, multi_pv);
+  auto pv = Search::search(pos, depth, multi_pv);
   cout << "Value = " << pv.first << " , " << UCI::value(pv.first) << " , PV = ";
   for (auto m : pv.second)
     cout << UCI::move(m, false) << " ";

From 880d23af1c551e9122e95cd52c9aa155bfe11a38 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 19:44:15 +0200
Subject: [PATCH 329/583] Move sfen input/output streams to sfen_stream.h

---
 src/learn/gensfen.cpp   | 100 +------------------
 src/learn/learn.cpp     | 112 +--------------------
 src/learn/sfen_stream.h | 213 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 217 insertions(+), 208 deletions(-)
 create mode 100644 src/learn/sfen_stream.h

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 7b135b81..4a6f26dc 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -2,6 +2,7 @@
 
 #include "packed_sfen.h"
 #include "multi_think.h"
+#include "sfen_stream.h"
 #include "../syzygy/tbprobe.h"
 
 #include "misc.h"
@@ -38,107 +39,12 @@ using namespace std;
 
 namespace Learner
 {
-    enum struct SfenOutputType
-    {
-        Bin,
-        Binpack
-    };
-
     static bool write_out_draw_game_in_training_data_generation = true;
     static bool detect_draw_by_consecutive_low_score = true;
     static bool detect_draw_by_insufficient_mating_material = true;
 
     static SfenOutputType sfen_output_type = SfenOutputType::Bin;
 
-    static bool ends_with(const std::string& lhs, const std::string& end)
-    {
-        if (end.size() > lhs.size()) return false;
-
-        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
-    }
-
-    static std::string filename_with_extension(const std::string& filename, const std::string& ext)
-    {
-        if (ends_with(filename, ext))
-        {
-            return filename;
-        }
-        else
-        {
-            return filename + "." + ext;
-        }
-    }
-
-    struct BasicSfenOutputStream
-    {
-        virtual void write(const PSVector& sfens) = 0;
-        virtual ~BasicSfenOutputStream() {}
-    };
-
-    struct BinSfenOutputStream : BasicSfenOutputStream
-    {
-        static constexpr auto openmode = ios::out | ios::binary | ios::app;
-        static inline const std::string extension = "bin";
-
-        BinSfenOutputStream(std::string filename) :
-            m_stream(filename_with_extension(filename, extension), openmode)
-        {
-        }
-
-        void write(const PSVector& sfens) override
-        {
-            m_stream.write(reinterpret_cast<const char*>(sfens.data()), sizeof(PackedSfenValue) * sfens.size());
-        }
-
-        ~BinSfenOutputStream() override {}
-
-    private:
-        fstream m_stream;
-    };
-
-    struct BinpackSfenOutputStream : BasicSfenOutputStream
-    {
-        static constexpr auto openmode = ios::out | ios::binary | ios::app;
-        static inline const std::string extension = "binpack";
-
-        BinpackSfenOutputStream(std::string filename) :
-            m_stream(filename_with_extension(filename, extension), openmode)
-        {
-        }
-
-        void write(const PSVector& sfens) override
-        {
-            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
-
-            for(auto& sfen : sfens)
-            {
-                // The library uses a type that's different but layout-compatibile.
-                binpack::nodchip::PackedSfenValue e;
-                std::memcpy(&e, &sfen, sizeof(binpack::nodchip::PackedSfenValue));
-                m_stream.addTrainingDataEntry(binpack::packedSfenValueToTrainingDataEntry(e));
-            }
-        }
-
-        ~BinpackSfenOutputStream() override {}
-
-    private:
-        binpack::CompressedTrainingDataEntryWriter m_stream;
-    };
-
-    static std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename)
-    {
-        switch(sfen_output_type)
-        {
-            case SfenOutputType::Bin:
-                return std::make_unique<BinSfenOutputStream>(filename);
-            case SfenOutputType::Binpack:
-                return std::make_unique<BinpackSfenOutputStream>(filename);
-        }
-
-        assert(false);
-        return nullptr;
-    }
-
     // Helper class for exporting Sfen
     struct SfenWriter
     {
@@ -155,7 +61,7 @@ namespace Learner
             sfen_buffers_pool.reserve((size_t)thread_num * 10);
             sfen_buffers.resize(thread_num);
 
-            output_file_stream = create_new_sfen_output(filename_);
+            output_file_stream = create_new_sfen_output(filename_, sfen_output_type);
             filename = filename_;
 
             finished = false;
@@ -283,7 +189,7 @@ namespace Learner
                             // Add ios::app in consideration of overwriting.
                             // (Depending on the operation, it may not be necessary.)
                             string new_filename = filename + "_" + std::to_string(n);
-                            output_file_stream = create_new_sfen_output(new_filename);
+                            output_file_stream = create_new_sfen_output(new_filename, sfen_output_type);
                             cout << endl << "output sfen file = " << new_filename << endl;
                         }
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 452bd15f..6c865d98 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -21,6 +21,7 @@
 
 #include "convert.h"
 #include "multi_think.h"
+#include "sfen_stream.h"
 
 #include "misc.h"
 #include "position.h"
@@ -30,8 +31,6 @@
 #include "search.h"
 #include "timeman.h"
 
-#include "extra/nnue_data_binpack_format.h"
-
 #include "nnue/evaluate_nnue.h"
 #include "nnue/evaluate_nnue_learner.h"
 
@@ -286,115 +285,6 @@ namespace Learner
         return calc_grad((Value)psv.score, shallow, psv);
     }
 
-    struct BasicSfenInputStream
-    {
-        virtual std::optional<PackedSfenValue> next() = 0;
-        virtual bool eof() const = 0;
-        virtual ~BasicSfenInputStream() {}
-    };
-
-    struct BinSfenInputStream : BasicSfenInputStream
-    {
-        static constexpr auto openmode = ios::in | ios::binary;
-        static inline const std::string extension = "bin";
-
-        BinSfenInputStream(std::string filename) :
-            m_stream(filename, openmode),
-            m_eof(!m_stream)
-        {
-        }
-
-        std::optional<PackedSfenValue> next() override
-        {
-            PackedSfenValue e;
-            if(m_stream.read(reinterpret_cast<char*>(&e), sizeof(PackedSfenValue)))
-            {
-                return e;
-            }
-            else
-            {
-                m_eof = true;
-                return std::nullopt;
-            }
-        }
-
-        bool eof() const override
-        {
-            return m_eof;
-        }
-
-        ~BinSfenInputStream() override {}
-
-    private:
-        fstream m_stream;
-        bool m_eof;
-    };
-
-    struct BinpackSfenInputStream : BasicSfenInputStream
-    {
-        static constexpr auto openmode = ios::in | ios::binary;
-        static inline const std::string extension = "binpack";
-
-        BinpackSfenInputStream(std::string filename) :
-            m_stream(filename, openmode),
-            m_eof(!m_stream.hasNext())
-        {
-        }
-
-        std::optional<PackedSfenValue> next() override
-        {
-            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
-
-            if (!m_stream.hasNext())
-            {
-                m_eof = true;
-                return std::nullopt;
-            }
-
-            auto training_data_entry = m_stream.next();
-            auto v = binpack::trainingDataEntryToPackedSfenValue(training_data_entry);
-            PackedSfenValue psv;
-            // same layout, different types. One is from generic library.
-            std::memcpy(&psv, &v, sizeof(PackedSfenValue));
-
-            return psv;
-        }
-
-        bool eof() const override
-        {
-            return m_eof;
-        }
-
-        ~BinpackSfenInputStream() override {}
-
-    private:
-        binpack::CompressedTrainingDataEntryReader m_stream;
-        bool m_eof;
-    };
-
-    static bool ends_with(const std::string& lhs, const std::string& end)
-    {
-        if (end.size() > lhs.size()) return false;
-
-        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
-    }
-
-    static bool has_extension(const std::string& filename, const std::string& extension)
-    {
-        return ends_with(filename, "." + extension);
-    }
-
-    static std::unique_ptr<BasicSfenInputStream> open_sfen_input_file(const std::string& filename)
-    {
-        if (has_extension(filename, BinSfenInputStream::extension))
-            return std::make_unique<BinSfenInputStream>(filename);
-        else if (has_extension(filename, BinpackSfenInputStream::extension))
-            return std::make_unique<BinpackSfenInputStream>(filename);
-
-        assert(false);
-        return nullptr;
-    }
-
     // Sfen reader
     struct SfenReader
     {
diff --git a/src/learn/sfen_stream.h b/src/learn/sfen_stream.h
new file mode 100644
index 00000000..4d44901b
--- /dev/null
+++ b/src/learn/sfen_stream.h
@@ -0,0 +1,213 @@
+#ifndef _SFEN_STREAM_H_
+#define _SFEN_STREAM_H_
+
+#include "packed_sfen.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include <optional>
+#include <fstream>
+#include <string>
+#include <memory>
+
+namespace Learner {
+
+    enum struct SfenOutputType
+    {
+        Bin,
+        Binpack
+    };
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static bool has_extension(const std::string& filename, const std::string& extension)
+    {
+        return ends_with(filename, "." + extension);
+    }
+
+    static std::string filename_with_extension(const std::string& filename, const std::string& ext)
+    {
+        if (ends_with(filename, ext))
+        {
+            return filename;
+        }
+        else
+        {
+            return filename + "." + ext;
+        }
+    }
+
+    struct BasicSfenInputStream
+    {
+        virtual std::optional<PackedSfenValue> next() = 0;
+        virtual bool eof() const = 0;
+        virtual ~BasicSfenInputStream() {}
+    };
+
+    struct BinSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = std::ios::in | std::ios::binary;
+        static inline const std::string extension = "bin";
+
+        BinSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream)
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            PackedSfenValue e;
+            if(m_stream.read(reinterpret_cast<char*>(&e), sizeof(PackedSfenValue)))
+            {
+                return e;
+            }
+            else
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinSfenInputStream() override {}
+
+    private:
+        std::fstream m_stream;
+        bool m_eof;
+    };
+
+    struct BinpackSfenInputStream : BasicSfenInputStream
+    {
+        static constexpr auto openmode = std::ios::in | std::ios::binary;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenInputStream(std::string filename) :
+            m_stream(filename, openmode),
+            m_eof(!m_stream.hasNext())
+        {
+        }
+
+        std::optional<PackedSfenValue> next() override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            if (!m_stream.hasNext())
+            {
+                m_eof = true;
+                return std::nullopt;
+            }
+
+            auto training_data_entry = m_stream.next();
+            auto v = binpack::trainingDataEntryToPackedSfenValue(training_data_entry);
+            PackedSfenValue psv;
+            // same layout, different types. One is from generic library.
+            std::memcpy(&psv, &v, sizeof(PackedSfenValue));
+
+            return psv;
+        }
+
+        bool eof() const override
+        {
+            return m_eof;
+        }
+
+        ~BinpackSfenInputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryReader m_stream;
+        bool m_eof;
+    };
+
+    struct BasicSfenOutputStream
+    {
+        virtual void write(const PSVector& sfens) = 0;
+        virtual ~BasicSfenOutputStream() {}
+    };
+
+    struct BinSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = std::ios::out | std::ios::binary | std::ios::app;
+        static inline const std::string extension = "bin";
+
+        BinSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            m_stream.write(reinterpret_cast<const char*>(sfens.data()), sizeof(PackedSfenValue) * sfens.size());
+        }
+
+        ~BinSfenOutputStream() override {}
+
+    private:
+        std::fstream m_stream;
+    };
+
+    struct BinpackSfenOutputStream : BasicSfenOutputStream
+    {
+        static constexpr auto openmode = std::ios::out | std::ios::binary | std::ios::app;
+        static inline const std::string extension = "binpack";
+
+        BinpackSfenOutputStream(std::string filename) :
+            m_stream(filename_with_extension(filename, extension), openmode)
+        {
+        }
+
+        void write(const PSVector& sfens) override
+        {
+            static_assert(sizeof(binpack::nodchip::PackedSfenValue) == sizeof(PackedSfenValue));
+
+            for(auto& sfen : sfens)
+            {
+                // The library uses a type that's different but layout-compatibile.
+                binpack::nodchip::PackedSfenValue e;
+                std::memcpy(&e, &sfen, sizeof(binpack::nodchip::PackedSfenValue));
+                m_stream.addTrainingDataEntry(binpack::packedSfenValueToTrainingDataEntry(e));
+            }
+        }
+
+        ~BinpackSfenOutputStream() override {}
+
+    private:
+        binpack::CompressedTrainingDataEntryWriter m_stream;
+    };
+
+    inline std::unique_ptr<BasicSfenInputStream> open_sfen_input_file(const std::string& filename)
+    {
+        if (has_extension(filename, BinSfenInputStream::extension))
+            return std::make_unique<BinSfenInputStream>(filename);
+        else if (has_extension(filename, BinpackSfenInputStream::extension))
+            return std::make_unique<BinpackSfenInputStream>(filename);
+
+        assert(false);
+        return nullptr;
+    }
+
+    inline std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename, SfenOutputType sfen_output_type)
+    {
+        switch(sfen_output_type)
+        {
+            case SfenOutputType::Bin:
+                return std::make_unique<BinSfenOutputStream>(filename);
+            case SfenOutputType::Binpack:
+                return std::make_unique<BinpackSfenOutputStream>(filename);
+        }
+
+        assert(false);
+        return nullptr;
+    }
+}
+
+#endif
\ No newline at end of file

From 904adb9a32f2d96f5ffb99fb5d44912adecaa518 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 19:53:41 +0200
Subject: [PATCH 330/583] Indentation consistency in learn folder

---
 src/learn/gensfen.cpp     |   3 +-
 src/learn/half_float.h    | 178 +++++-----
 src/learn/learn.cpp       |  16 +-
 src/learn/multi_think.cpp | 144 ++++----
 src/learn/multi_think.h   | 188 +++++------
 src/learn/sfen_packer.cpp | 690 +++++++++++++++++++-------------------
 6 files changed, 607 insertions(+), 612 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 4a6f26dc..1a9187ae 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -3,7 +3,6 @@
 #include "packed_sfen.h"
 #include "multi_think.h"
 #include "sfen_stream.h"
-#include "../syzygy/tbprobe.h"
 
 #include "misc.h"
 #include "position.h"
@@ -73,7 +72,7 @@ namespace Learner
             file_worker_thread.join();
             output_file_stream.reset();
 
-#if defined(_DEBUG)
+#if !defined(NDEBUG)
             {
                 // All buffers should be empty since file_worker_thread
                 // should have written everything before exiting.
diff --git a/src/learn/half_float.h b/src/learn/half_float.h
index ebe77526..5808a786 100644
--- a/src/learn/half_float.h
+++ b/src/learn/half_float.h
@@ -11,122 +11,122 @@
 
 namespace HalfFloat
 {
-	// IEEE 754 float 32 format is :
-	//   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
-	//
-	// Our float16 format is :
-	//   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
-	union float32_converter
-	{
-		int32_t n;
-		float f;
-	};
+    // IEEE 754 float 32 format is :
+    //   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
+    //
+    // Our float16 format is :
+    //   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
+    union float32_converter
+    {
+        int32_t n;
+        float f;
+    };
 
 
-	// 16-bit float
-	struct float16
-	{
-		// --- constructors
+    // 16-bit float
+    struct float16
+    {
+        // --- constructors
 
-		float16() {}
-		float16(int16_t n) { from_float((float)n);  }
-		float16(int32_t n) { from_float((float)n); }
-		float16(float n) { from_float(n); }
-		float16(double n) { from_float((float)n); }
+        float16() {}
+        float16(int16_t n) { from_float((float)n);  }
+        float16(int32_t n) { from_float((float)n); }
+        float16(float n) { from_float(n); }
+        float16(double n) { from_float((float)n); }
 
-		// build from a float
-		void from_float(float f) { *this = to_float16(f); }
+        // build from a float
+        void from_float(float f) { *this = to_float16(f); }
 
-		// --- implicit converters
+        // --- implicit converters
 
-		operator int32_t() const { return (int32_t)to_float(*this); }
-		operator float() const { return to_float(*this); }
-		operator double() const { return double(to_float(*this)); }
+        operator int32_t() const { return (int32_t)to_float(*this); }
+        operator float() const { return to_float(*this); }
+        operator double() const { return double(to_float(*this)); }
 
-		// --- operators
+        // --- operators
 
-		float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
-		float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
-		float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
-		float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
-		float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
-		float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
-		float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
-		float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
-		float16 operator - () const { return float16(-to_float(*this)); }
-		bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
-		bool operator != (float16 rhs) const { return !(*this == rhs); }
+        float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
+        float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
+        float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
+        float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
+        float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
+        float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
+        float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
+        float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
+        float16 operator - () const { return float16(-to_float(*this)); }
+        bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
+        bool operator != (float16 rhs) const { return !(*this == rhs); }
 
-		static void UnitTest() { unit_test(); }
+        static void UnitTest() { unit_test(); }
 
-	private:
+    private:
 
-		// --- entity
+        // --- entity
 
-		uint16_t v_;
+        uint16_t v_;
 
-		// --- conversion between float and float16
+        // --- conversion between float and float16
 
-		static float16 to_float16(float f)
-		{
-			float32_converter c;
-			c.f = f;
-			u32 n = c.n;
+        static float16 to_float16(float f)
+        {
+            float32_converter c;
+            c.f = f;
+            u32 n = c.n;
 
-			// The sign bit is MSB in common.
-			uint16_t sign_bit = (n >> 16) & 0x8000;
+            // The sign bit is MSB in common.
+            uint16_t sign_bit = (n >> 16) & 0x8000;
 
-			// The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
-			uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;
+            // The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
+            uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;
 
-			// The fraction is limited to 10-bit.
-			uint16_t fraction = (n >> (23-10)) & 0x3ff;
+            // The fraction is limited to 10-bit.
+            uint16_t fraction = (n >> (23-10)) & 0x3ff;
 
-			float16 f_;
-			f_.v_ = sign_bit | exponent | fraction;
+            float16 f_;
+            f_.v_ = sign_bit | exponent | fraction;
 
-			return f_;
-		}
+            return f_;
+        }
 
-		static float to_float(float16 v)
-		{
-			u32 sign_bit = (v.v_ & 0x8000) << 16;
-			u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
-			u32 fraction = (v.v_ & 0x3ff) << (23 - 10);
+        static float to_float(float16 v)
+        {
+            u32 sign_bit = (v.v_ & 0x8000) << 16;
+            u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
+            u32 fraction = (v.v_ & 0x3ff) << (23 - 10);
 
-			float32_converter c;
-			c.n = sign_bit | exponent | fraction;
-			return c.f;
-		}
+            float32_converter c;
+            c.n = sign_bit | exponent | fraction;
+            return c.f;
+        }
 
-		// It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
-		static void unit_test()
-		{
-			float16 a, b, c, d;
-			a = 1;
-			std::cout << (float)a << std::endl;
-			b = -118.625;
-			std::cout << (float)b << std::endl;
-			c = 2.5;
-			std::cout << (float)c << std::endl;
-			d = a + c;
-			std::cout << (float)d << std::endl;
+        // It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
+        static void unit_test()
+        {
+            float16 a, b, c, d;
+            a = 1;
+            std::cout << (float)a << std::endl;
+            b = -118.625;
+            std::cout << (float)b << std::endl;
+            c = 2.5;
+            std::cout << (float)c << std::endl;
+            d = a + c;
+            std::cout << (float)d << std::endl;
 
-			c *= 1.5;
-			std::cout << (float)c << std::endl;
+            c *= 1.5;
+            std::cout << (float)c << std::endl;
 
-			b /= 3;
-			std::cout << (float)b << std::endl;
+            b /= 3;
+            std::cout << (float)b << std::endl;
 
-			float f1 = 1.5;
-			a += f1;
-			std::cout << (float)a << std::endl;
+            float f1 = 1.5;
+            a += f1;
+            std::cout << (float)a << std::endl;
 
-			a += f1 * (float)a;
-			std::cout << (float)a << std::endl;
-		}
+            a += f1 * (float)a;
+            std::cout << (float)a << std::endl;
+        }
 
-	};
+    };
 
 }
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 6c865d98..b09700e9 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1066,14 +1066,14 @@ namespace Learner
 
             pos.do_move((Move)ps.move, state[ply++]);
 
-			// There is a possibility that all the pieces are blocked and stuck.
-			// Also, the declaration win phase is excluded from
-			// learning because you cannot go to leaf with PV moves.
-			// (shouldn't write out such teacher aspect itself,
-			// but may have written it out with an old generation routine)
-			// Skip the position if there are no legal moves (=checkmated or stalemate).
-			if (MoveList<LEGAL>(pos).size() == 0)
-				goto RETRY_READ;
+            // There is a possibility that all the pieces are blocked and stuck.
+            // Also, the declaration win phase is excluded from
+            // learning because you cannot go to leaf with PV moves.
+            // (shouldn't write out such teacher aspect itself,
+            // but may have written it out with an old generation routine)
+            // Skip the position if there are no legal moves (=checkmated or stalemate).
+            if (MoveList<LEGAL>(pos).size() == 0)
+                goto RETRY_READ;
 
             // Evaluation value of shallow search (qsearch)
             const auto [_, pv] = Search::qsearch(pos);
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index daed3e96..d2ae65eb 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -1,103 +1,103 @@
 ﻿#include "multi_think.h"
 
-#include "nnue/evaluate_nnue.h"
-
 #include "tt.h"
 #include "uci.h"
 #include "types.h"
 #include "search.h"
 
+#include "nnue/evaluate_nnue.h"
+
 #include <thread>
 
 void MultiThink::go_think()
 {
-	// Read evaluation function, etc.
-	// In the case of the learn command, the value of the evaluation function may be corrected after reading the evaluation function, so
-	// Skip memory corruption check.
-	Eval::NNUE::init();
+    // Read evaluation function, etc.
+    // In the case of the learn command, the value of the evaluation function may be corrected after reading the evaluation function, so
+    // Skip memory corruption check.
+    Eval::NNUE::init();
 
-	// Call the derived class's init().
-	init();
+    // Call the derived class's init().
+    init();
 
-	// The loop upper limit is set with set_loop_max().
-	loop_count = 0;
-	done_count = 0;
+    // The loop upper limit is set with set_loop_max().
+    loop_count = 0;
+    done_count = 0;
 
-	// Create threads as many as Options["Threads"] and start thinking.
-	std::vector<std::thread> threads;
-	auto thread_num = (size_t)Options["Threads"];
+    // Create threads as many as Options["Threads"] and start thinking.
+    std::vector<std::thread> threads;
+    auto thread_num = (size_t)Options["Threads"];
 
-	// Secure end flag of worker thread
+    // Secure end flag of worker thread
         threads_finished=0;
 
-	// start worker thread
-	for (size_t i = 0; i < thread_num; ++i)
-	{
-		threads.push_back(std::thread([i, this]
-		{
-			// exhaust all processor threads.
-			WinProcGroup::bindThisThread(i);
+    // start worker thread
+    for (size_t i = 0; i < thread_num; ++i)
+    {
+        threads.push_back(std::thread([i, this]
+        {
+            // exhaust all processor threads.
+            WinProcGroup::bindThisThread(i);
 
-			// execute the overridden process
-			this->thread_worker(i);
+            // execute the overridden process
+            this->thread_worker(i);
 
-			// Set the end flag because the thread has ended
-			this->threads_finished++;
-		}));
-	}
+            // Set the end flag because the thread has ended
+            this->threads_finished++;
+        }));
+    }
 
-	// wait for all threads to finish
-	// for (auto& th :threads)
-	// th.join();
-	// If you write like, the thread will rush here while it is still working,
-	// During that time, callback_func() cannot be called and you cannot save.
-	// Therefore, you need to check the end flag yourself.
+    // wait for all threads to finish
+    // for (auto& th :threads)
+    // th.join();
+    // If you write like, the thread will rush here while it is still working,
+    // During that time, callback_func() cannot be called and you cannot save.
+    // Therefore, you need to check the end flag yourself.
 
-	// function to determine if all threads have finished
-	auto threads_done = [&]()
-	{
-		return threads_finished == thread_num;
-	};
+    // function to determine if all threads have finished
+    auto threads_done = [&]()
+    {
+        return threads_finished == thread_num;
+    };
 
-	// Call back if the callback function is set.
-	auto do_a_callback = [&]()
-	{
-		if (callback_func)
-			callback_func();
-	};
+    // Call back if the callback function is set.
+    auto do_a_callback = [&]()
+    {
+        if (callback_func)
+            callback_func();
+    };
 
 
-	for (uint64_t i = 0 ; ; )
-	{
-		// If all threads have finished, exit the loop.
-		if (threads_done())
-			break;
+    for (uint64_t i = 0 ; ; )
+    {
+        // If all threads have finished, exit the loop.
+        if (threads_done())
+            break;
 
-		sleep(1000);
+        sleep(1000);
 
-		// callback_func() is called every callback_seconds.
-		if (++i == callback_seconds)
-		{
-			do_a_callback();
-			// Since I am returning from ↑, I reset the counter, so
-			// no matter how long it takes to save() etc. in do_a_callback()
-			// The next call will take a certain amount of time.
-			i = 0;
-		}
-	}
+        // callback_func() is called every callback_seconds.
+        if (++i == callback_seconds)
+        {
+            do_a_callback();
+            // Since I am returning from ↑, I reset the counter, so
+            // no matter how long it takes to save() etc. in do_a_callback()
+            // The next call will take a certain amount of time.
+            i = 0;
+        }
+    }
 
-	// Last save.
-	std::cout << std::endl << "finalize..";
+    // Last save.
+    std::cout << std::endl << "finalize..";
 
-	// do_a_callback();
-	// → It should be saved by the caller, so I feel that it is not necessary here.
+    // do_a_callback();
+    // → It should be saved by the caller, so I feel that it is not necessary here.
 
-	// It is possible that the exit code of the thread is running but the exit code of the thread is running, so
-	// We need to wait for the end with join().
-	for (auto& th : threads)
-		th.join();
+    // It is possible that the exit code of the thread is running but the exit code of the thread is running, so
+    // We need to wait for the end with join().
+    for (auto& th : threads)
+        th.join();
 
-	// The file writing thread etc. are still running only when all threads are finished
-	// Since the work itself may not have completed, output only that all threads have finished.
-	std::cout << "all threads are joined." << std::endl;
+    // The file writing thread etc. are still running only when all threads are finished
+    // Since the work itself may not have completed, output only that all threads have finished.
+    std::cout << "all threads are joined." << std::endl;
 }
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
index e6c436f8..7e541909 100644
--- a/src/learn/multi_think.h
+++ b/src/learn/multi_think.h
@@ -19,84 +19,84 @@
 // Derive and use this class.
 struct MultiThink
 {
-	static constexpr std::uint64_t LOOP_COUNT_FINISHED = std::numeric_limits<std::uint64_t>::max();
+    static constexpr std::uint64_t LOOP_COUNT_FINISHED = std::numeric_limits<std::uint64_t>::max();
 
-	MultiThink() : prng{}, loop_count(0) { }
+    MultiThink() : prng{}, loop_count(0) { }
 
-	MultiThink(std::uint64_t seed) : prng(seed), loop_count(0) { }
+    MultiThink(std::uint64_t seed) : prng(seed), loop_count(0) { }
 
-	MultiThink(const std::string& seed) : prng(seed), loop_count(0) { }
+    MultiThink(const std::string& seed) : prng(seed), loop_count(0) { }
 
-	// Call this function from the master thread, each thread will think,
-	// Return control when the thought ending condition is satisfied.
-	// Do something else.
-	// ・It is safe for each thread to call Learner::search(),qsearch()
-	// Separates the substitution table for each thread. (It will be restored after the end.)
-	// ・Book is not thread safe when in on the fly mode, so temporarily change this mode.
-	// Turn it off.
-	// [Requirements]
-	// 1) Override thread_worker()
-	// 2) Set the loop count with set_loop_max()
-	// 3) set a function to be called back periodically (if necessary)
-	// callback_func and callback_interval
-	void go_think();
+    // Call this function from the master thread, each thread will think,
+    // Return control when the thought ending condition is satisfied.
+    // Do something else.
+    // ・It is safe for each thread to call Learner::search(),qsearch()
+    // Separates the substitution table for each thread. (It will be restored after the end.)
+    // ・Book is not thread safe when in on the fly mode, so temporarily change this mode.
+    // Turn it off.
+    // [Requirements]
+    // 1) Override thread_worker()
+    // 2) Set the loop count with set_loop_max()
+    // 3) set a function to be called back periodically (if necessary)
+    // callback_func and callback_interval
+    void go_think();
 
-	// If there is something you want to initialize on the derived class side, override this,
-	// Called when initialization is completed with go_think().
-	// It is better to read the fixed trace at that timing.
-	virtual void init() {}
+    // If there is something you want to initialize on the derived class side, override this,
+    // Called when initialization is completed with go_think().
+    // It is better to read the fixed trace at that timing.
+    virtual void init() {}
 
-	// A thread worker that is called by creating a thread when you go_think()
-	// Override and use this.
-	virtual void thread_worker(size_t thread_id) = 0;
+    // A thread worker that is called by creating a thread when you go_think()
+    // Override and use this.
+    virtual void thread_worker(size_t thread_id) = 0;
 
-	// Called back every callback_seconds [seconds] when go_think().
-	std::function<void()> callback_func;
-	uint64_t callback_seconds = 600;
+    // Called back every callback_seconds [seconds] when go_think().
+    std::function<void()> callback_func;
+    uint64_t callback_seconds = 600;
 
-	// Set the number of times worker processes (calls Search::think()).
-	void set_loop_max(uint64_t loop_max_) { loop_max = loop_max_; }
+    // Set the number of times worker processes (calls Search::think()).
+    void set_loop_max(uint64_t loop_max_) { loop_max = loop_max_; }
 
-	// Get the value set by set_loop_max().
-	uint64_t get_loop_max() const { return loop_max; }
+    // Get the value set by set_loop_max().
+    uint64_t get_loop_max() const { return loop_max; }
 
-	// [ASYNC] Take the value of the loop counter and add the loop counter after taking it out.
-	// If the loop counter has reached loop_max, return UINT64_MAX.
-	// If you want to generate a phase, you must call this function at the time of generating the phase,
-	// Please note that the number of generated phases and the value of the counter will not match.
-	uint64_t get_next_loop_count() {
-		std::unique_lock<std::mutex> lk(loop_mutex);
-		if (loop_count >= loop_max)
-			return LOOP_COUNT_FINISHED;
-		return loop_count++;
-	}
+    // [ASYNC] Take the value of the loop counter and add the loop counter after taking it out.
+    // If the loop counter has reached loop_max, return UINT64_MAX.
+    // If you want to generate a phase, you must call this function at the time of generating the phase,
+    // Please note that the number of generated phases and the value of the counter will not match.
+    uint64_t get_next_loop_count() {
+        std::unique_lock<std::mutex> lk(loop_mutex);
+        if (loop_count >= loop_max)
+            return LOOP_COUNT_FINISHED;
+        return loop_count++;
+    }
 
-	// [ASYNC] For returning the processed number. Each time it is called, it returns a counter that is incremented.
-	uint64_t get_done_count() {
-		std::unique_lock<std::mutex> lk(loop_mutex);
-		return ++done_count;
-	}
+    // [ASYNC] For returning the processed number. Each time it is called, it returns a counter that is incremented.
+    uint64_t get_done_count() {
+        std::unique_lock<std::mutex> lk(loop_mutex);
+        return ++done_count;
+    }
 
-	// Mutex when worker thread accesses I/O
-	std::mutex io_mutex;
+    // Mutex when worker thread accesses I/O
+    std::mutex io_mutex;
 
 protected:
-	// Random number generator body
-	AsyncPRNG prng;
+    // Random number generator body
+    AsyncPRNG prng;
 
 private:
-	// number of times worker processes (calls Search::think())
-	std::atomic<uint64_t> loop_max;
-	// number of times the worker has processed (calls Search::think())
-	std::atomic<uint64_t> loop_count;
-	// To return the number of times it has been processed.
-	std::atomic<uint64_t> done_count;
+    // number of times worker processes (calls Search::think())
+    std::atomic<uint64_t> loop_max;
+    // number of times the worker has processed (calls Search::think())
+    std::atomic<uint64_t> loop_count;
+    // To return the number of times it has been processed.
+    std::atomic<uint64_t> done_count;
 
-	// Mutex when changing the variables in ↑
-	std::mutex loop_mutex;
+    // Mutex when changing the variables in ↑
+    std::mutex loop_mutex;
 
-	// Thread end flag.
-        std::atomic<uint64_t> threads_finished;
+    // Thread end flag.
+    std::atomic<uint64_t> threads_finished;
 };
 
 // Mechanism to process task during idle time.
@@ -105,48 +105,48 @@ private:
 // Convenient to use when you want to write MultiThink thread worker in master-slave method.
 struct TaskDispatcher
 {
-	typedef std::function<void(size_t /* thread_id */)> Task;
+    typedef std::function<void(size_t /* thread_id */)> Task;
 
-	// slave calls this function during idle.
-	void on_idle(size_t thread_id)
-	{
-		Task task;
-		while ((task = get_task_async()) != nullptr)
-			task(thread_id);
+    // slave calls this function during idle.
+    void on_idle(size_t thread_id)
+    {
+        Task task;
+        while ((task = get_task_async()) != nullptr)
+            task(thread_id);
 
-		sleep(1);
-	}
+        sleep(1);
+    }
 
-	// Stack [ASYNC] task.
-	void push_task_async(Task task)
-	{
-		std::unique_lock<std::mutex> lk(task_mutex);
-		tasks.push_back(task);
-	}
+    // Stack [ASYNC] task.
+    void push_task_async(Task task)
+    {
+        std::unique_lock<std::mutex> lk(task_mutex);
+        tasks.push_back(task);
+    }
 
-	// Allocate size array elements for task in advance.
-	void task_reserve(size_t size)
-	{
-		tasks.reserve(size);
-	}
+    // Allocate size array elements for task in advance.
+    void task_reserve(size_t size)
+    {
+        tasks.reserve(size);
+    }
 
 protected:
-	// set of tasks
-	std::vector<Task> tasks;
+    // set of tasks
+    std::vector<Task> tasks;
 
-	// Take out one [ASYNC] task. Called from on_idle().
-	Task get_task_async()
-	{
-		std::unique_lock<std::mutex> lk(task_mutex);
-		if (tasks.size() == 0)
-			return nullptr;
-		Task task = *tasks.rbegin();
-		tasks.pop_back();
-		return task;
-	}
+    // Take out one [ASYNC] task. Called from on_idle().
+    Task get_task_async()
+    {
+        std::unique_lock<std::mutex> lk(task_mutex);
+        if (tasks.size() == 0)
+            return nullptr;
+        Task task = *tasks.rbegin();
+        tasks.pop_back();
+        return task;
+    }
 
-	// a mutex for accessing tasks
-	std::mutex task_mutex;
+    // a mutex for accessing tasks
+    std::mutex task_mutex;
 };
 
 #endif
diff --git a/src/learn/sfen_packer.cpp b/src/learn/sfen_packer.cpp
index 2de7efa4..777b5943 100644
--- a/src/learn/sfen_packer.cpp
+++ b/src/learn/sfen_packer.cpp
@@ -13,378 +13,374 @@ using namespace std;
 
 namespace Learner {
 
-  // Class that handles bitstream
-  // useful when doing aspect encoding
-  struct BitStream
-  {
-    // Set the memory to store the data in advance.
-    // Assume that memory is cleared to 0.
-    void set_data(std::uint8_t* data_) { data = data_; reset(); }
-
-    // Get the pointer passed in set_data().
-    uint8_t* get_data() const { return data; }
-
-    // Get the cursor.
-    int get_cursor() const { return bit_cursor; }
-
-    // reset the cursor
-    void reset() { bit_cursor = 0; }
-
-    // Write 1bit to the stream.
-    // If b is non-zero, write out 1. If 0, write 0.
-    void write_one_bit(int b)
+    // Class that handles bitstream
+    // useful when doing aspect encoding
+    struct BitStream
     {
-      if (b)
-        data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+        // Set the memory to store the data in advance.
+        // Assume that memory is cleared to 0.
+        void set_data(std::uint8_t* data_) { data = data_; reset(); }
 
-      ++bit_cursor;
-    }
+        // Get the pointer passed in set_data().
+        uint8_t* get_data() const { return data; }
 
-    // Get 1 bit from the stream.
-    int read_one_bit()
+        // Get the cursor.
+        int get_cursor() const { return bit_cursor; }
+
+        // reset the cursor
+        void reset() { bit_cursor = 0; }
+
+        // Write 1bit to the stream.
+        // If b is non-zero, write out 1. If 0, write 0.
+        void write_one_bit(int b)
+        {
+            if (b)
+                data[bit_cursor / 8] |= 1 << (bit_cursor & 7);
+
+            ++bit_cursor;
+        }
+
+        // Get 1 bit from the stream.
+        int read_one_bit()
+        {
+            int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
+            ++bit_cursor;
+
+            return b;
+        }
+
+        // write n bits of data
+        // Data shall be written out from the lower order of d.
+        void write_n_bit(int d, int n)
+        {
+            for (int i = 0; i <n; ++i)
+                write_one_bit(d & (1 << i));
+        }
+
+        // read n bits of data
+        // Reverse conversion of write_n_bit().
+        int read_n_bit(int n)
+        {
+            int result = 0;
+            for (int i = 0; i < n; ++i)
+                result |= read_one_bit() ? (1 << i) : 0;
+
+            return result;
+        }
+
+    private:
+        // Next bit position to read/write.
+        int bit_cursor;
+
+        // data entity
+        std::uint8_t* data;
+    };
+
+    // Class for compressing/decompressing sfen
+    // sfen can be packed to 256bit (32bytes) by Huffman coding.
+    // This is proven by mini. The above is Huffman coding.
+    //
+    // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
+    // Side to move (White = 0, Black = 1) (1bit)
+    // White King Position (6 bits)
+    // Black King Position (6 bits)
+    // Huffman Encoding of the board
+    // Castling availability (1 bit x 4)
+    // En passant square (1 or 1 + 6 bits)
+    // Rule 50 (6 bits)
+    // Game play (8 bits)
+    //
+    // TODO(someone): Rename SFEN to FEN.
+    //
+    struct SfenPacker
     {
-      int b = (data[bit_cursor / 8] >> (bit_cursor & 7)) & 1;
-      ++bit_cursor;
+        void pack(const Position& pos);
 
-      return b;
-    }
+        // sfen packed by pack() (256bit = 32bytes)
+        // Or sfen to decode with unpack()
+        uint8_t *data; // uint8_t[32];
 
-    // write n bits of data
-    // Data shall be written out from the lower order of d.
-    void write_n_bit(int d, int n)
+        BitStream stream;
+
+        // Output the board pieces to stream.
+        void write_board_piece_to_stream(Piece pc);
+
+        // Read one board piece from stream
+        Piece read_board_piece_from_stream();
+    };
+
+
+    // Huffman coding
+    // * is simplified from mini encoding to make conversion easier.
+    //
+    // Huffman Encoding
+    //
+    // Empty  xxxxxxx0
+    // Pawn   xxxxx001 + 1 bit (Color)
+    // Knight xxxxx011 + 1 bit (Color)
+    // Bishop xxxxx101 + 1 bit (Color)
+    // Rook   xxxxx111 + 1 bit (Color)
+    // Queen   xxxx1001 + 1 bit (Color)
+    //
+    // Worst case:
+    // - 32 empty squares    32 bits
+    // - 30 pieces           150 bits
+    // - 2 kings             12 bits
+    // - castling rights     4 bits
+    // - ep square           7 bits
+    // - rule50              7 bits
+    // - game ply            16 bits
+    // - TOTAL               228 bits < 256 bits
+
+    struct HuffmanedPiece
     {
-      for (int i = 0; i <n; ++i)
-        write_one_bit(d & (1 << i));
-    }
+        int code; // how it will be coded
+        int bits; // How many bits do you have
+    };
 
-    // read n bits of data
-    // Reverse conversion of write_n_bit().
-    int read_n_bit(int n)
+    constexpr HuffmanedPiece huffman_table[] =
     {
-      int result = 0;
-      for (int i = 0; i < n; ++i)
-        result |= read_one_bit() ? (1 << i) : 0;
+        {0b0000,1}, // NO_PIECE
+        {0b0001,4}, // PAWN
+        {0b0011,4}, // KNIGHT
+        {0b0101,4}, // BISHOP
+        {0b0111,4}, // ROOK
+        {0b1001,4}, // QUEEN
+    };
 
-      return result;
+    // Pack sfen and store in data[32].
+    void SfenPacker::pack(const Position& pos)
+    {
+        memset(data, 0, 32 /* 256bit */);
+        stream.set_data(data);
+
+        // turn
+        // Side to move.
+        stream.write_one_bit((int)(pos.side_to_move()));
+
+        // 7-bit positions for leading and trailing balls
+        // White king and black king, 6 bits for each.
+        for(auto c: Colors)
+            stream.write_n_bit(pos.king_square(c), 6);
+
+        // Write the pieces on the board other than the kings.
+        for (Rank r = RANK_8; r >= RANK_1; --r)
+        {
+            for (File f = FILE_A; f <= FILE_H; ++f)
+            {
+                Piece pc = pos.piece_on(make_square(f, r));
+                if (type_of(pc) == KING)
+                    continue;
+                write_board_piece_to_stream(pc);
+            }
+        }
+
+        // TODO(someone): Support chess960.
+        stream.write_one_bit(pos.can_castle(WHITE_OO));
+        stream.write_one_bit(pos.can_castle(WHITE_OOO));
+        stream.write_one_bit(pos.can_castle(BLACK_OO));
+        stream.write_one_bit(pos.can_castle(BLACK_OOO));
+
+        if (pos.ep_square() == SQ_NONE) {
+            stream.write_one_bit(0);
+        }
+        else {
+            stream.write_one_bit(1);
+            stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
+        }
+
+        stream.write_n_bit(pos.state()->rule50, 6);
+
+        const int fm = 1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2;
+        stream.write_n_bit(fm, 8);
+
+        // Write high bits of half move. This is a fix for the
+        // limited range of half move counter.
+        // This is backwards compatibile.
+        stream.write_n_bit(fm >> 8, 8);
+
+        // Write the highest bit of rule50 at the end. This is a backwards
+        // compatibile fix for rule50 having only 6 bits stored.
+        // This bit is just ignored by the old parsers.
+        stream.write_n_bit(pos.state()->rule50 >> 6, 1);
+
+        assert(stream.get_cursor() <= 256);
     }
 
-  private:
-    // Next bit position to read/write.
-    int bit_cursor;
-
-    // data entity
-    std::uint8_t* data;
-  };
-
-  // Class for compressing/decompressing sfen
-  // sfen can be packed to 256bit (32bytes) by Huffman coding.
-  // This is proven by mini. The above is Huffman coding.
-  //
-  // Internal format = 1-bit turn + 7-bit king position *2 + piece on board (Huffman coding) + hand piece (Huffman coding)
-  // Side to move (White = 0, Black = 1) (1bit)
-  // White King Position (6 bits)
-  // Black King Position (6 bits)
-  // Huffman Encoding of the board
-  // Castling availability (1 bit x 4)
-  // En passant square (1 or 1 + 6 bits)
-  // Rule 50 (6 bits)
-  // Game play (8 bits)
-  //
-  // TODO(someone): Rename SFEN to FEN.
-  //
-  struct SfenPacker
-  {
-    void pack(const Position& pos);
-
-    // sfen packed by pack() (256bit = 32bytes)
-    // Or sfen to decode with unpack()
-    uint8_t *data; // uint8_t[32];
-
-    BitStream stream;
-
     // Output the board pieces to stream.
-    void write_board_piece_to_stream(Piece pc);
+    void SfenPacker::write_board_piece_to_stream(Piece pc)
+    {
+        // piece type
+        PieceType pr = type_of(pc);
+        auto c = huffman_table[pr];
+        stream.write_n_bit(c.code, c.bits);
+
+        if (pc == NO_PIECE)
+            return;
+
+        // first and second flag
+        stream.write_one_bit(color_of(pc));
+    }
 
     // Read one board piece from stream
-    Piece read_board_piece_from_stream();
-  };
-
-
-  // Huffman coding
-  // * is simplified from mini encoding to make conversion easier.
-  //
-  // Huffman Encoding
-  //
-  // Empty  xxxxxxx0
-  // Pawn   xxxxx001 + 1 bit (Color)
-  // Knight xxxxx011 + 1 bit (Color)
-  // Bishop xxxxx101 + 1 bit (Color)
-  // Rook   xxxxx111 + 1 bit (Color)
-  // Queen   xxxx1001 + 1 bit (Color)
-  //
-  // Worst case:
-  // - 32 empty squares    32 bits
-  // - 30 pieces           150 bits
-  // - 2 kings             12 bits
-  // - castling rights     4 bits
-  // - ep square           7 bits
-  // - rule50              7 bits
-  // - game ply            16 bits
-  // - TOTAL               228 bits < 256 bits
-
-  struct HuffmanedPiece
-  {
-    int code; // how it will be coded
-    int bits; // How many bits do you have
-  };
-
-  constexpr HuffmanedPiece huffman_table[] =
-  {
-    {0b0000,1}, // NO_PIECE
-    {0b0001,4}, // PAWN
-    {0b0011,4}, // KNIGHT
-    {0b0101,4}, // BISHOP
-    {0b0111,4}, // ROOK
-    {0b1001,4}, // QUEEN
-  };
-
-  // Pack sfen and store in data[32].
-  void SfenPacker::pack(const Position& pos)
-  {
-  // cout << pos;
-
-    memset(data, 0, 32 /* 256bit */);
-    stream.set_data(data);
-
-    // turn
-    // Side to move.
-    stream.write_one_bit((int)(pos.side_to_move()));
-
-    // 7-bit positions for leading and trailing balls
-    // White king and black king, 6 bits for each.
-    for(auto c: Colors)
-      stream.write_n_bit(pos.king_square(c), 6);
-
-    // Write the pieces on the board other than the kings.
-    for (Rank r = RANK_8; r >= RANK_1; --r)
+    Piece SfenPacker::read_board_piece_from_stream()
     {
-      for (File f = FILE_A; f <= FILE_H; ++f)
-      {
-        Piece pc = pos.piece_on(make_square(f, r));
-        if (type_of(pc) == KING)
-          continue;
-        write_board_piece_to_stream(pc);
-      }
-    }
-
-    // TODO(someone): Support chess960.
-    stream.write_one_bit(pos.can_castle(WHITE_OO));
-    stream.write_one_bit(pos.can_castle(WHITE_OOO));
-    stream.write_one_bit(pos.can_castle(BLACK_OO));
-    stream.write_one_bit(pos.can_castle(BLACK_OOO));
-
-    if (pos.ep_square() == SQ_NONE) {
-      stream.write_one_bit(0);
-    }
-    else {
-      stream.write_one_bit(1);
-      stream.write_n_bit(static_cast<int>(pos.ep_square()), 6);
-    }
-
-    stream.write_n_bit(pos.state()->rule50, 6);
-
-    const int fm = 1 + (pos.game_ply()-(pos.side_to_move() == BLACK)) / 2;
-    stream.write_n_bit(fm, 8);
-
-    // Write high bits of half move. This is a fix for the
-    // limited range of half move counter.
-    // This is backwards compatibile.
-    stream.write_n_bit(fm >> 8, 8);
-
-    // Write the highest bit of rule50 at the end. This is a backwards
-    // compatibile fix for rule50 having only 6 bits stored.
-    // This bit is just ignored by the old parsers.
-    stream.write_n_bit(pos.state()->rule50 >> 6, 1);
-
-    assert(stream.get_cursor() <= 256);
-  }
-
-  // Output the board pieces to stream.
-  void SfenPacker::write_board_piece_to_stream(Piece pc)
-  {
-    // piece type
-    PieceType pr = type_of(pc);
-    auto c = huffman_table[pr];
-    stream.write_n_bit(c.code, c.bits);
-
-    if (pc == NO_PIECE)
-      return;
-
-    // first and second flag
-    stream.write_one_bit(color_of(pc));
-  }
-
-  // Read one board piece from stream
-  Piece SfenPacker::read_board_piece_from_stream()
-  {
-    PieceType pr = NO_PIECE_TYPE;
-    int code = 0, bits = 0;
-    while (true)
-    {
-      code |= stream.read_one_bit() << bits;
-      ++bits;
-
-      assert(bits <= 6);
-
-      for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
-        if (huffman_table[pr].code == code
-          && huffman_table[pr].bits == bits)
-          goto Found;
-    }
-  Found:;
-    if (pr == NO_PIECE_TYPE)
-      return NO_PIECE;
-
-    // first and second flag
-    Color c = (Color)stream.read_one_bit();
-
-    return make_piece(c, pr);
-  }
-
-  int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th)
-  {
-    SfenPacker packer;
-    auto& stream = packer.stream;
-
-    // TODO: separate streams for writing and reading. Here we actually have to
-    // const_cast which is not safe in the long run.
-    stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
-
-    pos.clear();
-    std::memset(si, 0, sizeof(StateInfo));
-    std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
-    pos.st = si;
-
-    // Active color
-    pos.sideToMove = (Color)stream.read_one_bit();
-
-    pos.pieceList[W_KING][0] = SQUARE_NB;
-    pos.pieceList[B_KING][0] = SQUARE_NB;
-
-    // First the position of the ball
-    for (auto c : Colors)
-      pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
-
-    // Piece placement
-    for (Rank r = RANK_8; r >= RANK_1; --r)
-    {
-      for (File f = FILE_A; f <= FILE_H; ++f)
-      {
-        auto sq = make_square(f, r);
-
-        // it seems there are already balls
-        Piece pc;
-        if (type_of(pos.board[sq]) != KING)
+        PieceType pr = NO_PIECE_TYPE;
+        int code = 0, bits = 0;
+        while (true)
         {
-          assert(pos.board[sq] == NO_PIECE);
-          pc = packer.read_board_piece_from_stream();
+            code |= stream.read_one_bit() << bits;
+            ++bits;
+
+            assert(bits <= 6);
+
+            for (pr = NO_PIECE_TYPE; pr <KING; ++pr)
+                if (huffman_table[pr].code == code
+                    && huffman_table[pr].bits == bits)
+                    goto Found;
         }
-        else
+    Found:;
+        if (pr == NO_PIECE_TYPE)
+            return NO_PIECE;
+
+        // first and second flag
+        Color c = (Color)stream.read_one_bit();
+
+        return make_piece(c, pr);
+    }
+
+    int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th)
+    {
+        SfenPacker packer;
+        auto& stream = packer.stream;
+
+        // TODO: separate streams for writing and reading. Here we actually have to
+        // const_cast which is not safe in the long run.
+        stream.set_data(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&sfen)));
+
+        pos.clear();
+        std::memset(si, 0, sizeof(StateInfo));
+        std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
+        pos.st = si;
+
+        // Active color
+        pos.sideToMove = (Color)stream.read_one_bit();
+
+        pos.pieceList[W_KING][0] = SQUARE_NB;
+        pos.pieceList[B_KING][0] = SQUARE_NB;
+
+        // First the position of the ball
+        for (auto c : Colors)
+            pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
+
+        // Piece placement
+        for (Rank r = RANK_8; r >= RANK_1; --r)
         {
-          pc = pos.board[sq];
-          // put_piece() will catch ASSERT unless you remove it all.
-          pos.board[sq] = NO_PIECE;
+            for (File f = FILE_A; f <= FILE_H; ++f)
+            {
+                auto sq = make_square(f, r);
+
+                // it seems there are already balls
+                Piece pc;
+                if (type_of(pos.board[sq]) != KING)
+                {
+                    assert(pos.board[sq] == NO_PIECE);
+                    pc = packer.read_board_piece_from_stream();
+                }
+                else
+                {
+                    pc = pos.board[sq];
+                    // put_piece() will catch ASSERT unless you remove it all.
+                    pos.board[sq] = NO_PIECE;
+                }
+
+                // There may be no pieces, so skip in that case.
+                if (pc == NO_PIECE)
+                    continue;
+
+                pos.put_piece(Piece(pc), sq);
+
+                if (stream.get_cursor()> 256)
+                    return 1;
+            }
         }
 
-        // There may be no pieces, so skip in that case.
-        if (pc == NO_PIECE)
-          continue;
+        // Castling availability.
+        // TODO(someone): Support chess960.
+        pos.st->castlingRights = 0;
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(WHITE, SQ_H1); pos.piece_on(rsq) != W_ROOK; --rsq) {}
+            pos.set_castling_right(WHITE, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(WHITE, SQ_A1); pos.piece_on(rsq) != W_ROOK; ++rsq) {}
+            pos.set_castling_right(WHITE, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(BLACK, SQ_H1); pos.piece_on(rsq) != B_ROOK; --rsq) {}
+            pos.set_castling_right(BLACK, rsq);
+        }
+        if (stream.read_one_bit()) {
+            Square rsq;
+            for (rsq = relative_square(BLACK, SQ_A1); pos.piece_on(rsq) != B_ROOK; ++rsq) {}
+            pos.set_castling_right(BLACK, rsq);
+        }
 
-        pos.put_piece(Piece(pc), sq);
+        // En passant square. Ignore if no pawn capture is possible
+        if (stream.read_one_bit()) {
+            Square ep_square = static_cast<Square>(stream.read_n_bit(6));
+            pos.st->epSquare = ep_square;
 
-        if (stream.get_cursor()> 256)
-          return 1;
+            if (!(pos.attackers_to(pos.st->epSquare) & pos.pieces(pos.sideToMove, PAWN))
+                || !(pos.pieces(~pos.sideToMove, PAWN) & (pos.st->epSquare + pawn_push(~pos.sideToMove))))
+                pos.st->epSquare = SQ_NONE;
+        }
+        else {
+            pos.st->epSquare = SQ_NONE;
+        }
 
-        //assert(stream.get_cursor() <= 256);
-      }
+        // Halfmove clock
+        pos.st->rule50 = stream.read_n_bit(6);
+
+        // Fullmove number
+        pos.gamePly = stream.read_n_bit(8);
+
+        // Read the highest bit of rule50. This was added as a fix for rule50
+        // counter having only 6 bits stored.
+        // In older entries this will just be a zero bit.
+        pos.gamePly |= stream.read_n_bit(8) << 8;
+
+        // Read the highest bit of rule50. This was added as a fix for rule50
+        // counter having only 6 bits stored.
+        // In older entries this will just be a zero bit.
+        pos.st->rule50 |= stream.read_n_bit(1) << 6;
+
+        // Convert from fullmove starting from 1 to gamePly starting from 0,
+        // handle also common incorrect FEN with fullmove = 0.
+        pos.gamePly = std::max(2 * (pos.gamePly - 1), 0) + (pos.sideToMove == BLACK);
+
+        assert(stream.get_cursor() <= 256);
+
+        pos.chess960 = false;
+        pos.thisThread = th;
+        pos.set_state(pos.st);
+
+        assert(pos.pos_is_ok());
+
+        return 0;
     }
 
-    // Castling availability.
-    // TODO(someone): Support chess960.
-    pos.st->castlingRights = 0;
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(WHITE, SQ_H1); pos.piece_on(rsq) != W_ROOK; --rsq) {}
-      pos.set_castling_right(WHITE, rsq);
+    PackedSfen sfen_pack(Position& pos)
+    {
+        PackedSfen sfen;
+
+        SfenPacker sp;
+        sp.data = (uint8_t*)&sfen;
+        sp.pack(pos);
+
+        return sfen;
     }
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(WHITE, SQ_A1); pos.piece_on(rsq) != W_ROOK; ++rsq) {}
-      pos.set_castling_right(WHITE, rsq);
-    }
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(BLACK, SQ_H1); pos.piece_on(rsq) != B_ROOK; --rsq) {}
-      pos.set_castling_right(BLACK, rsq);
-    }
-    if (stream.read_one_bit()) {
-      Square rsq;
-      for (rsq = relative_square(BLACK, SQ_A1); pos.piece_on(rsq) != B_ROOK; ++rsq) {}
-      pos.set_castling_right(BLACK, rsq);
-    }
-
-    // En passant square. Ignore if no pawn capture is possible
-    if (stream.read_one_bit()) {
-      Square ep_square = static_cast<Square>(stream.read_n_bit(6));
-      pos.st->epSquare = ep_square;
-
-      if (!(pos.attackers_to(pos.st->epSquare) & pos.pieces(pos.sideToMove, PAWN))
-        || !(pos.pieces(~pos.sideToMove, PAWN) & (pos.st->epSquare + pawn_push(~pos.sideToMove))))
-        pos.st->epSquare = SQ_NONE;
-    }
-    else {
-      pos.st->epSquare = SQ_NONE;
-    }
-
-    // Halfmove clock
-    pos.st->rule50 = stream.read_n_bit(6);
-
-    // Fullmove number
-    pos.gamePly = stream.read_n_bit(8);
-
-    // Read the highest bit of rule50. This was added as a fix for rule50
-    // counter having only 6 bits stored.
-    // In older entries this will just be a zero bit.
-    pos.gamePly |= stream.read_n_bit(8) << 8;
-
-    // Read the highest bit of rule50. This was added as a fix for rule50
-    // counter having only 6 bits stored.
-    // In older entries this will just be a zero bit.
-    pos.st->rule50 |= stream.read_n_bit(1) << 6;
-
-    // Convert from fullmove starting from 1 to gamePly starting from 0,
-    // handle also common incorrect FEN with fullmove = 0.
-    pos.gamePly = std::max(2 * (pos.gamePly - 1), 0) + (pos.sideToMove == BLACK);
-
-    assert(stream.get_cursor() <= 256);
-
-    pos.chess960 = false;
-    pos.thisThread = th;
-    pos.set_state(pos.st);
-
-    assert(pos.pos_is_ok());
-
-    return 0;
-  }
-
-  PackedSfen sfen_pack(Position& pos)
-  {
-    PackedSfen sfen;
-
-    SfenPacker sp;
-    sp.data = (uint8_t*)&sfen;
-    sp.pack(pos);
-
-    return sfen;
-  }
 }

From 5856237e3f397ae2db2a0c69e5648386507b019f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 19:59:31 +0200
Subject: [PATCH 331/583] Rename hirate to startpos

---
 src/learn/learn.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index b09700e9..d0e84945 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -768,12 +768,11 @@ namespace Learner
         atomic<int> move_accord_count;
         move_accord_count = 0;
 
-        // Display the value of eval() in the initial stage of Hirate and see the shaking.
         auto th = Threads[thread_id];
         auto& pos = th->rootPos;
         StateInfo si;
         pos.set(StartFEN, false, &si, th);
-        cout << "hirate eval = " << Eval::evaluate(pos) << endl;
+        cout << "startpos eval = " << Eval::evaluate(pos) << endl;
 
         // It's better to parallelize here, but it's a bit
         // troublesome because the search before slave has not finished.

From e503cc4ea80920d96be58f11c36c828076b380de Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 19:59:41 +0200
Subject: [PATCH 332/583] Add one more empty line between progress reports.

---
 src/learn/learn.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index d0e84945..0fce5d95 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -903,6 +903,10 @@ namespace Learner
                     << " , learn_entropy = " << learn_sum_entropy / done
                     << endl;
             }
+
+            // Bigger space between progress reports so that they can be more
+            // easly disinguished. Looking for timestamps is hard.
+            cout << endl;
         }
         else
         {

From 5db46d0c82a12f834e65d6464e43b9aa346d3b3f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 20:20:10 +0200
Subject: [PATCH 333/583] Verify whether there is a network being used during
 training.

---
 src/learn/gensfen.cpp      |  2 +-
 src/learn/learn.cpp        | 46 ++++++++++++++++++++------------------
 src/learn/multi_think.cpp  |  5 -----
 src/nnue/evaluate_nnue.cpp | 30 ++++++++++++++++++++++++-
 src/nnue/evaluate_nnue.h   |  3 ++-
 src/search.cpp             |  2 +-
 src/uci.cpp                |  2 +-
 7 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 1a9187ae..22fddafb 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1000,7 +1000,7 @@ namespace Learner
             << "  detect_draw_by_insufficient_mating_material = " << detect_draw_by_insufficient_mating_material << endl;
 
         // Show if the training data generator uses NNUE.
-        Eval::NNUE::verify();
+        Eval::NNUE::verify_eval_file_loaded();
 
         Threads.main()->ponder = false;
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 0fce5d95..a0a8ec07 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1486,6 +1486,27 @@ namespace Learner
         std::cout << "..shuffle_on_memory done." << std::endl;
     }
 
+    static void set_learning_search_limits()
+    {
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        limits.startTime = now();
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+    }
+
     // Learning from the generated game record
     void learn(Position&, istringstream& is)
     {
@@ -1837,30 +1858,9 @@ namespace Learner
 
         cout << "init.." << endl;
 
-        // Read evaluation function parameters
-        Eval::NNUE::init();
-
         Threads.main()->ponder = false;
 
-        // About Search::Limits
-        // Be careful because this member variable is global and affects other threads.
-        {
-          auto& limits = Search::Limits;
-
-          limits.startTime = now();
-
-          // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
-          limits.infinite = true;
-
-          // Since PV is an obstacle when displayed, erase it.
-          limits.silent = true;
-
-          // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
-          limits.nodes = 0;
-
-          // depth is also processed by the one passed as an argument of Learner::search().
-          limits.depth = 0;
-        }
+        set_learning_search_limits();
 
         cout << "init_training.." << endl;
         Eval::NNUE::InitializeTraining(seed);
@@ -1907,6 +1907,8 @@ namespace Learner
             sr.read_validation_set(validation_set_file_name, eval_limit);
         }
 
+        Eval::NNUE::verify_any_net_loaded();
+
         // Calculate rmse once at this point (timing of 0 sfen)
         // sr.calc_rmse();
 
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
index d2ae65eb..bf1ab29b 100644
--- a/src/learn/multi_think.cpp
+++ b/src/learn/multi_think.cpp
@@ -11,11 +11,6 @@
 
 void MultiThink::go_think()
 {
-    // Read evaluation function, etc.
-    // In the case of the learn command, the value of the evaluation function may be corrected after reading the evaluation function, so
-    // Skip memory corruption check.
-    Eval::NNUE::init();
-
     // Call the derived class's init().
     init();
 
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index f7f9adcc..e3a7be63 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -235,6 +235,7 @@ namespace Eval::NNUE {
             else
             {
                 sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+                eval_file_loaded.clear();
             }
         }
 
@@ -243,7 +244,7 @@ namespace Eval::NNUE {
   }
 
   /// NNUE::verify() verifies that the last net used was loaded successfully
-  void verify() {
+  void verify_eval_file_loaded() {
 
     std::string eval_file = std::string(Options["EvalFile"]);
 
@@ -273,4 +274,31 @@ namespace Eval::NNUE {
         sync_cout << "info string classical evaluation enabled" << sync_endl;
   }
 
+  /// In training we override eval file so this is useful.
+  void verify_any_net_loaded() {
+
+    if (useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
+    {
+        UCI::OptionsMap defaults;
+        UCI::init(defaults);
+
+        std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+        std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
+        std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+        std::string msg5 = "The engine will be terminated now.";
+
+        sync_cout << "info string ERROR: " << msg1 << sync_endl;
+        sync_cout << "info string ERROR: " << msg2 << sync_endl;
+        sync_cout << "info string ERROR: " << msg3 << sync_endl;
+        sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+        std::exit(EXIT_FAILURE);
+    }
+
+    if (useNNUE != UseNNUEMode::False)
+        sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
+    else
+        sync_cout << "info string classical evaluation enabled" << sync_endl;
+  }
+
 } // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index dcfa071d..5335713b 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -96,7 +96,8 @@ namespace Eval::NNUE {
   Value evaluate(const Position& pos);
   bool load_eval(std::string name, std::istream& stream);
   void init();
-  void verify();
+  void verify_eval_file_loaded();
+  void verify_any_net_loaded();
 
 }  // namespace Eval::NNUE
 
diff --git a/src/search.cpp b/src/search.cpp
index 79848812..436e11fd 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -219,7 +219,7 @@ void MainThread::search() {
   Time.init(Limits, us, rootPos.game_ply());
   TT.new_search();
 
-  Eval::NNUE::verify();
+  Eval::NNUE::verify_eval_file_loaded();
 
   if (rootMoves.empty())
   {
diff --git a/src/uci.cpp b/src/uci.cpp
index ff735b2e..896f6db8 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -101,7 +101,7 @@ namespace {
     Position p;
     p.set(pos.fen(), Options["UCI_Chess960"], &states->back(), Threads.main());
 
-    Eval::NNUE::verify();
+    Eval::NNUE::verify_eval_file_loaded();
 
     sync_cout << "\n" << Eval::trace(p) << sync_endl;
   }

From 3cf193a90eb400b0bea0dfd562cf41ba8a2d420b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 16 Oct 2020 19:37:53 +0200
Subject: [PATCH 334/583] Properly handle cases in verify and init when
 SkipLoadingEval is set.

---
 src/nnue/evaluate_nnue.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index e3a7be63..4d8a4b66 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -186,11 +186,6 @@ namespace Eval::NNUE {
 
     Initialize();
 
-    if (Options["SkipLoadingEval"])
-    {
-      std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
-      return true;
-    }
     fileName = name;
     return ReadParameters(stream);
   }
@@ -210,8 +205,12 @@ namespace Eval::NNUE {
   void init() {
 
     useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
-    if (useNNUE == UseNNUEMode::False)
-        return;
+
+    if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
+    {
+      eval_file_loaded.clear();
+      return;
+    }
 
     std::string eval_file = std::string(Options["EvalFile"]);
 
@@ -277,7 +276,7 @@ namespace Eval::NNUE {
   /// In training we override eval file so this is useful.
   void verify_any_net_loaded() {
 
-    if (useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
+    if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
     {
         UCI::OptionsMap defaults;
         UCI::init(defaults);

From c93f8732bfcdebcb23518c3ffe58ce9a5356cfac Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 16 Oct 2020 19:40:05 +0200
Subject: [PATCH 335/583] Force Use NNUE to pure when learning.

---
 src/learn/learn.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index a0a8ec07..95cbe4bb 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1685,7 +1685,6 @@ namespace Learner
             else if (option == "seed") is >> seed;
             else if (option == "set_recommended_uci_options")
             {
-                UCI::setoption("Use NNUE", "pure");
                 UCI::setoption("MultiPV", "1");
                 UCI::setoption("Contempt", "0");
                 UCI::setoption("Skill Level", "20");
@@ -1907,6 +1906,9 @@ namespace Learner
             sr.read_validation_set(validation_set_file_name, eval_limit);
         }
 
+        cout << "Forcing Use NNUE pure.\n";
+        UCI::setoption("Use NNUE", "pure");
+
         Eval::NNUE::verify_any_net_loaded();
 
         // Calculate rmse once at this point (timing of 0 sfen)

From ca760c3a5b78e74a06f3790492e91281a6c1159c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 20:24:26 +0200
Subject: [PATCH 336/583] Cleanup architecture files.

---
 .../architectures/halfkp-cr-ep_256x2-32-32.h  | 66 +++++++++----------
 .../architectures/halfkp-cr_256x2-32-32.h     | 38 +++++------
 src/nnue/architectures/halfkp_256x2-32-32.h   | 60 ++++++++---------
 src/nnue/architectures/halfkp_384x2-32-32.h   | 44 ++++++-------
 .../architectures/k-p-cr-ep_256x2-32-32.h     | 36 +++++-----
 src/nnue/architectures/k-p-cr_256x2-32-32.h   | 34 +++++-----
 src/nnue/architectures/k-p_256x2-32-32.h      | 44 ++++++-------
 7 files changed, 153 insertions(+), 169 deletions(-)

diff --git a/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h b/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
index a90de8e6..6327b78a 100644
--- a/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
+++ b/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Definition of input features and network structure used in NNUE evaluation function
@@ -21,36 +21,36 @@
 #ifndef NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
 #define NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
 
-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
-#include "../features/castling_right.h"
-#include "../features/enpassant.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
+#include "nnue/features/castling_right.h"
+#include "nnue/features/enpassant.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
 namespace Eval::NNUE {
 
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
-    Features::EnPassant>;
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
+        Features::EnPassant>;
 
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
 
-namespace Layers {
+    namespace Layers {
 
-// Define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
-}  // namespace Layers
+    }  // namespace Layers
 
-using Network = Layers::OutputLayer;
+    using Network = Layers::OutputLayer;
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/architectures/halfkp-cr_256x2-32-32.h b/src/nnue/architectures/halfkp-cr_256x2-32-32.h
index df14f499..dd587d1d 100644
--- a/src/nnue/architectures/halfkp-cr_256x2-32-32.h
+++ b/src/nnue/architectures/halfkp-cr_256x2-32-32.h
@@ -3,34 +3,34 @@
 #ifndef NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
 #define NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
 
-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
-#include "../features/castling_right.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
+#include "nnue/features/castling_right.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
 namespace Eval::NNUE {
 
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight>;
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight>;
 
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
 
-namespace Layers {
+    namespace Layers {
 
-// Define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
-}  // namespace Layers
+    }  // namespace Layers
 
-using Network = Layers::OutputLayer;
+    using Network = Layers::OutputLayer;
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/architectures/halfkp_256x2-32-32.h b/src/nnue/architectures/halfkp_256x2-32-32.h
index 9216bd41..333feb83 100644
--- a/src/nnue/architectures/halfkp_256x2-32-32.h
+++ b/src/nnue/architectures/halfkp_256x2-32-32.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Definition of input features and network structure used in NNUE evaluation function
@@ -21,33 +21,33 @@
 #ifndef NNUE_HALFKP_256X2_32_32_H_INCLUDED
 #define NNUE_HALFKP_256X2_32_32_H_INCLUDED
 
-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
 namespace Eval::NNUE {
 
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>>;
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>>;
 
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
 
-namespace Layers {
+    namespace Layers {
 
-// Define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
-}  // namespace Layers
+    }  // namespace Layers
 
-using Network = Layers::OutputLayer;
+    using Network = Layers::OutputLayer;
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/architectures/halfkp_384x2-32-32.h b/src/nnue/architectures/halfkp_384x2-32-32.h
index 3d28139a..96913295 100644
--- a/src/nnue/architectures/halfkp_384x2-32-32.h
+++ b/src/nnue/architectures/halfkp_384x2-32-32.h
@@ -3,37 +3,33 @@
 #ifndef HALFKP_384X2_32_32_H
 #define HALFKP_384X2_32_32_H
 
-#include "../features/feature_set.h"
-#include "../features/half_kp.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_kp.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
-namespace Eval {
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKP<Features::Side::kFriend>>;
 
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<
-    Features::HalfKP<Features::Side::kFriend>>;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 384;
 
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 384;
+    namespace Layers {
 
-namespace Layers {
+        // define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
-// define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+    }  // namespace Layers
 
-}  // namespace Layers
+    using Network = Layers::OutputLayer;
 
-using Network = Layers::OutputLayer;
-
-}  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE
 #endif // HALFKP_384X2_32_32_H
diff --git a/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h b/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
index e178b57b..14eeba54 100644
--- a/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
+++ b/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
@@ -3,40 +3,36 @@
 #ifndef K_P_CR_EP_256X2_32_32_H
 #define K_P_CR_EP_256X2_32_32_H
 
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
-#include "../features/castling_right.h"
-#include "../features/enpassant.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/k.h"
+#include "nnue/features/p.h"
+#include "nnue/features/castling_right.h"
+#include "nnue/features/enpassant.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
-namespace Eval {
-
-  namespace NNUE {
+namespace Eval::NNUE {
 
     // Input features used in evaluation function
     using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-      Features::CastlingRight, Features::EnPassant>;
+        Features::CastlingRight, Features::EnPassant>;
 
     // Number of input feature dimensions after conversion
     constexpr IndexType kTransformedFeatureDimensions = 256;
 
     namespace Layers {
 
-      // define network structure
-      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
     }  // namespace Layers
 
     using Network = Layers::OutputLayer;
 
-  }  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE
 #endif // K_P_CR_EP_256X2_32_32_H
diff --git a/src/nnue/architectures/k-p-cr_256x2-32-32.h b/src/nnue/architectures/k-p-cr_256x2-32-32.h
index d3c187c0..1db34b22 100644
--- a/src/nnue/architectures/k-p-cr_256x2-32-32.h
+++ b/src/nnue/architectures/k-p-cr_256x2-32-32.h
@@ -3,39 +3,35 @@
 #ifndef K_P_CR_256X2_32_32_H
 #define K_P_CR_256X2_32_32_H
 
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
-#include "../features/castling_right.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/k.h"
+#include "nnue/features/p.h"
+#include "nnue/features/castling_right.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
-namespace Eval {
-
-  namespace NNUE {
+namespace Eval::NNUE {
 
     // Input features used in evaluation function
     using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-      Features::CastlingRight>;
+        Features::CastlingRight>;
 
     // Number of input feature dimensions after conversion
     constexpr IndexType kTransformedFeatureDimensions = 256;
 
     namespace Layers {
 
-      // define network structure
-      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+        // define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
     }  // namespace Layers
 
     using Network = Layers::OutputLayer;
 
-  }  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE
 #endif // K_P_CR_256X2_32_32_H
diff --git a/src/nnue/architectures/k-p_256x2-32-32.h b/src/nnue/architectures/k-p_256x2-32-32.h
index 0f340dee..92c9efcd 100644
--- a/src/nnue/architectures/k-p_256x2-32-32.h
+++ b/src/nnue/architectures/k-p_256x2-32-32.h
@@ -3,37 +3,33 @@
 #ifndef K_P_256X2_32_32_H
 #define K_P_256X2_32_32_H
 
-#include "../features/feature_set.h"
-#include "../features/k.h"
-#include "../features/p.h"
+#include "nnue/features/feature_set.h"
+#include "nnue/features/k.h"
+#include "nnue/features/p.h"
 
-#include "../layers/input_slice.h"
-#include "../layers/affine_transform.h"
-#include "../layers/clipped_relu.h"
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
 
-namespace Eval {
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<Features::K, Features::P>;
 
-// Input features used in evaluation function
-using RawFeatures = Features::FeatureSet<Features::K, Features::P>;
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
 
-// Number of input feature dimensions after conversion
-constexpr IndexType kTransformedFeatureDimensions = 256;
+    namespace Layers {
 
-namespace Layers {
+        // define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
-// define network structure
-using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+    }  // namespace Layers
 
-}  // namespace Layers
+    using Network = Layers::OutputLayer;
 
-using Network = Layers::OutputLayer;
-
-}  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE
 #endif // K_P_256X2_32_32_H

From 0d4c3014caf06ee4382e2600264051c3dacc11a9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 20:44:47 +0200
Subject: [PATCH 337/583] Cleanup features.

---
 src/nnue/features/castling_right.cpp   |  94 ++---
 src/nnue/features/castling_right.h     |  48 +--
 src/nnue/features/enpassant.cpp        |  61 ++--
 src/nnue/features/enpassant.h          |  44 +--
 src/nnue/features/feature_set.h        | 470 +++++++++++++------------
 src/nnue/features/features_common.h    |  59 ++--
 src/nnue/features/half_kp.cpp          | 114 +++---
 src/nnue/features/half_kp.h            |  94 ++---
 src/nnue/features/half_relative_kp.cpp | 128 +++----
 src/nnue/features/half_relative_kp.h   |  92 ++---
 src/nnue/features/index_list.h         |  84 ++---
 src/nnue/features/k.cpp                |  67 ++--
 src/nnue/features/k.h                  |  62 ++--
 src/nnue/features/p.cpp                |  79 ++---
 src/nnue/features/p.h                  |  62 ++--
 15 files changed, 797 insertions(+), 761 deletions(-)

diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
index 2b3f3209..eb8a36a1 100644
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -1,60 +1,60 @@
-//Definition of input feature quantity CastlingRight of NNUE evaluation function
-
 #include "castling_right.h"
 #include "index_list.h"
 
+//Definition of input feature quantity CastlingRight of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  // Get a list of indices with a value of 1 among the features
-  void CastlingRight::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-    // do nothing if array size is small to avoid compiler warning
-    if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+    // Get a list of indices with a value of 1 among the features
+    void CastlingRight::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
+        // do nothing if array size is small to avoid compiler warning
+        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
 
-    int castling_rights = pos.state()->castlingRights;
-    int relative_castling_rights;
-    if (perspective == WHITE) {
-      relative_castling_rights = castling_rights;
-    }
-    else {
-      // Invert the perspective.
-      relative_castling_rights = ((castling_rights & 3) << 2)
-        & ((castling_rights >> 2) & 3);
+        int castling_rights = pos.state()->castlingRights;
+        int relative_castling_rights;
+        if (perspective == WHITE) {
+            relative_castling_rights = castling_rights;
+        }
+        else {
+            // Invert the perspective.
+            relative_castling_rights = ((castling_rights & 3) << 2)
+                & ((castling_rights >> 2) & 3);
+        }
+
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
+            if (relative_castling_rights & (1 << i)) {
+                active->push_back(i);
+            }
+        }
     }
 
-    for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
-      if (relative_castling_rights & (1 << i)) {
-        active->push_back(i);
-      }
-    }
-  }
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void CastlingRight::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* /* added */) {
 
-  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-  void CastlingRight::AppendChangedIndices(
-      const Position& pos, Color perspective,
-      IndexList* removed, IndexList* /* added */) {
-    int previous_castling_rights = pos.state()->previous->castlingRights;
-    int current_castling_rights = pos.state()->castlingRights;
-    int relative_previous_castling_rights;
-    int relative_current_castling_rights;
-    if (perspective == WHITE) {
-      relative_previous_castling_rights = previous_castling_rights;
-      relative_current_castling_rights = current_castling_rights;
-    }
-    else {
-      // Invert the perspective.
-      relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
-        & ((previous_castling_rights >> 2) & 3);
-      relative_current_castling_rights = ((current_castling_rights & 3) << 2)
-        & ((current_castling_rights >> 2) & 3);
-    }
+        int previous_castling_rights = pos.state()->previous->castlingRights;
+        int current_castling_rights = pos.state()->castlingRights;
+        int relative_previous_castling_rights;
+        int relative_current_castling_rights;
+        if (perspective == WHITE) {
+            relative_previous_castling_rights = previous_castling_rights;
+            relative_current_castling_rights = current_castling_rights;
+        }
+        else {
+            // Invert the perspective.
+            relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
+                & ((previous_castling_rights >> 2) & 3);
+            relative_current_castling_rights = ((current_castling_rights & 3) << 2)
+                & ((current_castling_rights >> 2) & 3);
+        }
 
-    for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
-      if ((relative_previous_castling_rights & (1 << i)) &&
-        (relative_current_castling_rights & (1 << i)) == 0) {
-        removed->push_back(i);
-      }
+        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
+            if ((relative_previous_castling_rights & (1 << i)) &&
+                (relative_current_castling_rights & (1 << i)) == 0) {
+                removed->push_back(i);
+            }
+        }
     }
-  }
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/castling_right.h b/src/nnue/features/castling_right.h
index 2d8c5322..3e35e432 100644
--- a/src/nnue/features/castling_right.h
+++ b/src/nnue/features/castling_right.h
@@ -1,34 +1,38 @@
-//Definition of input feature quantity CastlingRight of NNUE evaluation function
-
 #ifndef _NNUE_FEATURES_CASTLING_RIGHT_H_
 #define _NNUE_FEATURES_CASTLING_RIGHT_H_
 
-#include "../../evaluate.h"
 #include "features_common.h"
 
+#include "evaluate.h"
+
+//Definition of input feature quantity CastlingRight of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  class CastlingRight {
-  public:
-    // feature quantity name
-    static constexpr const char* kName = "CastlingRight";
-    // Hash value embedded in the evaluation function file
-    static constexpr std::uint32_t kHashValue = 0x913968AAu;
-    // number of feature dimensions
-    static constexpr IndexType kDimensions = 4;
-    // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-    static constexpr IndexType kMaxActiveDimensions = 4;
-    // Timing of full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+    class CastlingRight {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "CastlingRight";
 
-    // Get a list of indices with a value of 1 among the features
-    static void AppendActiveIndices(const Position& pos, Color perspective,
-      IndexList* active);
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x913968AAu;
 
-    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    static void AppendChangedIndices(const Position& pos, Color perspective,
-      IndexList* removed, IndexList* added);
-  };
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = 4;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 4;
+
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+        // Get a list of indices with a value of 1 among the features
+        static void AppendActiveIndices(const Position& pos, Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void AppendChangedIndices(const Position& pos, Color perspective,
+            IndexList* removed, IndexList* added);
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
index e5ceed5c..7aa8988b 100644
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -1,42 +1,45 @@
-//Definition of input feature quantity EnPassant of NNUE evaluation function
-
 #include "enpassant.h"
 #include "index_list.h"
 
+//Definition of input feature quantity EnPassant of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  // Get a list of indices with a value of 1 among the features
-  void EnPassant::AppendActiveIndices(
-    const Position& pos, Color /* perspective */, IndexList* active) {
-    // do nothing if array size is small to avoid compiler warning
-    if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+    // Get a list of indices with a value of 1 among the features
+    void EnPassant::AppendActiveIndices(
+        const Position& pos, Color /* perspective */, IndexList* active) {
 
-    auto epSquare = pos.state()->epSquare;
-    if (epSquare == SQ_NONE) {
-      return;
+        // do nothing if array size is small to avoid compiler warning
+        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions)
+            return;
+
+        auto epSquare = pos.state()->epSquare;
+        if (epSquare == SQ_NONE)
+            return;
+
+        auto file = file_of(epSquare);
+        active->push_back(file);
     }
-    auto file = file_of(epSquare);
-    active->push_back(file);
-  }
 
-  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-  void EnPassant::AppendChangedIndices(
-      const Position& pos, Color /* perspective */,
-      IndexList* removed, IndexList* added) {
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void EnPassant::AppendChangedIndices(
+        const Position& pos, Color /* perspective */,
+        IndexList* removed, IndexList* added) {
 
-    auto previous_epSquare = pos.state()->previous->epSquare;
-    auto epSquare = pos.state()->epSquare;
+        auto previous_epSquare = pos.state()->previous->epSquare;
+        auto epSquare = pos.state()->epSquare;
 
-    if (previous_epSquare != SQ_NONE) {
-      if (epSquare != SQ_NONE && file_of(epSquare) == file_of(previous_epSquare))
-        return;
-      auto file = file_of(previous_epSquare);
-      removed->push_back(file);
+        if (previous_epSquare != SQ_NONE) {
+            if (epSquare != SQ_NONE && file_of(epSquare) == file_of(previous_epSquare))
+                return;
+
+            auto file = file_of(previous_epSquare);
+            removed->push_back(file);
+        }
+
+        if (epSquare != SQ_NONE) {
+            auto file = file_of(epSquare);
+            added->push_back(file);
+        }
     }
-    if (epSquare != SQ_NONE) {
-      auto file = file_of(epSquare);
-      added->push_back(file);
-    }
-  }
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/enpassant.h b/src/nnue/features/enpassant.h
index 065e74a0..65819a96 100644
--- a/src/nnue/features/enpassant.h
+++ b/src/nnue/features/enpassant.h
@@ -1,34 +1,34 @@
-//Definition of input feature quantity EnPassant of NNUE evaluation function
-
 #ifndef _NNUE_FEATURES_ENPASSANT_H_
 #define _NNUE_FEATURES_ENPASSANT_H_
 
-#include "../../evaluate.h"
 #include "features_common.h"
 
+#include "evaluate.h"
+
+//Definition of input feature quantity EnPassant of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  class EnPassant {
-  public:
-    // feature quantity name
-    static constexpr const char* kName = "EnPassant";
-    // Hash value embedded in the evaluation function file
-    static constexpr std::uint32_t kHashValue = 0x02924F91u;
-    // number of feature dimensions
-    static constexpr IndexType kDimensions = 8;
-    // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-    static constexpr IndexType kMaxActiveDimensions = 1;
-    // Timing of full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+    class EnPassant {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "EnPassant";
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x02924F91u;
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = 8;
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 1;
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-    // Get a list of indices with a value of 1 among the features
-    static void AppendActiveIndices(const Position& pos, Color perspective,
-      IndexList* active);
+        // Get a list of indices with a value of 1 among the features
+        static void AppendActiveIndices(const Position& pos, Color perspective,
+            IndexList* active);
 
-    // Get a list of indices whose values have changed from the previous one in the feature quantity
-    static void AppendChangedIndices(const Position& pos, Color perspective,
-      IndexList* removed, IndexList* added);
-  };
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        static void AppendChangedIndices(const Position& pos, Color perspective,
+            IndexList* removed, IndexList* added);
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index a057142c..5b243424 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // A class template that represents the input feature set of the NNUE evaluation function
@@ -22,238 +22,266 @@
 #define NNUE_FEATURE_SET_H_INCLUDED
 
 #include "features_common.h"
+
 #include <array>
 
 namespace Eval::NNUE::Features {
 
-  // Class template that represents a list of values
-  template <typename T, T... Values>
-  struct CompileTimeList;
+    // Class template that represents a list of values
+    template <typename T, T... Values>
+    struct CompileTimeList;
 
-  template <typename T, T First, T... Remaining>
-  struct CompileTimeList<T, First, Remaining...> {
-    static constexpr bool Contains(T value) {
-      return value == First || CompileTimeList<T, Remaining...>::Contains(value);
-    }
-    static constexpr std::array<T, sizeof...(Remaining) + 1>
-        kValues = {{First, Remaining...}};
-  };
-
-  template <typename T, T First, T... Remaining>
-  constexpr std::array<T, sizeof...(Remaining) + 1>
-    CompileTimeList<T, First, Remaining...>::kValues;
-  template <typename T>
-  struct CompileTimeList<T> {
-    static constexpr bool Contains(T /*value*/) {
-      return false;
-    }
-    static constexpr std::array<T, 0> kValues = { {} };
-  };
-
-  // Class template that adds to the beginning of the list
-  template <typename T, typename ListType, T Value>
-  struct AppendToList;
-  template <typename T, T... Values, T AnotherValue>
-  struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
-    using Result = CompileTimeList<T, AnotherValue, Values...>;
-  };
-
-  // Class template for adding to a sorted, unique list
-  template <typename T, typename ListType, T Value>
-  struct InsertToSet;
-  template <typename T, T First, T... Remaining, T AnotherValue>
-  struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
-    using Result = std::conditional_t<
-      CompileTimeList<T, First, Remaining...>::Contains(AnotherValue),
-      CompileTimeList<T, First, Remaining...>,
-      std::conditional_t<(AnotherValue < First),
-      CompileTimeList<T, AnotherValue, First, Remaining...>,
-      typename AppendToList<T, typename InsertToSet<
-      T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
-      First>::Result>>;
-  };
-  template <typename T, T Value>
-  struct InsertToSet<T, CompileTimeList<T>, Value> {
-    using Result = CompileTimeList<T, Value>;
-  };
-
-  // Base class of feature set
-  template <typename Derived>
-  class FeatureSetBase {
-
-   public:
-    // Get a list of indices for active features
-    template <typename IndexListType>
-    static void AppendActiveIndices(
-        const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
-
-      for (Color perspective : { WHITE, BLACK }) {
-        Derived::CollectActiveIndices(
-            pos, trigger, perspective, &active[perspective]);
-      }
-    }
-
-    // Get a list of indices for recently changed features
-    template <typename PositionType, typename IndexListType>
-    static void AppendChangedIndices(
-        const PositionType& pos, TriggerEvent trigger,
-        IndexListType removed[2], IndexListType added[2], bool reset[2]) {
-
-      const auto& dp = pos.state()->dirtyPiece;
-
-      for (Color perspective : { WHITE, BLACK }) {
-        switch (trigger) {
-          case TriggerEvent::kNone:
-            break;
-          case TriggerEvent::kFriendKingMoved:
-            if (dp.dirty_num == 0) continue;
-            reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
-            break;
-          case TriggerEvent::kEnemyKingMoved:
-            if (dp.dirty_num == 0) continue;
-            reset[perspective] = dp.piece[0] == make_piece(~perspective, KING);
-            break;
-          case TriggerEvent::kAnyKingMoved:
-            if (dp.dirty_num == 0) continue;
-            reset[perspective] = type_of(dp.piece[0]) == KING;
-            break;
-          case TriggerEvent::kAnyPieceMoved:
-            reset[perspective] = true;
-            break;
-          default:
-            assert(false);
-            break;
+    template <typename T, T First, T... Remaining>
+    struct CompileTimeList<T, First, Remaining...> {
+        static constexpr bool Contains(T value) {
+            return value == First || CompileTimeList<T, Remaining...>::Contains(value);
         }
-        if (reset[perspective]) {
-          Derived::CollectActiveIndices(
-              pos, trigger, perspective, &added[perspective]);
-        } else {
-          Derived::CollectChangedIndices(
-              pos, trigger, perspective,
-              &removed[perspective], &added[perspective]);
+
+        static constexpr std::array<T, sizeof...(Remaining) + 1>
+            kValues = {{First, Remaining...}};
+    };
+
+    template <typename T, T First, T... Remaining>
+    constexpr std::array<T, sizeof...(Remaining) + 1>
+        CompileTimeList<T, First, Remaining...>::kValues;
+
+    template <typename T>
+    struct CompileTimeList<T> {
+        static constexpr bool Contains(T /*value*/) {
+            return false;
         }
-      }
-    }
-  };
+        static constexpr std::array<T, 0> kValues = { {} };
+    };
 
-  // Class template that represents the feature set
-  // do internal processing in reverse order of template arguments in order to linearize the amount of calculation at runtime
-  template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-  class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
-    public FeatureSetBase<
-    FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
-  private:
-    using Head = FirstFeatureType;
-    using Tail = FeatureSet<RemainingFeatureTypes...>;
+    // Class template that adds to the beginning of the list
+    template <typename T, typename ListType, T Value>
+    struct AppendToList;
 
-  public:
-    // Hash value embedded in the evaluation function file
-    static constexpr std::uint32_t kHashValue =
-      Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
-    // number of feature dimensions
-    static constexpr IndexType kDimensions =
-      Head::kDimensions + Tail::kDimensions;
-    // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-    static constexpr IndexType kMaxActiveDimensions =
-      Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
-    // List of timings to perform all calculations instead of difference calculation
-    using SortedTriggerSet = typename InsertToSet<TriggerEvent,
-      typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
-    static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+    template <typename T, T... Values, T AnotherValue>
+    struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
+        using Result = CompileTimeList<T, AnotherValue, Values...>;
+    };
 
-    // Get the feature quantity name
-    static std::string GetName() {
-      return std::string(Head::kName) + "+" + Tail::GetName();
-    }
+    // Class template for adding to a sorted, unique list
+    template <typename T, typename ListType, T Value>
+    struct InsertToSet;
 
-  private:
-    // Get a list of indices with a value of 1 among the features
-    template <typename IndexListType>
-    static void CollectActiveIndices(
-      const Position& pos, const TriggerEvent trigger, const Color perspective,
-      IndexListType* const active) {
-      Tail::CollectActiveIndices(pos, trigger, perspective, active);
-      if (Head::kRefreshTrigger == trigger) {
-        const auto start = active->size();
-        Head::AppendActiveIndices(pos, perspective, active);
-        for (auto i = start; i < active->size(); ++i) {
-          (*active)[i] += Tail::kDimensions;
+    template <typename T, T First, T... Remaining, T AnotherValue>
+    struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
+        using Result =
+            std::conditional_t<
+                CompileTimeList<T, First, Remaining...>::Contains(AnotherValue),
+                CompileTimeList<T, First, Remaining...>,
+                std::conditional_t<
+                    (AnotherValue < First),
+                    CompileTimeList<T, AnotherValue, First, Remaining...>,
+                    typename AppendToList<T, typename InsertToSet<
+                        T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
+                        First
+                    >::Result
+                >
+            >;
+    };
+
+    template <typename T, T Value>
+    struct InsertToSet<T, CompileTimeList<T>, Value> {
+        using Result = CompileTimeList<T, Value>;
+    };
+
+    // Base class of feature set
+    template <typename Derived>
+    class FeatureSetBase {
+
+       public:
+        // Get a list of indices for active features
+        template <typename IndexListType>
+        static void AppendActiveIndices(
+            const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
+
+            for (Color perspective : { WHITE, BLACK }) {
+                Derived::CollectActiveIndices(
+                    pos, trigger, perspective, &active[perspective]);
+            }
         }
-      }
-    }
 
-    // Get a list of indices whose values have changed from the previous one in the feature quantity
-    template <typename IndexListType>
-    static void CollectChangedIndices(
-      const Position& pos, const TriggerEvent trigger, const Color perspective,
-      IndexListType* const removed, IndexListType* const added) {
-      Tail::CollectChangedIndices(pos, trigger, perspective, removed, added);
-      if (Head::kRefreshTrigger == trigger) {
-        const auto start_removed = removed->size();
-        const auto start_added = added->size();
-        Head::AppendChangedIndices(pos, perspective, removed, added);
-        for (auto i = start_removed; i < removed->size(); ++i) {
-          (*removed)[i] += Tail::kDimensions;
+        // Get a list of indices for recently changed features
+        template <typename PositionType, typename IndexListType>
+        static void AppendChangedIndices(
+            const PositionType& pos, TriggerEvent trigger,
+            IndexListType removed[2], IndexListType added[2], bool reset[2]) {
+
+            const auto& dp = pos.state()->dirtyPiece;
+
+            for (Color perspective : { WHITE, BLACK }) {
+                switch (trigger) {
+                    case TriggerEvent::kNone:
+                        break;
+                    case TriggerEvent::kFriendKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
+                        break;
+                    case TriggerEvent::kEnemyKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = dp.piece[0] == make_piece(~perspective, KING);
+                        break;
+                    case TriggerEvent::kAnyKingMoved:
+                        if (dp.dirty_num == 0) continue;
+                        reset[perspective] = type_of(dp.piece[0]) == KING;
+                        break;
+                    case TriggerEvent::kAnyPieceMoved:
+                        reset[perspective] = true;
+                        break;
+                    default:
+                        assert(false);
+                        break;
+                }
+
+                if (reset[perspective]) {
+                    Derived::CollectActiveIndices(
+                        pos, trigger, perspective, &added[perspective]);
+                } else {
+                    Derived::CollectChangedIndices(
+                        pos, trigger, perspective,
+                        &removed[perspective], &added[perspective]);
+                }
+            }
         }
-        for (auto i = start_added; i < added->size(); ++i) {
-          (*added)[i] += Tail::kDimensions;
+    };
+
+    // Class template that represents the feature set
+    // do internal processing in reverse order of template arguments in order to linearize the amount of calculation at runtime
+    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+    class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
+      public FeatureSetBase<
+          FeatureSet<FirstFeatureType, RemainingFeatureTypes...>
+      > {
+
+    private:
+        using Head = FirstFeatureType;
+        using Tail = FeatureSet<RemainingFeatureTypes...>;
+
+    public:
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            Head::kDimensions + Tail::kDimensions;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
+
+        // List of timings to perform all calculations instead of difference calculation
+        using SortedTriggerSet = typename InsertToSet<TriggerEvent,
+            typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
+
+        static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+        // Get the feature quantity name
+        static std::string GetName() {
+            return std::string(Head::kName) + "+" + Tail::GetName();
         }
-      }
-    }
 
-    // Make the base class and the class template that recursively uses itself a friend
-    friend class FeatureSetBase<FeatureSet>;
-    template <typename... FeatureTypes>
-    friend class FeatureSet;
-  };
+    private:
+        // Get a list of indices with a value of 1 among the features
+        template <typename IndexListType>
+        static void CollectActiveIndices(
+            const Position& pos, const TriggerEvent trigger, const Color perspective,
+            IndexListType* const active) {
+            Tail::CollectActiveIndices(pos, trigger, perspective, active);
+            if (Head::kRefreshTrigger == trigger) {
+                const auto start = active->size();
+                Head::AppendActiveIndices(pos, perspective, active);
 
-  // Class template that represents the feature set
-  template <typename FeatureType>
-  class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+                for (auto i = start; i < active->size(); ++i) {
+                    (*active)[i] += Tail::kDimensions;
+                }
+            }
+        }
 
-   public:
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
-    // Number of feature dimensions
-    static constexpr IndexType kDimensions = FeatureType::kDimensions;
-    // Maximum number of simultaneously active features
-    static constexpr IndexType kMaxActiveDimensions =
-        FeatureType::kMaxActiveDimensions;
-    // Trigger for full calculation instead of difference calculation
-    using SortedTriggerSet =
-        CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
-    static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+        // Get a list of indices whose values have changed from the previous one in the feature quantity
+        template <typename IndexListType>
+        static void CollectChangedIndices(
+            const Position& pos, const TriggerEvent trigger, const Color perspective,
+            IndexListType* const removed, IndexListType* const added) {
+            Tail::CollectChangedIndices(pos, trigger, perspective, removed, added);
+            if (Head::kRefreshTrigger == trigger) {
+                const auto start_removed = removed->size();
+                const auto start_added = added->size();
+                Head::AppendChangedIndices(pos, perspective, removed, added);
 
-    // Get the feature quantity name
-    static std::string GetName() {
-      return FeatureType::kName;
-    }
+                for (auto i = start_removed; i < removed->size(); ++i) {
+                    (*removed)[i] += Tail::kDimensions;
+                }
 
-   private:
-    // Get a list of indices for active features
-    static void CollectActiveIndices(
-        const Position& pos, const TriggerEvent trigger, const Color perspective,
-        IndexList* const active) {
-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendActiveIndices(pos, perspective, active);
-      }
-    }
+                for (auto i = start_added; i < added->size(); ++i) {
+                    (*added)[i] += Tail::kDimensions;
+                }
+            }
+        }
 
-    // Get a list of indices for recently changed features
-    static void CollectChangedIndices(
-        const Position& pos, const TriggerEvent trigger, const Color perspective,
-        IndexList* const removed, IndexList* const added) {
+        // Make the base class and the class template that recursively uses itself a friend
+        friend class FeatureSetBase<FeatureSet>;
 
-      if (FeatureType::kRefreshTrigger == trigger) {
-        FeatureType::AppendChangedIndices(pos, perspective, removed, added);
-      }
-    }
+        template <typename... FeatureTypes>
+        friend class FeatureSet;
+    };
 
-    // Make the base class and the class template that recursively uses itself a friend
-    friend class FeatureSetBase<FeatureSet>;
-    template <typename... FeatureTypes>
-    friend class FeatureSet;
-  };
+    // Class template that represents the feature set
+    template <typename FeatureType>
+    class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+
+    public:
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
+
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions = FeatureType::kDimensions;
+
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;
+
+        // Trigger for full calculation instead of difference calculation
+        using SortedTriggerSet =
+            CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
+
+        static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+        // Get the feature quantity name
+        static std::string GetName() {
+            return FeatureType::kName;
+        }
+
+    private:
+        // Get a list of indices for active features
+        static void CollectActiveIndices(
+            const Position& pos, const TriggerEvent trigger, const Color perspective,
+            IndexList* const active) {
+
+            if (FeatureType::kRefreshTrigger == trigger) {
+              FeatureType::AppendActiveIndices(pos, perspective, active);
+            }
+        }
+
+        // Get a list of indices for recently changed features
+        static void CollectChangedIndices(
+            const Position& pos, const TriggerEvent trigger, const Color perspective,
+            IndexList* const removed, IndexList* const added) {
+
+            if (FeatureType::kRefreshTrigger == trigger) {
+              FeatureType::AppendChangedIndices(pos, perspective, removed, added);
+            }
+        }
+
+        // Make the base class and the class template that recursively uses itself a friend
+        friend class FeatureSetBase<FeatureSet>;
+
+        template <typename... FeatureTypes>
+        friend class FeatureSet;
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/features_common.h b/src/nnue/features/features_common.h
index 656502a3..671ceeb9 100644
--- a/src/nnue/features/features_common.h
+++ b/src/nnue/features/features_common.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 //Common header of input features of NNUE evaluation function
@@ -21,29 +21,30 @@
 #ifndef NNUE_FEATURES_COMMON_H_INCLUDED
 #define NNUE_FEATURES_COMMON_H_INCLUDED
 
-#include "../../evaluate.h"
-#include "../nnue_common.h"
+#include "evaluate.h"
+
+#include "nnue/nnue_common.h"
 
 namespace Eval::NNUE::Features {
 
-  class IndexList;
+    class IndexList;
 
-  template <typename... FeatureTypes>
-  class FeatureSet;
+    template <typename... FeatureTypes>
+    class FeatureSet;
 
-  // Trigger to perform full calculations instead of difference only
-  enum class TriggerEvent {
-    kNone, // Calculate the difference whenever possible
-    kFriendKingMoved, // calculate full evaluation when own king moves
-    kEnemyKingMoved, // calculate full evaluation when opponent king moves
-    kAnyKingMoved, // calculate full evaluation when any king moves
-    kAnyPieceMoved, // always calculate full evaluation
-  };
+    // Trigger to perform full calculations instead of difference only
+    enum class TriggerEvent {
+        kNone, // Calculate the difference whenever possible
+        kFriendKingMoved, // calculate full evaluation when own king moves
+        kEnemyKingMoved, // calculate full evaluation when opponent king moves
+        kAnyKingMoved, // calculate full evaluation when any king moves
+        kAnyPieceMoved, // always calculate full evaluation
+    };
 
-  enum class Side {
-    kFriend, // side to move
-    kEnemy, // opponent
-  };
+    enum class Side {
+        kFriend, // side to move
+        kEnemy, // opponent
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index ae1d697f..17b50472 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 //Definition of input features HalfKP of NNUE evaluation function
@@ -23,51 +23,59 @@
 
 namespace Eval::NNUE::Features {
 
-  // Orient a square according to perspective (flip rank for black)
-  inline Square orient(Color perspective, Square s) {
-    return Square(int(s) ^ (bool(perspective) * SQ_A8));
-  }
-
-  // Find the index of the feature quantity from the king position and PieceSquare
-  template <Side AssociatedKing>
-  inline IndexType HalfKP<AssociatedKing>::MakeIndex(
-      Color perspective, Square s, Piece pc, Square ksq) {
-
-    return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
-  }
-
-  // Get a list of indices for active features
-  template <Side AssociatedKing>
-  void HalfKP<AssociatedKing>::AppendActiveIndices(
-      const Position& pos, Color perspective, IndexList* active) {
-
-    Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
-    Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-    while (bb) {
-      Square s = pop_lsb(&bb);
-      active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
     }
-  }
 
-  // Get a list of indices for recently changed features
-  template <Side AssociatedKing>
-  void HalfKP<AssociatedKing>::AppendChangedIndices(
-      const Position& pos, Color perspective,
-      IndexList* removed, IndexList* added) {
+    // Find the index of the feature quantity from the king position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfKP<AssociatedKing>::MakeIndex(
+        Color perspective, Square s, Piece pc, Square ksq) {
 
-    Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
-    const auto& dp = pos.state()->dirtyPiece;
-    for (int i = 0; i < dp.dirty_num; ++i) {
-      Piece pc = dp.piece[i];
-      if (type_of(pc) == KING) continue;
-      if (dp.from[i] != SQ_NONE)
-        removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
-      if (dp.to[i] != SQ_NONE)
-        added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
     }
-  }
 
-  template class HalfKP<Side::kFriend>;
-  template class HalfKP<Side::kEnemy>;
+    // Get a list of indices for active features
+    template <Side AssociatedKing>
+    void HalfKP<AssociatedKing>::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
+
+        Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices for recently changed features
+    template <Side AssociatedKing>
+    void HalfKP<AssociatedKing>::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (type_of(pc) == KING)
+                continue;
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfKP<Side::kFriend>;
+    template class HalfKP<Side::kEnemy>;
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
index 23e8beb6..834f800e 100644
--- a/src/nnue/features/half_kp.h
+++ b/src/nnue/features/half_kp.h
@@ -1,65 +1,69 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-//Definition of input features HalfKP of NNUE evaluation function
-
 #ifndef NNUE_FEATURES_HALF_KP_H_INCLUDED
 #define NNUE_FEATURES_HALF_KP_H_INCLUDED
 
-#include "../../evaluate.h"
 #include "features_common.h"
 
+#include "evaluate.h"
+
+//Definition of input features HalfKP of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  // Feature HalfKP: Combination of the position of own king
-  // and the position of pieces other than kings
-  template <Side AssociatedKing>
-  class HalfKP {
+    // Feature HalfKP: Combination of the position of own king
+    // and the position of pieces other than kings
+    template <Side AssociatedKing>
+    class HalfKP {
 
-   public:
-    // Feature name
-    static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
-        "HalfKP(Friend)" : "HalfKP(Enemy)";
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t kHashValue =
-        0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
-    // Number of feature dimensions
-    static constexpr IndexType kDimensions =
-        static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END);
-    // Maximum number of simultaneously active features
-    static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-    // Trigger for full calculation instead of difference calculation
-    static constexpr TriggerEvent kRefreshTrigger =
-        (AssociatedKing == Side::kFriend) ?
-        TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+    public:
+        // Feature name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfKP(Friend)" : "HalfKP(Enemy)";
 
-    // Get a list of indices for active features
-    static void AppendActiveIndices(const Position& pos, Color perspective,
-                                    IndexList* active);
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue =
+            0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
 
-    // Get a list of indices for recently changed features
-    static void AppendChangedIndices(const Position& pos, Color perspective,
-                                     IndexList* removed, IndexList* added);
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions =
+            static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END);
 
-   private:
-    // Index of a feature for a given king position and another piece on some square
-    static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
-  };
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
+
+        // Trigger for full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices for active features
+        static void AppendActiveIndices(const Position& pos, Color perspective,
+                                        IndexList* active);
+
+        // Get a list of indices for recently changed features
+        static void AppendChangedIndices(const Position& pos, Color perspective,
+                                         IndexList* removed, IndexList* added);
+
+    private:
+        // Index of a feature for a given king position and another piece on some square
+        static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 6b456a1f..5ab22890 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -1,74 +1,80 @@
-﻿//Definition of input features HalfRelativeKP of NNUE evaluation function
-
-#include "half_relative_kp.h"
+﻿#include "half_relative_kp.h"
 #include "index_list.h"
 
-namespace Eval {
+//Definition of input features HalfRelativeKP of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace NNUE {
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+    }
 
-namespace Features {
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
+        Color perspective, Square s, Piece pc, Square sq_k) {
+        const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+        return MakeIndex(sq_k, p);
+    }
 
-// Orient a square according to perspective (flip rank for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * SQ_A8));
-}
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
+        Square sq_k, IndexType p) {
 
-// Find the index of the feature quantity from the ball position and PieceSquare
-template <Side AssociatedKing>
-inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-  Color perspective, Square s, Piece pc, Square sq_k) {
-  const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-  return MakeIndex(sq_k, p);
-}
+        constexpr IndexType W = kBoardWidth;
+        constexpr IndexType H = kBoardHeight;
+        const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
+        const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
+        const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
+        const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
+        return H * W * piece_index + H * relative_file + relative_rank;
+    }
 
-// Find the index of the feature quantity from the ball position and PieceSquare
-template <Side AssociatedKing>
-inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-    Square sq_k, IndexType p) {
-  constexpr IndexType W = kBoardWidth;
-  constexpr IndexType H = kBoardHeight;
-  const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
-  const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
-  const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
-  const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
-  return H * W * piece_index + H * relative_file + relative_rank;
-}
+    // Get a list of indices with a value of 1 among the features
+    template <Side AssociatedKing>
+    void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
 
-// Get a list of indices with a value of 1 among the features
-template <Side AssociatedKing>
-void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
-  Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-  while (bb) {
-    Square s = pop_lsb(&bb);
-    active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
-  }
-}
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
 
-// Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-template <Side AssociatedKing>
-void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
-  const auto& dp = pos.state()->dirtyPiece;
-  for (int i = 0; i < dp.dirty_num; ++i) {
-    Piece pc = dp.piece[i];
-    if (type_of(pc) == KING) continue;
-    if (dp.from[i] != SQ_NONE)
-      removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
-    if (dp.to[i] != SQ_NONE)
-      added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
-  }
-}
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
 
-template class HalfRelativeKP<Side::kFriend>;
-template class HalfRelativeKP<Side::kEnemy>;
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    template <Side AssociatedKing>
+    void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* added) {
 
-}  // namespace Features
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
 
-}  // namespace NNUE
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
 
-}  // namespace Eval
+            if (type_of(pc) == KING)
+                continue;
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfRelativeKP<Side::kFriend>;
+    template class HalfRelativeKP<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_relative_kp.h b/src/nnue/features/half_relative_kp.h
index 1b384c14..cc1e136f 100644
--- a/src/nnue/features/half_relative_kp.h
+++ b/src/nnue/features/half_relative_kp.h
@@ -1,61 +1,61 @@
-﻿//Definition of input features HalfRelativeKP of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
+﻿#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
 #define _NNUE_FEATURES_HALF_RELATIVE_KP_H_
 
-#include "../../evaluate.h"
 #include "features_common.h"
 
-namespace Eval {
+#include "evaluate.h"
 
-namespace NNUE {
+//Definition of input features HalfRelativeKP of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace Features {
+    // Feature HalfRelativeKP: Relative position of each piece other than the ball based on own ball or enemy ball
+    template <Side AssociatedKing>
+    class HalfRelativeKP {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";
 
-// Feature HalfRelativeKP: Relative position of each piece other than the ball based on own ball or enemy ball
-template <Side AssociatedKing>
-class HalfRelativeKP {
- public:
-  // feature quantity name
-  static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
-      "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue =
-      0xF9180919u ^ (AssociatedKing == Side::kFriend);
-  // Piece type excluding balls
-  static constexpr IndexType kNumPieceKinds = 5 * 2;
-  // width of the virtual board with the ball in the center
-  static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
-  // height of a virtual board with balls in the center
-  static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions =
-      kNumPieceKinds * kBoardHeight * kBoardWidth;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger =
-      (AssociatedKing == Side::kFriend) ?
-      TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            0xF9180919u ^ (AssociatedKing == Side::kFriend);
 
-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+        // Piece type excluding balls
+        static constexpr IndexType kNumPieceKinds = 5 * 2;
 
-  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+        // width of the virtual board with the ball in the center
+        static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
 
-  // Find the index of the feature quantity from the ball position and PieceSquare
-  static IndexType MakeIndex(Square s, IndexType p);
-  // Find the index of the feature quantity from the ball position and PieceSquare
-  static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
-};
+        // height of a virtual board with balls in the center
+        static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
 
-}  // namespace Features
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            kNumPieceKinds * kBoardHeight * kBoardWidth;
 
-}  // namespace NNUE
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
 
-}  // namespace Eval
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices with a value of 1 among the features
+        static void AppendActiveIndices(const Position& pos, Color perspective,
+                                        IndexList* active);
+
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void AppendChangedIndices(const Position& pos, Color perspective,
+                                         IndexList* removed, IndexList* added);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType MakeIndex(Square s, IndexType p);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/features/index_list.h b/src/nnue/features/index_list.h
index dd055fb3..6751b26c 100644
--- a/src/nnue/features/index_list.h
+++ b/src/nnue/features/index_list.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Definition of index list of input features
@@ -21,43 +21,43 @@
 #ifndef NNUE_FEATURES_INDEX_LIST_H_INCLUDED
 #define NNUE_FEATURES_INDEX_LIST_H_INCLUDED
 
-#include "../../position.h"
-#include "../nnue_architecture.h"
+#include "position.h"
+
+#include "nnue/nnue_architecture.h"
 
 namespace Eval::NNUE::Features {
 
-  // Class template used for feature index list
-  template <typename T, std::size_t MaxSize>
-  class ValueList {
+    // Class template used for feature index list
+    template <typename T, std::size_t MaxSize>
+    class ValueList {
 
-   public:
-    std::size_t size() const { return size_; }
-    void resize(std::size_t size) { size_ = size; }
-    void push_back(const T& value) { values_[size_++] = value; }
-    T& operator[](std::size_t index) { return values_[index]; }
-    T* begin() { return values_; }
-    T* end() { return values_ + size_; }
-    const T& operator[](std::size_t index) const { return values_[index]; }
-    const T* begin() const { return values_; }
-    const T* end() const { return values_ + size_; }
+    public:
+        std::size_t size() const { return size_; }
+        void resize(std::size_t size) { size_ = size; }
+        void push_back(const T& value) { values_[size_++] = value; }
+        T& operator[](std::size_t index) { return values_[index]; }
+        T* begin() { return values_; }
+        T* end() { return values_ + size_; }
+        const T& operator[](std::size_t index) const { return values_[index]; }
+        const T* begin() const { return values_; }
+        const T* end() const { return values_ + size_; }
 
-    void swap(ValueList& other) {
-      const std::size_t max_size = std::max(size_, other.size_);
-      for (std::size_t i = 0; i < max_size; ++i) {
-        std::swap(values_[i], other.values_[i]);
-      }
-      std::swap(size_, other.size_);
-    }
+        void swap(ValueList& other) {
+            const std::size_t max_size = std::max(size_, other.size_);
+            for (std::size_t i = 0; i < max_size; ++i) {
+                std::swap(values_[i], other.values_[i]);
+            }
+            std::swap(size_, other.size_);
+        }
 
-   private:
-    T values_[MaxSize] = {};
-    std::size_t size_ = 0;
-  };
+    private:
+        T values_[MaxSize] = {};
+        std::size_t size_ = 0;
+    };
 
-  //Type of feature index list
-  class IndexList
-      : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
-  };
+    //Type of feature index list
+    class IndexList : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/k.cpp b/src/nnue/features/k.cpp
index bd8d7dd0..8911abb7 100644
--- a/src/nnue/features/k.cpp
+++ b/src/nnue/features/k.cpp
@@ -1,46 +1,39 @@
-﻿//Definition of input feature quantity K of NNUE evaluation function
-
-#include "k.h"
+﻿#include "k.h"
 #include "index_list.h"
 
-namespace Eval {
+//Definition of input feature quantity K of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace NNUE {
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+    }
 
-namespace Features {
+    // Index of a feature for a given king position.
+    IndexType K::MakeIndex(Color perspective, Square s, Color king_color) {
+        return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
+    }
 
-// Orient a square according to perspective (flip rank for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * SQ_A8));
-}
+    // Get a list of indices with a value of 1 among the features
+    void K::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
 
-// Index of a feature for a given king position.
-IndexType K::MakeIndex(Color perspective, Square s, Color king_color) {
-  return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
-}
+        for (auto color : Colors) {
+          active->push_back(MakeIndex(perspective, pos.square<KING>(color), color));
+        }
+    }
 
-// Get a list of indices with a value of 1 among the features
-void K::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  for (auto color : Colors) {
-    active->push_back(MakeIndex(perspective, pos.square<KING>(color), color));
-  }
-}
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void K::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* added) {
 
-// Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-void K::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  const auto& dp = pos.state()->dirtyPiece;
-  if (type_of(dp.piece[0]) == KING)
-  {
-    removed->push_back(MakeIndex(perspective, dp.from[0], color_of(dp.piece[0])));
-    added->push_back(MakeIndex(perspective, dp.to[0], color_of(dp.piece[0])));
-  }
-}
+        const auto& dp = pos.state()->dirtyPiece;
+        if (type_of(dp.piece[0]) == KING)
+        {
+            removed->push_back(MakeIndex(perspective, dp.from[0], color_of(dp.piece[0])));
+            added->push_back(MakeIndex(perspective, dp.to[0], color_of(dp.piece[0])));
+        }
+    }
 
-}  // namespace Features
-
-}  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/k.h b/src/nnue/features/k.h
index 9a0be4bb..c9726ab2 100644
--- a/src/nnue/features/k.h
+++ b/src/nnue/features/k.h
@@ -1,48 +1,44 @@
-﻿//Definition of input feature quantity K of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_K_H_
+﻿#ifndef _NNUE_FEATURES_K_H_
 #define _NNUE_FEATURES_K_H_
 
-#include "../../evaluate.h"
 #include "features_common.h"
 
-namespace Eval {
+#include "evaluate.h"
 
-namespace NNUE {
+//Definition of input feature quantity K of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace Features {
+  // Feature K: Ball position
+  class K {
+  public:
+      // feature quantity name
+      static constexpr const char* kName = "K";
 
-// Feature K: Ball position
-class K {
- public:
-  // feature quantity name
-  static constexpr const char* kName = "K";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions = SQUARE_NB * 2;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 2;
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+      // Hash value embedded in the evaluation function file
+      static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
 
-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+      // number of feature dimensions
+      static constexpr IndexType kDimensions = SQUARE_NB * 2;
 
-  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+      // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+      static constexpr IndexType kMaxActiveDimensions = 2;
 
-private:
-  // Index of a feature for a given king position.
-  static IndexType MakeIndex(Color perspective, Square s, Color king_color);
-};
+      // Timing of full calculation instead of difference calculation
+      static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-}  // namespace Features
+      // Get a list of indices with a value of 1 among the features
+      static void AppendActiveIndices(const Position& pos, Color perspective,
+                                      IndexList* active);
 
-}  // namespace NNUE
+      // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+      static void AppendChangedIndices(const Position& pos, Color perspective,
+                                       IndexList* removed, IndexList* added);
 
-}  // namespace Eval
+  private:
+      // Index of a feature for a given king position.
+      static IndexType MakeIndex(Color perspective, Square s, Color king_color);
+  };
+
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/features/p.cpp b/src/nnue/features/p.cpp
index 012311ac..b4757284 100644
--- a/src/nnue/features/p.cpp
+++ b/src/nnue/features/p.cpp
@@ -1,52 +1,49 @@
-﻿//Definition of input feature P of NNUE evaluation function
-
-#include "p.h"
+﻿#include "p.h"
 #include "index_list.h"
 
-namespace Eval {
+//Definition of input feature P of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace NNUE {
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+    }
 
-namespace Features {
+    // Find the index of the feature quantity from the king position and PieceSquare
+    inline IndexType P::MakeIndex(
+        Color perspective, Square s, Piece pc) {
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+    }
 
-// Orient a square according to perspective (flip rank for black)
-inline Square orient(Color perspective, Square s) {
-  return Square(int(s) ^ (bool(perspective) * SQ_A8));
-}
+    // Get a list of indices with a value of 1 among the features
+    void P::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
 
-// Find the index of the feature quantity from the king position and PieceSquare
-inline IndexType P::MakeIndex(
-  Color perspective, Square s, Piece pc) {
-  return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-}
+        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(MakeIndex(perspective, s, pos.piece_on(s)));
+        }
+    }
 
-// Get a list of indices with a value of 1 among the features
-void P::AppendActiveIndices(
-    const Position& pos, Color perspective, IndexList* active) {
-  Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-  while (bb) {
-    Square s = pop_lsb(&bb);
-    active->push_back(MakeIndex(perspective, s, pos.piece_on(s)));
-  }
-}
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void P::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* added) {
 
-// Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-void P::AppendChangedIndices(
-    const Position& pos, Color perspective,
-    IndexList* removed, IndexList* added) {
-  const auto& dp = pos.state()->dirtyPiece;
-  for (int i = 0; i < dp.dirty_num; ++i) {
-    Piece pc = dp.piece[i];
-    if (type_of(pc) == KING) continue;
-    if (dp.from[i] != SQ_NONE)
-      removed->push_back(MakeIndex(perspective, dp.from[i], pc));
-    if (dp.to[i] != SQ_NONE)
-      added->push_back(MakeIndex(perspective, dp.to[i], pc));
-  }
-}
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
 
-}  // namespace Features
+            if (type_of(pc) == KING)
+              continue;
 
-}  // namespace NNUE
+            if (dp.from[i] != SQ_NONE)
+              removed->push_back(MakeIndex(perspective, dp.from[i], pc));
 
-}  // namespace Eval
+            if (dp.to[i] != SQ_NONE)
+              added->push_back(MakeIndex(perspective, dp.to[i], pc));
+        }
+    }
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/p.h b/src/nnue/features/p.h
index 07d88952..6a8a5392 100644
--- a/src/nnue/features/p.h
+++ b/src/nnue/features/p.h
@@ -1,48 +1,44 @@
-﻿//Definition of input feature P of NNUE evaluation function
-
-#ifndef _NNUE_FEATURES_P_H_
+﻿#ifndef _NNUE_FEATURES_P_H_
 #define _NNUE_FEATURES_P_H_
 
-#include "../../evaluate.h"
 #include "features_common.h"
 
-namespace Eval {
+#include "evaluate.h"
 
-namespace NNUE {
+//Definition of input feature P of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace Features {
+  // Feature P: PieceSquare of pieces other than balls
+  class P {
+  public:
+      // feature quantity name
+      static constexpr const char* kName = "P";
 
-// Feature P: PieceSquare of pieces other than balls
-class P {
- public:
-  // feature quantity name
-  static constexpr const char* kName = "P";
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
-  // number of feature dimensions
-  static constexpr IndexType kDimensions = PS_END;
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-  // Timing of full calculation instead of difference calculation
-  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+      // Hash value embedded in the evaluation function file
+      static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
 
-  // Get a list of indices with a value of 1 among the features
-  static void AppendActiveIndices(const Position& pos, Color perspective,
-                                  IndexList* active);
+      // number of feature dimensions
+      static constexpr IndexType kDimensions = PS_END;
 
-  // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-  static void AppendChangedIndices(const Position& pos, Color perspective,
-                                   IndexList* removed, IndexList* added);
+      // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+      static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
 
- private:
-  // Index of a feature for a given piece on some square
-  static IndexType MakeIndex(Color perspective, Square s, Piece pc);
-};
+      // Timing of full calculation instead of difference calculation
+      static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-}  // namespace Features
+      // Get a list of indices with a value of 1 among the features
+      static void AppendActiveIndices(const Position& pos, Color perspective,
+                                      IndexList* active);
 
-}  // namespace NNUE
+      // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+      static void AppendChangedIndices(const Position& pos, Color perspective,
+                                       IndexList* removed, IndexList* added);
 
-}  // namespace Eval
+  private:
+      // Index of a feature for a given piece on some square
+      static IndexType MakeIndex(Color perspective, Square s, Piece pc);
+  };
+
+}  // namespace Eval::NNUE::Features
 
 #endif

From 3041adb080558700cf4b77833305d974a1ca82c2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 20:55:35 +0200
Subject: [PATCH 338/583] Cleanup layers.

---
 src/nnue/layers/affine_transform.h | 523 +++++++++++++++--------------
 src/nnue/layers/clipped_relu.h     | 299 +++++++++--------
 src/nnue/layers/input_slice.h      | 110 +++---
 src/nnue/layers/sum.h              | 257 +++++++-------
 4 files changed, 616 insertions(+), 573 deletions(-)

diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index f24578a8..cc5e5eef 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Definition of layer AffineTransform of NNUE evaluation function
@@ -21,267 +21,290 @@
 #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
 #define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
 
-#include <iostream>
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
+
+#include <string>
+#include <type_traits>
+#include <cstdint>
 
 namespace Eval::NNUE::Layers {
 
-  // Affine transformation layer
-  template <typename PreviousLayer, IndexType OutputDimensions>
-  class AffineTransform {
-   public:
-    // Input/output type
-    using InputType = typename PreviousLayer::OutputType;
-    using OutputType = std::int32_t;
-    static_assert(std::is_same<InputType, std::uint8_t>::value, "");
+    // Affine transformation layer
+    template <typename PreviousLayer, IndexType OutputDimensions>
+    class AffineTransform {
+    public:
+        // Input/output type
+        using InputType = typename PreviousLayer::OutputType;
 
-    // Number of input/output dimensions
-    static constexpr IndexType kInputDimensions =
-        PreviousLayer::kOutputDimensions;
-    static constexpr IndexType kOutputDimensions = OutputDimensions;
-    static constexpr IndexType kPaddedInputDimensions =
-        CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);
+        using OutputType = std::int32_t;
 
-    // Size of forward propagation buffer used in this layer
-    static constexpr std::size_t kSelfBufferSize =
-        CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+        static_assert(std::is_same<InputType, std::uint8_t>::value, "");
 
-    // Size of the forward propagation buffer used from the input layer to this layer
-    static constexpr std::size_t kBufferSize =
-        PreviousLayer::kBufferSize + kSelfBufferSize;
+        // Number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            PreviousLayer::kOutputDimensions;
 
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t GetHashValue() {
-      std::uint32_t hash_value = 0xCC03DAE4u;
-      hash_value += kOutputDimensions;
-      hash_value ^= PreviousLayer::GetHashValue() >> 1;
-      hash_value ^= PreviousLayer::GetHashValue() << 31;
-      return hash_value;
-    }
+        static constexpr IndexType kOutputDimensions = OutputDimensions;
 
-    // A string that represents the structure from the input layer to this layer
-    static std::string GetStructureString() {
-      return "AffineTransform[" +
-        std::to_string(kOutputDimensions) + "<-" +
-        std::to_string(kInputDimensions) + "](" +
-        PreviousLayer::GetStructureString() + ")";
-    }
-    
-   // Read network parameters
-    bool ReadParameters(std::istream& stream) {
-      if (!previous_layer_.ReadParameters(stream)) return false;
-      for (std::size_t i = 0; i < kOutputDimensions; ++i)
-        biases_[i] = read_little_endian<BiasType>(stream);
-      for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
-        weights_[i] = read_little_endian<WeightType>(stream);
-      return !stream.fail();
-    }
+        static constexpr IndexType kPaddedInputDimensions =
+            CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);
 
-    // write parameters
-    bool WriteParameters(std::ostream& stream) const {
-      if (!previous_layer_.WriteParameters(stream)) return false;
-      stream.write(reinterpret_cast<const char*>(biases_),
-        kOutputDimensions * sizeof(BiasType));
-      stream.write(reinterpret_cast<const char*>(weights_),
-        kOutputDimensions * kPaddedInputDimensions *
-        sizeof(WeightType));
-      return !stream.fail();
-    }
+        // Size of forward propagation buffer used in this layer
+        static constexpr std::size_t kSelfBufferSize =
+            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
-    // Forward propagation
-    const OutputType* Propagate(
-        const TransformedFeatureType* transformed_features, char* buffer) const {
-      const auto input = previous_layer_.Propagate(
-          transformed_features, buffer + kSelfBufferSize);
-      const auto output = reinterpret_cast<OutputType*>(buffer);
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize =
+            PreviousLayer::kBufferSize + kSelfBufferSize;
 
-  #if defined(USE_AVX512)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
-      const auto input_vector = reinterpret_cast<const __m512i*>(input);
-  #if !defined(USE_VNNI)
-      const __m512i kOnes = _mm512_set1_epi16(1);
-  #endif
-
-  #elif defined(USE_AVX2)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-      const auto input_vector = reinterpret_cast<const __m256i*>(input);
-  #if !defined(USE_VNNI)
-      const __m256i kOnes = _mm256_set1_epi16(1);
-  #endif
-
-  #elif defined(USE_SSE2)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-  #ifndef USE_SSSE3
-      const __m128i kZeros = _mm_setzero_si128();
-  #else
-      const __m128i kOnes = _mm_set1_epi16(1);
-  #endif
-      const auto input_vector = reinterpret_cast<const __m128i*>(input);
-
-  #elif defined(USE_MMX)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-      const __m64 kZeros = _mm_setzero_si64();
-      const auto input_vector = reinterpret_cast<const __m64*>(input);
-
-  #elif defined(USE_NEON)
-      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-      const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
-  #endif
-
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType offset = i * kPaddedInputDimensions;
-
-  #if defined(USE_AVX512)
-        __m512i sum = _mm512_setzero_si512();
-        const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-            sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #else
-            __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-            product = _mm512_madd_epi16(product, kOnes);
-            sum = _mm512_add_epi32(sum, product);
-  #endif
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xCC03DAE4u;
+            hash_value += kOutputDimensions;
+            hash_value ^= PreviousLayer::GetHashValue() >> 1;
+            hash_value ^= PreviousLayer::GetHashValue() << 31;
+            return hash_value;
         }
 
-        // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
-        // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
-        // and we have to do one more 256bit chunk.
-        if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
-        {
-            const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
-            const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
-  #if defined(USE_VNNI)
-            __m256i product256 = _mm256_dpbusd_epi32(
-                _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_inserti32x8(sum, product256, 0);
-  #else
-            __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
-  #endif
+        // A string that represents the structure from the input layer to this layer
+        static std::string GetStructureString() {
+            return "AffineTransform[" +
+                std::to_string(kOutputDimensions) + "<-" +
+                std::to_string(kInputDimensions) + "](" +
+                PreviousLayer::GetStructureString() + ")";
         }
-        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
 
-  #elif defined(USE_AVX2)
-        __m256i sum = _mm256_setzero_si256();
-        const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(USE_VNNI)
-          sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-  #else
-          __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
-          product = _mm256_madd_epi16(product, kOnes);
-          sum = _mm256_add_epi32(sum, product);
-  #endif
+       // Read network parameters
+        bool ReadParameters(std::istream& stream) {
+            if (!previous_layer_.ReadParameters(stream))
+                return false;
+
+            for (std::size_t i = 0; i < kOutputDimensions; ++i)
+                biases_[i] = read_little_endian<BiasType>(stream);
+
+            for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
+                weights_[i] = read_little_endian<WeightType>(stream);
+
+            return !stream.fail();
         }
-        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
-        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
-        output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
 
-  #elif defined(USE_SSSE3)
-        __m128i sum = _mm_setzero_si128();
-        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
-          __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
-          product0 = _mm_madd_epi16(product0, kOnes);
-          sum = _mm_add_epi32(sum, product0);
-          __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
-          product1 = _mm_madd_epi16(product1, kOnes);
-          sum = _mm_add_epi32(sum, product1);
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            if (!previous_layer_.WriteParameters(stream))
+                return false;
+
+            stream.write(reinterpret_cast<const char*>(biases_),
+                kOutputDimensions * sizeof(BiasType));
+
+            stream.write(reinterpret_cast<const char*>(weights_),
+                kOutputDimensions * kPaddedInputDimensions *
+                sizeof(WeightType));
+
+            return !stream.fail();
         }
-        if (kNumChunks & 0x1) {
-          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
-          product = _mm_madd_epi16(product, kOnes);
-          sum = _mm_add_epi32(sum, product);
+
+        // Forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
+
+            const auto input = previous_layer_.Propagate(
+                transformed_features, buffer + kSelfBufferSize);
+            const auto output = reinterpret_cast<OutputType*>(buffer);
+
+#if defined(USE_AVX512)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
+            const auto input_vector = reinterpret_cast<const __m512i*>(input);
+#if !defined(USE_VNNI)
+            const __m512i kOnes = _mm512_set1_epi16(1);
+#endif
+
+#elif defined(USE_AVX2)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+            const auto input_vector = reinterpret_cast<const __m256i*>(input);
+#if !defined(USE_VNNI)
+            const __m256i kOnes = _mm256_set1_epi16(1);
+#endif
+
+#elif defined(USE_SSE2)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+#ifndef USE_SSSE3
+            const __m128i kZeros = _mm_setzero_si128();
+#else
+            const __m128i kOnes = _mm_set1_epi16(1);
+#endif
+            const auto input_vector = reinterpret_cast<const __m128i*>(input);
+
+#elif defined(USE_MMX)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+            const __m64 kZeros = _mm_setzero_si64();
+            const auto input_vector = reinterpret_cast<const __m64*>(input);
+
+#elif defined(USE_NEON)
+            constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+            const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
+#endif
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const IndexType offset = i * kPaddedInputDimensions;
+
+#if defined(USE_AVX512)
+                __m512i sum = _mm512_setzero_si512();
+                const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+#if defined(USE_VNNI)
+                    sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+#else
+                    __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+                    product = _mm512_madd_epi16(product, kOnes);
+                    sum = _mm512_add_epi32(sum, product);
+#endif
+                }
+
+                // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
+                // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
+                // and we have to do one more 256bit chunk.
+                if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
+                {
+                    const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
+                    const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
+#if defined(USE_VNNI)
+                    __m256i product256 = _mm256_dpbusd_epi32(
+                        _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+                    sum = _mm512_inserti32x8(sum, product256, 0);
+#else
+                    __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+                    sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
+#endif
+                }
+
+                output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
+
+#elif defined(USE_AVX2)
+                __m256i sum = _mm256_setzero_si256();
+                const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+#if defined(USE_VNNI)
+                    sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+#else
+                    __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+                    product = _mm256_madd_epi16(product, kOnes);
+                    sum = _mm256_add_epi32(sum, product);
+#endif
+                }
+
+                __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+                sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+                sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+                output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
+
+#elif defined(USE_SSSE3)
+                __m128i sum = _mm_setzero_si128();
+                const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+                for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
+                    __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+                    product0 = _mm_madd_epi16(product0, kOnes);
+                    sum = _mm_add_epi32(sum, product0);
+                    __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
+                    product1 = _mm_madd_epi16(product1, kOnes);
+                    sum = _mm_add_epi32(sum, product1);
+                }
+
+                if (kNumChunks & 0x1) {
+                    __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
+                    product = _mm_madd_epi16(product, kOnes);
+                    sum = _mm_add_epi32(sum, product);
+                }
+
+                sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+                sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+                output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
+
+#elif defined(USE_SSE2)
+                __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
+                __m128i sum_hi = kZeros;
+                const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m128i row_j = _mm_load_si128(&row[j]);
+                    __m128i input_j = _mm_load_si128(&input_vector[j]);
+                    __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
+                    __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
+                    __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
+                    __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
+                    __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
+                    __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
+                    __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
+                    sum_lo = _mm_add_epi32(sum_lo, product_lo);
+                    sum_hi = _mm_add_epi32(sum_hi, product_hi);
+                }
+
+                __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
+                __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
+                sum = _mm_add_epi32(sum, sum_high_64);
+                __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
+                sum = _mm_add_epi32(sum, sum_second_32);
+                output[i] = _mm_cvtsi128_si32(sum);
+
+#elif defined(USE_MMX)
+                __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
+                __m64 sum_hi = kZeros;
+                const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m64 row_j = row[j];
+                    __m64 input_j = input_vector[j];
+                    __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
+                    __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
+                    __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
+                    __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
+                    __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
+                    __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
+                    __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
+                    sum_lo = _mm_add_pi32(sum_lo, product_lo);
+                    sum_hi = _mm_add_pi32(sum_hi, product_hi);
+                }
+
+                __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
+                sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
+                output[i] = _mm_cvtsi64_si32(sum);
+
+#elif defined(USE_NEON)
+                int32x4_t sum = {biases_[i]};
+                const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
+                    product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
+                    sum = vpadalq_s16(sum, product);
+                }
+
+                output[i] = sum[0] + sum[1] + sum[2] + sum[3];
+
+#else
+                OutputType sum = biases_[i];
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    sum += weights_[offset + j] * input[j];
+                }
+
+                output[i] = sum;
+#endif
+
+            }
+#if defined(USE_MMX)
+            _mm_empty();
+#endif
+            return output;
         }
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
-        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
-        output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
 
-  #elif defined(USE_SSE2)
-        __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
-        __m128i sum_hi = kZeros;
-        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m128i row_j = _mm_load_si128(&row[j]);
-          __m128i input_j = _mm_load_si128(&input_vector[j]);
-          __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
-          __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
-          __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
-          __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
-          __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
-          __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
-          __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
-          sum_lo = _mm_add_epi32(sum_lo, product_lo);
-          sum_hi = _mm_add_epi32(sum_hi, product_hi);
-        }
-        __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
-        __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
-        sum = _mm_add_epi32(sum, sum_high_64);
-        __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
-        sum = _mm_add_epi32(sum, sum_second_32);
-        output[i] = _mm_cvtsi128_si32(sum);
+    private:
+        using BiasType = OutputType;
+        using WeightType = std::int8_t;
 
-  #elif defined(USE_MMX)
-        __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
-        __m64 sum_hi = kZeros;
-        const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m64 row_j = row[j];
-          __m64 input_j = input_vector[j];
-          __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
-          __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
-          __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
-          __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
-          __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
-          __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
-          __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
-          sum_lo = _mm_add_pi32(sum_lo, product_lo);
-          sum_hi = _mm_add_pi32(sum_hi, product_hi);
-        }
-        __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
-        sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
-        output[i] = _mm_cvtsi64_si32(sum);
+        // Make the learning class a friend
+        friend class Trainer<AffineTransform>;
 
-  #elif defined(USE_NEON)
-        int32x4_t sum = {biases_[i]};
-        const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
-          product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
-          sum = vpadalq_s16(sum, product);
-        }
-        output[i] = sum[0] + sum[1] + sum[2] + sum[3];
+        PreviousLayer previous_layer_;
 
-  #else
-        OutputType sum = biases_[i];
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          sum += weights_[offset + j] * input[j];
-        }
-        output[i] = sum;
-  #endif
-
-      }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
-      return output;
-    }
-
-   private:
-    using BiasType = OutputType;
-    using WeightType = std::int8_t;
-
-    // Make the learning class a friend
-    friend class Trainer<AffineTransform>;
-
-    PreviousLayer previous_layer_;
-
-    alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
-    alignas(kCacheLineSize)
-        WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
-  };
+        alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
+        alignas(kCacheLineSize) WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
+    };
 
 }  // namespace Eval::NNUE::Layers
 
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index d923986e..0846f3df 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Definition of layer ClippedReLU of NNUE evaluation function
@@ -21,160 +21,169 @@
 #ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
 #define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
 
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
+
+#include <string>
+#include <cstdint>
+#include <type_traits>
 
 namespace Eval::NNUE::Layers {
 
-  // Clipped ReLU
-  template <typename PreviousLayer>
-  class ClippedReLU {
-   public:
-    // Input/output type
-    using InputType = typename PreviousLayer::OutputType;
-    using OutputType = std::uint8_t;
-    static_assert(std::is_same<InputType, std::int32_t>::value, "");
+    // Clipped ReLU
+    template <typename PreviousLayer>
+    class ClippedReLU {
+    public:
+        // Input/output type
+        using InputType = typename PreviousLayer::OutputType;
 
-    // Number of input/output dimensions
-    static constexpr IndexType kInputDimensions =
-        PreviousLayer::kOutputDimensions;
-    static constexpr IndexType kOutputDimensions = kInputDimensions;
+        using OutputType = std::uint8_t;
 
-    // Size of forward propagation buffer used in this layer
-    static constexpr std::size_t kSelfBufferSize =
-        CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+        static_assert(std::is_same<InputType, std::int32_t>::value, "");
 
-    // Size of the forward propagation buffer used from the input layer to this layer
-    static constexpr std::size_t kBufferSize =
-        PreviousLayer::kBufferSize + kSelfBufferSize;
+        // Number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            PreviousLayer::kOutputDimensions;
 
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t GetHashValue() {
-      std::uint32_t hash_value = 0x538D24C7u;
-      hash_value += PreviousLayer::GetHashValue();
-      return hash_value;
-    }
+        static constexpr IndexType kOutputDimensions = kInputDimensions;
 
-    // A string that represents the structure from the input layer to this layer
-    static std::string GetStructureString() {
-      return "ClippedReLU[" +
-        std::to_string(kOutputDimensions) + "](" +
-        PreviousLayer::GetStructureString() + ")";
-    }
+        // Size of forward propagation buffer used in this layer
+        static constexpr std::size_t kSelfBufferSize =
+            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
-    // Read network parameters
-    bool ReadParameters(std::istream& stream) {
-      return previous_layer_.ReadParameters(stream);
-    }
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize =
+            PreviousLayer::kBufferSize + kSelfBufferSize;
 
-    // write parameters
-    bool WriteParameters(std::ostream& stream) const {
-      return previous_layer_.WriteParameters(stream);
-    }
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0x538D24C7u;
+            hash_value += PreviousLayer::GetHashValue();
+            return hash_value;
+        }
 
-    // Forward propagation
-    const OutputType* Propagate(
-        const TransformedFeatureType* transformed_features, char* buffer) const {
-      const auto input = previous_layer_.Propagate(
-          transformed_features, buffer + kSelfBufferSize);
-      const auto output = reinterpret_cast<OutputType*>(buffer);
+        // A string that represents the structure from the input layer to this layer
+        static std::string GetStructureString() {
+            return "ClippedReLU[" +
+                std::to_string(kOutputDimensions) + "](" +
+                PreviousLayer::GetStructureString() + ")";
+        }
 
-  #if defined(USE_AVX2)
-      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
-      const __m256i kZero = _mm256_setzero_si256();
-      const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-      const auto in = reinterpret_cast<const __m256i*>(input);
-      const auto out = reinterpret_cast<__m256i*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 0]),
-            _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
-        const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 2]),
-            _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
-        _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
-            _mm256_packs_epi16(words0, words1), kZero), kOffsets));
-      }
-      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+        // Read network parameters
+        bool ReadParameters(std::istream& stream) {
+            return previous_layer_.ReadParameters(stream);
+        }
 
-  #elif defined(USE_SSE2)
-      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            return previous_layer_.WriteParameters(stream);
+        }
 
-  #ifdef USE_SSE41
-      const __m128i kZero = _mm_setzero_si128();
-  #else
-      const __m128i k0x80s = _mm_set1_epi8(-128);
-  #endif
+        // Forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
 
-      const auto in = reinterpret_cast<const __m128i*>(input);
-      const auto out = reinterpret_cast<__m128i*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
-            _mm_load_si128(&in[i * 4 + 0]),
-            _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits);
-        const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
-            _mm_load_si128(&in[i * 4 + 2]),
-            _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
-        const __m128i packedbytes = _mm_packs_epi16(words0, words1);
-        _mm_store_si128(&out[i],
+            const auto input = previous_layer_.Propagate(
+                transformed_features, buffer + kSelfBufferSize);
+            const auto output = reinterpret_cast<OutputType*>(buffer);
 
-  #ifdef USE_SSE41
-          _mm_max_epi8(packedbytes, kZero)
-  #else
-          _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
-  #endif
+#if defined(USE_AVX2)
+            constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+            const __m256i kZero = _mm256_setzero_si256();
+            const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+            const auto in = reinterpret_cast<const __m256i*>(input);
+            const auto out = reinterpret_cast<__m256i*>(output);
+            for (IndexType i = 0; i < kNumChunks; ++i) {
+                const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
+                    _mm256_loadA_si256(&in[i * 4 + 0]),
+                    _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
+                const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
+                    _mm256_loadA_si256(&in[i * 4 + 2]),
+                    _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
+                _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+                    _mm256_packs_epi16(words0, words1), kZero), kOffsets));
+            }
 
-        );
-      }
-      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+            constexpr IndexType kStart = kNumChunks * kSimdWidth;
 
-  #elif defined(USE_MMX)
-      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
-      const __m64 k0x80s = _mm_set1_pi8(-128);
-      const auto in = reinterpret_cast<const __m64*>(input);
-      const auto out = reinterpret_cast<__m64*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        const __m64 words0 = _mm_srai_pi16(
-            _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
-            kWeightScaleBits);
-        const __m64 words1 = _mm_srai_pi16(
-            _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
-            kWeightScaleBits);
-        const __m64 packedbytes = _mm_packs_pi16(words0, words1);
-        out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
-      }
-      _mm_empty();
-      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+#elif defined(USE_SSE2)
+            constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
 
-  #elif defined(USE_NEON)
-      constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
-      const int8x8_t kZero = {0};
-      const auto in = reinterpret_cast<const int32x4_t*>(input);
-      const auto out = reinterpret_cast<int8x8_t*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        int16x8_t shifted;
-        const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
-        pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits);
-        pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits);
-        out[i] = vmax_s8(vqmovn_s16(shifted), kZero);
-      }
-      constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2);
-  #else
-      constexpr IndexType kStart = 0;
-  #endif
+#if defined(USE_SSE41)
+            const __m128i kZero = _mm_setzero_si128();
+#else
+            const __m128i k0x80s = _mm_set1_epi8(-128);
+#endif
 
-      for (IndexType i = kStart; i < kInputDimensions; ++i) {
-        output[i] = static_cast<OutputType>(
-            std::max(0, std::min(127, input[i] >> kWeightScaleBits)));
-      }
-      return output;
-    }
+            const auto in = reinterpret_cast<const __m128i*>(input);
+            const auto out = reinterpret_cast<__m128i*>(output);
+            for (IndexType i = 0; i < kNumChunks; ++i) {
+                const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
+                    _mm_load_si128(&in[i * 4 + 0]),
+                    _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits);
+                const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
+                    _mm_load_si128(&in[i * 4 + 2]),
+                    _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
+                const __m128i packedbytes = _mm_packs_epi16(words0, words1);
+                _mm_store_si128(&out[i],
 
-   private:
-     // Make the learning class a friend
-     friend class Trainer<ClippedReLU>;
-     
-    PreviousLayer previous_layer_;
-  };
+#if defined(USE_SSE41)
+                    _mm_max_epi8(packedbytes, kZero)
+            #else
+                    _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+#endif
+
+                );
+            }
+            constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
+#elif defined(USE_MMX)
+            constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+            const __m64 k0x80s = _mm_set1_pi8(-128);
+            const auto in = reinterpret_cast<const __m64*>(input);
+            const auto out = reinterpret_cast<__m64*>(output);
+            for (IndexType i = 0; i < kNumChunks; ++i) {
+                const __m64 words0 = _mm_srai_pi16(
+                    _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
+                    kWeightScaleBits);
+                const __m64 words1 = _mm_srai_pi16(
+                    _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
+                    kWeightScaleBits);
+                const __m64 packedbytes = _mm_packs_pi16(words0, words1);
+                out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+            }
+            _mm_empty();
+            constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
+#elif defined(USE_NEON)
+            constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
+            const int8x8_t kZero = {0};
+            const auto in = reinterpret_cast<const int32x4_t*>(input);
+            const auto out = reinterpret_cast<int8x8_t*>(output);
+            for (IndexType i = 0; i < kNumChunks; ++i) {
+                int16x8_t shifted;
+                const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
+                pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits);
+                pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits);
+                out[i] = vmax_s8(vqmovn_s16(shifted), kZero);
+            }
+            constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2);
+#else
+            constexpr IndexType kStart = 0;
+#endif
+
+            for (IndexType i = kStart; i < kInputDimensions; ++i) {
+                output[i] = static_cast<OutputType>(
+                    std::max(0, std::min(127, input[i] >> kWeightScaleBits)));
+            }
+            return output;
+        }
+
+    private:
+        // Make the learning class a friend
+        friend class Trainer<ClippedReLU>;
+
+        PreviousLayer previous_layer_;
+    };
 
 }  // namespace Eval::NNUE::Layers
 
diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h
index 78756a39..9d9476a5 100644
--- a/src/nnue/layers/input_slice.h
+++ b/src/nnue/layers/input_slice.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // NNUE evaluation function layer InputSlice definition
@@ -21,59 +21,63 @@
 #ifndef NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
 #define NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
 
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
+
+#include <string>
+#include <cstdint>
 
 namespace Eval::NNUE::Layers {
 
-// Input layer
-template <IndexType OutputDimensions, IndexType Offset = 0>
-class InputSlice {
- public:
-  // Need to maintain alignment
-  static_assert(Offset % kMaxSimdWidth == 0, "");
+  // Input layer
+  template <IndexType OutputDimensions, IndexType Offset = 0>
+  class InputSlice {
+  public:
+      // Need to maintain alignment
+      static_assert(Offset % kMaxSimdWidth == 0, "");
 
-  // Output type
-  using OutputType = TransformedFeatureType;
+      // Output type
+      using OutputType = TransformedFeatureType;
 
-  // Output dimensionality
-  static constexpr IndexType kOutputDimensions = OutputDimensions;
+      // Output dimensionality
+      static constexpr IndexType kOutputDimensions = OutputDimensions;
 
-  // Size of forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize = 0;
+      // Size of forward propagation buffer used from the input layer to this layer
+      static constexpr std::size_t kBufferSize = 0;
 
-  // Hash value embedded in the evaluation file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xEC42E90Du;
-    hash_value ^= kOutputDimensions ^ (Offset << 10);
-    return hash_value;
-  }
+      // Hash value embedded in the evaluation file
+      static constexpr std::uint32_t GetHashValue() {
+          std::uint32_t hash_value = 0xEC42E90Du;
+          hash_value ^= kOutputDimensions ^ (Offset << 10);
+          return hash_value;
+      }
 
-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
-      std::to_string(Offset) + ":" +
-      std::to_string(Offset + kOutputDimensions) + ")]";
-  }
+      // A string that represents the structure from the input layer to this layer
+      static std::string GetStructureString() {
+          return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
+              std::to_string(Offset) + ":" +
+              std::to_string(Offset + kOutputDimensions) + ")]";
+      }
 
-  // Read network parameters
-  bool ReadParameters(std::istream& /*stream*/) {
-    return true;
-  }
+      // Read network parameters
+      bool ReadParameters(std::istream& /*stream*/) {
+          return true;
+      }
 
-  // write parameters
-  bool WriteParameters(std::ostream& /*stream*/) const {
-    return true;
-  }
+      // write parameters
+      bool WriteParameters(std::ostream& /*stream*/) const {
+          return true;
+      }
 
-  // Forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features,
-      char* /*buffer*/) const {
-    return transformed_features + Offset;
-  }
+      // Forward propagation
+      const OutputType* Propagate(
+          const TransformedFeatureType* transformed_features,
+          char* /*buffer*/) const {
 
- private:
-};
+          return transformed_features + Offset;
+      }
+
+  private:
+  };
 
 }  // namespace Layers
 
diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h
index 419ced89..c81f5850 100644
--- a/src/nnue/layers/sum.h
+++ b/src/nnue/layers/sum.h
@@ -1,159 +1,166 @@
-﻿// Definition of layer Sum of NNUE evaluation function
-
-#ifndef _NNUE_LAYERS_SUM_H_
+﻿#ifndef _NNUE_LAYERS_SUM_H_
 #define _NNUE_LAYERS_SUM_H_
 
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
 
-namespace Eval {
+// Definition of layer Sum of NNUE evaluation function
+namespace Eval::NNUE::Layers {
 
-namespace NNUE {
+    // Layer that sums the output of multiple layers
+    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+    class Sum : public Sum<RemainingPreviousLayers...> {
+    private:
+        using Head = FirstPreviousLayer;
+        using Tail = Sum<RemainingPreviousLayers...>;
 
-namespace Layers {
+     public:
+        // Input/output type
+        using InputType = typename Head::OutputType;
 
-// Layer that sums the output of multiple layers
-template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-class Sum : public Sum<RemainingPreviousLayers...> {
- private:
-  using Head = FirstPreviousLayer;
-  using Tail = Sum<RemainingPreviousLayers...>;
+        using OutputType = InputType;
 
- public:
-  // Input/output type
-  using InputType = typename Head::OutputType;
-  using OutputType = InputType;
-  static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
+        static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = kInputDimensions;
-  static_assert(kInputDimensions == Tail::kInputDimensions ,"");
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
 
-  // Size of forward propagation buffer used in this layer
-  static constexpr std::size_t kSelfBufferSize =
-      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+        static constexpr IndexType kOutputDimensions = kInputDimensions;
 
-  // Size of the forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize =
-      std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
+        static_assert(kInputDimensions == Tail::kInputDimensions ,"");
 
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xBCE400B4u;
-    hash_value ^= Head::GetHashValue() >> 1;
-    hash_value ^= Head::GetHashValue() << 31;
-    hash_value ^= Tail::GetHashValue() >> 2;
-    hash_value ^= Tail::GetHashValue() << 30;
-    return hash_value;
-  }
+        // Size of forward propagation buffer used in this layer
+        static constexpr std::size_t kSelfBufferSize =
+            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "Sum[" +
-        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
-  }
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize =
+            std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
 
-  // read parameters
-  bool ReadParameters(std::istream& stream) {
-    if (!Tail::ReadParameters(stream)) return false;
-    return previous_layer_.ReadParameters(stream);
-  }
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xBCE400B4u;
+            hash_value ^= Head::GetHashValue() >> 1;
+            hash_value ^= Head::GetHashValue() << 31;
+            hash_value ^= Tail::GetHashValue() >> 2;
+            hash_value ^= Tail::GetHashValue() << 30;
+            return hash_value;
+        }
 
-  // write parameters
-  bool WriteParameters(std::ostream& stream) const {
-    if (!Tail::WriteParameters(stream)) return false;
-    return previous_layer_.WriteParameters(stream);
-  }
+        // A string that represents the structure from the input layer to this layer
+        static std::string GetStructureString() {
+            return "Sum[" +
+                std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+        }
 
-  // forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features, char* buffer) const {
-    Tail::Propagate(transformed_features, buffer);
-    const auto head_output = previous_layer_.Propagate(
-        transformed_features, buffer + kSelfBufferSize);
-    const auto output = reinterpret_cast<OutputType*>(buffer);
-    for (IndexType i = 0; i <kOutputDimensions; ++i) {
-      output[i] += head_output[i];
-    }
-    return output;
-  }
+        // read parameters
+        bool ReadParameters(std::istream& stream) {
+            if (!Tail::ReadParameters(stream))
+                return false;
 
- protected:
-  // A string that represents the list of layers to be summed
-  static std::string GetSummandsString() {
-    return Head::GetStructureString() + "," + Tail::GetSummandsString();
-  }
+            return previous_layer_.ReadParameters(stream);
+        }
 
-  // Make the learning class a friend
-  friend class Trainer<Sum>;
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            if (!Tail::WriteParameters(stream))
+                return false;
 
-  // the layer immediately before this layer
-  FirstPreviousLayer previous_layer_;
-};
+            return previous_layer_.WriteParameters(stream);
+        }
 
-// Layer that sums the output of multiple layers (when there is one template argument)
-template <typename PreviousLayer>
-class Sum<PreviousLayer> {
- public:
-  // Input/output type
-  using InputType = typename PreviousLayer::OutputType;
-  using OutputType = InputType;
+        // forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      PreviousLayer::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = kInputDimensions;
+            Tail::Propagate(transformed_features, buffer);
 
-  // Size of the forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+            const auto head_output = previous_layer_.Propagate(
+                transformed_features, buffer + kSelfBufferSize);
 
-  // Hash value embedded in the evaluation function file
-  static constexpr std::uint32_t GetHashValue() {
-    std::uint32_t hash_value = 0xBCE400B4u;
-    hash_value ^= PreviousLayer::GetHashValue() >> 1;
-    hash_value ^= PreviousLayer::GetHashValue() << 31;
-    return hash_value;
-  }
+            const auto output = reinterpret_cast<OutputType*>(buffer);
 
-  // A string that represents the structure from the input layer to this layer
-  static std::string GetStructureString() {
-    return "Sum[" +
-        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
-  }
+            for (IndexType i = 0; i <kOutputDimensions; ++i) {
+                output[i] += head_output[i];
+            }
 
-  // read parameters
-  bool ReadParameters(std::istream& stream) {
-    return previous_layer_.ReadParameters(stream);
-  }
+            return output;
+        }
 
-  // write parameters
-  bool WriteParameters(std::ostream& stream) const {
-    return previous_layer_.WriteParameters(stream);
-  }
+    protected:
+        // A string that represents the list of layers to be summed
+        static std::string GetSummandsString() {
+            return Head::GetStructureString() + "," + Tail::GetSummandsString();
+        }
 
-  // forward propagation
-  const OutputType* Propagate(
-      const TransformedFeatureType* transformed_features, char* buffer) const {
-    return previous_layer_.Propagate(transformed_features, buffer);
-  }
+        // Make the learning class a friend
+        friend class Trainer<Sum>;
 
- protected:
-  // A string that represents the list of layers to be summed
-  static std::string GetSummandsString() {
-    return PreviousLayer::GetStructureString();
-  }
+        // the layer immediately before this layer
+        FirstPreviousLayer previous_layer_;
+    };
 
-  // Make the learning class a friend
-  friend class Trainer<Sum>;
+    // Layer that sums the output of multiple layers (when there is one template argument)
+    template <typename PreviousLayer>
+    class Sum<PreviousLayer> {
+    public:
+        // Input/output type
+        using InputType = typename PreviousLayer::OutputType;
 
-  // the layer immediately before this layer
-  PreviousLayer previous_layer_;
-};
+        using OutputType = InputType;
 
-}  // namespace Layers
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            PreviousLayer::kOutputDimensions;
 
-}  // namespace NNUE
+        static constexpr IndexType kOutputDimensions = kInputDimensions;
 
-}  // namespace Eval
+        // Size of the forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t GetHashValue() {
+            std::uint32_t hash_value = 0xBCE400B4u;
+            hash_value ^= PreviousLayer::GetHashValue() >> 1;
+            hash_value ^= PreviousLayer::GetHashValue() << 31;
+            return hash_value;
+        }
+
+        // A string that represents the structure from the input layer to this layer
+        static std::string GetStructureString() {
+            return "Sum[" +
+                std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+        }
+
+        // read parameters
+        bool ReadParameters(std::istream& stream) {
+            return previous_layer_.ReadParameters(stream);
+        }
+
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            return previous_layer_.WriteParameters(stream);
+        }
+
+        // forward propagation
+        const OutputType* Propagate(
+            const TransformedFeatureType* transformed_features, char* buffer) const {
+
+            return previous_layer_.Propagate(transformed_features, buffer);
+        }
+
+    protected:
+        // A string that represents the list of layers to be summed
+        static std::string GetSummandsString() {
+            return PreviousLayer::GetStructureString();
+        }
+
+        // Make the learning class a friend
+        friend class Trainer<Sum>;
+
+        // the layer immediately before this layer
+        PreviousLayer previous_layer_;
+    };
+
+}  // namespace Eval::NNUE::Layers
 
 #endif

From ea8eb415de3bea3f6943e9257b747721741e4197 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 21:00:05 +0200
Subject: [PATCH 339/583] Cleanup trainer features.

---
 src/nnue/trainer/features/factorizer.h        | 181 +++++++++---------
 .../trainer/features/factorizer_feature_set.h | 171 +++++++++--------
 .../trainer/features/factorizer_half_kp.h     | 155 ++++++++-------
 3 files changed, 256 insertions(+), 251 deletions(-)

diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
index 43950de2..784fe047 100644
--- a/src/nnue/trainer/features/factorizer.h
+++ b/src/nnue/trainer/features/factorizer.h
@@ -1,106 +1,109 @@
-﻿// NNUE evaluation function feature conversion class template
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
+﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
 
-#include "../../nnue_common.h"
-#include "../trainer.h"
+#include "nnue/nnue_common.h"
 
-namespace Eval {
+#include "nnue/trainer/trainer.h"
 
-namespace NNUE {
+// NNUE evaluation function feature conversion class template
+namespace Eval::NNUE::Features {
 
-namespace Features {
+    // Class template that converts input features into learning features
+    // By default, the learning feature is the same as the original input feature, and specialized as necessary
+    template <typename FeatureType>
+    class Factorizer {
+    public:
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType GetDimensions() {
+            return FeatureType::kDimensions;
+        }
 
-// Class template that converts input features into learning features
-// By default, the learning feature is the same as the original input feature, and specialized as necessary
-template <typename FeatureType>
-class Factorizer {
- public:
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return FeatureType::kDimensions;
-  }
+        // Get index of learning feature and scale of learning rate
+        static void AppendTrainingFeatures(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {
 
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features) {
-    assert(base_index <FeatureType::kDimensions);
-    training_features->emplace_back(base_index);
-  }
-};
+            assert(base_index <FeatureType::kDimensions);
+            training_features->emplace_back(base_index);
+        }
+    };
 
-// Learning feature information
-struct FeatureProperties {
-  bool active;
-  IndexType dimensions;
-};
+    // Learning feature information
+    struct FeatureProperties {
+        bool active;
+        IndexType dimensions;
+    };
 
-// Add the original input features to the learning features
-template <typename FeatureType>
-IndexType AppendBaseFeature(
-    FeatureProperties properties, IndexType base_index,
-    std::vector<TrainingFeature>* training_features) {
-  assert(properties.dimensions == FeatureType::kDimensions);
-  assert(base_index < FeatureType::kDimensions);
-  training_features->emplace_back(base_index);
-  return properties.dimensions;
-}
+    // Add the original input features to the learning features
+    template <typename FeatureType>
+    IndexType AppendBaseFeature(
+        FeatureProperties properties, IndexType base_index,
+        std::vector<TrainingFeature>* training_features) {
 
-// If the learning rate scale is not 0, inherit other types of learning features
-template <typename FeatureType>
-IndexType InheritFeaturesIfRequired(
-    IndexType index_offset, FeatureProperties properties, IndexType base_index,
-    std::vector<TrainingFeature>* training_features) {
-  if (!properties.active) {
-    return 0;
-  }
-  assert(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
-  assert(base_index < FeatureType::kDimensions);
-  const auto start = training_features->size();
-  Factorizer<FeatureType>::AppendTrainingFeatures(
-      base_index, training_features);
-  for (auto i = start; i < training_features->size(); ++i) {
-    auto& feature = (*training_features)[i];
-    assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-    feature.ShiftIndex(index_offset);
-  }
-  return properties.dimensions;
-}
-
-// Return the index difference as needed, without adding learning features
-// Call instead of InheritFeaturesIfRequired() if there are no corresponding features
-IndexType SkipFeatures(FeatureProperties properties) {
-  if (!properties.active) {
-    return 0;
-  }
-  return properties.dimensions;
-}
-
-// Get the dimensionality of the learning feature
-template <std::size_t N>
-constexpr IndexType GetActiveDimensions(
-    const FeatureProperties (&properties)[N]) {
-  static_assert(N > 0, "");
-  IndexType dimensions = properties[0].dimensions;
-  for (std::size_t i = 1; i < N; ++i) {
-    if (properties[i].active) {
-      dimensions += properties[i].dimensions;
+        assert(properties.dimensions == FeatureType::kDimensions);
+        assert(base_index < FeatureType::kDimensions);
+        training_features->emplace_back(base_index);
+        return properties.dimensions;
     }
-  }
-  return dimensions;
-}
 
-// get the number of elements in the array
-template <typename T, std::size_t N>
-constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
-  return N;
-}
+    // If the learning rate scale is not 0, inherit other types of learning features
+    template <typename FeatureType>
+    IndexType InheritFeaturesIfRequired(
+        IndexType index_offset, FeatureProperties properties, IndexType base_index,
+        std::vector<TrainingFeature>* training_features) {
 
-}  // namespace Features
+        if (!properties.active) {
+            return 0;
+        }
 
-}  // namespace NNUE
+        assert(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
+        assert(base_index < FeatureType::kDimensions);
 
-}  // namespace Eval
+        const auto start = training_features->size();
+        Factorizer<FeatureType>::AppendTrainingFeatures(
+            base_index, training_features);
+
+        for (auto i = start; i < training_features->size(); ++i) {
+            auto& feature = (*training_features)[i];
+            assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
+            feature.ShiftIndex(index_offset);
+        }
+
+        return properties.dimensions;
+    }
+
+    // Return the index difference as needed, without adding learning features
+    // Call instead of InheritFeaturesIfRequired() if there are no corresponding features
+    IndexType SkipFeatures(FeatureProperties properties) {
+        if (!properties.active)
+            return 0;
+
+        return properties.dimensions;
+    }
+
+    // Get the dimensionality of the learning feature
+    template <std::size_t N>
+    constexpr IndexType GetActiveDimensions(
+        const FeatureProperties (&properties)[N]) {
+
+        static_assert(N > 0, "");
+
+        IndexType dimensions = properties[0].dimensions;
+
+        for (std::size_t i = 1; i < N; ++i) {
+            if (properties[i].active) {
+                dimensions += properties[i].dimensions;
+            }
+        }
+
+        return dimensions;
+    }
+
+    // get the number of elements in the array
+    template <typename T, std::size_t N>
+    constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
+        return N;
+    }
+
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
index caf6608b..d272a453 100644
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/nnue/trainer/features/factorizer_feature_set.h
@@ -1,100 +1,105 @@
-﻿// Specialization for feature set of feature conversion class template of NNUE evaluation function
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
+﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
 
-#include "../../features/feature_set.h"
 #include "factorizer.h"
 
-namespace Eval {
+#include "nnue/features/feature_set.h"
 
-namespace NNUE {
+// Specialization for feature set of feature conversion class template of NNUE evaluation function
+namespace Eval::NNUE::Features {
 
-namespace Features {
+    // Class template that converts input features into learning features
+    // Specialization for FeatureSet
+    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+    class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
+    private:
+        using Head = Factorizer<FeatureSet<FirstFeatureType>>;
+        using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
 
-// Class template that converts input features into learning features
-// Specialization for FeatureSet
-template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
- private:
-  using Head = Factorizer<FeatureSet<FirstFeatureType>>;
-  using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
+    public:
+        // number of dimensions of original input features
+        static constexpr IndexType kBaseDimensions =
+            FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
 
- public:
-  // number of dimensions of original input features
-  static constexpr IndexType kBaseDimensions =
-      FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
-
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return Head::GetDimensions() + Tail::GetDimensions();
-  }
-
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features,
-      IndexType base_dimensions = kBaseDimensions) {
-    assert(base_index < kBaseDimensions);
-    constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
-    if (base_index < boundary) {
-      Tail::AppendTrainingFeatures(
-          base_index, training_features, base_dimensions);
-    } else {
-      const auto start = training_features->size();
-      Head::AppendTrainingFeatures(
-          base_index - boundary, training_features, base_dimensions);
-      for (auto i = start; i < training_features->size(); ++i) {
-        auto& feature = (*training_features)[i];
-        const auto index = feature.GetIndex();
-        assert(index < Head::GetDimensions() ||
-                   (index >= base_dimensions &&
-                    index < base_dimensions +
-                            Head::GetDimensions() - Head::kBaseDimensions));
-        if (index < Head::kBaseDimensions) {
-          feature.ShiftIndex(Tail::kBaseDimensions);
-        } else {
-          feature.ShiftIndex(Tail::GetDimensions() - Tail::kBaseDimensions);
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType GetDimensions() {
+            return Head::GetDimensions() + Tail::GetDimensions();
         }
-      }
-    }
-  }
-};
 
-// Class template that converts input features into learning features
-// Specialization when FeatureSet has one template argument
-template <typename FeatureType>
-class Factorizer<FeatureSet<FeatureType>> {
-public:
-  // number of dimensions of original input features
-  static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
+        // Get index of learning feature and scale of learning rate
+        static void AppendTrainingFeatures(
+            IndexType base_index, std::vector<TrainingFeature>* training_features,
+            IndexType base_dimensions = kBaseDimensions) {
 
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return Factorizer<FeatureType>::GetDimensions();
-  }
+            assert(base_index < kBaseDimensions);
 
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features,
-      IndexType base_dimensions = kBaseDimensions) {
-    assert(base_index < kBaseDimensions);
-    const auto start = training_features->size();
-    Factorizer<FeatureType>::AppendTrainingFeatures(
-        base_index, training_features);
-    for (auto i = start; i < training_features->size(); ++i) {
-      auto& feature = (*training_features)[i];
-      assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-      if (feature.GetIndex() >= kBaseDimensions) {
-        feature.ShiftIndex(base_dimensions - kBaseDimensions);
-      }
-    }
-  }
-};
+            constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
 
-}  // namespace Features
+            if (base_index < boundary) {
+                Tail::AppendTrainingFeatures(
+                    base_index, training_features, base_dimensions);
+            }
+            else {
+                const auto start = training_features->size();
 
-}  // namespace NNUE
+                Head::AppendTrainingFeatures(
+                    base_index - boundary, training_features, base_dimensions);
 
-}  // namespace Eval
+                for (auto i = start; i < training_features->size(); ++i) {
+                    auto& feature = (*training_features)[i];
+                    const auto index = feature.GetIndex();
+
+                    assert(index < Head::GetDimensions() ||
+                               (index >= base_dimensions &&
+                                index < base_dimensions +
+                                        Head::GetDimensions() - Head::kBaseDimensions));
+
+                    if (index < Head::kBaseDimensions) {
+                        feature.ShiftIndex(Tail::kBaseDimensions);
+                    }
+                    else {
+                        feature.ShiftIndex(Tail::GetDimensions() - Tail::kBaseDimensions);
+                    }
+                }
+            }
+        }
+    };
+
+    // Class template that converts input features into learning features
+    // Specialization when FeatureSet has one template argument
+    template <typename FeatureType>
+    class Factorizer<FeatureSet<FeatureType>> {
+    public:
+        // number of dimensions of original input features
+        static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
+
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType GetDimensions() {
+            return Factorizer<FeatureType>::GetDimensions();
+        }
+
+        // Get index of learning feature and scale of learning rate
+        static void AppendTrainingFeatures(
+            IndexType base_index, std::vector<TrainingFeature>* training_features,
+            IndexType base_dimensions = kBaseDimensions) {
+
+            assert(base_index < kBaseDimensions);
+
+            const auto start = training_features->size();
+
+            Factorizer<FeatureType>::AppendTrainingFeatures(
+                base_index, training_features);
+
+            for (auto i = start; i < training_features->size(); ++i) {
+                auto& feature = (*training_features)[i];
+                assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
+                if (feature.GetIndex() >= kBaseDimensions) {
+                    feature.ShiftIndex(base_dimensions - kBaseDimensions);
+                }
+            }
+        }
+    };
+
+}  // namespace Eval::NNUE::Features
 
 #endif
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
index 70a6acca..1ed5bdd3 100644
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/nnue/trainer/features/factorizer_half_kp.h
@@ -1,99 +1,96 @@
-﻿// Specialization of NNUE evaluation function feature conversion class template for HalfKP
-
-#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
+﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 #define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
 
-#include "../../features/half_kp.h"
-#include "../../features/p.h"
-#include "../../features/half_relative_kp.h"
 #include "factorizer.h"
 
-namespace Eval {
+#include "nnue/features/half_kp.h"
+#include "nnue/features/p.h"
+#include "nnue/features/half_relative_kp.h"
 
-namespace NNUE {
+// Specialization of NNUE evaluation function feature conversion class template for HalfKP
+namespace Eval::NNUE::Features {
 
-namespace Features {
+    // Class template that converts input features into learning features
+    // Specialization for HalfKP
+    template <Side AssociatedKing>
+    class Factorizer<HalfKP<AssociatedKing>> {
+    private:
+        using FeatureType = HalfKP<AssociatedKing>;
 
-// Class template that converts input features into learning features
-// Specialization for HalfKP
-template <Side AssociatedKing>
-class Factorizer<HalfKP<AssociatedKing>> {
- private:
-  using FeatureType = HalfKP<AssociatedKing>;
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;
 
-  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-  static constexpr IndexType kMaxActiveDimensions =
-      FeatureType::kMaxActiveDimensions;
+        // Type of learning feature
+        enum TrainingFeatureType {
+            kFeaturesHalfKP,
+            kFeaturesHalfK,
+            kFeaturesP,
+            kFeaturesHalfRelativeKP,
+            kNumTrainingFeatureTypes,
+        };
 
-  // Type of learning feature
-  enum TrainingFeatureType {
-    kFeaturesHalfKP,
-    kFeaturesHalfK,
-    kFeaturesP,
-    kFeaturesHalfRelativeKP,
-    kNumTrainingFeatureTypes,
-  };
+        // Learning feature information
+        static constexpr FeatureProperties kProperties[] = {
+            // kFeaturesHalfKP
+            {true, FeatureType::kDimensions},
+            // kFeaturesHalfK
+            {true, SQUARE_NB},
+            // kFeaturesP
+            {true, Factorizer<P>::GetDimensions()},
+            // kFeaturesHalfRelativeKP
+            {true, Factorizer<HalfRelativeKP<AssociatedKing>>::GetDimensions()},
+        };
 
-  // Learning feature information
-  static constexpr FeatureProperties kProperties[] = {
-    // kFeaturesHalfKP
-    {true, FeatureType::kDimensions},
-    // kFeaturesHalfK
-    {true, SQUARE_NB},
-    // kFeaturesP
-    {true, Factorizer<P>::GetDimensions()},
-    // kFeaturesHalfRelativeKP
-    {true, Factorizer<HalfRelativeKP<AssociatedKing>>::GetDimensions()},
-  };
-  static_assert(GetArrayLength(kProperties) == kNumTrainingFeatureTypes, "");
+        static_assert(GetArrayLength(kProperties) == kNumTrainingFeatureTypes, "");
 
- public:
-  // Get the dimensionality of the learning feature
-  static constexpr IndexType GetDimensions() {
-    return GetActiveDimensions(kProperties);
-  }
+    public:
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType GetDimensions() {
+            return GetActiveDimensions(kProperties);
+        }
 
-  // Get index of learning feature and scale of learning rate
-  static void AppendTrainingFeatures(
-      IndexType base_index, std::vector<TrainingFeature>* training_features) {
-    // kFeaturesHalfKP
-    IndexType index_offset = AppendBaseFeature<FeatureType>(
-        kProperties[kFeaturesHalfKP], base_index, training_features);
+        // Get index of learning feature and scale of learning rate
+        static void AppendTrainingFeatures(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {
 
-    const auto sq_k = static_cast<Square>(base_index / PS_END);
-    const auto p = static_cast<IndexType>(base_index % PS_END);
-    // kFeaturesHalfK
-    {
-      const auto& properties = kProperties[kFeaturesHalfK];
-      if (properties.active) {
-        training_features->emplace_back(index_offset + sq_k);
-        index_offset += properties.dimensions;
-      }
-    }
-    // kFeaturesP
-    index_offset += InheritFeaturesIfRequired<P>(
-        index_offset, kProperties[kFeaturesP], p, training_features);
-    // kFeaturesHalfRelativeKP
-    if (p >= PS_W_PAWN) {
-      index_offset += InheritFeaturesIfRequired<HalfRelativeKP<AssociatedKing>>(
-          index_offset, kProperties[kFeaturesHalfRelativeKP],
-          HalfRelativeKP<AssociatedKing>::MakeIndex(sq_k, p),
-          training_features);
-    } else {
-      index_offset += SkipFeatures(kProperties[kFeaturesHalfRelativeKP]);
-    }
+            // kFeaturesHalfKP
+            IndexType index_offset = AppendBaseFeature<FeatureType>(
+                kProperties[kFeaturesHalfKP], base_index, training_features);
 
-    assert(index_offset == GetDimensions());
-  }
-};
+            const auto sq_k = static_cast<Square>(base_index / PS_END);
+            const auto p = static_cast<IndexType>(base_index % PS_END);
 
-template <Side AssociatedKing>
-constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
+            // kFeaturesHalfK
+            {
+                const auto& properties = kProperties[kFeaturesHalfK];
+                if (properties.active) {
+                    training_features->emplace_back(index_offset + sq_k);
+                    index_offset += properties.dimensions;
+                }
+            }
 
-}  // namespace Features
+            // kFeaturesP
+            index_offset += InheritFeaturesIfRequired<P>(
+                index_offset, kProperties[kFeaturesP], p, training_features);
+            // kFeaturesHalfRelativeKP
+            if (p >= PS_W_PAWN) {
+                index_offset += InheritFeaturesIfRequired<HalfRelativeKP<AssociatedKing>>(
+                    index_offset, kProperties[kFeaturesHalfRelativeKP],
+                    HalfRelativeKP<AssociatedKing>::MakeIndex(sq_k, p),
+                    training_features);
+            }
+            else {
+                index_offset += SkipFeatures(kProperties[kFeaturesHalfRelativeKP]);
+            }
 
-}  // namespace NNUE
+            assert(index_offset == GetDimensions());
+        }
+    };
 
-}  // namespace Eval
+    template <Side AssociatedKing>
+    constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
+
+}  // namespace Eval::NNUE::Features
 
 #endif

From c286f9cd7d875aa4bf61ae12998e68f2e8fcb1a4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 21:26:03 +0200
Subject: [PATCH 340/583] Cleanup trainer.

---
 src/nnue/trainer/trainer.h                    | 205 +++---
 src/nnue/trainer/trainer_affine_transform.h   | 578 +++++++--------
 src/nnue/trainer/trainer_clipped_relu.h       | 228 +++---
 .../trainer/trainer_feature_transformer.h     | 661 +++++++++---------
 src/nnue/trainer/trainer_input_slice.h        | 432 ++++++------
 src/nnue/trainer/trainer_sum.h                | 312 +++++----
 6 files changed, 1263 insertions(+), 1153 deletions(-)

diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index 659863ad..7d9b66ee 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -1,121 +1,134 @@
-﻿// Common header of class template for learning NNUE evaluation function
-
-#ifndef _NNUE_TRAINER_H_
+﻿#ifndef _NNUE_TRAINER_H_
 #define _NNUE_TRAINER_H_
 
-#include "../nnue_common.h"
-#include "../features/index_list.h"
+#include "nnue/nnue_common.h"
+#include "nnue/features/index_list.h"
 
 #include <sstream>
+
 #if defined(USE_BLAS)
 static_assert(std::is_same<LearnFloatType, float>::value, "");
 #include <cblas.h>
 #endif
 
-namespace Eval {
+// Common header of class template for learning NNUE evaluation function
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Ponanza constant used in the relation between evaluation value and winning percentage
+    constexpr double kPonanzaConstant = 600.0;
 
-// Ponanza constant used in the relation between evaluation value and winning percentage
-constexpr double kPonanzaConstant = 600.0;
+    // Class that represents one index of learning feature
+    class TrainingFeature {
+        using StorageType = std::uint32_t;
+        static_assert(std::is_unsigned<StorageType>::value, "");
 
-// Class that represents one index of learning feature
-class TrainingFeature {
-  using StorageType = std::uint32_t;
-  static_assert(std::is_unsigned<StorageType>::value, "");
+    public:
+        static constexpr std::uint32_t kIndexBits = 24;
 
- public:
-  static constexpr std::uint32_t kIndexBits = 24;
-  static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
-  static constexpr std::uint32_t kCountBits =
-      std::numeric_limits<StorageType>::digits - kIndexBits;
+        static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
 
-  explicit TrainingFeature(IndexType index) :
-      index_and_count_((index << kCountBits) | 1) {
-    assert(index < (1 << kIndexBits));
-  }
-  TrainingFeature& operator+=(const TrainingFeature& other) {
-    assert(other.GetIndex() == GetIndex());
-    assert(other.GetCount() + GetCount() < (1 << kCountBits));
-    index_and_count_ += other.GetCount();
-    return *this;
-  }
-  IndexType GetIndex() const {
-    return static_cast<IndexType>(index_and_count_ >> kCountBits);
-  }
-  void ShiftIndex(IndexType offset) {
-    assert(GetIndex() + offset < (1 << kIndexBits));
-    index_and_count_ += offset << kCountBits;
-  }
-  IndexType GetCount() const {
-    return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
-  }
-  bool operator<(const TrainingFeature& other) const {
-    return index_and_count_ < other.index_and_count_;
-  }
+        static constexpr std::uint32_t kCountBits =
+            std::numeric_limits<StorageType>::digits - kIndexBits;
 
- private:
-  StorageType index_and_count_;
-};
+        explicit TrainingFeature(IndexType index) :
+            index_and_count_((index << kCountBits) | 1) {
 
-// Structure that represents one sample of training data
-struct Example {
-  std::vector<TrainingFeature> training_features[2];
-  Learner::PackedSfenValue psv;
-  int sign;
-  double weight;
-};
+            assert(index < (1 << kIndexBits));
+        }
 
-// Message used for setting hyperparameters
-struct Message {
-  Message(const std::string& message_name, const std::string& message_value = ""):
-      name(message_name), value(message_value), num_peekers(0), num_receivers(0) {}
-  const std::string name;
-  const std::string value;
-  std::uint32_t num_peekers;
-  std::uint32_t num_receivers;
-};
+        TrainingFeature& operator+=(const TrainingFeature& other) {
+            assert(other.GetIndex() == GetIndex());
+            assert(other.GetCount() + GetCount() < (1 << kCountBits));
+            index_and_count_ += other.GetCount();
+            return *this;
+        }
 
-// determine whether to accept the message
-bool ReceiveMessage(const std::string& name, Message* message) {
-  const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
-  if (message->name.substr(0, name.size() + 1) == name + "[") {
-    ++message->num_peekers;
-  }
-  if (message->name == name || message->name == name + subscript) {
-    ++message->num_receivers;
-    return true;
-  }
-  return false;
-}
+        IndexType GetIndex() const {
+            return static_cast<IndexType>(index_and_count_ >> kCountBits);
+        }
 
-// split the string
-std::vector<std::string> Split(const std::string& input, char delimiter) {
-  std::istringstream stream(input);
-  std::string field;
-  std::vector<std::string> fields;
-  while (std::getline(stream, field, delimiter)) {
-    fields.push_back(field);
-  }
-  return fields;
-}
+        void ShiftIndex(IndexType offset) {
+            assert(GetIndex() + offset < (1 << kIndexBits));
+            index_and_count_ += offset << kCountBits;
+        }
 
-// round a floating point number to an integer
-template <typename IntType>
-IntType Round(double value) {
-  return static_cast<IntType>(std::floor(value + 0.5));
-}
+        IndexType GetCount() const {
+            return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
+        }
 
-// make_shared with alignment
-template <typename T, typename... ArgumentTypes>
-std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
-  const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
-      T(std::forward<ArgumentTypes>(arguments)...);
-  return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
-}
+        bool operator<(const TrainingFeature& other) const {
+            return index_and_count_ < other.index_and_count_;
+        }
 
-}  // namespace NNUE
+    private:
+        StorageType index_and_count_;
+    };
 
-}  // namespace Eval
+    // Structure that represents one sample of training data
+    struct Example {
+        std::vector<TrainingFeature> training_features[2];
+        Learner::PackedSfenValue psv;
+        int sign;
+        double weight;
+    };
+
+    // Message used for setting hyperparameters
+    struct Message {
+        Message(const std::string& message_name, const std::string& message_value = "") :
+            name(message_name), value(message_value), num_peekers(0), num_receivers(0)
+        {
+        }
+
+        const std::string name;
+        const std::string value;
+        std::uint32_t num_peekers;
+        std::uint32_t num_receivers;
+    };
+
+    // determine whether to accept the message
+    bool ReceiveMessage(const std::string& name, Message* message) {
+        const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
+
+        if (message->name.substr(0, name.size() + 1) == name + "[") {
+            ++message->num_peekers;
+        }
+
+        if (message->name == name || message->name == name + subscript) {
+            ++message->num_receivers;
+            return true;
+        }
+
+        return false;
+    }
+
+    // split the string
+    std::vector<std::string> Split(const std::string& input, char delimiter) {
+        std::istringstream stream(input);
+        std::string field;
+        std::vector<std::string> fields;
+
+        while (std::getline(stream, field, delimiter)) {
+            fields.push_back(field);
+        }
+
+        return fields;
+    }
+
+    // round a floating point number to an integer
+    template <typename IntType>
+    IntType Round(double value) {
+        return static_cast<IntType>(std::floor(value + 0.5));
+    }
+
+    // make_shared with alignment
+    template <typename T, typename... ArgumentTypes>
+    std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
+        const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
+            T(std::forward<ArgumentTypes>(arguments)...);
+
+        return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
+    }
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 415b7dc8..dd70b8fb 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -1,297 +1,329 @@
-﻿// Specialization of NNUE evaluation function learning class template for AffineTransform
-
-#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
+﻿#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 
-#include "../../learn/learn.h"
-#include "../layers/affine_transform.h"
 #include "trainer.h"
 
+#include "learn/learn.h"
+
+#include "nnue/layers/affine_transform.h"
+
 #include <random>
 
-namespace Eval {
+// Specialization of NNUE evaluation function learning class template for AffineTransform
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Learning: Affine transformation layer
+    template <typename PreviousLayer, IndexType OutputDimensions>
+    class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
 
-// Learning: Affine transformation layer
-template <typename PreviousLayer, IndexType OutputDimensions>
-class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> Create(
+            LayerType* target_layer, FeatureTransformer* ft) {
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* ft) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, ft));
-  }
-
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-    if (ReceiveMessage("momentum", message)) {
-      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("learning_rate_scale", message)) {
-      learning_rate_scale_ =
-          static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("reset", message)) {
-      DequantizeParameters();
-    }
-    if (ReceiveMessage("quantize_parameters", message)) {
-      QuantizeParameters();
-    }
-  }
-
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-    if (kIsOutputLayer) {
-      // Initialize output layer with 0
-      std::fill(std::begin(biases_), std::end(biases_),
-                static_cast<LearnFloatType>(0.0));
-      std::fill(std::begin(weights_), std::end(weights_),
-                static_cast<LearnFloatType>(0.0));
-    } else {
-      // Assuming that the input distribution is unit-mean 0.5, equal variance,
-      // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
-      const double kSigma = 1.0 / std::sqrt(kInputDimensions);
-      auto distribution = std::normal_distribution<double>(0.0, kSigma);
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        double sum = 0.0;
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const auto weight = static_cast<LearnFloatType>(distribution(rng));
-          weights_[kInputDimensions * i + j] = weight;
-          sum += weight;
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
         }
-        biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
-      }
-    }
-    QuantizeParameters();
-  }
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    batch_input_ = previous_layer_trainer_->Propagate(batch);
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            previous_layer_trainer_->SendMessage(message);
+
+            if (ReceiveMessage("momentum", message)) {
+                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+            }
+
+            if (ReceiveMessage("learning_rate_scale", message)) {
+                learning_rate_scale_ =
+                    static_cast<LearnFloatType>(std::stod(message->value));
+            }
+
+            if (ReceiveMessage("reset", message)) {
+                DequantizeParameters();
+            }
+
+            if (ReceiveMessage("quantize_parameters", message)) {
+                QuantizeParameters();
+            }
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            previous_layer_trainer_->Initialize(rng);
+
+            if (kIsOutputLayer) {
+                // Initialize output layer with 0
+                std::fill(std::begin(biases_), std::end(biases_),
+                          static_cast<LearnFloatType>(0.0));
+                std::fill(std::begin(weights_), std::end(weights_),
+                          static_cast<LearnFloatType>(0.0));
+            }
+            else {
+                // Assuming that the input distribution is unit-mean 0.5, equal variance,
+                // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
+                const double kSigma = 1.0 / std::sqrt(kInputDimensions);
+                auto distribution = std::normal_distribution<double>(0.0, kSigma);
+
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    double sum = 0.0;
+                      for (IndexType j = 0; j < kInputDimensions; ++j) {
+                          const auto weight = static_cast<LearnFloatType>(distribution(rng));
+                          weights_[kInputDimensions * i + j] = weight;
+                          sum += weight;
+                      }
+
+                    biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
+                }
+            }
+
+            QuantizeParameters();
+        }
+
+        // forward propagation
+        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            if (output_.size() < kOutputDimensions * batch.size()) {
+                output_.resize(kOutputDimensions * batch.size());
+                gradients_.resize(kInputDimensions * batch.size());
+            }
+
+            batch_size_ = static_cast<IndexType>(batch.size());
+            batch_input_ = previous_layer_trainer_->Propagate(batch);
 #if defined(USE_BLAS)
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
-    }
-    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, batch_size_, kInputDimensions, 1.0,
-                weights_, kInputDimensions,
-                batch_input_, kInputDimensions,
-                1.0, &output_[0], kOutputDimensions);
-#else
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        double sum = biases_[i];
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const IndexType index = kInputDimensions * i + j;
-          sum += weights_[index] * batch_input_[input_batch_offset + j];
-        }
-        output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
-      }
-    }
-#endif
-    return output_.data();
-  }
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
+            }
+
+            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+                        kOutputDimensions, batch_size_, kInputDimensions, 1.0,
+                        weights_, kInputDimensions,
+                        batch_input_, kInputDimensions,
+                        1.0, &output_[0], kOutputDimensions);
+#else
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType input_batch_offset = kInputDimensions * b;
+                const IndexType output_batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    double sum = biases_[i];
+                    for (IndexType j = 0; j < kInputDimensions; ++j) {
+                        const IndexType index = kInputDimensions * i + j;
+                        sum += weights_[index] * batch_input_[input_batch_offset + j];
+                    }
+
+                    output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
+                }
+            }
+
+#endif
+            return output_.data();
+        }
+
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
+
+            const LearnFloatType local_learning_rate =
+                learning_rate * learning_rate_scale_;
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    const LearnFloatType local_learning_rate =
-        learning_rate * learning_rate_scale_;
 #if defined(USE_BLAS)
-    // backpropagate
-    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
-                kInputDimensions, batch_size_, kOutputDimensions, 1.0,
-                weights_, kInputDimensions,
-                gradients, kOutputDimensions,
-                0.0, &gradients_[0], kInputDimensions);
-    // update
-    cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      cblas_saxpy(kOutputDimensions, 1.0,
-                  &gradients[batch_offset], 1, biases_diff_, 1);
-    }
-    cblas_saxpy(kOutputDimensions, -local_learning_rate,
-                biases_diff_, 1, biases_, 1);
-    cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, kInputDimensions, batch_size_, 1.0,
-                gradients, kOutputDimensions,
-                batch_input_, kInputDimensions,
-                momentum_, weights_diff_, kInputDimensions);
-    cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
-                weights_diff_, 1, weights_, 1);
+            // backpropagate
+            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                        kInputDimensions, batch_size_, kOutputDimensions, 1.0,
+                        weights_, kInputDimensions,
+                        gradients, kOutputDimensions,
+                        0.0, &gradients_[0], kInputDimensions);
+
+            // update
+            cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                cblas_saxpy(kOutputDimensions, 1.0,
+                          &gradients[batch_offset], 1, biases_diff_, 1);
+            }
+
+            cblas_saxpy(kOutputDimensions, -local_learning_rate,
+                        biases_diff_, 1, biases_, 1);
+
+            cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
+                        kOutputDimensions, kInputDimensions, batch_size_, 1.0,
+                        gradients, kOutputDimensions,
+                        batch_input_, kInputDimensions,
+                        momentum_, weights_diff_, kInputDimensions);
+            cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
+                        weights_diff_, 1, weights_, 1);
+
 #else
-    // backpropagate
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        double sum = 0.0;
-        for (IndexType i = 0; i < kOutputDimensions; ++i) {
-          const IndexType index = kInputDimensions * i + j;
-          sum += weights_[index] * gradients[output_batch_offset + i];
-        }
-        gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
-      }
-    }
-    // update
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_diff_[i] *= momentum_;
-    }
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_diff_[i] *= momentum_;
-    }
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        biases_diff_[i] += gradients[output_batch_offset + i];
-      }
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
-          const IndexType index = kInputDimensions * i + j;
-          weights_diff_[index] += gradients[output_batch_offset + i] *
-              batch_input_[input_batch_offset + j];
-        }
-      }
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_[i] -= local_learning_rate * biases_diff_[i];
-    }
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_[i] -= local_learning_rate * weights_diff_[i];
-    }
-#endif
-    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }
+            // backpropagate
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType input_batch_offset = kInputDimensions * b;
+                const IndexType output_batch_offset = kOutputDimensions * b;
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    double sum = 0.0;
+                    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                        const IndexType index = kInputDimensions * i + j;
+                        sum += weights_[index] * gradients[output_batch_offset + i];
+                    }
+                    gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
+                }
+            }
 
- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-      batch_size_(0),
-      batch_input_(nullptr),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, ft)),
-      target_layer_(target_layer),
-      biases_(),
-      weights_(),
-      biases_diff_(),
-      weights_diff_(),
-      momentum_(0.2),
-      learning_rate_scale_(1.0) {
-    DequantizeParameters();
-  }
+            // update
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                biases_diff_[i] *= momentum_;
+            }
 
-  // Weight saturation and parameterization
-  void QuantizeParameters() {
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-      weights_[i] = std::max(-kMaxWeightMagnitude,
-                             std::min(+kMaxWeightMagnitude, weights_[i]));
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      target_layer_->biases_[i] =
-          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      const auto offset = kInputDimensions * i;
-      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        target_layer_->weights_[padded_offset + j] =
-            Round<typename LayerType::WeightType>(
-                weights_[offset + j] * kWeightScale);
-      }
-    }
-  }
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+                weights_diff_[i] *= momentum_;
+            }
 
-  // read parameterized integer
-  void DequantizeParameters() {
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      biases_[i] = static_cast<LearnFloatType>(
-          target_layer_->biases_[i] / kBiasScale);
-    }
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-      const auto offset = kInputDimensions * i;
-      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
-        weights_[offset + j] = static_cast<LearnFloatType>(
-            target_layer_->weights_[padded_offset + j] / kWeightScale);
-      }
-    }
-    std::fill(std::begin(biases_diff_), std::end(biases_diff_),
-              static_cast<LearnFloatType>(0.0));
-    std::fill(std::begin(weights_diff_), std::end(weights_diff_),
-              static_cast<LearnFloatType>(0.0));
-  }
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType input_batch_offset = kInputDimensions * b;
+                const IndexType output_batch_offset = kOutputDimensions * b;
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    biases_diff_[i] += gradients[output_batch_offset + i];
+                }
 
-  // If the output dimensionality is 1, the output layer
-  static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    for (IndexType j = 0; j < kInputDimensions; ++j) {
+                        const IndexType index = kInputDimensions * i + j;
+                        weights_diff_[index] += gradients[output_batch_offset + i] *
+                            batch_input_[input_batch_offset + j];
+                    }
+                }
+            }
 
-  // Coefficient used for parameterization
-  static constexpr LearnFloatType kActivationScale =
-      std::numeric_limits<std::int8_t>::max();
-  static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
-      (kPonanzaConstant * FV_SCALE) :
-      ((1 << kWeightScaleBits) * kActivationScale);
-  static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                biases_[i] -= local_learning_rate * biases_diff_[i];
+            }
 
-  // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
-  static constexpr LearnFloatType kMaxWeightMagnitude =
-      std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
-
-  // number of samples in mini-batch
-  IndexType batch_size_;
-
-  // Input mini batch
-  const LearnFloatType* batch_input_;
-
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-  // layer to learn
-  LayerType* const target_layer_;
-
-  // parameter
-  LearnFloatType biases_[kOutputDimensions];
-  LearnFloatType weights_[kOutputDimensions * kInputDimensions];
-
-  // Buffer used for updating parameters
-  LearnFloatType biases_diff_[kOutputDimensions];
-  LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
-
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
-
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
-
-  // hyper parameter
-  LearnFloatType momentum_;
-  LearnFloatType learning_rate_scale_;
-};
-
-}  // namespace NNUE
-
-}  // namespace Eval
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+                weights_[i] -= local_learning_rate * weights_diff_[i];
+            }
+
+#endif
+            previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+        }
+
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            batch_size_(0),
+            batch_input_(nullptr),
+            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer),
+            biases_(),
+            weights_(),
+            biases_diff_(),
+            weights_diff_(),
+            momentum_(0.2),
+            learning_rate_scale_(1.0) {
+
+            DequantizeParameters();
+        }
+
+        // Weight saturation and parameterization
+        void QuantizeParameters() {
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+                weights_[i] = std::max(-kMaxWeightMagnitude,
+                                       std::min(+kMaxWeightMagnitude, weights_[i]));
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                target_layer_->biases_[i] =
+                    Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const auto offset = kInputDimensions * i;
+                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    target_layer_->weights_[padded_offset + j] =
+                        Round<typename LayerType::WeightType>(
+                            weights_[offset + j] * kWeightScale);
+                }
+            }
+        }
+
+        // read parameterized integer
+        void DequantizeParameters() {
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                biases_[i] = static_cast<LearnFloatType>(
+                    target_layer_->biases_[i] / kBiasScale);
+            }
+
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const auto offset = kInputDimensions * i;
+                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    weights_[offset + j] = static_cast<LearnFloatType>(
+                        target_layer_->weights_[padded_offset + j] / kWeightScale);
+                }
+            }
+
+            std::fill(std::begin(biases_diff_), std::end(biases_diff_),
+                      static_cast<LearnFloatType>(0.0));
+            std::fill(std::begin(weights_diff_), std::end(weights_diff_),
+                      static_cast<LearnFloatType>(0.0));
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+        // If the output dimensionality is 1, the output layer
+        static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
+
+        // Coefficient used for parameterization
+        static constexpr LearnFloatType kActivationScale =
+            std::numeric_limits<std::int8_t>::max();
+
+        static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
+            (kPonanzaConstant * FV_SCALE) :
+            ((1 << kWeightScaleBits) * kActivationScale);
+
+        static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
+
+        // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
+        static constexpr LearnFloatType kMaxWeightMagnitude =
+            std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        // Input mini batch
+        const LearnFloatType* batch_input_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // parameter
+        LearnFloatType biases_[kOutputDimensions];
+        LearnFloatType weights_[kOutputDimensions * kInputDimensions];
+
+        // Buffer used for updating parameters
+        LearnFloatType biases_diff_[kOutputDimensions];
+        LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType> output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType> gradients_;
+
+        // hyper parameter
+        LearnFloatType momentum_;
+        LearnFloatType learning_rate_scale_;
+    };
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index cf7a2447..902c2747 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -1,138 +1,142 @@
-﻿// Specialization of NNUE evaluation function learning class template for ClippedReLU
-
-#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
+﻿#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
 #define _NNUE_TRAINER_CLIPPED_RELU_H_
 
-#include "../../learn/learn.h"
-#include "../layers/clipped_relu.h"
 #include "trainer.h"
 
-namespace Eval {
+#include "learn/learn.h"
 
-namespace NNUE {
+#include "nnue/layers/clipped_relu.h"
 
-// Learning: Affine transformation layer
-template <typename PreviousLayer>
-class Trainer<Layers::ClippedReLU<PreviousLayer>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::ClippedReLU<PreviousLayer>;
+// Specialization of NNUE evaluation function learning class template for ClippedReLU
+namespace Eval::NNUE {
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* ft) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, ft));
-  }
+    // Learning: Affine transformation layer
+    template <typename PreviousLayer>
+    class Trainer<Layers::ClippedReLU<PreviousLayer>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::ClippedReLU<PreviousLayer>;
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-    if (ReceiveMessage("check_health", message)) {
-      CheckHealth();
-    }
-  }
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> Create(
+            LayerType* target_layer, FeatureTransformer* ft) {
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-  }
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
+        }
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    const auto input = previous_layer_trainer_->Propagate(batch);
-    batch_size_ = static_cast<IndexType>(batch.size());
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
-        min_activations_[i] = std::min(min_activations_[i], output_[index]);
-        max_activations_[i] = std::max(max_activations_[i], output_[index]);
-      }
-    }
-    return output_.data();
-  }
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            previous_layer_trainer_->SendMessage(message);
+            if (ReceiveMessage("check_health", message)) {
+                CheckHealth();
+            }
+        }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        gradients_[index] = gradients[index] *
-            (output_[index] > kZero) * (output_[index] < kOne);
-      }
-    }
-    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            previous_layer_trainer_->Initialize(rng);
+        }
 
- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-      batch_size_(0),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, ft)),
-      target_layer_(target_layer) {
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-  }
+        // forward propagation
+        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            if (output_.size() < kOutputDimensions * batch.size()) {
+              output_.resize(kOutputDimensions * batch.size());
+              gradients_.resize(kInputDimensions * batch.size());
+            }
 
-  // Check if there are any problems with learning
-  void CheckHealth() {
-    const auto largest_min_activation = *std::max_element(
-        std::begin(min_activations_), std::end(min_activations_));
-    const auto smallest_max_activation = *std::min_element(
-        std::begin(max_activations_), std::end(max_activations_));
-    std::cout << "INFO: largest min activation = " << largest_min_activation
-              << ", smallest max activation = " << smallest_max_activation
-              << std::endl;
+            const auto input = previous_layer_trainer_->Propagate(batch);
+            batch_size_ = static_cast<IndexType>(batch.size());
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
+                    min_activations_[i] = std::min(min_activations_[i], output_[index]);
+                    max_activations_[i] = std::max(max_activations_[i], output_[index]);
+                }
+            }
 
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-  }
+            return output_.data();
+        }
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
 
-  // LearnFloatType constant
-  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    gradients_[index] = gradients[index] *
+                        (output_[index] > kZero) * (output_[index] < kOne);
+                }
+            }
 
-  // number of samples in mini-batch
-  IndexType batch_size_;
+            previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+        }
 
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            batch_size_(0),
+            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer) {
 
-  // layer to learn
-  LayerType* const target_layer_;
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
+                      std::numeric_limits<LearnFloatType>::max());
+            std::fill(std::begin(max_activations_), std::end(max_activations_),
+                      std::numeric_limits<LearnFloatType>::lowest());
+        }
 
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
+        // Check if there are any problems with learning
+        void CheckHealth() {
+            const auto largest_min_activation = *std::max_element(
+                std::begin(min_activations_), std::end(min_activations_));
+            const auto smallest_max_activation = *std::min_element(
+                std::begin(max_activations_), std::end(max_activations_));
 
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
+            std::cout << "INFO: largest min activation = " << largest_min_activation
+                      << ", smallest max activation = " << smallest_max_activation
+                      << std::endl;
 
-  // Health check statistics
-  LearnFloatType min_activations_[kOutputDimensions];
-  LearnFloatType max_activations_[kOutputDimensions];
-};
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
+                      std::numeric_limits<LearnFloatType>::max());
+            std::fill(std::begin(max_activations_), std::end(max_activations_),
+                      std::numeric_limits<LearnFloatType>::lowest());
+        }
 
-}  // namespace NNUE
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
 
-}  // namespace Eval
+        // LearnFloatType constant
+        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType> output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType> gradients_;
+
+        // Health check statistics
+        LearnFloatType min_activations_[kOutputDimensions];
+        LearnFloatType max_activations_[kOutputDimensions];
+    };
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 225c91fc..f403e413 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -1,13 +1,14 @@
-﻿// Specialization for feature transformer of learning class template of NNUE evaluation function
-
-#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
+﻿#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 #define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 
-#include "../../learn/learn.h"
-#include "../nnue_feature_transformer.h"
 #include "trainer.h"
+
 #include "features/factorizer_feature_set.h"
 
+#include "learn/learn.h"
+
+#include "nnue/nnue_feature_transformer.h"
+
 #include <array>
 #include <bitset>
 #include <numeric>
@@ -18,356 +19,392 @@
 #include <omp.h>
 #endif
 
-namespace Eval {
+// Specialization for feature transformer of learning class template of NNUE evaluation function
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Learning: Input feature converter
+    template <>
+    class Trainer<FeatureTransformer> {
+    private:
+        // Type of layer to learn
+        using LayerType = FeatureTransformer;
 
-// Learning: Input feature converter
-template <>
-class Trainer<FeatureTransformer> {
- private:
-  // Type of layer to learn
-  using LayerType = FeatureTransformer;
+    public:
+        template <typename T>
+        friend struct AlignedDeleter;
 
- public:
-  template <typename T>
-  friend struct AlignedDeleter;
-  template <typename T, typename... ArgumentTypes>
-  friend std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments);
+        template <typename T, typename... ArgumentTypes>
+        friend std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments);
 
-  // factory function
-  static std::shared_ptr<Trainer> Create(LayerType* target_layer) {
-    return MakeAlignedSharedPtr<Trainer>(target_layer);
-  }
+        // factory function
+        static std::shared_ptr<Trainer> Create(LayerType* target_layer) {
+            return MakeAlignedSharedPtr<Trainer>(target_layer);
+        }
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    if (ReceiveMessage("momentum", message)) {
-      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("learning_rate_scale", message)) {
-      learning_rate_scale_ =
-          static_cast<LearnFloatType>(std::stod(message->value));
-    }
-    if (ReceiveMessage("reset", message)) {
-      DequantizeParameters();
-    }
-    if (ReceiveMessage("quantize_parameters", message)) {
-      QuantizeParameters();
-    }
-    if (ReceiveMessage("clear_unobserved_feature_weights", message)) {
-      ClearUnobservedFeatureWeights();
-    }
-    if (ReceiveMessage("check_health", message)) {
-      CheckHealth();
-    }
-  }
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            if (ReceiveMessage("momentum", message)) {
+                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+            }
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    std::fill(std::begin(weights_), std::end(weights_), +kZero);
-    const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
-    auto distribution = std::normal_distribution<double>(0.0, kSigma);
-    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
-      const auto weight = static_cast<LearnFloatType>(distribution(rng));
-      weights_[i] = weight;
-    }
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_[i] = static_cast<LearnFloatType>(0.5);
-    }
-    QuantizeParameters();
-  }
+            if (ReceiveMessage("learning_rate_scale", message)) {
+                learning_rate_scale_ =
+                    static_cast<LearnFloatType>(std::stod(message->value));
+            }
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kOutputDimensions * batch.size());
-    }
-    batch_ = &batch;
-    // affine transform
+            if (ReceiveMessage("reset", message)) {
+                DequantizeParameters();
+            }
+
+            if (ReceiveMessage("quantize_parameters", message)) {
+                QuantizeParameters();
+            }
+
+            if (ReceiveMessage("clear_unobserved_feature_weights", message)) {
+                ClearUnobservedFeatureWeights();
+            }
+
+            if (ReceiveMessage("check_health", message)) {
+                CheckHealth();
+            }
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            std::fill(std::begin(weights_), std::end(weights_), +kZero);
+
+            const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
+            auto distribution = std::normal_distribution<double>(0.0, kSigma);
+
+            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
+                const auto weight = static_cast<LearnFloatType>(distribution(rng));
+                weights_[i] = weight;
+            }
+
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                biases_[i] = static_cast<LearnFloatType>(0.5);
+            }
+
+            QuantizeParameters();
+        }
+
+        // forward propagation
+        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            if (output_.size() < kOutputDimensions * batch.size()) {
+                output_.resize(kOutputDimensions * batch.size());
+                gradients_.resize(kOutputDimensions * batch.size());
+            }
+
+            batch_ = &batch;
+            // affine transform
 #pragma omp parallel for
-    for (IndexType b = 0; b < batch.size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+            for (IndexType b = 0; b < batch.size(); ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
 #if defined(USE_BLAS)
-        cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
-        for (const auto& feature : batch[b].training_features[c]) {
-          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-          cblas_saxpy(kHalfDimensions, (float)feature.GetCount(),
-                      &weights_[weights_offset], 1, &output_[output_offset], 1);
-        }
+                    cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
+                    for (const auto& feature : batch[b].training_features[c]) {
+                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+                        cblas_saxpy(kHalfDimensions, (float)feature.GetCount(),
+                                    &weights_[weights_offset], 1, &output_[output_offset], 1);
+                    }
 #else
-        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-          output_[output_offset + i] = biases_[i];
-        }
-        for (const auto& feature : batch[b].training_features[c]) {
-          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-          for (IndexType i = 0; i < kHalfDimensions; ++i) {
-            output_[output_offset + i] +=
-                feature.GetCount() * weights_[weights_offset + i];
-          }
-        }
+                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                        output_[output_offset + i] = biases_[i];
+                    }
+                    for (const auto& feature : batch[b].training_features[c]) {
+                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+                        for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                            output_[output_offset + i] +=
+                                feature.GetCount() * weights_[weights_offset + i];
+                        }
+                    }
 #endif
-      }
-    }
-    // clipped ReLU
-    for (IndexType b = 0; b < batch.size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        min_pre_activation_ = std::min(min_pre_activation_, output_[index]);
-        max_pre_activation_ = std::max(max_pre_activation_, output_[index]);
-        output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
-        const IndexType t = i % kHalfDimensions;
-        min_activations_[t] = std::min(min_activations_[t], output_[index]);
-        max_activations_[t] = std::max(max_activations_[t], output_[index]);
-      }
-    }
-    return output_.data();
-  }
+                }
+            }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    const LearnFloatType local_learning_rate =
-        learning_rate * learning_rate_scale_;
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        const IndexType index = batch_offset + i;
-        gradients_[index] = gradients[index] *
-            ((output_[index] > kZero) * (output_[index] < kOne));
-      }
-    }
-    // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
-    // Correct the learning rate and adjust the scale without using momentum
-    const LearnFloatType effective_learning_rate =
-        static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
+            // clipped ReLU
+            for (IndexType b = 0; b < batch.size(); ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    min_pre_activation_ = std::min(min_pre_activation_, output_[index]);
+                    max_pre_activation_ = std::max(max_pre_activation_, output_[index]);
+                    output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
+                    const IndexType t = i % kHalfDimensions;
+                    min_activations_[t] = std::min(min_activations_[t], output_[index]);
+                    max_activations_[t] = std::max(max_activations_[t], output_[index]);
+                }
+            }
+
+            return output_.data();
+        }
+
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
+
+            const LearnFloatType local_learning_rate =
+                learning_rate * learning_rate_scale_;
+
+            for (IndexType b = 0; b < batch_->size(); ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    const IndexType index = batch_offset + i;
+                    gradients_[index] = gradients[index] *
+                        ((output_[index] > kZero) * (output_[index] < kOne));
+                }
+            }
+
+            // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
+            // Correct the learning rate and adjust the scale without using momentum
+            const LearnFloatType effective_learning_rate =
+                static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
 #if defined(USE_BLAS)
-    cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-        cblas_saxpy(kHalfDimensions, 1.0,
-                    &gradients_[output_offset], 1, biases_diff_, 1);
-      }
-    }
-    cblas_saxpy(kHalfDimensions, -local_learning_rate,
-                biases_diff_, 1, biases_, 1);
+            cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
+            for (IndexType b = 0; b < batch_->size(); ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    cblas_saxpy(kHalfDimensions, 1.0,
+                                &gradients_[output_offset], 1, biases_diff_, 1);
+                }
+            }
+
+            cblas_saxpy(kHalfDimensions, -local_learning_rate,
+                        biases_diff_, 1, biases_, 1);
+
 #pragma omp parallel
-    {
+            {
 #if defined(_OPENMP)
-      const IndexType num_threads = omp_get_num_threads();
-      const IndexType thread_index = omp_get_thread_num();
+                const IndexType num_threads = omp_get_num_threads();
+                const IndexType thread_index = omp_get_thread_num();
 #endif
-      for (IndexType b = 0; b < batch_->size(); ++b) {
-        const IndexType batch_offset = kOutputDimensions * b;
-        for (IndexType c = 0; c < 2; ++c) {
-          const IndexType output_offset = batch_offset + kHalfDimensions * c;
-          for (const auto& feature : (*batch_)[b].training_features[c]) {
+                for (IndexType b = 0; b < batch_->size(); ++b) {
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType c = 0; c < 2; ++c) {
+                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                        for (const auto& feature : (*batch_)[b].training_features[c]) {
 #if defined(_OPENMP)
-            if (feature.GetIndex() % num_threads != thread_index) continue;
+                            if (feature.GetIndex() % num_threads != thread_index)
+                                continue;
 #endif
-            const IndexType weights_offset =
-                kHalfDimensions * feature.GetIndex();
-            const auto scale = static_cast<LearnFloatType>(
-                effective_learning_rate / feature.GetCount());
-            cblas_saxpy(kHalfDimensions, -scale,
-                        &gradients_[output_offset], 1,
-                        &weights_[weights_offset], 1);
-          }
-        }
-      }
-    }
+                            const IndexType weights_offset =
+                                kHalfDimensions * feature.GetIndex();
+                            const auto scale = static_cast<LearnFloatType>(
+                                effective_learning_rate / feature.GetCount());
+
+                            cblas_saxpy(kHalfDimensions, -scale,
+                                        &gradients_[output_offset], 1,
+                                        &weights_[weights_offset], 1);
+                        }
+                    }
+                }
+            }
+
 #else
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_diff_[i] *= momentum_;
-    }
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-          biases_diff_[i] += gradients_[output_offset + i];
-        }
-      }
-    }
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_[i] -= local_learning_rate * biases_diff_[i];
-    }
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-        for (const auto& feature : (*batch_)[b].training_features[c]) {
-          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-          const auto scale = static_cast<LearnFloatType>(
-              effective_learning_rate / feature.GetCount());
-          for (IndexType i = 0; i < kHalfDimensions; ++i) {
-            weights_[weights_offset + i] -=
-                scale * gradients_[output_offset + i];
-          }
-        }
-      }
-    }
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                biases_diff_[i] *= momentum_;
+            }
+
+            for (IndexType b = 0; b < batch_->size(); ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                        biases_diff_[i] += gradients_[output_offset + i];
+                    }
+                }
+            }
+
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                biases_[i] -= local_learning_rate * biases_diff_[i];
+            }
+
+            for (IndexType b = 0; b < batch_->size(); ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+                        const auto scale = static_cast<LearnFloatType>(
+                            effective_learning_rate / feature.GetCount());
+
+                        for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                            weights_[weights_offset + i] -=
+                                scale * gradients_[output_offset + i];
+                        }
+                    }
+                }
+            }
+
 #endif
-    for (IndexType b = 0; b < batch_->size(); ++b) {
-      for (IndexType c = 0; c < 2; ++c) {
-        for (const auto& feature : (*batch_)[b].training_features[c]) {
-          observed_features.set(feature.GetIndex());
+            for (IndexType b = 0; b < batch_->size(); ++b) {
+                for (IndexType c = 0; c < 2; ++c) {
+                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                        observed_features.set(feature.GetIndex());
+                    }
+                }
+            }
         }
-      }
-    }
-  }
 
- private:
-  // constructor
-  Trainer(LayerType* target_layer) :
-      batch_(nullptr),
-      target_layer_(target_layer),
-      biases_(),
-      weights_(),
-      biases_diff_(),
-      momentum_(0.2),
-      learning_rate_scale_(1.0) {
-    min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
-    max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-    DequantizeParameters();
-  }
+    private:
+        // constructor
+        Trainer(LayerType* target_layer) :
+            batch_(nullptr),
+            target_layer_(target_layer),
+            biases_(),
+            weights_(),
+            biases_diff_(),
+            momentum_(0.2),
+            learning_rate_scale_(1.0) {
+
+            min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
+            max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
+
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
+                      std::numeric_limits<LearnFloatType>::max());
+            std::fill(std::begin(max_activations_), std::end(max_activations_),
+                      std::numeric_limits<LearnFloatType>::lowest());
+
+            DequantizeParameters();
+        }
+
+        // Weight saturation and parameterization
+        void QuantizeParameters() {
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                target_layer_->biases_[i] =
+                    Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+            }
+
+            std::vector<TrainingFeature> training_features;
 
-  // Weight saturation and parameterization
-  void QuantizeParameters() {
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      target_layer_->biases_[i] =
-          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-    }
-    std::vector<TrainingFeature> training_features;
 #pragma omp parallel for private(training_features)
-    for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
-      training_features.clear();
-      Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
-          j, &training_features);
-      for (IndexType i = 0; i < kHalfDimensions; ++i) {
-        double sum = 0.0;
-        for (const auto& feature : training_features) {
-          sum += weights_[kHalfDimensions * feature.GetIndex() + i];
+            for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
+                training_features.clear();
+                Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+                    j, &training_features);
+
+                for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                    double sum = 0.0;
+                    for (const auto& feature : training_features) {
+                        sum += weights_[kHalfDimensions * feature.GetIndex() + i];
+                    }
+
+                    target_layer_->weights_[kHalfDimensions * j + i] =
+                        Round<typename LayerType::WeightType>(sum * kWeightScale);
+                }
+            }
         }
-        target_layer_->weights_[kHalfDimensions * j + i] =
-            Round<typename LayerType::WeightType>(sum * kWeightScale);
-      }
-    }
-  }
 
-  // read parameterized integer
-  void DequantizeParameters() {
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_[i] = static_cast<LearnFloatType>(
-          target_layer_->biases_[i] / kBiasScale);
-    }
-    std::fill(std::begin(weights_), std::end(weights_), +kZero);
-    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
-      weights_[i] = static_cast<LearnFloatType>(
-          target_layer_->weights_[i] / kWeightScale);
-    }
-    std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
-  }
+        // read parameterized integer
+        void DequantizeParameters() {
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                biases_[i] = static_cast<LearnFloatType>(
+                    target_layer_->biases_[i] / kBiasScale);
+            }
 
-  // Set the weight corresponding to the feature that does not appear in the learning data to 0
-  void ClearUnobservedFeatureWeights() {
-    for (IndexType i = 0; i < kInputDimensions; ++i) {
-      if (!observed_features.test(i)) {
-        std::fill(std::begin(weights_) + kHalfDimensions * i,
-                  std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
-      }
-    }
-    QuantizeParameters();
-  }
+            std::fill(std::begin(weights_), std::end(weights_), +kZero);
 
-  // Check if there are any problems with learning
-  void CheckHealth() {
-    std::cout << "INFO: observed " << observed_features.count()
-              << " (out of " << kInputDimensions << ") features" << std::endl;
+            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
+                weights_[i] = static_cast<LearnFloatType>(
+                    target_layer_->weights_[i] / kWeightScale);
+            }
 
-    constexpr LearnFloatType kPreActivationLimit =
-        std::numeric_limits<typename LayerType::WeightType>::max() /
-        kWeightScale;
-    std::cout << "INFO: (min, max) of pre-activations = "
-              << min_pre_activation_ << ", "
-              << max_pre_activation_ << " (limit = "
-              << kPreActivationLimit << ")" << std::endl;
+            std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
+        }
 
-    const auto largest_min_activation = *std::max_element(
-        std::begin(min_activations_), std::end(min_activations_));
-    const auto smallest_max_activation = *std::min_element(
-        std::begin(max_activations_), std::end(max_activations_));
-    std::cout << "INFO: largest min activation = " << largest_min_activation
-              << ", smallest max activation = " << smallest_max_activation
-              << std::endl;
+        // Set the weight corresponding to the feature that does not appear in the learning data to 0
+        void ClearUnobservedFeatureWeights() {
+            for (IndexType i = 0; i < kInputDimensions; ++i) {
+                if (!observed_features.test(i)) {
+                    std::fill(std::begin(weights_) + kHalfDimensions * i,
+                              std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
+                }
+            }
 
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::max());
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
-  }
+            QuantizeParameters();
+        }
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      Features::Factorizer<RawFeatures>::GetDimensions();
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-  static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
+        // Check if there are any problems with learning
+        void CheckHealth() {
+            std::cout << "INFO: observed " << observed_features.count()
+                      << " (out of " << kInputDimensions << ") features" << std::endl;
 
-  // Coefficient used for parameterization
-  static constexpr LearnFloatType kActivationScale =
-      std::numeric_limits<std::int8_t>::max();
-  static constexpr LearnFloatType kBiasScale = kActivationScale;
-  static constexpr LearnFloatType kWeightScale = kActivationScale;
+            constexpr LearnFloatType kPreActivationLimit =
+                std::numeric_limits<typename LayerType::WeightType>::max() /
+                kWeightScale;
 
-  // LearnFloatType constant
-  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+            std::cout << "INFO: (min, max) of pre-activations = "
+                      << min_pre_activation_ << ", "
+                      << max_pre_activation_ << " (limit = "
+                      << kPreActivationLimit << ")" << std::endl;
 
-  // mini batch
-  const std::vector<Example>* batch_;
+            const auto largest_min_activation = *std::max_element(
+                std::begin(min_activations_), std::end(min_activations_));
+            const auto smallest_max_activation = *std::min_element(
+                std::begin(max_activations_), std::end(max_activations_));
 
-  // layer to learn
-  LayerType* const target_layer_;
+            std::cout << "INFO: largest min activation = " << largest_min_activation
+                      << ", smallest max activation = " << smallest_max_activation
+                      << std::endl;
 
-  // parameter
-  alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
-  alignas(kCacheLineSize)
-      LearnFloatType weights_[kHalfDimensions * kInputDimensions];
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
+                      std::numeric_limits<LearnFloatType>::max());
+            std::fill(std::begin(max_activations_), std::end(max_activations_),
+                      std::numeric_limits<LearnFloatType>::lowest());
+        }
 
-  // Buffer used for updating parameters
-  LearnFloatType biases_diff_[kHalfDimensions];
-  std::vector<LearnFloatType> gradients_;
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            Features::Factorizer<RawFeatures>::GetDimensions();
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+        static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
 
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
+        // Coefficient used for parameterization
+        static constexpr LearnFloatType kActivationScale =
+            std::numeric_limits<std::int8_t>::max();
+        static constexpr LearnFloatType kBiasScale = kActivationScale;
+        static constexpr LearnFloatType kWeightScale = kActivationScale;
 
-  // Features that appeared in the training data
-  std::bitset<kInputDimensions> observed_features;
+        // LearnFloatType constant
+        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
 
-  // hyper parameter
-  LearnFloatType momentum_;
-  LearnFloatType learning_rate_scale_;
+        // mini batch
+        const std::vector<Example>* batch_;
 
-  // Health check statistics
-  LearnFloatType min_pre_activation_;
-  LearnFloatType max_pre_activation_;
-  LearnFloatType min_activations_[kHalfDimensions];
-  LearnFloatType max_activations_[kHalfDimensions];
-};
+        // layer to learn
+        LayerType* const target_layer_;
 
-}  // namespace NNUE
+        // parameter
+        alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
+        alignas(kCacheLineSize)
+            LearnFloatType weights_[kHalfDimensions * kInputDimensions];
 
-}  // namespace Eval
+        // Buffer used for updating parameters
+        LearnFloatType biases_diff_[kHalfDimensions];
+        std::vector<LearnFloatType> gradients_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType> output_;
+
+        // Features that appeared in the training data
+        std::bitset<kInputDimensions> observed_features;
+
+        // hyper parameter
+        LearnFloatType momentum_;
+        LearnFloatType learning_rate_scale_;
+
+        // Health check statistics
+        LearnFloatType min_pre_activation_;
+        LearnFloatType max_pre_activation_;
+        LearnFloatType min_activations_[kHalfDimensions];
+        LearnFloatType max_activations_[kHalfDimensions];
+    };
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index e2cd0c25..45dcbacc 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -1,247 +1,267 @@
-﻿// Specialization of NNUE evaluation function learning class template for InputSlice
-
-#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
+﻿#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
 #define _NNUE_TRAINER_INPUT_SLICE_H_
 
-#include "../../learn/learn.h"
-#include "../layers/input_slice.h"
 #include "trainer.h"
 
-namespace Eval {
+#include "learn/learn.h"
 
-namespace NNUE {
+#include "nnue/layers/input_slice.h"
 
-// Learning: Input layer
-class SharedInputTrainer {
- public:
-  // factory function
-  static std::shared_ptr<SharedInputTrainer> Create(
-      FeatureTransformer* ft) {
-    static std::shared_ptr<SharedInputTrainer> instance;
-    if (!instance) {
-      instance.reset(new SharedInputTrainer(ft));
-    }
-    ++instance->num_referrers_;
-    return instance;
-  }
+// Specialization of NNUE evaluation function learning class template for InputSlice
+namespace Eval::NNUE {
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kSendMessage;
-      feature_transformer_trainer_->SendMessage(message);
-    }
-    assert(current_operation_ == Operation::kSendMessage);
-    if (++num_calls_ == num_referrers_) {
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-  }
+    // Learning: Input layer
+    class SharedInputTrainer {
+    public:
+        // factory function
+        static std::shared_ptr<SharedInputTrainer> Create(
+            FeatureTransformer* ft) {
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kInitialize;
-      feature_transformer_trainer_->Initialize(rng);
-    }
-    assert(current_operation_ == Operation::kInitialize);
-    if (++num_calls_ == num_referrers_) {
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-  }
+            static std::shared_ptr<SharedInputTrainer> instance;
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (gradients_.size() < kInputDimensions * batch.size()) {
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kPropagate;
-      output_ = feature_transformer_trainer_->Propagate(batch);
-    }
-    assert(current_operation_ == Operation::kPropagate);
-    if (++num_calls_ == num_referrers_) {
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-    return output_;
-  }
+            if (!instance) {
+                instance.reset(new SharedInputTrainer(ft));
+            }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    if (num_referrers_ == 1) {
-      feature_transformer_trainer_->Backpropagate(gradients, learning_rate);
-      return;
-    }
-    if (num_calls_ == 0) {
-      current_operation_ = Operation::kBackPropagate;
-      for (IndexType b = 0; b < batch_size_; ++b) {
-        const IndexType batch_offset = kInputDimensions * b;
-        for (IndexType i = 0; i < kInputDimensions; ++i) {
-          gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
+            ++instance->num_referrers_;
+
+            return instance;
         }
-      }
-    }
-    assert(current_operation_ == Operation::kBackPropagate);
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kInputDimensions * b;
-      for (IndexType i = 0; i < kInputDimensions; ++i) {
-        gradients_[batch_offset + i] += gradients[batch_offset + i];
-      }
-    }
-    if (++num_calls_ == num_referrers_) {
-      feature_transformer_trainer_->Backpropagate(
-          gradients_.data(), learning_rate);
-      num_calls_ = 0;
-      current_operation_ = Operation::kNone;
-    }
-  }
 
- private:
-  // constructor
-  SharedInputTrainer(FeatureTransformer* ft) :
-      batch_size_(0),
-      num_referrers_(0),
-      num_calls_(0),
-      current_operation_(Operation::kNone),
-      feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
-          ft)),
-      output_(nullptr) {
-  }
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            if (num_calls_ == 0) {
+                current_operation_ = Operation::kSendMessage;
+                feature_transformer_trainer_->SendMessage(message);
+            }
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      FeatureTransformer::kOutputDimensions;
+            assert(current_operation_ == Operation::kSendMessage);
 
-  // type of processing
-  enum class Operation {
-    kNone,
-    kSendMessage,
-    kInitialize,
-    kPropagate,
-    kBackPropagate,
-  };
+            if (++num_calls_ == num_referrers_) {
+                num_calls_ = 0;
+                current_operation_ = Operation::kNone;
+            }
+        }
 
-  // number of samples in mini-batch
-  IndexType batch_size_;
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            if (num_calls_ == 0) {
+                current_operation_ = Operation::kInitialize;
+                feature_transformer_trainer_->Initialize(rng);
+            }
 
-  // number of layers sharing this layer as input
-  std::uint32_t num_referrers_;
+            assert(current_operation_ == Operation::kInitialize);
 
-  // Number of times the current process has been called
-  std::uint32_t num_calls_;
+            if (++num_calls_ == num_referrers_) {
+                num_calls_ = 0;
+                current_operation_ = Operation::kNone;
+            }
+        }
 
-  // current processing type
-  Operation current_operation_;
+        // forward propagation
+        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            if (gradients_.size() < kInputDimensions * batch.size()) {
+                gradients_.resize(kInputDimensions * batch.size());
+            }
 
-  // Trainer of input feature converter
-  const std::shared_ptr<Trainer<FeatureTransformer>>
-      feature_transformer_trainer_;
+            batch_size_ = static_cast<IndexType>(batch.size());
 
-  // pointer to output shared for forward propagation
-  const LearnFloatType* output_;
+            if (num_calls_ == 0) {
+                current_operation_ = Operation::kPropagate;
+                output_ = feature_transformer_trainer_->Propagate(batch);
+            }
 
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
-};
+            assert(current_operation_ == Operation::kPropagate);
 
-// Learning: Input layer
-template <IndexType OutputDimensions, IndexType Offset>
-class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
+            if (++num_calls_ == num_referrers_) {
+                num_calls_ = 0;
+                current_operation_ = Operation::kNone;
+            }
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* /*target_layer*/, FeatureTransformer* ft) {
-    return std::shared_ptr<Trainer>(new Trainer(ft));
-  }
+            return output_;
+        }
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    shared_input_trainer_->SendMessage(message);
-  }
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    shared_input_trainer_->Initialize(rng);
-  }
+            if (num_referrers_ == 1) {
+                feature_transformer_trainer_->Backpropagate(gradients, learning_rate);
+                return;
+            }
 
-  // forward propagation
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-      gradients_.resize(kInputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    const auto input = shared_input_trainer_->Propagate(batch);
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_offset = kInputDimensions * b;
-      const IndexType output_offset = kOutputDimensions * b;
+            if (num_calls_ == 0) {
+                current_operation_ = Operation::kBackPropagate;
+                for (IndexType b = 0; b < batch_size_; ++b) {
+                    const IndexType batch_offset = kInputDimensions * b;
+                    for (IndexType i = 0; i < kInputDimensions; ++i) {
+                        gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
+                    }
+                }
+            }
+
+            assert(current_operation_ == Operation::kBackPropagate);
+
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kInputDimensions * b;
+                for (IndexType i = 0; i < kInputDimensions; ++i) {
+                    gradients_[batch_offset + i] += gradients[batch_offset + i];
+                }
+            }
+
+            if (++num_calls_ == num_referrers_) {
+                feature_transformer_trainer_->Backpropagate(
+                    gradients_.data(), learning_rate);
+                num_calls_ = 0;
+                current_operation_ = Operation::kNone;
+            }
+        }
+
+    private:
+        // constructor
+        SharedInputTrainer(FeatureTransformer* ft) :
+            batch_size_(0),
+            num_referrers_(0),
+            num_calls_(0),
+            current_operation_(Operation::kNone),
+            feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
+                ft)),
+            output_(nullptr) {
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            FeatureTransformer::kOutputDimensions;
+
+        // type of processing
+        enum class Operation {
+            kNone,
+            kSendMessage,
+            kInitialize,
+            kPropagate,
+            kBackPropagate,
+        };
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        // number of layers sharing this layer as input
+        std::uint32_t num_referrers_;
+
+        // Number of times the current process has been called
+        std::uint32_t num_calls_;
+
+        // current processing type
+        Operation current_operation_;
+
+        // Trainer of input feature converter
+        const std::shared_ptr<Trainer<FeatureTransformer>>
+            feature_transformer_trainer_;
+
+        // pointer to output shared for forward propagation
+        const LearnFloatType* output_;
+
+        // buffer for back propagation
+        std::vector<LearnFloatType> gradients_;
+    };
+
+    // Learning: Input layer
+    template <IndexType OutputDimensions, IndexType Offset>
+    class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
+
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> Create(
+            LayerType* /*target_layer*/, FeatureTransformer* ft) {
+
+            return std::shared_ptr<Trainer>(new Trainer(ft));
+        }
+
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            shared_input_trainer_->SendMessage(message);
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            shared_input_trainer_->Initialize(rng);
+        }
+
+        // forward propagation
+        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            if (output_.size() < kOutputDimensions * batch.size()) {
+              output_.resize(kOutputDimensions * batch.size());
+              gradients_.resize(kInputDimensions * batch.size());
+            }
+
+            batch_size_ = static_cast<IndexType>(batch.size());
+
+            const auto input = shared_input_trainer_->Propagate(batch);
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType input_offset = kInputDimensions * b;
+                const IndexType output_offset = kOutputDimensions * b;
 #if defined(USE_BLAS)
-      cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
-                  &output_[output_offset], 1);
+                cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
+                            &output_[output_offset], 1);
 #else
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output_[output_offset + i] = input[input_offset + Offset + i];
-      }
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    output_[output_offset + i] = input[input_offset + Offset + i];
+                }
 #endif
-    }
-    return output_.data();
-  }
+            }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_offset = kInputDimensions * b;
-      const IndexType output_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kInputDimensions; ++i) {
-        if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) {
-          gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-        } else {
-          gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+            return output_.data();
         }
-      }
-    }
-    shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  }
 
- private:
-  // constructor
-  Trainer(FeatureTransformer* ft):
-      batch_size_(0),
-      shared_input_trainer_(SharedInputTrainer::Create(ft)) {
-  }
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
 
-  // number of input/output dimensions
-  static constexpr IndexType kInputDimensions =
-      FeatureTransformer::kOutputDimensions;
-  static constexpr IndexType kOutputDimensions = OutputDimensions;
-  static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType input_offset = kInputDimensions * b;
+                const IndexType output_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kInputDimensions; ++i) {
+                    if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) {
+                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                    } else {
+                        gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+                    }
+                }
+            }
+            shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate);
+        }
 
-  // number of samples in mini-batch
-  IndexType batch_size_;
+    private:
+        // constructor
+        Trainer(FeatureTransformer* ft):
+            batch_size_(0),
+            shared_input_trainer_(SharedInputTrainer::Create(ft)) {
+        }
 
-  // Trainer of shared input layer
-  const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
+        // number of input/output dimensions
+        static constexpr IndexType kInputDimensions =
+            FeatureTransformer::kOutputDimensions;
+        static constexpr IndexType kOutputDimensions = OutputDimensions;
+        static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
 
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
+        // number of samples in mini-batch
+        IndexType batch_size_;
 
-  // buffer for back propagation
-  std::vector<LearnFloatType> gradients_;
-};
+        // Trainer of shared input layer
+        const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
 
-}  // namespace NNUE
+        // Forward propagation buffer
+        std::vector<LearnFloatType> output_;
 
-}  // namespace Eval
+        // buffer for back propagation
+        std::vector<LearnFloatType> gradients_;
+    };
+
+}  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index 65a0b681..9904704b 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -1,186 +1,190 @@
-﻿// Specialization of NNUE evaluation function learning class template for Sum
-
-#ifndef _NNUE_TRAINER_SUM_H_
+﻿#ifndef _NNUE_TRAINER_SUM_H_
 #define _NNUE_TRAINER_SUM_H_
 
 #include "../../learn/learn.h"
 #include "../layers/sum.h"
 #include "trainer.h"
 
-namespace Eval {
+// Specialization of NNUE evaluation function learning class template for Sum
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // Learning: A layer that sums the outputs of multiple layers
+    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+    class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
+          Trainer<Layers::Sum<RemainingPreviousLayers...>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
+        using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
 
-// Learning: A layer that sums the outputs of multiple layers
-template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
-      Trainer<Layers::Sum<RemainingPreviousLayers...>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
-  using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> Create(
+            LayerType* target_layer, FeatureTransformer* ft) {
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* ft) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, ft));
-  }
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
+        }
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    // The results of other member functions do not depend on the processing order, so
-    // Tail is processed first for the purpose of simplifying the implementation, but
-    // SendMessage processes Head first to make it easier to understand subscript correspondence
-    previous_layer_trainer_->SendMessage(message);
-    Tail::SendMessage(message);
-  }
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            // The results of other member functions do not depend on the processing order, so
+            // Tail is processed first for the purpose of simplifying the implementation, but
+            // SendMessage processes Head first to make it easier to understand subscript correspondence
+            previous_layer_trainer_->SendMessage(message);
+            Tail::SendMessage(message);
+        }
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    Tail::Initialize(rng);
-    previous_layer_trainer_->Initialize(rng);
-  }
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            Tail::Initialize(rng);
+            previous_layer_trainer_->Initialize(rng);
+        }
+
+        // forward propagation
+        /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            batch_size_ = static_cast<IndexType>(batch.size());
+            auto output = Tail::Propagate(batch);
+            const auto head_output = previous_layer_trainer_->Propagate(batch);
 
-  // forward propagation
-  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    batch_size_ = static_cast<IndexType>(batch.size());
-    auto output = Tail::Propagate(batch);
-    const auto head_output = previous_layer_trainer_->Propagate(batch);
 #if defined(USE_BLAS)
-    cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
-                head_output, 1, output, 1);
+            cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
+                        head_output, 1, output, 1);
 #else
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output[batch_offset + i] += head_output[batch_offset + i];
-      }
-    }
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    output[batch_offset + i] += head_output[batch_offset + i];
+                }
+            }
+
 #endif
-    return output;
-  }
+            return output;
+        }
 
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    Tail::Backpropagate(gradients, learning_rate);
-    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
-  }
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
 
- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* ft):
-      Tail(target_layer, ft),
-      batch_size_(0),
-      previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
-          &target_layer->previous_layer_, ft)),
-      target_layer_(target_layer) {
-  }
+            Tail::Backpropagate(gradients, learning_rate);
+            previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+        }
 
-  // number of input/output dimensions
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft):
+            Tail(target_layer, ft),
+            batch_size_(0),
+            previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer) {
+        }
 
-  // make subclass friend
-  template <typename SumLayer>
-  friend class Trainer;
+        // number of input/output dimensions
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
 
-  // number of samples in mini-batch
-  IndexType batch_size_;
+        // make subclass friend
+        template <typename SumLayer>
+        friend class Trainer;
 
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
+        // number of samples in mini-batch
+        IndexType batch_size_;
 
-  // layer to learn
-  LayerType* const target_layer_;
-};
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+    };
 
 
-// Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
-template <typename PreviousLayer>
-class Trainer<Layers::Sum<PreviousLayer>> {
- private:
-  // Type of layer to learn
-  using LayerType = Layers::Sum<PreviousLayer>;
+    // Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
+    template <typename PreviousLayer>
+    class Trainer<Layers::Sum<PreviousLayer>> {
+    private:
+        // Type of layer to learn
+        using LayerType = Layers::Sum<PreviousLayer>;
 
- public:
-  // factory function
-  static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* ft) {
-    return std::shared_ptr<Trainer>(
-        new Trainer(target_layer, ft));
-  }
+    public:
+        // factory function
+        static std::shared_ptr<Trainer> Create(
+            LayerType* target_layer, FeatureTransformer* ft) {
 
-  // Set options such as hyperparameters
-  void SendMessage(Message* message) {
-    previous_layer_trainer_->SendMessage(message);
-  }
+            return std::shared_ptr<Trainer>(
+                new Trainer(target_layer, ft));
+        }
 
-  // Initialize the parameters with random numbers
-  template <typename RNG>
-  void Initialize(RNG& rng) {
-    previous_layer_trainer_->Initialize(rng);
-  }
+        // Set options such as hyperparameters
+        void SendMessage(Message* message) {
+            previous_layer_trainer_->SendMessage(message);
+        }
+
+        // Initialize the parameters with random numbers
+        template <typename RNG>
+        void Initialize(RNG& rng) {
+            previous_layer_trainer_->Initialize(rng);
+        }
+
+        // forward propagation
+        /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+            if (output_.size() < kOutputDimensions * batch.size()) {
+                output_.resize(kOutputDimensions * batch.size());
+            }
+
+            batch_size_ = static_cast<IndexType>(batch.size());
+            const auto output = previous_layer_trainer_->Propagate(batch);
 
-  // forward propagation
-  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
-      output_.resize(kOutputDimensions * batch.size());
-    }
-    batch_size_ = static_cast<IndexType>(batch.size());
-    const auto output = previous_layer_trainer_->Propagate(batch);
 #if defined(USE_BLAS)
-    cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
+            cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
 #else
-    for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output_[batch_offset + i] = output[batch_offset + i];
-      }
-    }
-#endif
-    return output_.data();
-  }
-
-  // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
-    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
-  }
-
- private:
-  // constructor
-  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-      batch_size_(0),
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-          &target_layer->previous_layer_, ft)),
-      target_layer_(target_layer) {
-  }
-
-  // number of input/output dimensions
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-  // make subclass friend
-  template <typename SumLayer>
-  friend class Trainer;
-
-  // number of samples in mini-batch
-  IndexType batch_size_;
-
-  // Trainer of the previous layer
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-  // layer to learn
-  LayerType* const target_layer_;
-
-  // Forward propagation buffer
-  std::vector<LearnFloatType> output_;
-};
-
-}  // namespace NNUE
-
-}  // namespace Eval
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    output_[batch_offset + i] = output[batch_offset + i];
+                }
+            }
+
+#endif
+            return output_.data();
+        }
+
+        // backpropagation
+        void Backpropagate(const LearnFloatType* gradients,
+                           LearnFloatType learning_rate) {
+
+            previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+        }
+
+    private:
+        // constructor
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            batch_size_(0),
+            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+                &target_layer->previous_layer_, ft)),
+            target_layer_(target_layer) {
+        }
+
+        // number of input/output dimensions
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+        // make subclass friend
+        template <typename SumLayer>
+        friend class Trainer;
+
+        // number of samples in mini-batch
+        IndexType batch_size_;
+
+        // Trainer of the previous layer
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+        // layer to learn
+        LayerType* const target_layer_;
+
+        // Forward propagation buffer
+        std::vector<LearnFloatType> output_;
+    };
+
+}  // namespace Eval::NNUE
 
 #endif

From 497f689aa360dafc7e4b5d4b702b09c524cb84b2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 21:45:38 +0200
Subject: [PATCH 341/583] Cleanup nnue

---
 src/nnue/evaluate_nnue.cpp          |  14 +-
 src/nnue/evaluate_nnue.h            | 139 ++---
 src/nnue/evaluate_nnue_learner.cpp  | 354 +++++++------
 src/nnue/evaluate_nnue_learner.h    |  41 +-
 src/nnue/nnue_accumulator.h         |  38 +-
 src/nnue/nnue_architecture.h        |  37 +-
 src/nnue/nnue_common.h              | 152 +++---
 src/nnue/nnue_feature_transformer.h | 791 ++++++++++++++--------------
 src/nnue/nnue_test_command.cpp      | 390 +++++++-------
 src/nnue/nnue_test_command.h        |  17 +-
 10 files changed, 1003 insertions(+), 970 deletions(-)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 4d8a4b66..0d504468 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -214,13 +214,13 @@ namespace Eval::NNUE {
 
     std::string eval_file = std::string(Options["EvalFile"]);
 
-    #if defined(DEFAULT_NNUE_DIRECTORY)
-    #define stringify2(x) #x
-    #define stringify(x) stringify2(x)
+#if defined(DEFAULT_NNUE_DIRECTORY)
+#define stringify2(x) #x
+#define stringify(x) stringify2(x)
     std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
-    #else
+#else
     std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
-    #endif
+#endif
 
     for (std::string directory : dirs)
         if (eval_file_loaded != eval_file)
@@ -238,8 +238,8 @@ namespace Eval::NNUE {
             }
         }
 
-    #undef stringify2
-    #undef stringify
+#undef stringify2
+#undef stringify
   }
 
   /// NNUE::verify() verifies that the last net used was loaded successfully
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index 5335713b..e6ddc7fd 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -1,23 +1,21 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-// header used in NNUE evaluation function
-
 #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
 #define NNUE_EVALUATE_NNUE_H_INCLUDED
 
@@ -25,79 +23,82 @@
 
 #include <memory>
 
+// header used in NNUE evaluation function
 namespace Eval::NNUE {
 
-  enum struct UseNNUEMode
-  {
-    False,
-    True,
-    Pure
-  };
+    enum struct UseNNUEMode
+    {
+        False,
+        True,
+        Pure
+    };
 
-  // Hash value of evaluation function structure
-  constexpr std::uint32_t kHashValue =
-      FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
+    // Hash value of evaluation function structure
+    constexpr std::uint32_t kHashValue =
+        FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
 
-  // Deleter for automating release of memory area
-  template <typename T>
-  struct AlignedDeleter {
-    void operator()(T* ptr) const {
-      ptr->~T();
-      std_aligned_free(ptr);
-    }
-  };
+    // Deleter for automating release of memory area
+    template <typename T>
+    struct AlignedDeleter {
+        void operator()(T* ptr) const {
+            ptr->~T();
+            std_aligned_free(ptr);
+        }
+    };
 
-  template <typename T>
-  struct LargePageDeleter {
-    void operator()(T* ptr) const {
-      ptr->~T();
-      aligned_large_pages_free(ptr);
-    }
-  };
+    template <typename T>
+    struct LargePageDeleter {
+        void operator()(T* ptr) const {
+            ptr->~T();
+            aligned_large_pages_free(ptr);
+        }
+    };
 
-  template <typename T>
-  using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
+    template <typename T>
+    using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
 
-  template <typename T>
-  using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
+    template <typename T>
+    using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
 
-  // Input feature converter
-  extern LargePagePtr<FeatureTransformer> feature_transformer;
+    // Input feature converter
+    extern LargePagePtr<FeatureTransformer> feature_transformer;
 
-  // Evaluation function
-  extern AlignedPtr<Network> network;
+    // Evaluation function
+    extern AlignedPtr<Network> network;
 
-  // Evaluation function file name
-  extern std::string fileName;
+    // Evaluation function file name
+    extern std::string fileName;
 
-  // Saved evaluation function file name
-  extern std::string savedfileName;
+    // Saved evaluation function file name
+    extern std::string savedfileName;
 
-  extern UseNNUEMode useNNUE;
-  extern std::string eval_file_loaded;
+    extern UseNNUEMode useNNUE;
 
-  // Get a string that represents the structure of the evaluation function
-  std::string GetArchitectureString();
+    extern std::string eval_file_loaded;
 
-  // read the header
-  bool ReadHeader(std::istream& stream,
-    std::uint32_t* hash_value, std::string* architecture);
+    // Get a string that represents the structure of the evaluation function
+    std::string GetArchitectureString();
 
-  // write the header
-  bool WriteHeader(std::ostream& stream,
-    std::uint32_t hash_value, const std::string& architecture);
+    // read the header
+    bool ReadHeader(std::istream& stream,
+        std::uint32_t* hash_value, std::string* architecture);
 
-  // read evaluation function parameters
-  bool ReadParameters(std::istream& stream);
+    // write the header
+    bool WriteHeader(std::ostream& stream,
+        std::uint32_t hash_value, const std::string& architecture);
 
-  // write evaluation function parameters
-  bool WriteParameters(std::ostream& stream);
+    // read evaluation function parameters
+    bool ReadParameters(std::istream& stream);
 
-  Value evaluate(const Position& pos);
-  bool load_eval(std::string name, std::istream& stream);
-  void init();
-  void verify_eval_file_loaded();
-  void verify_any_net_loaded();
+    // write evaluation function parameters
+    bool WriteParameters(std::ostream& stream);
+
+    Value evaluate(const Position& pos);
+    bool load_eval(std::string name, std::istream& stream);
+    void init();
+
+    void verify_eval_file_loaded();
+    void verify_any_net_loaded();
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 2d6c6db3..92ecd8d2 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -1,18 +1,10 @@
-﻿// Code for learning NNUE evaluation function
-
-#include <random>
+﻿#include <random>
 #include <fstream>
 #include <filesystem>
 
-#include "../learn/learn.h"
-
-#include "../position.h"
-#include "../uci.h"
-#include "../misc.h"
-#include "../thread_win32_osx.h"
-
 #include "evaluate_nnue.h"
 #include "evaluate_nnue_learner.h"
+
 #include "trainer/features/factorizer_feature_set.h"
 #include "trainer/features/factorizer_half_kp.h"
 #include "trainer/trainer_feature_transformer.h"
@@ -21,191 +13,207 @@
 #include "trainer/trainer_clipped_relu.h"
 #include "trainer/trainer_sum.h"
 
+#include "position.h"
+#include "uci.h"
+#include "misc.h"
+#include "thread_win32_osx.h"
+
+#include "learn/learn.h"
+
 // Learning rate scale
 double global_learning_rate;
 
+// Code for learning NNUE evaluation function
 namespace Eval::NNUE {
 
-  namespace {
+    namespace {
 
-    // learning data
-    std::vector<Example> examples;
+        // learning data
+        std::vector<Example> examples;
 
-    // Mutex for exclusive control of examples
-    std::mutex examples_mutex;
+        // Mutex for exclusive control of examples
+        std::mutex examples_mutex;
 
-    // number of samples in mini-batch
-    uint64_t batch_size;
+        // number of samples in mini-batch
+        uint64_t batch_size;
 
-    // random number generator
-    std::mt19937 rng;
+        // random number generator
+        std::mt19937 rng;
 
-    // learner
-    std::shared_ptr<Trainer<Network>> trainer;
+        // learner
+        std::shared_ptr<Trainer<Network>> trainer;
 
-    // Tell the learner options such as hyperparameters
-    void SendMessages(std::vector<Message> messages) {
-      for (auto& message : messages) {
-        trainer->SendMessage(&message);
-        assert(message.num_receivers > 0);
-      }
-    }
-
-  }  // namespace
-
-  // Initialize learning
-  void InitializeTraining(const std::string& seed) {
-    std::cout << "Initializing NN training for "
-              << GetArchitectureString() << std::endl;
-
-    assert(feature_transformer);
-    assert(network);
-    trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
-    rng.seed(PRNG(seed).rand<uint64_t>());
-
-    if (Options["SkipLoadingEval"]) {
-      trainer->Initialize(rng);
-    }
-  }
-
-  // set the number of samples in the mini-batch
-  void SetBatchSize(uint64_t size) {
-    assert(size > 0);
-    batch_size = size;
-  }
-  
-  // Set options such as hyperparameters
-  void SetOptions(const std::string& options) {
-    std::vector<Message> messages;
-    for (const auto& option : Split(options, ',')) {
-      const auto fields = Split(option, '=');
-      assert(fields.size() == 1 || fields.size() == 2);
-      if (fields.size() == 1) {
-        messages.emplace_back(fields[0]);
-      } else {
-        messages.emplace_back(fields[0], fields[1]);
-      }
-    }
-    SendMessages(std::move(messages));
-  }
-
-  // Reread the evaluation function parameters for learning from the file
-  void RestoreParameters(const std::string& dir_name) {
-    const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
-    std::ifstream stream(file_name, std::ios::binary);
-#ifndef NDEBUG
-    bool result =
-#endif
-    ReadParameters(stream);
-#ifndef NDEBUG
-    assert(result);
-#endif
-
-    SendMessages({{"reset"}});
-  }
-
-  void FinalizeNet() {
-    SendMessages({{"clear_unobserved_feature_weights"}});
-  }
-
-  // Add 1 sample of learning data
-  void AddExample(Position& pos, Color rootColor,
-                  const Learner::PackedSfenValue& psv, double weight) {
-    Example example;
-    if (rootColor == pos.side_to_move()) {
-      example.sign = 1;
-    } else {
-      example.sign = -1;
-    }
-    example.psv = psv;
-    example.weight = weight;
-
-    Features::IndexList active_indices[2];
-    for (const auto trigger : kRefreshTriggers) {
-      RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
-    }
-    if (pos.side_to_move() != WHITE) {
-      active_indices[0].swap(active_indices[1]);
-    }
-    for (const auto color : Colors) {
-      std::vector<TrainingFeature> training_features;
-      for (const auto base_index : active_indices[color]) {
-        static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
-                      (1 << TrainingFeature::kIndexBits), "");
-        Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
-            base_index, &training_features);
-      }
-      std::sort(training_features.begin(), training_features.end());
-
-      auto& unique_features = example.training_features[color];
-      for (const auto& feature : training_features) {
-        if (!unique_features.empty() &&
-            feature.GetIndex() == unique_features.back().GetIndex()) {
-          unique_features.back() += feature;
-        } else {
-          unique_features.push_back(feature);
+        // Tell the learner options such as hyperparameters
+        void SendMessages(std::vector<Message> messages) {
+            for (auto& message : messages) {
+                trainer->SendMessage(&message);
+                assert(message.num_receivers > 0);
+            }
+        }
+
+    }  // namespace
+
+    // Initialize learning
+    void InitializeTraining(const std::string& seed) {
+        std::cout << "Initializing NN training for "
+                  << GetArchitectureString() << std::endl;
+
+        assert(feature_transformer);
+        assert(network);
+        trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
+        rng.seed(PRNG(seed).rand<uint64_t>());
+
+        if (Options["SkipLoadingEval"]) {
+            trainer->Initialize(rng);
         }
-      }
     }
 
-    std::lock_guard<std::mutex> lock(examples_mutex);
-    examples.push_back(std::move(example));
-  }
-
-  // update the evaluation function parameters
-  void UpdateParameters() {
-    assert(batch_size > 0);
-
-    const auto learning_rate = static_cast<LearnFloatType>(
-        global_learning_rate / batch_size);
-
-    std::lock_guard<std::mutex> lock(examples_mutex);
-    std::shuffle(examples.begin(), examples.end(), rng);
-    while (examples.size() >= batch_size) {
-      std::vector<Example> batch(examples.end() - batch_size, examples.end());
-      examples.resize(examples.size() - batch_size);
-
-      const auto network_output = trainer->Propagate(batch);
-
-      std::vector<LearnFloatType> gradients(batch.size());
-      for (std::size_t b = 0; b < batch.size(); ++b) {
-        const auto shallow = static_cast<Value>(Round<std::int32_t>(
-            batch[b].sign * network_output[b] * kPonanzaConstant));
-        const auto& psv = batch[b].psv;
-        const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
-        gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
-      }
-
-      trainer->Backpropagate(gradients.data(), learning_rate);
+    // set the number of samples in the mini-batch
+    void SetBatchSize(uint64_t size) {
+        assert(size > 0);
+        batch_size = size;
     }
-    SendMessages({{"quantize_parameters"}});
-  }
 
-  // Check if there are any problems with learning
-  void CheckHealth() {
-    SendMessages({{"check_health"}});
-  }
+    // Set options such as hyperparameters
+    void SetOptions(const std::string& options) {
+        std::vector<Message> messages;
+        for (const auto& option : Split(options, ',')) {
+          const auto fields = Split(option, '=');
+          assert(fields.size() == 1 || fields.size() == 2);
 
-  // save merit function parameters to a file
-  void save_eval(std::string dir_name) {
-    auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
-    std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+          if (fields.size() == 1) {
+              messages.emplace_back(fields[0]);
+          } else {
+              messages.emplace_back(fields[0], fields[1]);
+          }
+        }
 
-    // mkdir() will fail if this folder already exists, but
-    // Apart from that. If not, I just want you to make it.
-    // Also, assume that the folders up to EvalSaveDir have been dug.
-    std::filesystem::create_directories(eval_dir);
+        SendMessages(std::move(messages));
+    }
 
-    const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
-    std::ofstream stream(file_name, std::ios::binary);
+    // Reread the evaluation function parameters for learning from the file
+    void RestoreParameters(const std::string& dir_name) {
+        const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
+        std::ifstream stream(file_name, std::ios::binary);
 #ifndef NDEBUG
-    bool result =
+        bool result =
 #endif
-    WriteParameters(stream);
+        ReadParameters(stream);
 #ifndef NDEBUG
-    assert(result);
+        assert(result);
 #endif
 
-    std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
-  }
+        SendMessages({{"reset"}});
+    }
+
+    void FinalizeNet() {
+        SendMessages({{"clear_unobserved_feature_weights"}});
+    }
+
+    // Add 1 sample of learning data
+    void AddExample(Position& pos, Color rootColor,
+                    const Learner::PackedSfenValue& psv, double weight) {
+
+        Example example;
+        if (rootColor == pos.side_to_move()) {
+            example.sign = 1;
+        } else {
+            example.sign = -1;
+        }
+
+        example.psv = psv;
+        example.weight = weight;
+
+        Features::IndexList active_indices[2];
+        for (const auto trigger : kRefreshTriggers) {
+            RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
+        }
+
+        if (pos.side_to_move() != WHITE) {
+            active_indices[0].swap(active_indices[1]);
+        }
+
+        for (const auto color : Colors) {
+            std::vector<TrainingFeature> training_features;
+            for (const auto base_index : active_indices[color]) {
+                static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
+                              (1 << TrainingFeature::kIndexBits), "");
+                Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+                    base_index, &training_features);
+            }
+
+            std::sort(training_features.begin(), training_features.end());
+
+            auto& unique_features = example.training_features[color];
+            for (const auto& feature : training_features) {
+                if (!unique_features.empty() &&
+                    feature.GetIndex() == unique_features.back().GetIndex()) {
+
+                    unique_features.back() += feature;
+                } else {
+                    unique_features.push_back(feature);
+                }
+            }
+        }
+
+        std::lock_guard<std::mutex> lock(examples_mutex);
+        examples.push_back(std::move(example));
+    }
+
+    // update the evaluation function parameters
+    void UpdateParameters() {
+        assert(batch_size > 0);
+
+        const auto learning_rate = static_cast<LearnFloatType>(
+            global_learning_rate / batch_size);
+
+        std::lock_guard<std::mutex> lock(examples_mutex);
+        std::shuffle(examples.begin(), examples.end(), rng);
+        while (examples.size() >= batch_size) {
+            std::vector<Example> batch(examples.end() - batch_size, examples.end());
+            examples.resize(examples.size() - batch_size);
+
+            const auto network_output = trainer->Propagate(batch);
+
+            std::vector<LearnFloatType> gradients(batch.size());
+            for (std::size_t b = 0; b < batch.size(); ++b) {
+                const auto shallow = static_cast<Value>(Round<std::int32_t>(
+                    batch[b].sign * network_output[b] * kPonanzaConstant));
+                const auto& psv = batch[b].psv;
+                const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
+                gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+            }
+
+            trainer->Backpropagate(gradients.data(), learning_rate);
+        }
+        SendMessages({{"quantize_parameters"}});
+    }
+
+    // Check if there are any problems with learning
+    void CheckHealth() {
+        SendMessages({{"check_health"}});
+    }
+
+    // save merit function parameters to a file
+    void save_eval(std::string dir_name) {
+        auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
+        std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+
+        // mkdir() will fail if this folder already exists, but
+        // Apart from that. If not, I just want you to make it.
+        // Also, assume that the folders up to EvalSaveDir have been dug.
+        std::filesystem::create_directories(eval_dir);
+
+        const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
+        std::ofstream stream(file_name, std::ios::binary);
+#ifndef NDEBUG
+        bool result =
+#endif
+        WriteParameters(stream);
+#ifndef NDEBUG
+        assert(result);
+#endif
+
+        std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
+    }
 }  // namespace Eval::NNUE
\ No newline at end of file
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index c41d8d6b..525b286a 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -1,37 +1,36 @@
-﻿// Interface used for learning NNUE evaluation function
-
-#ifndef _EVALUATE_NNUE_LEARNER_H_
+﻿#ifndef _EVALUATE_NNUE_LEARNER_H_
 #define _EVALUATE_NNUE_LEARNER_H_
 
-#include "../learn/learn.h"
+#include "learn/learn.h"
 
+// Interface used for learning NNUE evaluation function
 namespace Eval::NNUE {
 
-  // Initialize learning
-  void InitializeTraining(const std::string& seed);
+    // Initialize learning
+    void InitializeTraining(const std::string& seed);
 
-  // set the number of samples in the mini-batch
-  void SetBatchSize(uint64_t size);
+    // set the number of samples in the mini-batch
+    void SetBatchSize(uint64_t size);
 
-  // Set options such as hyperparameters
-  void SetOptions(const std::string& options);
+    // Set options such as hyperparameters
+    void SetOptions(const std::string& options);
 
-  // Reread the evaluation function parameters for learning from the file
-  void RestoreParameters(const std::string& dir_name);
+    // Reread the evaluation function parameters for learning from the file
+    void RestoreParameters(const std::string& dir_name);
 
-// Add 1 sample of learning data
-  void AddExample(Position& pos, Color rootColor,
-  	const Learner::PackedSfenValue& psv, double weight);
+    // Add 1 sample of learning data
+    void AddExample(Position& pos, Color rootColor,
+    	 const Learner::PackedSfenValue& psv, double weight);
 
-  // update the evaluation function parameters
-  void UpdateParameters();
+    // update the evaluation function parameters
+    void UpdateParameters();
 
-  // Check if there are any problems with learning
-  void CheckHealth();
+    // Check if there are any problems with learning
+    void CheckHealth();
 
-  void FinalizeNet();
+    void FinalizeNet();
 
-  void save_eval(std::string suffix);
+    void save_eval(std::string suffix);
 }  // namespace Eval::NNUE
 
 #endif
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index 26370710..8b60dafc 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -1,36 +1,34 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-// Class for difference calculation of NNUE evaluation function
-
 #ifndef NNUE_ACCUMULATOR_H_INCLUDED
 #define NNUE_ACCUMULATOR_H_INCLUDED
 
 #include "nnue_architecture.h"
 
+// Class for difference calculation of NNUE evaluation function
 namespace Eval::NNUE {
 
-  // Class that holds the result of affine transformation of input features
-  struct alignas(kCacheLineSize) Accumulator {
-    std::int16_t
-        accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
-    bool computed_accumulation;
-  };
+    // Class that holds the result of affine transformation of input features
+    struct alignas(kCacheLineSize) Accumulator {
+        std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+        bool computed_accumulation;
+    };
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h
index 91cdc4bd..2ecb6999 100644
--- a/src/nnue/nnue_architecture.h
+++ b/src/nnue/nnue_architecture.h
@@ -1,37 +1,36 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-// Input features and network structure used in NNUE evaluation function
-
 #ifndef NNUE_ARCHITECTURE_H_INCLUDED
 #define NNUE_ARCHITECTURE_H_INCLUDED
 
 // Defines the network structure
 #include "architectures/halfkp_256x2-32-32.h"
 
+// Input features and network structure used in NNUE evaluation function
 namespace Eval::NNUE {
 
-  static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
-  static_assert(Network::kOutputDimensions == 1, "");
-  static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
+    static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
+    static_assert(Network::kOutputDimensions == 1, "");
+    static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
 
-  // Trigger for full calculation instead of difference calculation
-  constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
+    // Trigger for full calculation instead of difference calculation
+    constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 9975134c..70c7596d 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Constants used in NNUE evaluation function
@@ -21,11 +21,11 @@
 #ifndef NNUE_COMMON_H_INCLUDED
 #define NNUE_COMMON_H_INCLUDED
 
+#include "types.h"
+
 #include <cstring>
 #include <iostream>
 
-#include "../types.h"
-
 #if defined(USE_AVX2)
 #include <immintrin.h>
 
@@ -70,84 +70,84 @@
 
 namespace Eval::NNUE {
 
-  // Version of the evaluation file
-  constexpr std::uint32_t kVersion = 0x7AF32F17u;
+    // Version of the evaluation file
+    constexpr std::uint32_t kVersion = 0x7AF32F17u;
 
-  // Constant used in evaluation value calculation
-  constexpr int FV_SCALE = 16;
-  constexpr int kWeightScaleBits = 6;
+    // Constant used in evaluation value calculation
+    constexpr int FV_SCALE = 16;
+    constexpr int kWeightScaleBits = 6;
 
-  // Size of cache line (in bytes)
-  constexpr std::size_t kCacheLineSize = 64;
+    // Size of cache line (in bytes)
+    constexpr std::size_t kCacheLineSize = 64;
 
-  // SIMD width (in bytes)
-  #if defined(USE_AVX2)
-  constexpr std::size_t kSimdWidth = 32;
+    // SIMD width (in bytes)
+#if defined(USE_AVX2)
+    constexpr std::size_t kSimdWidth = 32;
 
-  #elif defined(USE_SSE2)
-  constexpr std::size_t kSimdWidth = 16;
+#elif defined(USE_SSE2)
+    constexpr std::size_t kSimdWidth = 16;
 
-  #elif defined(USE_MMX)
-  constexpr std::size_t kSimdWidth = 8;
+#elif defined(USE_MMX)
+    constexpr std::size_t kSimdWidth = 8;
 
-  #elif defined(USE_NEON)
-  constexpr std::size_t kSimdWidth = 16;
-  #endif
+#elif defined(USE_NEON)
+    constexpr std::size_t kSimdWidth = 16;
+#endif
 
-  constexpr std::size_t kMaxSimdWidth = 32;
+    constexpr std::size_t kMaxSimdWidth = 32;
 
-  // unique number for each piece type on each square
-  enum {
-    PS_NONE     =  0,
-    PS_W_PAWN   =  1,
-    PS_B_PAWN   =  1 * SQUARE_NB + 1,
-    PS_W_KNIGHT =  2 * SQUARE_NB + 1,
-    PS_B_KNIGHT =  3 * SQUARE_NB + 1,
-    PS_W_BISHOP =  4 * SQUARE_NB + 1,
-    PS_B_BISHOP =  5 * SQUARE_NB + 1,
-    PS_W_ROOK   =  6 * SQUARE_NB + 1,
-    PS_B_ROOK   =  7 * SQUARE_NB + 1,
-    PS_W_QUEEN  =  8 * SQUARE_NB + 1,
-    PS_B_QUEEN  =  9 * SQUARE_NB + 1,
-    PS_W_KING   = 10 * SQUARE_NB + 1,
-    PS_END      = PS_W_KING, // pieces without kings (pawns included)
-    PS_B_KING   = 11 * SQUARE_NB + 1,
-    PS_END2     = 12 * SQUARE_NB + 1
-  };
+    // unique number for each piece type on each square
+    enum {
+        PS_NONE     =  0,
+        PS_W_PAWN   =  1,
+        PS_B_PAWN   =  1 * SQUARE_NB + 1,
+        PS_W_KNIGHT =  2 * SQUARE_NB + 1,
+        PS_B_KNIGHT =  3 * SQUARE_NB + 1,
+        PS_W_BISHOP =  4 * SQUARE_NB + 1,
+        PS_B_BISHOP =  5 * SQUARE_NB + 1,
+        PS_W_ROOK   =  6 * SQUARE_NB + 1,
+        PS_B_ROOK   =  7 * SQUARE_NB + 1,
+        PS_W_QUEEN  =  8 * SQUARE_NB + 1,
+        PS_B_QUEEN  =  9 * SQUARE_NB + 1,
+        PS_W_KING   = 10 * SQUARE_NB + 1,
+        PS_END      = PS_W_KING, // pieces without kings (pawns included)
+        PS_B_KING   = 11 * SQUARE_NB + 1,
+        PS_END2     = 12 * SQUARE_NB + 1
+    };
 
-  extern const uint32_t kpp_board_index[PIECE_NB][COLOR_NB];
+    extern const uint32_t kpp_board_index[PIECE_NB][COLOR_NB];
 
-  // Type of input feature after conversion
-  using TransformedFeatureType = std::uint8_t;
-  using IndexType = std::uint32_t;
+    // Type of input feature after conversion
+    using TransformedFeatureType = std::uint8_t;
+    using IndexType = std::uint32_t;
 
-  // Forward declaration of learning class template
-  template <typename Layer>
-  class Trainer;
+    // Forward declaration of learning class template
+    template <typename Layer>
+    class Trainer;
 
-  // Round n up to be a multiple of base
-  template <typename IntType>
-  constexpr IntType CeilToMultiple(IntType n, IntType base) {
-      return (n + base - 1) / base * base;
-  }
+    // Round n up to be a multiple of base
+    template <typename IntType>
+    constexpr IntType CeilToMultiple(IntType n, IntType base) {
+        return (n + base - 1) / base * base;
+    }
 
-  // read_little_endian() is our utility to read an integer (signed or unsigned, any size)
-  // from a stream in little-endian order. We swap the byte order after the read if
-  // necessary to return a result with the byte ordering of the compiling machine.
-  template <typename IntType>
-  inline IntType read_little_endian(std::istream& stream) {
+    // read_little_endian() is our utility to read an integer (signed or unsigned, any size)
+    // from a stream in little-endian order. We swap the byte order after the read if
+    // necessary to return a result with the byte ordering of the compiling machine.
+    template <typename IntType>
+    inline IntType read_little_endian(std::istream& stream) {
 
-      IntType result;
-      std::uint8_t u[sizeof(IntType)];
-      typename std::make_unsigned<IntType>::type v = 0;
+        IntType result;
+        std::uint8_t u[sizeof(IntType)];
+        typename std::make_unsigned<IntType>::type v = 0;
 
-      stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
-      for (std::size_t i = 0; i < sizeof(IntType); ++i)
-          v = (v << 8) | u[sizeof(IntType) - i - 1];
+        stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
+        for (std::size_t i = 0; i < sizeof(IntType); ++i)
+            v = (v << 8) | u[sizeof(IntType) - i - 1];
 
-      std::memcpy(&result, &v, sizeof(IntType));
-      return result;
-  }
+        std::memcpy(&result, &v, sizeof(IntType));
+        return result;
+    }
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index c9d8e0d2..2fc24dab 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -1,19 +1,19 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // A class that converts the input features of the NNUE evaluation function
@@ -23,435 +23,450 @@
 
 #include "nnue_common.h"
 #include "nnue_architecture.h"
+
 #include "features/index_list.h"
 
-#include <cstring> // std::memset()
+#include <cstring>
+#include <string>
 
 namespace Eval::NNUE {
 
-  // If vector instructions are enabled, we update and refresh the
-  // accumulator tile by tile such that each tile fits in the CPU's
-  // vector registers.
-  #define TILING
+    // If vector instructions are enabled, we update and refresh the
+    // accumulator tile by tile such that each tile fits in the CPU's
+    // vector registers.
+#define TILING
 
-  #ifdef USE_AVX512
-  typedef __m512i vec_t;
-  #define vec_load(a) _mm512_loadA_si512(a)
-  #define vec_store(a,b) _mm512_storeA_si512(a,b)
-  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
-  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
-  #define vec_zero _mm512_setzero_si512()
-  static constexpr IndexType kNumRegs = 8; // only 8 are needed
+#ifdef USE_AVX512
+    typedef __m512i vec_t;
+#define vec_load(a) _mm512_loadA_si512(a)
+#define vec_store(a,b) _mm512_storeA_si512(a,b)
+#define vec_add_16(a,b) _mm512_add_epi16(a,b)
+#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+#define vec_zero _mm512_setzero_si512()
+    static constexpr IndexType kNumRegs = 8; // only 8 are needed
 
-  #elif USE_AVX2
-  typedef __m256i vec_t;
-  #define vec_load(a) _mm256_loadA_si256(a)
-  #define vec_store(a,b) _mm256_storeA_si256(a,b)
-  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
-  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
-  #define vec_zero _mm256_setzero_si256()
-  static constexpr IndexType kNumRegs = 16;
+#elif USE_AVX2
+    typedef __m256i vec_t;
+#define vec_load(a) _mm256_loadA_si256(a)
+#define vec_store(a,b) _mm256_storeA_si256(a,b)
+#define vec_add_16(a,b) _mm256_add_epi16(a,b)
+#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+#define vec_zero _mm256_setzero_si256()
+    static constexpr IndexType kNumRegs = 16;
 
-  #elif USE_SSE2
-  typedef __m128i vec_t;
-  #define vec_load(a) (*(a))
-  #define vec_store(a,b) *(a)=(b)
-  #define vec_add_16(a,b) _mm_add_epi16(a,b)
-  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
-  #define vec_zero _mm_setzero_si128()
-  static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
+#elif USE_SSE2
+    typedef __m128i vec_t;
+#define vec_load(a) (*(a))
+#define vec_store(a,b) *(a)=(b)
+#define vec_add_16(a,b) _mm_add_epi16(a,b)
+#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+#define vec_zero _mm_setzero_si128()
+    static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
 
-  #elif USE_MMX
-  typedef __m64 vec_t;
-  #define vec_load(a) (*(a))
-  #define vec_store(a,b) *(a)=(b)
-  #define vec_add_16(a,b) _mm_add_pi16(a,b)
-  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
-  #define vec_zero _mm_setzero_si64()
-  static constexpr IndexType kNumRegs = 8;
+#elif USE_MMX
+    typedef __m64 vec_t;
+#define vec_load(a) (*(a))
+#define vec_store(a,b) *(a)=(b)
+#define vec_add_16(a,b) _mm_add_pi16(a,b)
+#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+#define vec_zero _mm_setzero_si64()
+    static constexpr IndexType kNumRegs = 8;
 
-  #elif USE_NEON
-  typedef int16x8_t vec_t;
-  #define vec_load(a) (*(a))
-  #define vec_store(a,b) *(a)=(b)
-  #define vec_add_16(a,b) vaddq_s16(a,b)
-  #define vec_sub_16(a,b) vsubq_s16(a,b)
-  #define vec_zero {0}
-  static constexpr IndexType kNumRegs = 16;
+#elif USE_NEON
+    typedef int16x8_t vec_t;
+#define vec_load(a) (*(a))
+#define vec_store(a,b) *(a)=(b)
+#define vec_add_16(a,b) vaddq_s16(a,b)
+#define vec_sub_16(a,b) vsubq_s16(a,b)
+#define vec_zero {0}
+    static constexpr IndexType kNumRegs = 16;
 
-  #else
-  #undef TILING
+#else
+#undef TILING
 
-  #endif
+#endif
 
-  // Input feature converter
-  class FeatureTransformer {
+    // Input feature converter
+    class FeatureTransformer {
 
-   private:
-    // Number of output dimensions for one side
-    static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
+    private:
+        // Number of output dimensions for one side
+        static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
 
-    #ifdef TILING
-    static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
-    static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
-    #endif
+#ifdef TILING
+        static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
+        static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
+#endif
 
-   public:
-    // Output type
-    using OutputType = TransformedFeatureType;
+    public:
+        // Output type
+        using OutputType = TransformedFeatureType;
 
-    // Number of input/output dimensions
-    static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
-    static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
+        // Number of input/output dimensions
+        static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
+        static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
 
-    // Size of forward propagation buffer
-    static constexpr std::size_t kBufferSize =
-        kOutputDimensions * sizeof(OutputType);
+        // Size of forward propagation buffer
+        static constexpr std::size_t kBufferSize =
+            kOutputDimensions * sizeof(OutputType);
 
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t GetHashValue() {
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t GetHashValue() {
 
-      return RawFeatures::kHashValue ^ kOutputDimensions;
-    }
-
-    // a string representing the structure
-    static std::string GetStructureString() {
-      return RawFeatures::GetName() + "[" +
-        std::to_string(kInputDimensions) + "->" +
-        std::to_string(kHalfDimensions) + "x2]";
-    }
-
-    // Read network parameters
-    bool ReadParameters(std::istream& stream) {
-
-      for (std::size_t i = 0; i < kHalfDimensions; ++i)
-        biases_[i] = read_little_endian<BiasType>(stream);
-      for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
-        weights_[i] = read_little_endian<WeightType>(stream);
-      return !stream.fail();
-    }
-
-    // write parameters
-    bool WriteParameters(std::ostream& stream) const {
-      stream.write(reinterpret_cast<const char*>(biases_),
-        kHalfDimensions * sizeof(BiasType));
-      stream.write(reinterpret_cast<const char*>(weights_),
-        kHalfDimensions * kInputDimensions * sizeof(WeightType));
-      return !stream.fail();
-    }
-
-    // Proceed with the difference calculation if possible
-    bool UpdateAccumulatorIfPossible(const Position& pos) const {
-
-      const auto now = pos.state();
-      if (now->accumulator.computed_accumulation)
-        return true;
-
-      const auto prev = now->previous;
-      if (prev && prev->accumulator.computed_accumulation) {
-        UpdateAccumulator(pos);
-        return true;
-      }
-
-      return false;
-    }
-
-    // Convert input features
-    void Transform(const Position& pos, OutputType* output) const {
-
-      if (!UpdateAccumulatorIfPossible(pos))
-        RefreshAccumulator(pos);
-
-      const auto& accumulation = pos.state()->accumulator.accumulation;
-
-  #if defined(USE_AVX2)
-      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-      constexpr int kControl = 0b11011000;
-      const __m256i kZero = _mm256_setzero_si256();
-
-  #elif defined(USE_SSE2)
-      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-
-  #ifdef USE_SSE41
-      const __m128i kZero = _mm_setzero_si128();
-  #else
-      const __m128i k0x80s = _mm_set1_epi8(-128);
-  #endif
-
-  #elif defined(USE_MMX)
-      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-      const __m64 k0x80s = _mm_set1_pi8(-128);
-
-  #elif defined(USE_NEON)
-      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-      const int8x8_t kZero = {0};
-  #endif
-
-      const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
-      for (IndexType p = 0; p < 2; ++p) {
-        const IndexType offset = kHalfDimensions * p;
-
-  #if defined(USE_AVX2)
-        auto out = reinterpret_cast<__m256i*>(&output[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i sum0 = _mm256_loadA_si256(
-              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m256i sum1 = _mm256_loadA_si256(
-            &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 0]);
-            sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 1]);
-          }
-          _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
-              _mm256_packs_epi16(sum0, sum1), kZero), kControl));
+            return RawFeatures::kHashValue ^ kOutputDimensions;
         }
 
-  #elif defined(USE_SSE2)
-        auto out = reinterpret_cast<__m128i*>(&output[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 1]);
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 0]);
-            sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 1]);
-          }
-      const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
-
-          _mm_store_si128(&out[j],
-
-  #ifdef USE_SSE41
-            _mm_max_epi8(packedbytes, kZero)
-  #else
-            _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
-  #endif
-
-          );
+        // a string representing the structure
+        static std::string GetStructureString() {
+            return RawFeatures::GetName() + "[" +
+                std::to_string(kInputDimensions) + "->" +
+                std::to_string(kHalfDimensions) + "x2]";
         }
 
-  #elif defined(USE_MMX)
-        auto out = reinterpret_cast<__m64*>(&output[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m64 sum0 = *(&reinterpret_cast<const __m64*>(
-              accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
-              accumulation[perspectives[p]][0])[j * 2 + 1]);
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
-                accumulation[perspectives[p]][i])[j * 2 + 0]);
-            sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
-                accumulation[perspectives[p]][i])[j * 2 + 1]);
-          }
-          const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
-          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+        // Read network parameters
+        bool ReadParameters(std::istream& stream) {
+
+            for (std::size_t i = 0; i < kHalfDimensions; ++i)
+                biases_[i] = read_little_endian<BiasType>(stream);
+
+            for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
+                weights_[i] = read_little_endian<WeightType>(stream);
+
+            return !stream.fail();
         }
 
-  #elif defined(USE_NEON)
-        const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          int16x8_t sum = reinterpret_cast<const int16x8_t*>(
-              accumulation[perspectives[p]][0])[j];
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
-                accumulation[perspectives[p]][i])[j]);
-          }
-          out[j] = vmax_s8(vqmovn_s16(sum), kZero);
+        // write parameters
+        bool WriteParameters(std::ostream& stream) const {
+            stream.write(reinterpret_cast<const char*>(biases_),
+                kHalfDimensions * sizeof(BiasType));
+
+            stream.write(reinterpret_cast<const char*>(weights_),
+                kHalfDimensions * kInputDimensions * sizeof(WeightType));
+
+            return !stream.fail();
         }
 
-  #else
-        for (IndexType j = 0; j < kHalfDimensions; ++j) {
-          BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum += accumulation[static_cast<int>(perspectives[p])][i][j];
-          }
-          output[offset + j] = static_cast<OutputType>(
-              std::max<int>(0, std::min<int>(127, sum)));
-        }
-  #endif
+        // Proceed with the difference calculation if possible
+        bool UpdateAccumulatorIfPossible(const Position& pos) const {
 
-      }
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
-    }
+            const auto now = pos.state();
+            if (now->accumulator.computed_accumulation)
+                return true;
 
-   private:
-    // Calculate cumulative value without using difference calculation
-    void RefreshAccumulator(const Position& pos) const {
-
-      auto& accumulator = pos.state()->accumulator;
-      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-        Features::IndexList active_indices[2];
-        RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
-                                         active_indices);
-        for (Color perspective : { WHITE, BLACK }) {
-    #ifdef TILING
-          for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-            auto accTile = reinterpret_cast<vec_t*>(
-                &accumulator.accumulation[perspective][i][j * kTileHeight]);
-            vec_t acc[kNumRegs];
-
-            if (i == 0) {
-              auto biasesTile = reinterpret_cast<const vec_t*>(
-                  &biases_[j * kTileHeight]);
-              for (unsigned k = 0; k < kNumRegs; ++k)
-                acc[k] = biasesTile[k];
-            } else {
-              for (unsigned k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_zero;
-            }
-            for (const auto index : active_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-              for (unsigned k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_add_16(acc[k], column[k]);
+            const auto prev = now->previous;
+            if (prev && prev->accumulator.computed_accumulation) {
+                UpdateAccumulator(pos);
+                return true;
             }
 
-            for (unsigned k = 0; k < kNumRegs; k++)
-              vec_store(&accTile[k], acc[k]);
-          }
-    #else
-          if (i == 0) {
-            std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                        kHalfDimensions * sizeof(BiasType));
-          } else {
-            std::memset(accumulator.accumulation[perspective][i], 0,
-                        kHalfDimensions * sizeof(BiasType));
-          }
-
-          for (const auto index : active_indices[perspective]) {
-            const IndexType offset = kHalfDimensions * index;
-
-            for (IndexType j = 0; j < kHalfDimensions; ++j)
-              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-          }
-    #endif
+            return false;
         }
 
-      }
+        // Convert input features
+        void Transform(const Position& pos, OutputType* output) const {
 
-  #if defined(USE_MMX)
-      _mm_empty();
-  #endif
+            if (!UpdateAccumulatorIfPossible(pos))
+              RefreshAccumulator(pos);
 
-      accumulator.computed_accumulation = true;
-    }
+            const auto& accumulation = pos.state()->accumulator.accumulation;
 
-    // Calculate cumulative value using difference calculation
-    void UpdateAccumulator(const Position& pos) const {
+#if defined(USE_AVX2)
+            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+            constexpr int kControl = 0b11011000;
+            const __m256i kZero = _mm256_setzero_si256();
 
-      const auto& prev_accumulator = pos.state()->previous->accumulator;
-      auto& accumulator = pos.state()->accumulator;
-      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-        Features::IndexList removed_indices[2], added_indices[2];
-        bool reset[2] = { false, false };
-        RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
-                                          removed_indices, added_indices, reset);
+#elif defined(USE_SSE2)
+            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
 
-    #ifdef TILING
-        for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-          for (Color perspective : { WHITE, BLACK }) {
-            auto accTile = reinterpret_cast<vec_t*>(
-                &accumulator.accumulation[perspective][i][j * kTileHeight]);
-            vec_t acc[kNumRegs];
+#ifdef USE_SSE41
+            const __m128i kZero = _mm_setzero_si128();
+#else
+            const __m128i k0x80s = _mm_set1_epi8(-128);
+#endif
 
-            if (reset[perspective]) {
-              if (i == 0) {
-                auto biasesTile = reinterpret_cast<const vec_t*>(
-                    &biases_[j * kTileHeight]);
-                for (unsigned k = 0; k < kNumRegs; ++k)
-                  acc[k] = biasesTile[k];
-              } else {
-                for (unsigned k = 0; k < kNumRegs; ++k)
-                  acc[k] = vec_zero;
-              }
-            } else {
-              auto prevAccTile = reinterpret_cast<const vec_t*>(
-                  &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
-              for (IndexType k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_load(&prevAccTile[k]);
+#elif defined(USE_MMX)
+            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+            const __m64 k0x80s = _mm_set1_pi8(-128);
 
-              // Difference calculation for the deactivated features
-              for (const auto index : removed_indices[perspective]) {
-                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+#elif defined(USE_NEON)
+            constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+            const int8x8_t kZero = {0};
+#endif
+
+            const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+            for (IndexType p = 0; p < 2; ++p) {
+                const IndexType offset = kHalfDimensions * p;
+
+#if defined(USE_AVX2)
+                auto out = reinterpret_cast<__m256i*>(&output[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m256i sum0 = _mm256_loadA_si256(
+                        &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+                    __m256i sum1 = _mm256_loadA_si256(
+                      &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 0]);
+                        sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 1]);
+                    }
+
+                    _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+                        _mm256_packs_epi16(sum0, sum1), kZero), kControl));
+                }
+
+#elif defined(USE_SSE2)
+                auto out = reinterpret_cast<__m128i*>(&output[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+                        accumulation[perspectives[p]][0])[j * 2 + 0]);
+                    __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+                        accumulation[perspectives[p]][0])[j * 2 + 1]);
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 0]);
+                        sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 1]);
+                    }
+
+                    const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
+
+                    _mm_store_si128(&out[j],
+
+#ifdef USE_SSE41
+                        _mm_max_epi8(packedbytes, kZero)
+#else
+                        _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+#endif
+
+                    );
+                }
+
+#elif defined(USE_MMX)
+                auto out = reinterpret_cast<__m64*>(&output[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    __m64 sum0 = *(&reinterpret_cast<const __m64*>(
+                        accumulation[perspectives[p]][0])[j * 2 + 0]);
+                    __m64 sum1 = *(&reinterpret_cast<const __m64*>(
+                        accumulation[perspectives[p]][0])[j * 2 + 1]);
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 0]);
+                        sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
+                            accumulation[perspectives[p]][i])[j * 2 + 1]);
+                    }
+
+                    const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
+                    out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+                }
+
+#elif defined(USE_NEON)
+                const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
+                for (IndexType j = 0; j < kNumChunks; ++j) {
+                    int16x8_t sum = reinterpret_cast<const int16x8_t*>(
+                        accumulation[perspectives[p]][0])[j];
+
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
+                            accumulation[perspectives[p]][i])[j]);
+                    }
+
+                    out[j] = vmax_s8(vqmovn_s16(sum), kZero);
+                }
+
+#else
+                for (IndexType j = 0; j < kHalfDimensions; ++j) {
+                    BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
+                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+                        sum += accumulation[static_cast<int>(perspectives[p])][i][j];
+                    }
+
+                    output[offset + j] = static_cast<OutputType>(
+                        std::max<int>(0, std::min<int>(127, sum)));
+                }
+#endif
 
-                for (IndexType k = 0; k < kNumRegs; ++k)
-                  acc[k] = vec_sub_16(acc[k], column[k]);
-              }
             }
-            { // Difference calculation for the activated features
-              for (const auto index : added_indices[perspective]) {
-                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
-                for (IndexType k = 0; k < kNumRegs; ++k)
-                  acc[k] = vec_add_16(acc[k], column[k]);
-              }
-            }
-
-            for (IndexType k = 0; k < kNumRegs; ++k)
-              vec_store(&accTile[k], acc[k]);
-          }
+#if defined(USE_MMX)
+            _mm_empty();
+#endif
         }
-    #if defined(USE_MMX)
-        _mm_empty();
-    #endif
 
-    #else
-        for (Color perspective : { WHITE, BLACK }) {
+    private:
+        // Calculate cumulative value without using difference calculation
+        void RefreshAccumulator(const Position& pos) const {
+
+            auto& accumulator = pos.state()->accumulator;
+            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                Features::IndexList active_indices[2];
+                RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
+                                                 active_indices);
+                for (Color perspective : { WHITE, BLACK }) {
+#ifdef TILING
+                    for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+                        auto accTile = reinterpret_cast<vec_t*>(
+                            &accumulator.accumulation[perspective][i][j * kTileHeight]);
+                        vec_t acc[kNumRegs];
+
+                        if (i == 0) {
+                            auto biasesTile = reinterpret_cast<const vec_t*>(
+                                &biases_[j * kTileHeight]);
+                            for (unsigned k = 0; k < kNumRegs; ++k)
+                                acc[k] = biasesTile[k];
+                        } else {
+                            for (unsigned k = 0; k < kNumRegs; ++k)
+                                acc[k] = vec_zero;
+                        }
+
+                        for (const auto index : active_indices[perspective]) {
+                            const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                            auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+                            for (unsigned k = 0; k < kNumRegs; ++k)
+                                acc[k] = vec_add_16(acc[k], column[k]);
+                        }
+
+                        for (unsigned k = 0; k < kNumRegs; k++)
+                            vec_store(&accTile[k], acc[k]);
+                    }
+#else
+                    if (i == 0) {
+                        std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                                    kHalfDimensions * sizeof(BiasType));
+                    } else {
+                        std::memset(accumulator.accumulation[perspective][i], 0,
+                                    kHalfDimensions * sizeof(BiasType));
+                    }
+
+                    for (const auto index : active_indices[perspective]) {
+                        const IndexType offset = kHalfDimensions * index;
+
+                        for (IndexType j = 0; j < kHalfDimensions; ++j)
+                            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+                    }
+#endif
+                }
 
-          if (reset[perspective]) {
-            if (i == 0) {
-              std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                          kHalfDimensions * sizeof(BiasType));
-            } else {
-              std::memset(accumulator.accumulation[perspective][i], 0,
-                          kHalfDimensions * sizeof(BiasType));
             }
-          } else {
-            std::memcpy(accumulator.accumulation[perspective][i],
-                        prev_accumulator.accumulation[perspective][i],
-                        kHalfDimensions * sizeof(BiasType));
-            // Difference calculation for the deactivated features
-            for (const auto index : removed_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index;
 
-              for (IndexType j = 0; j < kHalfDimensions; ++j)
-                accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
-            }
-          }
-          { // Difference calculation for the activated features
-            for (const auto index : added_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index;
+#if defined(USE_MMX)
+            _mm_empty();
+#endif
 
-              for (IndexType j = 0; j < kHalfDimensions; ++j)
-                accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-            }
-          }
+            accumulator.computed_accumulation = true;
         }
-    #endif
-      }
-      accumulator.computed_accumulation = true;
-    }
 
-    using BiasType = std::int16_t;
-    using WeightType = std::int16_t;
+        // Calculate cumulative value using difference calculation
+        void UpdateAccumulator(const Position& pos) const {
 
-    // Make the learning class a friend
-    friend class Trainer<FeatureTransformer>;
+            const auto& prev_accumulator = pos.state()->previous->accumulator;
+            auto& accumulator = pos.state()->accumulator;
+            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                Features::IndexList removed_indices[2], added_indices[2];
+                bool reset[2] = { false, false };
+                RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
+                                                  removed_indices, added_indices, reset);
 
-    alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
-    alignas(kCacheLineSize)
-        WeightType weights_[kHalfDimensions * kInputDimensions];
-  };
+#ifdef TILING
+                for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+                    for (Color perspective : { WHITE, BLACK }) {
+                        auto accTile = reinterpret_cast<vec_t*>(
+                            &accumulator.accumulation[perspective][i][j * kTileHeight]);
+                        vec_t acc[kNumRegs];
+
+                        if (reset[perspective]) {
+                            if (i == 0) {
+                                auto biasesTile = reinterpret_cast<const vec_t*>(
+                                    &biases_[j * kTileHeight]);
+                                for (unsigned k = 0; k < kNumRegs; ++k)
+                                    acc[k] = biasesTile[k];
+                            } else {
+                                for (unsigned k = 0; k < kNumRegs; ++k)
+                                    acc[k] = vec_zero;
+                            }
+                        } else {
+                            auto prevAccTile = reinterpret_cast<const vec_t*>(
+                                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
+
+                            for (IndexType k = 0; k < kNumRegs; ++k)
+                                acc[k] = vec_load(&prevAccTile[k]);
+
+                            // Difference calculation for the deactivated features
+                            for (const auto index : removed_indices[perspective]) {
+                                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+                                for (IndexType k = 0; k < kNumRegs; ++k)
+                                    acc[k] = vec_sub_16(acc[k], column[k]);
+                            }
+                        }
+
+                        { // Difference calculation for the activated features
+                          for (const auto index : added_indices[perspective]) {
+                              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+                              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+                              for (IndexType k = 0; k < kNumRegs; ++k)
+                                  acc[k] = vec_add_16(acc[k], column[k]);
+                          }
+                        }
+
+                        for (IndexType k = 0; k < kNumRegs; ++k)
+                          vec_store(&accTile[k], acc[k]);
+                    }
+                }
+#if defined(USE_MMX)
+                _mm_empty();
+#endif
+
+#else
+                for (Color perspective : { WHITE, BLACK }) {
+
+                    if (reset[perspective]) {
+                        if (i == 0) {
+                            std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                                        kHalfDimensions * sizeof(BiasType));
+                        } else {
+                            std::memset(accumulator.accumulation[perspective][i], 0,
+                                        kHalfDimensions * sizeof(BiasType));
+                        }
+                    } else {
+                        std::memcpy(accumulator.accumulation[perspective][i],
+                                    prev_accumulator.accumulation[perspective][i],
+                                    kHalfDimensions * sizeof(BiasType));
+                        // Difference calculation for the deactivated features
+                        for (const auto index : removed_indices[perspective]) {
+                            const IndexType offset = kHalfDimensions * index;
+
+                            for (IndexType j = 0; j < kHalfDimensions; ++j)
+                                accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
+                        }
+                    }
+                    { // Difference calculation for the activated features
+                        for (const auto index : added_indices[perspective]) {
+                          const IndexType offset = kHalfDimensions * index;
+
+                          for (IndexType j = 0; j < kHalfDimensions; ++j)
+                              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+                        }
+                    }
+                }
+#endif
+            }
+            accumulator.computed_accumulation = true;
+        }
+
+        using BiasType = std::int16_t;
+        using WeightType = std::int16_t;
+
+        // Make the learning class a friend
+        friend class Trainer<FeatureTransformer>;
+
+        alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
+        alignas(kCacheLineSize)
+            WeightType weights_[kHalfDimensions * kInputDimensions];
+    };
 
 }  // namespace Eval::NNUE
 
-#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#endif //#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index f6f05c2e..55fa603a 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -1,197 +1,215 @@
-﻿// USI extended command for NNUE evaluation function
-
-#include "../thread.h"
-#include "../uci.h"
-#include "evaluate_nnue.h"
+﻿#include "evaluate_nnue.h"
 #include "nnue_test_command.h"
 
+#include "thread.h"
+#include "uci.h"
+
 #include <set>
 #include <fstream>
 
-#define ASSERT(X) { if (!(X)) { std::cout << "\nError : ASSERT(" << #X << "), " << __FILE__ << "(" << __LINE__ << "): " << __func__ << std::endl; \
- std::this_thread::sleep_for(std::chrono::microseconds(3000)); *(int*)1 =0;} }
-
-namespace Eval {
-
-namespace NNUE {
-
-namespace {
-
-// Testing RawFeatures mainly for difference calculation
-void TestFeatures(Position& pos) {
-  const std::uint64_t num_games = 1000;
-  StateInfo si;
-  pos.set(StartFEN, false, &si, Threads.main());
-  const int MAX_PLY = 256; // test up to 256 hands
-
-  StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
-  int ply; // Trouble from the initial phase
-
-  PRNG prng(20171128);
-
-  std::uint64_t num_moves = 0;
-  std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
-  std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
-  constexpr IndexType kUnknown = -1;
-  std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
-  auto make_index_sets = [&](const Position& position) {
-    std::vector<std::vector<std::set<IndexType>>> index_sets(
-        kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
-    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-      Features::IndexList active_indices[2];
-      RawFeatures::AppendActiveIndices(position, kRefreshTriggers[i],
-                                       active_indices);
-      for (const auto perspective : Colors) {
-        for (const auto index : active_indices[perspective]) {
-          ASSERT(index < RawFeatures::kDimensions);
-          ASSERT(index_sets[i][perspective].count(index) == 0);
-          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-          index_sets[i][perspective].insert(index);
-          trigger_map[index] = i;
-        }
-      }
-    }
-    return index_sets;
-  };
-  auto update_index_sets = [&](const Position& position, auto* index_sets) {
-    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-      Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2] = { false, false };
-      RawFeatures::AppendChangedIndices(position, kRefreshTriggers[i],
-                                        removed_indices, added_indices, reset);
-      for (const auto perspective : Colors) {
-        if (reset[perspective]) {
-          (*index_sets)[i][perspective].clear();
-          ++num_resets[i];
-        } else {
-          for (const auto index : removed_indices[perspective]) {
-            ASSERT(index < RawFeatures::kDimensions);
-            ASSERT((*index_sets)[i][perspective].count(index) == 1);
-            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-            (*index_sets)[i][perspective].erase(index);
-            ++num_updates.back();
-            ++num_updates[i];
-            trigger_map[index] = i;
-          }
-        }
-        for (const auto index : added_indices[perspective]) {
-          ASSERT(index < RawFeatures::kDimensions);
-          ASSERT((*index_sets)[i][perspective].count(index) == 0);
-          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-          (*index_sets)[i][perspective].insert(index);
-          ++num_updates.back();
-          ++num_updates[i];
-          trigger_map[index] = i;
-        }
-      }
-    }
-  };
-
-  std::cout << "feature set: " << RawFeatures::GetName()
-            << "[" << RawFeatures::kDimensions << "]" << std::endl;
-  std::cout << "start testing with random games";
-
-  for (std::uint64_t i = 0; i < num_games; ++i) {
-    auto index_sets = make_index_sets(pos);
-    for (ply = 0; ply < MAX_PLY; ++ply) {
-      MoveList<LEGAL> mg(pos); // Generate all legal hands
-
-      // There was no legal move == Clog
-      if (mg.size() == 0)
-        break;
-
-      // Randomly choose from the generated moves and advance the phase with the moves.
-      Move m = mg.begin()[prng.rand(mg.size())];
-      pos.do_move(m, state[ply]);
-
-      ++num_moves;
-      update_index_sets(pos, &index_sets);
-      ASSERT(index_sets == make_index_sets(pos));
-    }
-
-    pos.set(StartFEN, false, &si, Threads.main());
-
-    // Output'.' every 100 times (so you can see that it's progressing)
-    if ((i % 100) == 0)
-      std::cout << "." << std::flush;
-  }
-  std::cout << "passed." << std::endl;
-  std::cout << num_games << " games, " << num_moves << " moves, "
-            << num_updates.back() << " updates, "
-            << (1.0 * num_updates.back() / num_moves)
-            << " updates per move" << std::endl;
-  std::size_t num_observed_indices = 0;
-  for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-    const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
-    num_observed_indices += count;
-    std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
-              << "): " << count << " features ("
-              << (100.0 * count / RawFeatures::kDimensions) << "%), "
-              << num_updates[i] << " updates ("
-              << (1.0 * num_updates[i] / num_moves) << " per move), "
-              << num_resets[i] << " resets ("
-              << (100.0 * num_resets[i] / num_moves) << "%)"
-              << std::endl;
-  }
-  std::cout << "observed " << num_observed_indices << " ("
-            << (100.0 * num_observed_indices / RawFeatures::kDimensions)
-            << "% of " << RawFeatures::kDimensions
-            << ") features" << std::endl;
+#define ASSERT(X) { \
+    if (!(X)) { \
+        std::cout \
+            << "\nError : ASSERT(" << #X << "), " \
+            << __FILE__ << "(" << __LINE__ << "): " \
+            << __func__ << std::endl; \
+            std::this_thread::sleep_for(std::chrono::microseconds(3000)); \
+            *(int*)1 =0; \
+    } \
 }
 
-// Output a string that represents the structure of the evaluation function
-void PrintInfo(std::istream& stream) {
-  std::cout << "network architecture: " << GetArchitectureString() << std::endl;
-
-  while (true) {
-    std::string file_name;
-    stream >> file_name;
-    if (file_name.empty()) break;
-
-    std::uint32_t hash_value;
-    std::string architecture;
-    const bool success = [&]() {
-      std::ifstream file_stream(file_name, std::ios::binary);
-      if (!file_stream) return false;
-      if (!ReadHeader(file_stream, &hash_value, &architecture)) return false;
-      return true;
-    }();
-
-    std::cout << file_name << ": ";
-    if (success) {
-      if (hash_value == kHashValue) {
-        std::cout << "matches with this binary";
-        if (architecture != GetArchitectureString()) {
-          std::cout << ", but architecture string differs: " << architecture;
-        }
-        std::cout << std::endl;
-      } else {
-        std::cout << architecture << std::endl;
-      }
-    } else {
-      std::cout << "failed to read header" << std::endl;
-    }
-  }
-}
-
-}  // namespace
-
 // USI extended command for NNUE evaluation function
-void TestCommand(Position& pos, std::istream& stream) {
-  std::string sub_command;
-  stream >> sub_command;
+namespace Eval::NNUE {
 
-  if (sub_command == "test_features") {
-    TestFeatures(pos);
-  } else if (sub_command == "info") {
-    PrintInfo(stream);
-  } else {
-    std::cout << "usage:" << std::endl;
-    std::cout << " test nnue test_features" << std::endl;
-    std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
-  }
-}
+    namespace {
 
-}  // namespace NNUE
+        // Testing RawFeatures mainly for difference calculation
+        void TestFeatures(Position& pos) {
+            const std::uint64_t num_games = 1000;
+            StateInfo si;
+            pos.set(StartFEN, false, &si, Threads.main());
+            const int MAX_PLY = 256; // test up to 256 hands
 
-}  // namespace Eval
+            StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
+            int ply; // Trouble from the initial phase
+
+            PRNG prng(20171128);
+
+            std::uint64_t num_moves = 0;
+            std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
+            std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
+            constexpr IndexType kUnknown = -1;
+            std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
+
+            auto make_index_sets = [&](const Position& position) {
+                std::vector<std::vector<std::set<IndexType>>> index_sets(
+                    kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
+
+                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                    Features::IndexList active_indices[2];
+                    RawFeatures::AppendActiveIndices(position, kRefreshTriggers[i],
+                                                     active_indices);
+
+                    for (const auto perspective : Colors) {
+                        for (const auto index : active_indices[perspective]) {
+                            ASSERT(index < RawFeatures::kDimensions);
+                            ASSERT(index_sets[i][perspective].count(index) == 0);
+                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                            index_sets[i][perspective].insert(index);
+                            trigger_map[index] = i;
+                        }
+                    }
+                }
+
+                return index_sets;
+            };
+
+            auto update_index_sets = [&](const Position& position, auto* index_sets) {
+                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                    Features::IndexList removed_indices[2], added_indices[2];
+                    bool reset[2] = { false, false };
+                    RawFeatures::AppendChangedIndices(position, kRefreshTriggers[i],
+                                                      removed_indices, added_indices, reset);
+                    for (const auto perspective : Colors) {
+                        if (reset[perspective]) {
+                            (*index_sets)[i][perspective].clear();
+                            ++num_resets[i];
+                        } else {
+                            for (const auto index : removed_indices[perspective]) {
+                                ASSERT(index < RawFeatures::kDimensions);
+                                ASSERT((*index_sets)[i][perspective].count(index) == 1);
+                                ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                                (*index_sets)[i][perspective].erase(index);
+                                ++num_updates.back();
+                                ++num_updates[i];
+                                trigger_map[index] = i;
+                            }
+                        }
+
+                        for (const auto index : added_indices[perspective]) {
+                            ASSERT(index < RawFeatures::kDimensions);
+                            ASSERT((*index_sets)[i][perspective].count(index) == 0);
+                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+                            (*index_sets)[i][perspective].insert(index);
+                            ++num_updates.back();
+                            ++num_updates[i];
+                            trigger_map[index] = i;
+                        }
+                    }
+                }
+            };
+
+            std::cout << "feature set: " << RawFeatures::GetName()
+                      << "[" << RawFeatures::kDimensions << "]" << std::endl;
+            std::cout << "start testing with random games";
+
+            for (std::uint64_t i = 0; i < num_games; ++i) {
+                auto index_sets = make_index_sets(pos);
+                for (ply = 0; ply < MAX_PLY; ++ply) {
+                    MoveList<LEGAL> mg(pos); // Generate all legal hands
+
+                    // There was no legal move == Clog
+                    if (mg.size() == 0)
+                        break;
+
+                    // Randomly choose from the generated moves and advance the phase with the moves.
+                    Move m = mg.begin()[prng.rand(mg.size())];
+                    pos.do_move(m, state[ply]);
+
+                    ++num_moves;
+                    update_index_sets(pos, &index_sets);
+                    ASSERT(index_sets == make_index_sets(pos));
+                }
+
+                pos.set(StartFEN, false, &si, Threads.main());
+
+                // Output'.' every 100 times (so you can see that it's progressing)
+                if ((i % 100) == 0)
+                    std::cout << "." << std::flush;
+            }
+
+            std::cout << "passed." << std::endl;
+            std::cout << num_games << " games, " << num_moves << " moves, "
+                      << num_updates.back() << " updates, "
+                      << (1.0 * num_updates.back() / num_moves)
+                      << " updates per move" << std::endl;
+            std::size_t num_observed_indices = 0;
+
+            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+                const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
+                num_observed_indices += count;
+                std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
+                          << "): " << count << " features ("
+                          << (100.0 * count / RawFeatures::kDimensions) << "%), "
+                          << num_updates[i] << " updates ("
+                          << (1.0 * num_updates[i] / num_moves) << " per move), "
+                          << num_resets[i] << " resets ("
+                          << (100.0 * num_resets[i] / num_moves) << "%)"
+                          << std::endl;
+            }
+            std::cout << "observed " << num_observed_indices << " ("
+                      << (100.0 * num_observed_indices / RawFeatures::kDimensions)
+                      << "% of " << RawFeatures::kDimensions
+                      << ") features" << std::endl;
+        }
+
+        // Output a string that represents the structure of the evaluation function
+        void PrintInfo(std::istream& stream) {
+            std::cout << "network architecture: " << GetArchitectureString() << std::endl;
+
+            while (true) {
+                std::string file_name;
+                stream >> file_name;
+                if (file_name.empty())
+                    break;
+
+                std::uint32_t hash_value;
+                std::string architecture;
+                const bool success = [&]() {
+                    std::ifstream file_stream(file_name, std::ios::binary);
+
+                    if (!file_stream)
+                        return false;
+                    if (!ReadHeader(file_stream, &hash_value, &architecture))
+                        return false;
+
+                    return true;
+                }();
+
+                std::cout << file_name << ": ";
+                if (success) {
+                    if (hash_value == kHashValue) {
+                        std::cout << "matches with this binary";
+                        if (architecture != GetArchitectureString()) {
+                            std::cout << ", but architecture string differs: " << architecture;
+                        }
+
+                        std::cout << std::endl;
+                    } else {
+                        std::cout << architecture << std::endl;
+                    }
+                } else {
+                    std::cout << "failed to read header" << std::endl;
+                }
+            }
+        }
+
+    }  // namespace
+
+    // USI extended command for NNUE evaluation function
+    void TestCommand(Position& pos, std::istream& stream) {
+        std::string sub_command;
+        stream >> sub_command;
+
+        if (sub_command == "test_features") {
+            TestFeatures(pos);
+        } else if (sub_command == "info") {
+            PrintInfo(stream);
+        } else {
+            std::cout << "usage:" << std::endl;
+            std::cout << " test nnue test_features" << std::endl;
+            std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
+        }
+    }
+
+}  // namespace Eval::NNUE
diff --git a/src/nnue/nnue_test_command.h b/src/nnue/nnue_test_command.h
index 75d33e82..989731d6 100644
--- a/src/nnue/nnue_test_command.h
+++ b/src/nnue/nnue_test_command.h
@@ -1,17 +1,12 @@
-﻿// USI extended command interface for NNUE evaluation function
-
-#ifndef _NNUE_TEST_COMMAND_H_
+﻿#ifndef _NNUE_TEST_COMMAND_H_
 #define _NNUE_TEST_COMMAND_H_
 
-namespace Eval {
+// USI extended command interface for NNUE evaluation function
+namespace Eval::NNUE {
 
-namespace NNUE {
+    // USI extended command for NNUE evaluation function
+    void TestCommand(Position& pos, std::istream& stream);
 
-// USI extended command for NNUE evaluation function
-void TestCommand(Position& pos, std::istream& stream);
-
-}  // namespace NNUE
-
-}  // namespace Eval
+}  // namespace Eval::NNUE
 
 #endif

From 77624addf2763de1418162a1ed34527dadc83da5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 21:46:40 +0200
Subject: [PATCH 342/583] Cleanup last ".." in include paths.

---
 src/nnue/trainer/trainer_sum.h |  6 ++++--
 src/syzygy/tbprobe.cpp         | 12 ++++++------
 src/syzygy/tbprobe.h           |  2 +-
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index 9904704b..24fc6152 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -1,10 +1,12 @@
 ﻿#ifndef _NNUE_TRAINER_SUM_H_
 #define _NNUE_TRAINER_SUM_H_
 
-#include "../../learn/learn.h"
-#include "../layers/sum.h"
 #include "trainer.h"
 
+#include "learn/learn.h"
+
+#include "nnue/layers/sum.h"
+
 // Specialization of NNUE evaluation function learning class template for Sum
 namespace Eval::NNUE {
 
diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp
index f4b9447f..191986da 100644
--- a/src/syzygy/tbprobe.cpp
+++ b/src/syzygy/tbprobe.cpp
@@ -28,12 +28,12 @@
 #include <type_traits>
 #include <mutex>
 
-#include "../bitboard.h"
-#include "../movegen.h"
-#include "../position.h"
-#include "../search.h"
-#include "../types.h"
-#include "../uci.h"
+#include "bitboard.h"
+#include "movegen.h"
+#include "position.h"
+#include "search.h"
+#include "types.h"
+#include "uci.h"
 
 #include "tbprobe.h"
 
diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h
index b998989b..efc4b6b7 100644
--- a/src/syzygy/tbprobe.h
+++ b/src/syzygy/tbprobe.h
@@ -21,7 +21,7 @@
 
 #include <ostream>
 
-#include "../search.h"
+#include "search.h"
 
 namespace Tablebases {
 

From 9023edc3c864e1932cab7cec7a1608c5d7dce27a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 21:48:31 +0200
Subject: [PATCH 343/583] Add missing includes.

---
 src/nnue/evaluate_nnue.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index e6ddc7fd..264d24fe 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -21,6 +21,8 @@
 
 #include "nnue_feature_transformer.h"
 
+#include "misc.h"
+
 #include <memory>
 
 // header used in NNUE evaluation function

From 69ea3d30b241b268cc5b521ce6b6a6c6274c94e9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 21:58:25 +0200
Subject: [PATCH 344/583] Move the extra new line to after check health.

---
 src/learn/learn.cpp                            | 4 ----
 src/nnue/trainer/trainer_feature_transformer.h | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 95cbe4bb..205b9220 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -903,10 +903,6 @@ namespace Learner
                     << " , learn_entropy = " << learn_sum_entropy / done
                     << endl;
             }
-
-            // Bigger space between progress reports so that they can be more
-            // easly disinguished. Looking for timestamps is hard.
-            cout << endl;
         }
         else
         {
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index f403e413..4173f46d 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -349,7 +349,7 @@ namespace Eval::NNUE {
 
             std::cout << "INFO: largest min activation = " << largest_min_activation
                       << ", smallest max activation = " << smallest_max_activation
-                      << std::endl;
+                      << std::endl << std::endl;
 
             std::fill(std::begin(min_activations_), std::end(min_activations_),
                       std::numeric_limits<LearnFloatType>::max());

From 2398d34e87226df0244fda050440f6a63115b79f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 22:35:35 +0200
Subject: [PATCH 345/583] Move string split to misc

---
 src/misc.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/misc.h b/src/misc.h
index 6696b0a8..ae1d69d4 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -30,6 +30,7 @@
 #include <utility>
 #include <cmath>
 #include <cctype>
+#include <sstream>
 
 #include "types.h"
 
@@ -273,6 +274,19 @@ namespace Algo {
         for (uint64_t i = 0; i < size; ++i)
             std::swap(buf[i], buf[prng.rand(size - i) + i]);
     }
+
+    // split the string
+    inline std::vector<std::string> split(const std::string& input, char delimiter) {
+        std::istringstream stream(input);
+        std::string field;
+        std::vector<std::string> fields;
+
+        while (std::getline(stream, field, delimiter)) {
+            fields.push_back(field);
+        }
+
+        return fields;
+    }
 }
 
 // --------------------

From 146a6b056ed2daef9a06da0ae28ce5bcdb351dbf Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 14 Oct 2020 22:42:58 +0200
Subject: [PATCH 346/583] PascalCase -> snake_case for consistency with the
 rest of the codebase.

---
 src/learn/learn.cpp                           |  28 +-
 src/misc.h                                    |   4 +-
 src/nnue/evaluate_nnue.cpp                    | 547 ++++++++++--------
 src/nnue/evaluate_nnue.h                      |  12 +-
 src/nnue/evaluate_nnue_learner.cpp            |  64 +-
 src/nnue/evaluate_nnue_learner.h              |  21 +-
 src/nnue/features/castling_right.cpp          |  15 +-
 src/nnue/features/castling_right.h            |  11 +-
 src/nnue/features/enpassant.cpp               |  14 +-
 src/nnue/features/enpassant.h                 |  11 +-
 src/nnue/features/feature_set.h               |  75 ++-
 src/nnue/features/half_kp.cpp                 |  33 +-
 src/nnue/features/half_kp.h                   |  15 +-
 src/nnue/features/half_relative_kp.cpp        |  35 +-
 src/nnue/features/half_relative_kp.h          |  17 +-
 src/nnue/features/k.cpp                       |  22 +-
 src/nnue/features/k.h                         |  51 +-
 src/nnue/features/p.cpp                       |  22 +-
 src/nnue/features/p.h                         |  51 +-
 src/nnue/layers/affine_transform.h            |  26 +-
 src/nnue/layers/clipped_relu.h                |  22 +-
 src/nnue/layers/input_slice.h                 |  10 +-
 src/nnue/layers/sum.h                         |  64 +-
 src/nnue/nnue_common.h                        |   2 +-
 src/nnue/nnue_feature_transformer.h           |  32 +-
 src/nnue/nnue_test_command.cpp                |  22 +-
 src/nnue/nnue_test_command.h                  |   2 +-
 src/nnue/trainer/features/factorizer.h        |  22 +-
 .../trainer/features/factorizer_feature_set.h |  34 +-
 .../trainer/features/factorizer_half_kp.h     |  24 +-
 src/nnue/trainer/trainer.h                    |  33 +-
 src/nnue/trainer/trainer_affine_transform.h   |  44 +-
 src/nnue/trainer/trainer_clipped_relu.h       |  26 +-
 .../trainer/trainer_feature_transformer.h     |  78 +--
 src/nnue/trainer/trainer_input_slice.h        |  42 +-
 src/nnue/trainer/trainer_sum.h                |  48 +-
 src/uci.cpp                                   |   2 +-
 37 files changed, 844 insertions(+), 737 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 205b9220..dfbba391 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -964,7 +964,7 @@ namespace Learner
 
                         // Lock the evaluation function so that it is not used during updating.
                         lock_guard<shared_timed_mutex> write_lock(nn_mutex);
-                        Eval::NNUE::UpdateParameters();
+                        Eval::NNUE::update_parameters();
                     }
 
                     ++epoch;
@@ -998,7 +998,7 @@ namespace Learner
                         // loss calculation
                         calc_loss(thread_id, done);
 
-                        Eval::NNUE::CheckHealth();
+                        Eval::NNUE::check_health();
 
                         // Make a note of how far you have totaled.
                         sr.last_done = sr.total_done;
@@ -1127,7 +1127,7 @@ namespace Learner
                 learn_sum_entropy_win += learn_entropy_win;
                 learn_sum_entropy += learn_entropy;
 
-                Eval::NNUE::AddExample(pos, rootColor, ps, 1.0);
+                Eval::NNUE::add_example(pos, rootColor, ps, 1.0);
 
                 // Since the processing is completed, the counter of the processed number is incremented
                 sr.total_done++;
@@ -1194,7 +1194,7 @@ namespace Learner
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
-                    best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
+                    best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
                     trials = newbob_num_trials;
 
                     if (tot >= last_lr_drop + auto_lr_drop)
@@ -1207,13 +1207,13 @@ namespace Learner
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
-                    best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
+                    best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
                     trials = newbob_num_trials;
                 }
                 else
                 {
                     cout << " >= best (" << best_loss << "), rejected" << endl;
-                    best_nn_directory = Path::Combine((std::string)Options["EvalSaveDir"], dir_name);
+                    best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
 
                     if (--trials > 0 && !is_final)
                     {
@@ -1713,14 +1713,14 @@ namespace Learner
         // Display learning game file
         if (target_dir != "")
         {
-            string kif_base_dir = Path::Combine(base_dir, target_dir);
+            string kif_base_dir = Path::combine(base_dir, target_dir);
 
             namespace sys = std::filesystem;
             sys::path p(kif_base_dir); // Origin of enumeration
             std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
                 [&](const sys::path& path) {
                     if (sys::is_regular_file(path))
-                        filenames.push_back(Path::Combine(target_dir, path.filename().generic_string()));
+                        filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
                 });
         }
 
@@ -1814,7 +1814,7 @@ namespace Learner
             // order so I'll reverse it here. I'm sorry.
             for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
             {
-                sr.filenames.push_back(Path::Combine(base_dir, *it));
+                sr.filenames.push_back(Path::combine(base_dir, *it));
             }
         }
 
@@ -1858,9 +1858,9 @@ namespace Learner
         set_learning_search_limits();
 
         cout << "init_training.." << endl;
-        Eval::NNUE::InitializeTraining(seed);
-        Eval::NNUE::SetBatchSize(nn_batch_size);
-        Eval::NNUE::SetOptions(nn_options);
+        Eval::NNUE::initialize_training(seed);
+        Eval::NNUE::set_batch_size(nn_batch_size);
+        Eval::NNUE::set_options(nn_options);
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
             // Save the current net to [EvalSaveDir]\original.
             Eval::NNUE::save_eval("original");
@@ -1868,7 +1868,7 @@ namespace Learner
             // Set the folder above to best_nn_directory so that the trainer can
             // resotre the network parameters from the original net file.
             learn_think.best_nn_directory =
-                Path::Combine(Options["EvalSaveDir"], "original");
+                Path::combine(Options["EvalSaveDir"], "original");
         }
 
         cout << "init done." << endl;
@@ -1925,7 +1925,7 @@ namespace Learner
         // Start learning.
         learn_think.go_think();
 
-        Eval::NNUE::FinalizeNet();
+        Eval::NNUE::finalize_net();
 
         // Save once at the end.
         learn_think.save(true);
diff --git a/src/misc.h b/src/misc.h
index ae1d69d4..320eea76 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -299,7 +299,7 @@ struct Path
 {
 	// Combine the path name and file name and return it.
 	// If the folder name is not an empty string, append it if there is no'/' or'\\' at the end.
-	static std::string Combine(const std::string& folder, const std::string& filename)
+	static std::string combine(const std::string& folder, const std::string& filename)
 	{
 		if (folder.length() >= 1 && *folder.rbegin() != '/' && *folder.rbegin() != '\\')
 			return folder + "/" + filename;
@@ -308,7 +308,7 @@ struct Path
 	}
 
 	// Get the file name part (excluding the folder name) from the full path expression.
-	static std::string GetFileName(const std::string& path)
+	static std::string get_file_name(const std::string& path)
 	{
 		// I don't know which "\" or "/" is used.
 		auto path_index1 = path.find_last_of("\\") + 1;
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 0d504468..67398f81 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -1,303 +1,338 @@
 /*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
 
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
 
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Code for calculating NNUE evaluation function
 
+#include "evaluate_nnue.h"
+
+#include "position.h"
+#include "misc.h"
+#include "uci.h"
+#include "types.h"
+
 #include <iostream>
 #include <string>
 #include <fstream>
 #include <set>
 
-#include "../position.h"
-#include "../misc.h"
-#include "../uci.h"
-#include "../types.h"
-
-#include "evaluate_nnue.h"
-
 namespace Eval::NNUE {
 
-  const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
-   // convention: W - us, B - them
-   // viewed from other side, W and B are reversed
-      { PS_NONE,     PS_NONE     },
-      { PS_W_PAWN,   PS_B_PAWN   },
-      { PS_W_KNIGHT, PS_B_KNIGHT },
-      { PS_W_BISHOP, PS_B_BISHOP },
-      { PS_W_ROOK,   PS_B_ROOK   },
-      { PS_W_QUEEN,  PS_B_QUEEN  },
-      { PS_W_KING,   PS_B_KING   },
-      { PS_NONE,     PS_NONE     },
-      { PS_NONE,     PS_NONE     },
-      { PS_B_PAWN,   PS_W_PAWN   },
-      { PS_B_KNIGHT, PS_W_KNIGHT },
-      { PS_B_BISHOP, PS_W_BISHOP },
-      { PS_B_ROOK,   PS_W_ROOK   },
-      { PS_B_QUEEN,  PS_W_QUEEN  },
-      { PS_B_KING,   PS_W_KING   },
-      { PS_NONE,     PS_NONE     }
-  };
+    const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
+        // convention: W - us, B - them
+        // viewed from other side, W and B are reversed
+        { PS_NONE,     PS_NONE     },
+        { PS_W_PAWN,   PS_B_PAWN   },
+        { PS_W_KNIGHT, PS_B_KNIGHT },
+        { PS_W_BISHOP, PS_B_BISHOP },
+        { PS_W_ROOK,   PS_B_ROOK   },
+        { PS_W_QUEEN,  PS_B_QUEEN  },
+        { PS_W_KING,   PS_B_KING   },
+        { PS_NONE,     PS_NONE     },
+        { PS_NONE,     PS_NONE     },
+        { PS_B_PAWN,   PS_W_PAWN   },
+        { PS_B_KNIGHT, PS_W_KNIGHT },
+        { PS_B_BISHOP, PS_W_BISHOP },
+        { PS_B_ROOK,   PS_W_ROOK   },
+        { PS_B_QUEEN,  PS_W_QUEEN  },
+        { PS_B_KING,   PS_W_KING   },
+        { PS_NONE,     PS_NONE     }
+    };
 
-  // Input feature converter
-  LargePagePtr<FeatureTransformer> feature_transformer;
+    // Input feature converter
+    LargePagePtr<FeatureTransformer> feature_transformer;
 
-  // Evaluation function
-  AlignedPtr<Network> network;
+    // Evaluation function
+    AlignedPtr<Network> network;
 
-  // Evaluation function file name
-  std::string fileName;
+    // Evaluation function file name
+    std::string fileName;
 
-  // Saved evaluation function file name
-  std::string savedfileName = "nn.bin";
+    // Saved evaluation function file name
+    std::string savedfileName = "nn.bin";
 
-  // Get a string that represents the structure of the evaluation function
-  std::string GetArchitectureString() {
-    return "Features=" + FeatureTransformer::GetStructureString() +
-      ",Network=" + Network::GetStructureString();
-  }
-
-  UseNNUEMode useNNUE;
-  std::string eval_file_loaded = "None";
-
-  namespace Detail {
-
-  // Initialize the evaluation function parameters
-  template <typename T>
-  void Initialize(AlignedPtr<T>& pointer) {
-
-    pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
-    std::memset(pointer.get(), 0, sizeof(T));
-  }
-
-  template <typename T>
-  void Initialize(LargePagePtr<T>& pointer) {
-
-    static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
-    pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
-    std::memset(pointer.get(), 0, sizeof(T));
-  }
-
-  // Read evaluation function parameters
-  template <typename T>
-  bool ReadParameters(std::istream& stream, T& reference) {
-
-    std::uint32_t header;
-    header = read_little_endian<std::uint32_t>(stream);
-    if (!stream || header != T::GetHashValue()) return false;
-    return reference.ReadParameters(stream);
-  }
-
-  // write evaluation function parameters
-  template <typename T>
-  bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
-    constexpr std::uint32_t header = T::GetHashValue();
-    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
-    return pointer->WriteParameters(stream);
-  }
-
-  template <typename T>
-  bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
-    constexpr std::uint32_t header = T::GetHashValue();
-    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
-    return pointer->WriteParameters(stream);
-  }
-
-  }  // namespace Detail
-
-  // Initialize the evaluation function parameters
-  void Initialize() {
-
-    Detail::Initialize(feature_transformer);
-    Detail::Initialize(network);
-  }
-
-  // Read network header
-  bool ReadHeader(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
-  {
-    std::uint32_t version, size;
-
-    version     = read_little_endian<std::uint32_t>(stream);
-    *hash_value = read_little_endian<std::uint32_t>(stream);
-    size        = read_little_endian<std::uint32_t>(stream);
-    if (!stream || version != kVersion) return false;
-    architecture->resize(size);
-    stream.read(&(*architecture)[0], size);
-    return !stream.fail();
-  }
-
-  // write the header
-  bool WriteHeader(std::ostream& stream,
-    std::uint32_t hash_value, const std::string& architecture) {
-    stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
-    stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
-    const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
-    stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
-    stream.write(architecture.data(), size);
-    return !stream.fail();
-  }
-
-  // Read network parameters
-  bool ReadParameters(std::istream& stream) {
-
-    std::uint32_t hash_value;
-    std::string architecture;
-    if (!ReadHeader(stream, &hash_value, &architecture)) return false;
-    if (hash_value != kHashValue) return false;
-    if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
-    if (!Detail::ReadParameters(stream, *network)) return false;
-    return stream && stream.peek() == std::ios::traits_type::eof();
-  }
-  // write evaluation function parameters
-  bool WriteParameters(std::ostream& stream) {
-    if (!WriteHeader(stream, kHashValue, GetArchitectureString())) return false;
-    if (!Detail::WriteParameters(stream, feature_transformer)) return false;
-    if (!Detail::WriteParameters(stream, network)) return false;
-    return !stream.fail();
-  }
-  // Evaluation function. Perform differential calculation.
-  Value evaluate(const Position& pos) {
-
-    alignas(kCacheLineSize) TransformedFeatureType
-        transformed_features[FeatureTransformer::kBufferSize];
-    feature_transformer->Transform(pos, transformed_features);
-    alignas(kCacheLineSize) char buffer[Network::kBufferSize];
-    const auto output = network->Propagate(transformed_features, buffer);
-
-    return static_cast<Value>(output[0] / FV_SCALE);
-  }
-
-  // Load eval, from a file stream or a memory stream
-  bool load_eval(std::string name, std::istream& stream) {
-
-    Initialize();
-
-    fileName = name;
-    return ReadParameters(stream);
-  }
-
-  static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
-  {
-    if (mode == "false")
-      return UseNNUEMode::False;
-    else if (mode == "true")
-      return UseNNUEMode::True;
-    else if (mode == "pure")
-      return UseNNUEMode::Pure;
-
-    return UseNNUEMode::False;
-  }
-
-  void init() {
-
-    useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
-
-    if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
-    {
-      eval_file_loaded.clear();
-      return;
+    // Get a string that represents the structure of the evaluation function
+    std::string get_architecture_string() {
+        return "Features=" + FeatureTransformer::get_structure_string() +
+            ",Network=" + Network::get_structure_string();
     }
 
-    std::string eval_file = std::string(Options["EvalFile"]);
+    UseNNUEMode useNNUE;
+    std::string eval_file_loaded = "None";
+
+    namespace Detail {
+
+        // Initialize the evaluation function parameters
+        template <typename T>
+        void initialize(AlignedPtr<T>& pointer) {
+
+            pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
+            std::memset(pointer.get(), 0, sizeof(T));
+        }
+
+        template <typename T>
+        void initialize(LargePagePtr<T>& pointer) {
+
+            static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
+
+            pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
+            std::memset(pointer.get(), 0, sizeof(T));
+        }
+
+        // Read evaluation function parameters
+        template <typename T>
+        bool read_parameters(std::istream& stream, T& reference) {
+
+            std::uint32_t header;
+            header = read_little_endian<std::uint32_t>(stream);
+
+            if (!stream || header != T::get_hash_value())
+                return false;
+
+            return reference.read_parameters(stream);
+        }
+
+        // write evaluation function parameters
+        template <typename T>
+        bool write_parameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
+            constexpr std::uint32_t header = T::get_hash_value();
+
+            stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+            return pointer->write_parameters(stream);
+        }
+
+        template <typename T>
+        bool write_parameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
+            constexpr std::uint32_t header = T::get_hash_value();
+
+            stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+            return pointer->write_parameters(stream);
+        }
+    }  // namespace Detail
+
+    // Initialize the evaluation function parameters
+    void initialize() {
+
+        Detail::initialize(feature_transformer);
+        Detail::initialize(network);
+    }
+
+    // Read network header
+    bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
+    {
+        std::uint32_t version, size;
+
+        version     = read_little_endian<std::uint32_t>(stream);
+        *hash_value = read_little_endian<std::uint32_t>(stream);
+        size        = read_little_endian<std::uint32_t>(stream);
+
+        if (!stream || version != kVersion)
+            return false;
+
+        architecture->resize(size);
+        stream.read(&(*architecture)[0], size);
+
+        return !stream.fail();
+    }
+
+    // write the header
+    bool write_header(std::ostream& stream,
+        std::uint32_t hash_value, const std::string& architecture) {
+
+        stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
+        stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
+
+        const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
+
+        stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
+        stream.write(architecture.data(), size);
+
+        return !stream.fail();
+    }
+
+    // Read network parameters
+    bool read_parameters(std::istream& stream) {
+
+        std::uint32_t hash_value;
+        std::string architecture;
+        if (!read_header(stream, &hash_value, &architecture))
+            return false;
+
+        if (hash_value != kHashValue)
+            return false;
+
+        if (!Detail::read_parameters(stream, *feature_transformer))
+            return false;
+
+        if (!Detail::read_parameters(stream, *network))
+            return false;
+
+        return stream && stream.peek() == std::ios::traits_type::eof();
+    }
+    // write evaluation function parameters
+    bool write_parameters(std::ostream& stream) {
+
+        if (!write_header(stream, kHashValue, get_architecture_string()))
+            return false;
+
+        if (!Detail::write_parameters(stream, feature_transformer))
+            return false;
+
+        if (!Detail::write_parameters(stream, network))
+            return false;
+
+        return !stream.fail();
+    }
+    // Evaluation function. Perform differential calculation.
+    Value evaluate(const Position& pos) {
+
+        alignas(kCacheLineSize) TransformedFeatureType
+            transformed_features[FeatureTransformer::kBufferSize];
+
+        feature_transformer->transform(pos, transformed_features);
+
+        alignas(kCacheLineSize) char buffer[Network::kBufferSize];
+
+        const auto output = network->propagate(transformed_features, buffer);
+
+        return static_cast<Value>(output[0] / FV_SCALE);
+    }
+
+    // Load eval, from a file stream or a memory stream
+    bool load_eval(std::string name, std::istream& stream) {
+
+        initialize();
+
+        fileName = name;
+        return read_parameters(stream);
+    }
+
+    static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
+    {
+        if (mode == "false")
+          return UseNNUEMode::False;
+        else if (mode == "true")
+          return UseNNUEMode::True;
+        else if (mode == "pure")
+          return UseNNUEMode::Pure;
+
+        return UseNNUEMode::False;
+    }
+
+    void init() {
+
+        useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
+
+        if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
+        {
+            eval_file_loaded.clear();
+            return;
+        }
+
+        std::string eval_file = std::string(Options["EvalFile"]);
 
 #if defined(DEFAULT_NNUE_DIRECTORY)
 #define stringify2(x) #x
 #define stringify(x) stringify2(x)
-    std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
+        std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
 #else
-    std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
+        std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
 #endif
 
-    for (std::string directory : dirs)
-        if (eval_file_loaded != eval_file)
+        for (std::string directory : dirs)
         {
-            std::ifstream stream(directory + eval_file, std::ios::binary);
-            if (load_eval(eval_file, stream))
+            if (eval_file_loaded != eval_file)
             {
-                sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
-                eval_file_loaded = eval_file;
-            }
-            else
-            {
-                sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
-                eval_file_loaded.clear();
+                std::ifstream stream(directory + eval_file, std::ios::binary);
+                if (load_eval(eval_file, stream))
+                {
+                    sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
+                    eval_file_loaded = eval_file;
+                }
+                else
+                {
+                    sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+                    eval_file_loaded.clear();
+                }
             }
         }
 
 #undef stringify2
 #undef stringify
-  }
-
-  /// NNUE::verify() verifies that the last net used was loaded successfully
-  void verify_eval_file_loaded() {
-
-    std::string eval_file = std::string(Options["EvalFile"]);
-
-    if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
-    {
-        UCI::OptionsMap defaults;
-        UCI::init(defaults);
-
-        std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
-        std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
-        std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
-        std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
-        std::string msg5 = "The engine will be terminated now.";
-
-        sync_cout << "info string ERROR: " << msg1 << sync_endl;
-        sync_cout << "info string ERROR: " << msg2 << sync_endl;
-        sync_cout << "info string ERROR: " << msg3 << sync_endl;
-        sync_cout << "info string ERROR: " << msg4 << sync_endl;
-        sync_cout << "info string ERROR: " << msg5 << sync_endl;
-
-        std::exit(EXIT_FAILURE);
     }
 
-    if (useNNUE != UseNNUEMode::False)
-        sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
-    else
-        sync_cout << "info string classical evaluation enabled" << sync_endl;
-  }
+    /// NNUE::verify() verifies that the last net used was loaded successfully
+    void verify_eval_file_loaded() {
 
-  /// In training we override eval file so this is useful.
-  void verify_any_net_loaded() {
+        std::string eval_file = std::string(Options["EvalFile"]);
 
-    if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
-    {
-        UCI::OptionsMap defaults;
-        UCI::init(defaults);
+        if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
+        {
+            UCI::OptionsMap defaults;
+            UCI::init(defaults);
 
-        std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
-        std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
-        std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
-        std::string msg5 = "The engine will be terminated now.";
+            std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+            std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
+            std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+            std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
+            std::string msg5 = "The engine will be terminated now.";
 
-        sync_cout << "info string ERROR: " << msg1 << sync_endl;
-        sync_cout << "info string ERROR: " << msg2 << sync_endl;
-        sync_cout << "info string ERROR: " << msg3 << sync_endl;
-        sync_cout << "info string ERROR: " << msg5 << sync_endl;
+            sync_cout << "info string ERROR: " << msg1 << sync_endl;
+            sync_cout << "info string ERROR: " << msg2 << sync_endl;
+            sync_cout << "info string ERROR: " << msg3 << sync_endl;
+            sync_cout << "info string ERROR: " << msg4 << sync_endl;
+            sync_cout << "info string ERROR: " << msg5 << sync_endl;
 
-        std::exit(EXIT_FAILURE);
+            std::exit(EXIT_FAILURE);
+        }
+
+        if (useNNUE != UseNNUEMode::False)
+            sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
+        else
+            sync_cout << "info string classical evaluation enabled" << sync_endl;
     }
 
-    if (useNNUE != UseNNUEMode::False)
-        sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
-    else
-        sync_cout << "info string classical evaluation enabled" << sync_endl;
-  }
+    /// In training we override eval file so this is useful.
+    void verify_any_net_loaded() {
+
+        if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
+        {
+            UCI::OptionsMap defaults;
+            UCI::init(defaults);
+
+            std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+            std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
+            std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+            std::string msg5 = "The engine will be terminated now.";
+
+            sync_cout << "info string ERROR: " << msg1 << sync_endl;
+            sync_cout << "info string ERROR: " << msg2 << sync_endl;
+            sync_cout << "info string ERROR: " << msg3 << sync_endl;
+            sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+            std::exit(EXIT_FAILURE);
+        }
+
+        if (useNNUE != UseNNUEMode::False)
+            sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
+        else
+            sync_cout << "info string classical evaluation enabled" << sync_endl;
+    }
 
 } // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index 264d24fe..d0f61644 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -37,7 +37,7 @@ namespace Eval::NNUE {
 
     // Hash value of evaluation function structure
     constexpr std::uint32_t kHashValue =
-        FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
+        FeatureTransformer::get_hash_value() ^ Network::get_hash_value();
 
     // Deleter for automating release of memory area
     template <typename T>
@@ -79,21 +79,21 @@ namespace Eval::NNUE {
     extern std::string eval_file_loaded;
 
     // Get a string that represents the structure of the evaluation function
-    std::string GetArchitectureString();
+    std::string get_architecture_string();
 
     // read the header
-    bool ReadHeader(std::istream& stream,
+    bool read_header(std::istream& stream,
         std::uint32_t* hash_value, std::string* architecture);
 
     // write the header
-    bool WriteHeader(std::ostream& stream,
+    bool write_header(std::ostream& stream,
         std::uint32_t hash_value, const std::string& architecture);
 
     // read evaluation function parameters
-    bool ReadParameters(std::istream& stream);
+    bool read_parameters(std::istream& stream);
 
     // write evaluation function parameters
-    bool WriteParameters(std::ostream& stream);
+    bool write_parameters(std::ostream& stream);
 
     Value evaluate(const Position& pos);
     bool load_eval(std::string name, std::istream& stream);
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 92ecd8d2..e0236781 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -44,9 +44,9 @@ namespace Eval::NNUE {
         std::shared_ptr<Trainer<Network>> trainer;
 
         // Tell the learner options such as hyperparameters
-        void SendMessages(std::vector<Message> messages) {
+        void send_messages(std::vector<Message> messages) {
             for (auto& message : messages) {
-                trainer->SendMessage(&message);
+                trainer->send_message(&message);
                 assert(message.num_receivers > 0);
             }
         }
@@ -54,31 +54,31 @@ namespace Eval::NNUE {
     }  // namespace
 
     // Initialize learning
-    void InitializeTraining(const std::string& seed) {
+    void initialize_training(const std::string& seed) {
         std::cout << "Initializing NN training for "
-                  << GetArchitectureString() << std::endl;
+                  << get_architecture_string() << std::endl;
 
         assert(feature_transformer);
         assert(network);
-        trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
+        trainer = Trainer<Network>::create(network.get(), feature_transformer.get());
         rng.seed(PRNG(seed).rand<uint64_t>());
 
         if (Options["SkipLoadingEval"]) {
-            trainer->Initialize(rng);
+            trainer->initialize(rng);
         }
     }
 
     // set the number of samples in the mini-batch
-    void SetBatchSize(uint64_t size) {
+    void set_batch_size(uint64_t size) {
         assert(size > 0);
         batch_size = size;
     }
 
     // Set options such as hyperparameters
-    void SetOptions(const std::string& options) {
+    void set_options(const std::string& options) {
         std::vector<Message> messages;
-        for (const auto& option : Split(options, ',')) {
-          const auto fields = Split(option, '=');
+        for (const auto& option : Algo::split(options, ',')) {
+          const auto fields = Algo::split(option, '=');
           assert(fields.size() == 1 || fields.size() == 2);
 
           if (fields.size() == 1) {
@@ -88,30 +88,30 @@ namespace Eval::NNUE {
           }
         }
 
-        SendMessages(std::move(messages));
+        send_messages(std::move(messages));
     }
 
     // Reread the evaluation function parameters for learning from the file
-    void RestoreParameters(const std::string& dir_name) {
-        const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
+    void restore_parameters(const std::string& dir_name) {
+        const std::string file_name = Path::combine(dir_name, NNUE::savedfileName);
         std::ifstream stream(file_name, std::ios::binary);
 #ifndef NDEBUG
         bool result =
 #endif
-        ReadParameters(stream);
+        read_parameters(stream);
 #ifndef NDEBUG
         assert(result);
 #endif
 
-        SendMessages({{"reset"}});
+        send_messages({{"reset"}});
     }
 
-    void FinalizeNet() {
-        SendMessages({{"clear_unobserved_feature_weights"}});
+    void finalize_net() {
+        send_messages({{"clear_unobserved_feature_weights"}});
     }
 
     // Add 1 sample of learning data
-    void AddExample(Position& pos, Color rootColor,
+    void add_example(Position& pos, Color rootColor,
                     const Learner::PackedSfenValue& psv, double weight) {
 
         Example example;
@@ -126,7 +126,7 @@ namespace Eval::NNUE {
 
         Features::IndexList active_indices[2];
         for (const auto trigger : kRefreshTriggers) {
-            RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
+            RawFeatures::append_active_indices(pos, trigger, active_indices);
         }
 
         if (pos.side_to_move() != WHITE) {
@@ -136,9 +136,9 @@ namespace Eval::NNUE {
         for (const auto color : Colors) {
             std::vector<TrainingFeature> training_features;
             for (const auto base_index : active_indices[color]) {
-                static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
+                static_assert(Features::Factorizer<RawFeatures>::get_dimensions() <
                               (1 << TrainingFeature::kIndexBits), "");
-                Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+                Features::Factorizer<RawFeatures>::append_training_features(
                     base_index, &training_features);
             }
 
@@ -147,7 +147,7 @@ namespace Eval::NNUE {
             auto& unique_features = example.training_features[color];
             for (const auto& feature : training_features) {
                 if (!unique_features.empty() &&
-                    feature.GetIndex() == unique_features.back().GetIndex()) {
+                    feature.get_index() == unique_features.back().get_index()) {
 
                     unique_features.back() += feature;
                 } else {
@@ -161,7 +161,7 @@ namespace Eval::NNUE {
     }
 
     // update the evaluation function parameters
-    void UpdateParameters() {
+    void update_parameters() {
         assert(batch_size > 0);
 
         const auto learning_rate = static_cast<LearnFloatType>(
@@ -173,30 +173,30 @@ namespace Eval::NNUE {
             std::vector<Example> batch(examples.end() - batch_size, examples.end());
             examples.resize(examples.size() - batch_size);
 
-            const auto network_output = trainer->Propagate(batch);
+            const auto network_output = trainer->propagate(batch);
 
             std::vector<LearnFloatType> gradients(batch.size());
             for (std::size_t b = 0; b < batch.size(); ++b) {
-                const auto shallow = static_cast<Value>(Round<std::int32_t>(
+                const auto shallow = static_cast<Value>(round<std::int32_t>(
                     batch[b].sign * network_output[b] * kPonanzaConstant));
                 const auto& psv = batch[b].psv;
                 const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
                 gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
             }
 
-            trainer->Backpropagate(gradients.data(), learning_rate);
+            trainer->backpropagate(gradients.data(), learning_rate);
         }
-        SendMessages({{"quantize_parameters"}});
+        send_messages({{"quantize_parameters"}});
     }
 
     // Check if there are any problems with learning
-    void CheckHealth() {
-        SendMessages({{"check_health"}});
+    void check_health() {
+        send_messages({{"check_health"}});
     }
 
     // save merit function parameters to a file
     void save_eval(std::string dir_name) {
-        auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
+        auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name);
         std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
 
         // mkdir() will fail if this folder already exists, but
@@ -204,12 +204,12 @@ namespace Eval::NNUE {
         // Also, assume that the folders up to EvalSaveDir have been dug.
         std::filesystem::create_directories(eval_dir);
 
-        const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
+        const std::string file_name = Path::combine(eval_dir, NNUE::savedfileName);
         std::ofstream stream(file_name, std::ios::binary);
 #ifndef NDEBUG
         bool result =
 #endif
-        WriteParameters(stream);
+        write_parameters(stream);
 #ifndef NDEBUG
         assert(result);
 #endif
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 525b286a..431fb02e 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -7,28 +7,31 @@
 namespace Eval::NNUE {
 
     // Initialize learning
-    void InitializeTraining(const std::string& seed);
+    void initialize_training(const std::string& seed);
 
     // set the number of samples in the mini-batch
-    void SetBatchSize(uint64_t size);
+    void set_batch_size(uint64_t size);
 
     // Set options such as hyperparameters
-    void SetOptions(const std::string& options);
+    void set_options(const std::string& options);
 
     // Reread the evaluation function parameters for learning from the file
-    void RestoreParameters(const std::string& dir_name);
+    void restore_parameters(const std::string& dir_name);
 
     // Add 1 sample of learning data
-    void AddExample(Position& pos, Color rootColor,
-    	 const Learner::PackedSfenValue& psv, double weight);
+    void add_example(
+        Position& pos,
+        Color rootColor,
+    	const Learner::PackedSfenValue& psv,
+        double weight);
 
     // update the evaluation function parameters
-    void UpdateParameters();
+    void update_parameters();
 
     // Check if there are any problems with learning
-    void CheckHealth();
+    void check_health();
 
-    void FinalizeNet();
+    void finalize_net();
 
     void save_eval(std::string suffix);
 }  // namespace Eval::NNUE
diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
index eb8a36a1..cbac0851 100644
--- a/src/nnue/features/castling_right.cpp
+++ b/src/nnue/features/castling_right.cpp
@@ -5,8 +5,11 @@
 namespace Eval::NNUE::Features {
 
     // Get a list of indices with a value of 1 among the features
-    void CastlingRight::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
+    void CastlingRight::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
         // do nothing if array size is small to avoid compiler warning
         if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
 
@@ -29,9 +32,11 @@ namespace Eval::NNUE::Features {
     }
 
     // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    void CastlingRight::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* /* added */) {
+    void CastlingRight::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* /* added */) {
 
         int previous_castling_rights = pos.state()->previous->castlingRights;
         int current_castling_rights = pos.state()->castlingRights;
diff --git a/src/nnue/features/castling_right.h b/src/nnue/features/castling_right.h
index 3e35e432..cada24b6 100644
--- a/src/nnue/features/castling_right.h
+++ b/src/nnue/features/castling_right.h
@@ -26,12 +26,17 @@ namespace Eval::NNUE::Features {
         static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
         // Get a list of indices with a value of 1 among the features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
             IndexList* active);
 
         // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-            IndexList* removed, IndexList* added);
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
     };
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
index 7aa8988b..06ba2d49 100644
--- a/src/nnue/features/enpassant.cpp
+++ b/src/nnue/features/enpassant.cpp
@@ -5,8 +5,10 @@
 namespace Eval::NNUE::Features {
 
     // Get a list of indices with a value of 1 among the features
-    void EnPassant::AppendActiveIndices(
-        const Position& pos, Color /* perspective */, IndexList* active) {
+    void EnPassant::append_active_indices(
+        const Position& pos,
+        Color /* perspective */,
+        IndexList* active) {
 
         // do nothing if array size is small to avoid compiler warning
         if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions)
@@ -21,9 +23,11 @@ namespace Eval::NNUE::Features {
     }
 
     // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    void EnPassant::AppendChangedIndices(
-        const Position& pos, Color /* perspective */,
-        IndexList* removed, IndexList* added) {
+    void EnPassant::append_changed_indices(
+        const Position& pos,
+        Color /* perspective */,
+        IndexList* removed,
+        IndexList* added) {
 
         auto previous_epSquare = pos.state()->previous->epSquare;
         auto epSquare = pos.state()->epSquare;
diff --git a/src/nnue/features/enpassant.h b/src/nnue/features/enpassant.h
index 65819a96..6ccb6046 100644
--- a/src/nnue/features/enpassant.h
+++ b/src/nnue/features/enpassant.h
@@ -22,12 +22,17 @@ namespace Eval::NNUE::Features {
         static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
         // Get a list of indices with a value of 1 among the features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
             IndexList* active);
 
         // Get a list of indices whose values have changed from the previous one in the feature quantity
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-            IndexList* removed, IndexList* added);
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
     };
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index 5b243424..32ef24ef 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -33,8 +33,8 @@ namespace Eval::NNUE::Features {
 
     template <typename T, T First, T... Remaining>
     struct CompileTimeList<T, First, Remaining...> {
-        static constexpr bool Contains(T value) {
-            return value == First || CompileTimeList<T, Remaining...>::Contains(value);
+        static constexpr bool contains(T value) {
+            return value == First || CompileTimeList<T, Remaining...>::contains(value);
         }
 
         static constexpr std::array<T, sizeof...(Remaining) + 1>
@@ -47,7 +47,7 @@ namespace Eval::NNUE::Features {
 
     template <typename T>
     struct CompileTimeList<T> {
-        static constexpr bool Contains(T /*value*/) {
+        static constexpr bool contains(T /*value*/) {
             return false;
         }
         static constexpr std::array<T, 0> kValues = { {} };
@@ -70,7 +70,7 @@ namespace Eval::NNUE::Features {
     struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
         using Result =
             std::conditional_t<
-                CompileTimeList<T, First, Remaining...>::Contains(AnotherValue),
+                CompileTimeList<T, First, Remaining...>::contains(AnotherValue),
                 CompileTimeList<T, First, Remaining...>,
                 std::conditional_t<
                     (AnotherValue < First),
@@ -95,20 +95,23 @@ namespace Eval::NNUE::Features {
        public:
         // Get a list of indices for active features
         template <typename IndexListType>
-        static void AppendActiveIndices(
+        static void append_active_indices(
             const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
 
             for (Color perspective : { WHITE, BLACK }) {
-                Derived::CollectActiveIndices(
+                Derived::collect_active_indices(
                     pos, trigger, perspective, &active[perspective]);
             }
         }
 
         // Get a list of indices for recently changed features
         template <typename PositionType, typename IndexListType>
-        static void AppendChangedIndices(
-            const PositionType& pos, TriggerEvent trigger,
-            IndexListType removed[2], IndexListType added[2], bool reset[2]) {
+        static void append_changed_indices(
+            const PositionType& pos,
+            TriggerEvent trigger,
+            IndexListType removed[2],
+            IndexListType added[2],
+            bool reset[2]) {
 
             const auto& dp = pos.state()->dirtyPiece;
 
@@ -137,10 +140,10 @@ namespace Eval::NNUE::Features {
                 }
 
                 if (reset[perspective]) {
-                    Derived::CollectActiveIndices(
+                    Derived::collect_active_indices(
                         pos, trigger, perspective, &added[perspective]);
                 } else {
-                    Derived::CollectChangedIndices(
+                    Derived::collect_changed_indices(
                         pos, trigger, perspective,
                         &removed[perspective], &added[perspective]);
                 }
@@ -180,20 +183,23 @@ namespace Eval::NNUE::Features {
         static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
 
         // Get the feature quantity name
-        static std::string GetName() {
-            return std::string(Head::kName) + "+" + Tail::GetName();
+        static std::string get_name() {
+            return std::string(Head::kName) + "+" + Tail::get_name();
         }
 
     private:
         // Get a list of indices with a value of 1 among the features
         template <typename IndexListType>
-        static void CollectActiveIndices(
-            const Position& pos, const TriggerEvent trigger, const Color perspective,
+        static void collect_active_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
             IndexListType* const active) {
-            Tail::CollectActiveIndices(pos, trigger, perspective, active);
+
+            Tail::collect_active_indices(pos, trigger, perspective, active);
             if (Head::kRefreshTrigger == trigger) {
                 const auto start = active->size();
-                Head::AppendActiveIndices(pos, perspective, active);
+                Head::append_active_indices(pos, perspective, active);
 
                 for (auto i = start; i < active->size(); ++i) {
                     (*active)[i] += Tail::kDimensions;
@@ -203,14 +209,18 @@ namespace Eval::NNUE::Features {
 
         // Get a list of indices whose values have changed from the previous one in the feature quantity
         template <typename IndexListType>
-        static void CollectChangedIndices(
-            const Position& pos, const TriggerEvent trigger, const Color perspective,
-            IndexListType* const removed, IndexListType* const added) {
-            Tail::CollectChangedIndices(pos, trigger, perspective, removed, added);
+        static void collect_changed_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexListType* const removed,
+            IndexListType* const added) {
+
+            Tail::collect_changed_indices(pos, trigger, perspective, removed, added);
             if (Head::kRefreshTrigger == trigger) {
                 const auto start_removed = removed->size();
                 const auto start_added = added->size();
-                Head::AppendChangedIndices(pos, perspective, removed, added);
+                Head::append_changed_indices(pos, perspective, removed, added);
 
                 for (auto i = start_removed; i < removed->size(); ++i) {
                     (*removed)[i] += Tail::kDimensions;
@@ -251,28 +261,33 @@ namespace Eval::NNUE::Features {
         static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
 
         // Get the feature quantity name
-        static std::string GetName() {
+        static std::string get_name() {
             return FeatureType::kName;
         }
 
     private:
         // Get a list of indices for active features
-        static void CollectActiveIndices(
-            const Position& pos, const TriggerEvent trigger, const Color perspective,
+        static void collect_active_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
             IndexList* const active) {
 
             if (FeatureType::kRefreshTrigger == trigger) {
-              FeatureType::AppendActiveIndices(pos, perspective, active);
+              FeatureType::append_active_indices(pos, perspective, active);
             }
         }
 
         // Get a list of indices for recently changed features
-        static void CollectChangedIndices(
-            const Position& pos, const TriggerEvent trigger, const Color perspective,
-            IndexList* const removed, IndexList* const added) {
+        static void collect_changed_indices(
+            const Position& pos,
+            const TriggerEvent trigger,
+            const Color perspective,
+            IndexList* const removed,
+            IndexList* const added) {
 
             if (FeatureType::kRefreshTrigger == trigger) {
-              FeatureType::AppendChangedIndices(pos, perspective, removed, added);
+              FeatureType::append_changed_indices(pos, perspective, removed, added);
             }
         }
 
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index 17b50472..18e82004 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -30,30 +30,41 @@ namespace Eval::NNUE::Features {
 
     // Find the index of the feature quantity from the king position and PieceSquare
     template <Side AssociatedKing>
-    inline IndexType HalfKP<AssociatedKing>::MakeIndex(
-        Color perspective, Square s, Piece pc, Square ksq) {
+    inline IndexType HalfKP<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square ksq) {
 
         return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END * ksq);
     }
 
     // Get a list of indices for active features
     template <Side AssociatedKing>
-    void HalfKP<AssociatedKing>::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
+    void HalfKP<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
 
-        Square ksq = orient(perspective, pos.square<KING>(AssociatedKing == Side::kFriend ? perspective : ~perspective));
         Bitboard bb = pos.pieces() & ~pos.pieces(KING);
         while (bb) {
             Square s = pop_lsb(&bb);
-            active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
         }
     }
 
     // Get a list of indices for recently changed features
     template <Side AssociatedKing>
-    void HalfKP<AssociatedKing>::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+    void HalfKP<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
 
         Square ksq = orient(
             perspective,
@@ -68,10 +79,10 @@ namespace Eval::NNUE::Features {
                 continue;
 
             if (dp.from[i] != SQ_NONE)
-                removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
 
             if (dp.to[i] != SQ_NONE)
-                added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
         }
     }
 
diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
index 834f800e..4a4329e8 100644
--- a/src/nnue/features/half_kp.h
+++ b/src/nnue/features/half_kp.h
@@ -53,16 +53,21 @@ namespace Eval::NNUE::Features {
             TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
 
         // Get a list of indices for active features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
-                                        IndexList* active);
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
 
         // Get a list of indices for recently changed features
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-                                         IndexList* removed, IndexList* added);
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
 
     private:
         // Index of a feature for a given king position and another piece on some square
-        static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
     };
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 5ab22890..240e20c0 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -11,16 +11,21 @@ namespace Eval::NNUE::Features {
 
     // Find the index of the feature quantity from the ball position and PieceSquare
     template <Side AssociatedKing>
-    inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-        Color perspective, Square s, Piece pc, Square sq_k) {
+    inline IndexType HalfRelativeKP<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square sq_k) {
+
         const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-        return MakeIndex(sq_k, p);
+        return make_index(sq_k, p);
     }
 
     // Find the index of the feature quantity from the ball position and PieceSquare
     template <Side AssociatedKing>
-    inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
-        Square sq_k, IndexType p) {
+    inline IndexType HalfRelativeKP<AssociatedKing>::make_index(
+        Square sq_k,
+        IndexType p) {
 
         constexpr IndexType W = kBoardWidth;
         constexpr IndexType H = kBoardHeight;
@@ -33,8 +38,10 @@ namespace Eval::NNUE::Features {
 
     // Get a list of indices with a value of 1 among the features
     template <Side AssociatedKing>
-    void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
+    void HalfRelativeKP<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
 
         Square ksq = orient(
             perspective,
@@ -44,15 +51,17 @@ namespace Eval::NNUE::Features {
         Bitboard bb = pos.pieces() & ~pos.pieces(KING);
         while (bb) {
             Square s = pop_lsb(&bb);
-            active->push_back(MakeIndex(perspective, s, pos.piece_on(s), ksq));
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
         }
     }
 
     // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
     template <Side AssociatedKing>
-    void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+    void HalfRelativeKP<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
 
         Square ksq = orient(
             perspective,
@@ -67,10 +76,10 @@ namespace Eval::NNUE::Features {
                 continue;
 
             if (dp.from[i] != SQ_NONE)
-                removed->push_back(MakeIndex(perspective, dp.from[i], pc, ksq));
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
 
             if (dp.to[i] != SQ_NONE)
-                added->push_back(MakeIndex(perspective, dp.to[i], pc, ksq));
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
         }
     }
 
diff --git a/src/nnue/features/half_relative_kp.h b/src/nnue/features/half_relative_kp.h
index cc1e136f..590a01a3 100644
--- a/src/nnue/features/half_relative_kp.h
+++ b/src/nnue/features/half_relative_kp.h
@@ -42,18 +42,23 @@ namespace Eval::NNUE::Features {
             TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
 
         // Get a list of indices with a value of 1 among the features
-        static void AppendActiveIndices(const Position& pos, Color perspective,
-                                        IndexList* active);
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
 
         // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-        static void AppendChangedIndices(const Position& pos, Color perspective,
-                                         IndexList* removed, IndexList* added);
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
 
         // Find the index of the feature quantity from the ball position and PieceSquare
-        static IndexType MakeIndex(Square s, IndexType p);
+        static IndexType make_index(Square s, IndexType p);
 
         // Find the index of the feature quantity from the ball position and PieceSquare
-        static IndexType MakeIndex(Color perspective, Square s, Piece pc, Square sq_k);
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
     };
 
 }  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/k.cpp b/src/nnue/features/k.cpp
index 8911abb7..f01a6ce0 100644
--- a/src/nnue/features/k.cpp
+++ b/src/nnue/features/k.cpp
@@ -10,29 +10,33 @@ namespace Eval::NNUE::Features {
     }
 
     // Index of a feature for a given king position.
-    IndexType K::MakeIndex(Color perspective, Square s, Color king_color) {
+    IndexType K::make_index(Color perspective, Square s, Color king_color) {
         return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
     }
 
     // Get a list of indices with a value of 1 among the features
-    void K::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
+    void K::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
 
         for (auto color : Colors) {
-          active->push_back(MakeIndex(perspective, pos.square<KING>(color), color));
+          active->push_back(make_index(perspective, pos.square<KING>(color), color));
         }
     }
 
     // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    void K::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+    void K::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
 
         const auto& dp = pos.state()->dirtyPiece;
         if (type_of(dp.piece[0]) == KING)
         {
-            removed->push_back(MakeIndex(perspective, dp.from[0], color_of(dp.piece[0])));
-            added->push_back(MakeIndex(perspective, dp.to[0], color_of(dp.piece[0])));
+            removed->push_back(make_index(perspective, dp.from[0], color_of(dp.piece[0])));
+            added->push_back(make_index(perspective, dp.to[0], color_of(dp.piece[0])));
         }
     }
 
diff --git a/src/nnue/features/k.h b/src/nnue/features/k.h
index c9726ab2..928d77de 100644
--- a/src/nnue/features/k.h
+++ b/src/nnue/features/k.h
@@ -8,36 +8,41 @@
 //Definition of input feature quantity K of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  // Feature K: Ball position
-  class K {
-  public:
-      // feature quantity name
-      static constexpr const char* kName = "K";
+    // Feature K: Ball position
+    class K {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "K";
 
-      // Hash value embedded in the evaluation function file
-      static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
 
-      // number of feature dimensions
-      static constexpr IndexType kDimensions = SQUARE_NB * 2;
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = SQUARE_NB * 2;
 
-      // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-      static constexpr IndexType kMaxActiveDimensions = 2;
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 2;
 
-      // Timing of full calculation instead of difference calculation
-      static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-      // Get a list of indices with a value of 1 among the features
-      static void AppendActiveIndices(const Position& pos, Color perspective,
-                                      IndexList* active);
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
 
-      // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-      static void AppendChangedIndices(const Position& pos, Color perspective,
-                                       IndexList* removed, IndexList* added);
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
 
-  private:
-      // Index of a feature for a given king position.
-      static IndexType MakeIndex(Color perspective, Square s, Color king_color);
-  };
+    private:
+        // Index of a feature for a given king position.
+        static IndexType make_index(Color perspective, Square s, Color king_color);
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/features/p.cpp b/src/nnue/features/p.cpp
index b4757284..1621e8b2 100644
--- a/src/nnue/features/p.cpp
+++ b/src/nnue/features/p.cpp
@@ -10,26 +10,30 @@ namespace Eval::NNUE::Features {
     }
 
     // Find the index of the feature quantity from the king position and PieceSquare
-    inline IndexType P::MakeIndex(
+    inline IndexType P::make_index(
         Color perspective, Square s, Piece pc) {
         return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
     }
 
     // Get a list of indices with a value of 1 among the features
-    void P::AppendActiveIndices(
-        const Position& pos, Color perspective, IndexList* active) {
+    void P::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
 
         Bitboard bb = pos.pieces() & ~pos.pieces(KING);
         while (bb) {
             Square s = pop_lsb(&bb);
-            active->push_back(MakeIndex(perspective, s, pos.piece_on(s)));
+            active->push_back(make_index(perspective, s, pos.piece_on(s)));
         }
     }
 
     // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    void P::AppendChangedIndices(
-        const Position& pos, Color perspective,
-        IndexList* removed, IndexList* added) {
+    void P::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
 
         const auto& dp = pos.state()->dirtyPiece;
         for (int i = 0; i < dp.dirty_num; ++i) {
@@ -39,10 +43,10 @@ namespace Eval::NNUE::Features {
               continue;
 
             if (dp.from[i] != SQ_NONE)
-              removed->push_back(MakeIndex(perspective, dp.from[i], pc));
+              removed->push_back(make_index(perspective, dp.from[i], pc));
 
             if (dp.to[i] != SQ_NONE)
-              added->push_back(MakeIndex(perspective, dp.to[i], pc));
+              added->push_back(make_index(perspective, dp.to[i], pc));
         }
     }
 
diff --git a/src/nnue/features/p.h b/src/nnue/features/p.h
index 6a8a5392..d461086b 100644
--- a/src/nnue/features/p.h
+++ b/src/nnue/features/p.h
@@ -8,36 +8,41 @@
 //Definition of input feature P of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-  // Feature P: PieceSquare of pieces other than balls
-  class P {
-  public:
-      // feature quantity name
-      static constexpr const char* kName = "P";
+    // Feature P: PieceSquare of pieces other than balls
+    class P {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "P";
 
-      // Hash value embedded in the evaluation function file
-      static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
 
-      // number of feature dimensions
-      static constexpr IndexType kDimensions = PS_END;
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = PS_END;
 
-      // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-      static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
 
-      // Timing of full calculation instead of difference calculation
-      static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
 
-      // Get a list of indices with a value of 1 among the features
-      static void AppendActiveIndices(const Position& pos, Color perspective,
-                                      IndexList* active);
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
 
-      // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-      static void AppendChangedIndices(const Position& pos, Color perspective,
-                                       IndexList* removed, IndexList* added);
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
 
-  private:
-      // Index of a feature for a given piece on some square
-      static IndexType MakeIndex(Color perspective, Square s, Piece pc);
-  };
+    private:
+        // Index of a feature for a given piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc);
+    };
 
 }  // namespace Eval::NNUE::Features
 
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index cc5e5eef..6efaecbc 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -47,36 +47,36 @@ namespace Eval::NNUE::Layers {
         static constexpr IndexType kOutputDimensions = OutputDimensions;
 
         static constexpr IndexType kPaddedInputDimensions =
-            CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);
+            ceil_to_multiple<IndexType>(kInputDimensions, kMaxSimdWidth);
 
         // Size of forward propagation buffer used in this layer
         static constexpr std::size_t kSelfBufferSize =
-            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+            ceil_to_multiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
         // Size of the forward propagation buffer used from the input layer to this layer
         static constexpr std::size_t kBufferSize =
             PreviousLayer::kBufferSize + kSelfBufferSize;
 
         // Hash value embedded in the evaluation file
-        static constexpr std::uint32_t GetHashValue() {
+        static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0xCC03DAE4u;
             hash_value += kOutputDimensions;
-            hash_value ^= PreviousLayer::GetHashValue() >> 1;
-            hash_value ^= PreviousLayer::GetHashValue() << 31;
+            hash_value ^= PreviousLayer::get_hash_value() >> 1;
+            hash_value ^= PreviousLayer::get_hash_value() << 31;
             return hash_value;
         }
 
         // A string that represents the structure from the input layer to this layer
-        static std::string GetStructureString() {
+        static std::string get_structure_string() {
             return "AffineTransform[" +
                 std::to_string(kOutputDimensions) + "<-" +
                 std::to_string(kInputDimensions) + "](" +
-                PreviousLayer::GetStructureString() + ")";
+                PreviousLayer::get_structure_string() + ")";
         }
 
        // Read network parameters
-        bool ReadParameters(std::istream& stream) {
-            if (!previous_layer_.ReadParameters(stream))
+        bool read_parameters(std::istream& stream) {
+            if (!previous_layer_.read_parameters(stream))
                 return false;
 
             for (std::size_t i = 0; i < kOutputDimensions; ++i)
@@ -89,8 +89,8 @@ namespace Eval::NNUE::Layers {
         }
 
         // write parameters
-        bool WriteParameters(std::ostream& stream) const {
-            if (!previous_layer_.WriteParameters(stream))
+        bool write_parameters(std::ostream& stream) const {
+            if (!previous_layer_.write_parameters(stream))
                 return false;
 
             stream.write(reinterpret_cast<const char*>(biases_),
@@ -104,10 +104,10 @@ namespace Eval::NNUE::Layers {
         }
 
         // Forward propagation
-        const OutputType* Propagate(
+        const OutputType* propagate(
             const TransformedFeatureType* transformed_features, char* buffer) const {
 
-            const auto input = previous_layer_.Propagate(
+            const auto input = previous_layer_.propagate(
                 transformed_features, buffer + kSelfBufferSize);
             const auto output = reinterpret_cast<OutputType*>(buffer);
 
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 0846f3df..889effa7 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -48,41 +48,41 @@ namespace Eval::NNUE::Layers {
 
         // Size of forward propagation buffer used in this layer
         static constexpr std::size_t kSelfBufferSize =
-            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+            ceil_to_multiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
         // Size of the forward propagation buffer used from the input layer to this layer
         static constexpr std::size_t kBufferSize =
             PreviousLayer::kBufferSize + kSelfBufferSize;
 
         // Hash value embedded in the evaluation file
-        static constexpr std::uint32_t GetHashValue() {
+        static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0x538D24C7u;
-            hash_value += PreviousLayer::GetHashValue();
+            hash_value += PreviousLayer::get_hash_value();
             return hash_value;
         }
 
         // A string that represents the structure from the input layer to this layer
-        static std::string GetStructureString() {
+        static std::string get_structure_string() {
             return "ClippedReLU[" +
                 std::to_string(kOutputDimensions) + "](" +
-                PreviousLayer::GetStructureString() + ")";
+                PreviousLayer::get_structure_string() + ")";
         }
 
         // Read network parameters
-        bool ReadParameters(std::istream& stream) {
-            return previous_layer_.ReadParameters(stream);
+        bool read_parameters(std::istream& stream) {
+            return previous_layer_.read_parameters(stream);
         }
 
         // write parameters
-        bool WriteParameters(std::ostream& stream) const {
-            return previous_layer_.WriteParameters(stream);
+        bool write_parameters(std::ostream& stream) const {
+            return previous_layer_.write_parameters(stream);
         }
 
         // Forward propagation
-        const OutputType* Propagate(
+        const OutputType* propagate(
             const TransformedFeatureType* transformed_features, char* buffer) const {
 
-            const auto input = previous_layer_.Propagate(
+            const auto input = previous_layer_.propagate(
                 transformed_features, buffer + kSelfBufferSize);
             const auto output = reinterpret_cast<OutputType*>(buffer);
 
diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h
index 9d9476a5..b69028ab 100644
--- a/src/nnue/layers/input_slice.h
+++ b/src/nnue/layers/input_slice.h
@@ -45,31 +45,31 @@ namespace Eval::NNUE::Layers {
       static constexpr std::size_t kBufferSize = 0;
 
       // Hash value embedded in the evaluation file
-      static constexpr std::uint32_t GetHashValue() {
+      static constexpr std::uint32_t get_hash_value() {
           std::uint32_t hash_value = 0xEC42E90Du;
           hash_value ^= kOutputDimensions ^ (Offset << 10);
           return hash_value;
       }
 
       // A string that represents the structure from the input layer to this layer
-      static std::string GetStructureString() {
+      static std::string get_structure_string() {
           return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
               std::to_string(Offset) + ":" +
               std::to_string(Offset + kOutputDimensions) + ")]";
       }
 
       // Read network parameters
-      bool ReadParameters(std::istream& /*stream*/) {
+      bool read_parameters(std::istream& /*stream*/) {
           return true;
       }
 
       // write parameters
-      bool WriteParameters(std::ostream& /*stream*/) const {
+      bool write_parameters(std::ostream& /*stream*/) const {
           return true;
       }
 
       // Forward propagation
-      const OutputType* Propagate(
+      const OutputType* propagate(
           const TransformedFeatureType* transformed_features,
           char* /*buffer*/) const {
 
diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h
index c81f5850..64ef30f9 100644
--- a/src/nnue/layers/sum.h
+++ b/src/nnue/layers/sum.h
@@ -30,51 +30,51 @@ namespace Eval::NNUE::Layers {
 
         // Size of forward propagation buffer used in this layer
         static constexpr std::size_t kSelfBufferSize =
-            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+            ceil_to_multiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
 
         // Size of the forward propagation buffer used from the input layer to this layer
         static constexpr std::size_t kBufferSize =
             std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
 
         // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t GetHashValue() {
+        static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0xBCE400B4u;
-            hash_value ^= Head::GetHashValue() >> 1;
-            hash_value ^= Head::GetHashValue() << 31;
-            hash_value ^= Tail::GetHashValue() >> 2;
-            hash_value ^= Tail::GetHashValue() << 30;
+            hash_value ^= Head::get_hash_value() >> 1;
+            hash_value ^= Head::get_hash_value() << 31;
+            hash_value ^= Tail::get_hash_value() >> 2;
+            hash_value ^= Tail::get_hash_value() << 30;
             return hash_value;
         }
 
         // A string that represents the structure from the input layer to this layer
-        static std::string GetStructureString() {
+        static std::string get_structure_string() {
             return "Sum[" +
-                std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+                std::to_string(kOutputDimensions) + "](" + get_summands_string() + ")";
         }
 
         // read parameters
-        bool ReadParameters(std::istream& stream) {
-            if (!Tail::ReadParameters(stream))
+        bool read_parameters(std::istream& stream) {
+            if (!Tail::read_parameters(stream))
                 return false;
 
-            return previous_layer_.ReadParameters(stream);
+            return previous_layer_.read_parameters(stream);
         }
 
         // write parameters
-        bool WriteParameters(std::ostream& stream) const {
-            if (!Tail::WriteParameters(stream))
+        bool write_parameters(std::ostream& stream) const {
+            if (!Tail::write_parameters(stream))
                 return false;
 
-            return previous_layer_.WriteParameters(stream);
+            return previous_layer_.write_parameters(stream);
         }
 
         // forward propagation
-        const OutputType* Propagate(
+        const OutputType* propagate(
             const TransformedFeatureType* transformed_features, char* buffer) const {
 
-            Tail::Propagate(transformed_features, buffer);
+            Tail::propagate(transformed_features, buffer);
 
-            const auto head_output = previous_layer_.Propagate(
+            const auto head_output = previous_layer_.propagate(
                 transformed_features, buffer + kSelfBufferSize);
 
             const auto output = reinterpret_cast<OutputType*>(buffer);
@@ -88,8 +88,8 @@ namespace Eval::NNUE::Layers {
 
     protected:
         // A string that represents the list of layers to be summed
-        static std::string GetSummandsString() {
-            return Head::GetStructureString() + "," + Tail::GetSummandsString();
+        static std::string get_summands_string() {
+            return Head::get_structure_string() + "," + Tail::get_summands_string();
         }
 
         // Make the learning class a friend
@@ -118,40 +118,40 @@ namespace Eval::NNUE::Layers {
         static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
 
         // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t GetHashValue() {
+        static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0xBCE400B4u;
-            hash_value ^= PreviousLayer::GetHashValue() >> 1;
-            hash_value ^= PreviousLayer::GetHashValue() << 31;
+            hash_value ^= PreviousLayer::get_hash_value() >> 1;
+            hash_value ^= PreviousLayer::get_hash_value() << 31;
             return hash_value;
         }
 
         // A string that represents the structure from the input layer to this layer
-        static std::string GetStructureString() {
+        static std::string get_structure_string() {
             return "Sum[" +
-                std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+                std::to_string(kOutputDimensions) + "](" + get_summands_string() + ")";
         }
 
         // read parameters
-        bool ReadParameters(std::istream& stream) {
-            return previous_layer_.ReadParameters(stream);
+        bool read_parameters(std::istream& stream) {
+            return previous_layer_.read_parameters(stream);
         }
 
         // write parameters
-        bool WriteParameters(std::ostream& stream) const {
-            return previous_layer_.WriteParameters(stream);
+        bool write_parameters(std::ostream& stream) const {
+            return previous_layer_.write_parameters(stream);
         }
 
         // forward propagation
-        const OutputType* Propagate(
+        const OutputType* propagate(
             const TransformedFeatureType* transformed_features, char* buffer) const {
 
-            return previous_layer_.Propagate(transformed_features, buffer);
+            return previous_layer_.propagate(transformed_features, buffer);
         }
 
     protected:
         // A string that represents the list of layers to be summed
-        static std::string GetSummandsString() {
-            return PreviousLayer::GetStructureString();
+        static std::string get_summands_string() {
+            return PreviousLayer::get_structure_string();
         }
 
         // Make the learning class a friend
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 70c7596d..bd4294a3 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -127,7 +127,7 @@ namespace Eval::NNUE {
 
     // Round n up to be a multiple of base
     template <typename IntType>
-    constexpr IntType CeilToMultiple(IntType n, IntType base) {
+    constexpr IntType ceil_to_multiple(IntType n, IntType base) {
         return (n + base - 1) / base * base;
     }
 
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 2fc24dab..87b8ee58 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -111,20 +111,20 @@ namespace Eval::NNUE {
             kOutputDimensions * sizeof(OutputType);
 
         // Hash value embedded in the evaluation file
-        static constexpr std::uint32_t GetHashValue() {
+        static constexpr std::uint32_t get_hash_value() {
 
             return RawFeatures::kHashValue ^ kOutputDimensions;
         }
 
         // a string representing the structure
-        static std::string GetStructureString() {
-            return RawFeatures::GetName() + "[" +
+        static std::string get_structure_string() {
+            return RawFeatures::get_name() + "[" +
                 std::to_string(kInputDimensions) + "->" +
                 std::to_string(kHalfDimensions) + "x2]";
         }
 
         // Read network parameters
-        bool ReadParameters(std::istream& stream) {
+        bool read_parameters(std::istream& stream) {
 
             for (std::size_t i = 0; i < kHalfDimensions; ++i)
                 biases_[i] = read_little_endian<BiasType>(stream);
@@ -136,7 +136,7 @@ namespace Eval::NNUE {
         }
 
         // write parameters
-        bool WriteParameters(std::ostream& stream) const {
+        bool write_parameters(std::ostream& stream) const {
             stream.write(reinterpret_cast<const char*>(biases_),
                 kHalfDimensions * sizeof(BiasType));
 
@@ -147,7 +147,7 @@ namespace Eval::NNUE {
         }
 
         // Proceed with the difference calculation if possible
-        bool UpdateAccumulatorIfPossible(const Position& pos) const {
+        bool update_accumulator_if_possible(const Position& pos) const {
 
             const auto now = pos.state();
             if (now->accumulator.computed_accumulation)
@@ -155,7 +155,7 @@ namespace Eval::NNUE {
 
             const auto prev = now->previous;
             if (prev && prev->accumulator.computed_accumulation) {
-                UpdateAccumulator(pos);
+                update_accumulator(pos);
                 return true;
             }
 
@@ -163,10 +163,10 @@ namespace Eval::NNUE {
         }
 
         // Convert input features
-        void Transform(const Position& pos, OutputType* output) const {
+        void transform(const Position& pos, OutputType* output) const {
 
-            if (!UpdateAccumulatorIfPossible(pos))
-              RefreshAccumulator(pos);
+            if (!update_accumulator_if_possible(pos))
+              refresh_accumulator(pos);
 
             const auto& accumulation = pos.state()->accumulator.accumulation;
 
@@ -294,13 +294,13 @@ namespace Eval::NNUE {
 
     private:
         // Calculate cumulative value without using difference calculation
-        void RefreshAccumulator(const Position& pos) const {
+        void refresh_accumulator(const Position& pos) const {
 
             auto& accumulator = pos.state()->accumulator;
             for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
                 Features::IndexList active_indices[2];
-                RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
-                                                 active_indices);
+                RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
+                                                   active_indices);
                 for (Color perspective : { WHITE, BLACK }) {
 #ifdef TILING
                     for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
@@ -357,15 +357,15 @@ namespace Eval::NNUE {
         }
 
         // Calculate cumulative value using difference calculation
-        void UpdateAccumulator(const Position& pos) const {
+        void update_accumulator(const Position& pos) const {
 
             const auto& prev_accumulator = pos.state()->previous->accumulator;
             auto& accumulator = pos.state()->accumulator;
             for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
                 Features::IndexList removed_indices[2], added_indices[2];
                 bool reset[2] = { false, false };
-                RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
-                                                  removed_indices, added_indices, reset);
+                RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
+                                                    removed_indices, added_indices, reset);
 
 #ifdef TILING
                 for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
index 55fa603a..d892222b 100644
--- a/src/nnue/nnue_test_command.cpp
+++ b/src/nnue/nnue_test_command.cpp
@@ -24,7 +24,7 @@ namespace Eval::NNUE {
     namespace {
 
         // Testing RawFeatures mainly for difference calculation
-        void TestFeatures(Position& pos) {
+        void test_features(Position& pos) {
             const std::uint64_t num_games = 1000;
             StateInfo si;
             pos.set(StartFEN, false, &si, Threads.main());
@@ -47,7 +47,7 @@ namespace Eval::NNUE {
 
                 for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
                     Features::IndexList active_indices[2];
-                    RawFeatures::AppendActiveIndices(position, kRefreshTriggers[i],
+                    RawFeatures::append_active_indices(position, kRefreshTriggers[i],
                                                      active_indices);
 
                     for (const auto perspective : Colors) {
@@ -68,7 +68,7 @@ namespace Eval::NNUE {
                 for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
                     Features::IndexList removed_indices[2], added_indices[2];
                     bool reset[2] = { false, false };
-                    RawFeatures::AppendChangedIndices(position, kRefreshTriggers[i],
+                    RawFeatures::append_changed_indices(position, kRefreshTriggers[i],
                                                       removed_indices, added_indices, reset);
                     for (const auto perspective : Colors) {
                         if (reset[perspective]) {
@@ -99,7 +99,7 @@ namespace Eval::NNUE {
                 }
             };
 
-            std::cout << "feature set: " << RawFeatures::GetName()
+            std::cout << "feature set: " << RawFeatures::get_name()
                       << "[" << RawFeatures::kDimensions << "]" << std::endl;
             std::cout << "start testing with random games";
 
@@ -154,8 +154,8 @@ namespace Eval::NNUE {
         }
 
         // Output a string that represents the structure of the evaluation function
-        void PrintInfo(std::istream& stream) {
-            std::cout << "network architecture: " << GetArchitectureString() << std::endl;
+        void print_info(std::istream& stream) {
+            std::cout << "network architecture: " << get_architecture_string() << std::endl;
 
             while (true) {
                 std::string file_name;
@@ -170,7 +170,7 @@ namespace Eval::NNUE {
 
                     if (!file_stream)
                         return false;
-                    if (!ReadHeader(file_stream, &hash_value, &architecture))
+                    if (!read_header(file_stream, &hash_value, &architecture))
                         return false;
 
                     return true;
@@ -180,7 +180,7 @@ namespace Eval::NNUE {
                 if (success) {
                     if (hash_value == kHashValue) {
                         std::cout << "matches with this binary";
-                        if (architecture != GetArchitectureString()) {
+                        if (architecture != get_architecture_string()) {
                             std::cout << ", but architecture string differs: " << architecture;
                         }
 
@@ -197,14 +197,14 @@ namespace Eval::NNUE {
     }  // namespace
 
     // USI extended command for NNUE evaluation function
-    void TestCommand(Position& pos, std::istream& stream) {
+    void test_command(Position& pos, std::istream& stream) {
         std::string sub_command;
         stream >> sub_command;
 
         if (sub_command == "test_features") {
-            TestFeatures(pos);
+            test_features(pos);
         } else if (sub_command == "info") {
-            PrintInfo(stream);
+            print_info(stream);
         } else {
             std::cout << "usage:" << std::endl;
             std::cout << " test nnue test_features" << std::endl;
diff --git a/src/nnue/nnue_test_command.h b/src/nnue/nnue_test_command.h
index 989731d6..fcfe16f6 100644
--- a/src/nnue/nnue_test_command.h
+++ b/src/nnue/nnue_test_command.h
@@ -5,7 +5,7 @@
 namespace Eval::NNUE {
 
     // USI extended command for NNUE evaluation function
-    void TestCommand(Position& pos, std::istream& stream);
+    void test_command(Position& pos, std::istream& stream);
 
 }  // namespace Eval::NNUE
 
diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
index 784fe047..49a2fe26 100644
--- a/src/nnue/trainer/features/factorizer.h
+++ b/src/nnue/trainer/features/factorizer.h
@@ -14,12 +14,12 @@ namespace Eval::NNUE::Features {
     class Factorizer {
     public:
         // Get the dimensionality of the learning feature
-        static constexpr IndexType GetDimensions() {
+        static constexpr IndexType get_dimensions() {
             return FeatureType::kDimensions;
         }
 
         // Get index of learning feature and scale of learning rate
-        static void AppendTrainingFeatures(
+        static void append_training_features(
             IndexType base_index, std::vector<TrainingFeature>* training_features) {
 
             assert(base_index <FeatureType::kDimensions);
@@ -35,7 +35,7 @@ namespace Eval::NNUE::Features {
 
     // Add the original input features to the learning features
     template <typename FeatureType>
-    IndexType AppendBaseFeature(
+    IndexType append_base_feature(
         FeatureProperties properties, IndexType base_index,
         std::vector<TrainingFeature>* training_features) {
 
@@ -47,7 +47,7 @@ namespace Eval::NNUE::Features {
 
     // If the learning rate scale is not 0, inherit other types of learning features
     template <typename FeatureType>
-    IndexType InheritFeaturesIfRequired(
+    IndexType inherit_features_if_required(
         IndexType index_offset, FeatureProperties properties, IndexType base_index,
         std::vector<TrainingFeature>* training_features) {
 
@@ -55,17 +55,17 @@ namespace Eval::NNUE::Features {
             return 0;
         }
 
-        assert(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
+        assert(properties.dimensions == Factorizer<FeatureType>::get_dimensions());
         assert(base_index < FeatureType::kDimensions);
 
         const auto start = training_features->size();
-        Factorizer<FeatureType>::AppendTrainingFeatures(
+        Factorizer<FeatureType>::append_training_features(
             base_index, training_features);
 
         for (auto i = start; i < training_features->size(); ++i) {
             auto& feature = (*training_features)[i];
-            assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-            feature.ShiftIndex(index_offset);
+            assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
+            feature.shift_index(index_offset);
         }
 
         return properties.dimensions;
@@ -73,7 +73,7 @@ namespace Eval::NNUE::Features {
 
     // Return the index difference as needed, without adding learning features
     // Call instead of InheritFeaturesIfRequired() if there are no corresponding features
-    IndexType SkipFeatures(FeatureProperties properties) {
+    IndexType skip_features(FeatureProperties properties) {
         if (!properties.active)
             return 0;
 
@@ -82,7 +82,7 @@ namespace Eval::NNUE::Features {
 
     // Get the dimensionality of the learning feature
     template <std::size_t N>
-    constexpr IndexType GetActiveDimensions(
+    constexpr IndexType get_active_dimensions(
         const FeatureProperties (&properties)[N]) {
 
         static_assert(N > 0, "");
@@ -100,7 +100,7 @@ namespace Eval::NNUE::Features {
 
     // get the number of elements in the array
     template <typename T, std::size_t N>
-    constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
+    constexpr std::size_t get_array_length(const T (&/*array*/)[N]) {
         return N;
     }
 
diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
index d272a453..032a449b 100644
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/nnue/trainer/features/factorizer_feature_set.h
@@ -22,12 +22,12 @@ namespace Eval::NNUE::Features {
             FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
 
         // Get the dimensionality of the learning feature
-        static constexpr IndexType GetDimensions() {
-            return Head::GetDimensions() + Tail::GetDimensions();
+        static constexpr IndexType get_dimensions() {
+            return Head::get_dimensions() + Tail::get_dimensions();
         }
 
         // Get index of learning feature and scale of learning rate
-        static void AppendTrainingFeatures(
+        static void append_training_features(
             IndexType base_index, std::vector<TrainingFeature>* training_features,
             IndexType base_dimensions = kBaseDimensions) {
 
@@ -36,29 +36,29 @@ namespace Eval::NNUE::Features {
             constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
 
             if (base_index < boundary) {
-                Tail::AppendTrainingFeatures(
+                Tail::append_training_features(
                     base_index, training_features, base_dimensions);
             }
             else {
                 const auto start = training_features->size();
 
-                Head::AppendTrainingFeatures(
+                Head::append_training_features(
                     base_index - boundary, training_features, base_dimensions);
 
                 for (auto i = start; i < training_features->size(); ++i) {
                     auto& feature = (*training_features)[i];
-                    const auto index = feature.GetIndex();
+                    const auto index = feature.get_index();
 
-                    assert(index < Head::GetDimensions() ||
+                    assert(index < Head::get_dimensions() ||
                                (index >= base_dimensions &&
                                 index < base_dimensions +
-                                        Head::GetDimensions() - Head::kBaseDimensions));
+                                        Head::get_dimensions() - Head::kBaseDimensions));
 
                     if (index < Head::kBaseDimensions) {
-                        feature.ShiftIndex(Tail::kBaseDimensions);
+                        feature.shift_index(Tail::kBaseDimensions);
                     }
                     else {
-                        feature.ShiftIndex(Tail::GetDimensions() - Tail::kBaseDimensions);
+                        feature.shift_index(Tail::get_dimensions() - Tail::kBaseDimensions);
                     }
                 }
             }
@@ -74,12 +74,12 @@ namespace Eval::NNUE::Features {
         static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
 
         // Get the dimensionality of the learning feature
-        static constexpr IndexType GetDimensions() {
-            return Factorizer<FeatureType>::GetDimensions();
+        static constexpr IndexType get_dimensions() {
+            return Factorizer<FeatureType>::get_dimensions();
         }
 
         // Get index of learning feature and scale of learning rate
-        static void AppendTrainingFeatures(
+        static void append_training_features(
             IndexType base_index, std::vector<TrainingFeature>* training_features,
             IndexType base_dimensions = kBaseDimensions) {
 
@@ -87,14 +87,14 @@ namespace Eval::NNUE::Features {
 
             const auto start = training_features->size();
 
-            Factorizer<FeatureType>::AppendTrainingFeatures(
+            Factorizer<FeatureType>::append_training_features(
                 base_index, training_features);
 
             for (auto i = start; i < training_features->size(); ++i) {
                 auto& feature = (*training_features)[i];
-                assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
-                if (feature.GetIndex() >= kBaseDimensions) {
-                    feature.ShiftIndex(base_dimensions - kBaseDimensions);
+                assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
+                if (feature.get_index() >= kBaseDimensions) {
+                    feature.shift_index(base_dimensions - kBaseDimensions);
                 }
             }
         }
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
index 1ed5bdd3..152722ac 100644
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/nnue/trainer/features/factorizer_half_kp.h
@@ -37,25 +37,25 @@ namespace Eval::NNUE::Features {
             // kFeaturesHalfK
             {true, SQUARE_NB},
             // kFeaturesP
-            {true, Factorizer<P>::GetDimensions()},
+            {true, Factorizer<P>::get_dimensions()},
             // kFeaturesHalfRelativeKP
-            {true, Factorizer<HalfRelativeKP<AssociatedKing>>::GetDimensions()},
+            {true, Factorizer<HalfRelativeKP<AssociatedKing>>::get_dimensions()},
         };
 
-        static_assert(GetArrayLength(kProperties) == kNumTrainingFeatureTypes, "");
+        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
 
     public:
         // Get the dimensionality of the learning feature
-        static constexpr IndexType GetDimensions() {
-            return GetActiveDimensions(kProperties);
+        static constexpr IndexType get_dimensions() {
+            return get_active_dimensions(kProperties);
         }
 
         // Get index of learning feature and scale of learning rate
-        static void AppendTrainingFeatures(
+        static void append_training_features(
             IndexType base_index, std::vector<TrainingFeature>* training_features) {
 
             // kFeaturesHalfKP
-            IndexType index_offset = AppendBaseFeature<FeatureType>(
+            IndexType index_offset = append_base_feature<FeatureType>(
                 kProperties[kFeaturesHalfKP], base_index, training_features);
 
             const auto sq_k = static_cast<Square>(base_index / PS_END);
@@ -71,20 +71,20 @@ namespace Eval::NNUE::Features {
             }
 
             // kFeaturesP
-            index_offset += InheritFeaturesIfRequired<P>(
+            index_offset += inherit_features_if_required<P>(
                 index_offset, kProperties[kFeaturesP], p, training_features);
             // kFeaturesHalfRelativeKP
             if (p >= PS_W_PAWN) {
-                index_offset += InheritFeaturesIfRequired<HalfRelativeKP<AssociatedKing>>(
+                index_offset += inherit_features_if_required<HalfRelativeKP<AssociatedKing>>(
                     index_offset, kProperties[kFeaturesHalfRelativeKP],
-                    HalfRelativeKP<AssociatedKing>::MakeIndex(sq_k, p),
+                    HalfRelativeKP<AssociatedKing>::make_index(sq_k, p),
                     training_features);
             }
             else {
-                index_offset += SkipFeatures(kProperties[kFeaturesHalfRelativeKP]);
+                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKP]);
             }
 
-            assert(index_offset == GetDimensions());
+            assert(index_offset == get_dimensions());
         }
     };
 
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index 7d9b66ee..85666576 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -37,22 +37,22 @@ namespace Eval::NNUE {
         }
 
         TrainingFeature& operator+=(const TrainingFeature& other) {
-            assert(other.GetIndex() == GetIndex());
-            assert(other.GetCount() + GetCount() < (1 << kCountBits));
-            index_and_count_ += other.GetCount();
+            assert(other.get_index() == get_index());
+            assert(other.get_index() + get_count() < (1 << kCountBits));
+            index_and_count_ += other.get_count();
             return *this;
         }
 
-        IndexType GetIndex() const {
+        IndexType get_index() const {
             return static_cast<IndexType>(index_and_count_ >> kCountBits);
         }
 
-        void ShiftIndex(IndexType offset) {
-            assert(GetIndex() + offset < (1 << kIndexBits));
+        void shift_index(IndexType offset) {
+            assert(get_index() + offset < (1 << kIndexBits));
             index_and_count_ += offset << kCountBits;
         }
 
-        IndexType GetCount() const {
+        IndexType get_count() const {
             return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
         }
 
@@ -86,7 +86,7 @@ namespace Eval::NNUE {
     };
 
     // determine whether to accept the message
-    bool ReceiveMessage(const std::string& name, Message* message) {
+    bool receive_message(const std::string& name, Message* message) {
         const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
 
         if (message->name.substr(0, name.size() + 1) == name + "[") {
@@ -101,28 +101,15 @@ namespace Eval::NNUE {
         return false;
     }
 
-    // split the string
-    std::vector<std::string> Split(const std::string& input, char delimiter) {
-        std::istringstream stream(input);
-        std::string field;
-        std::vector<std::string> fields;
-
-        while (std::getline(stream, field, delimiter)) {
-            fields.push_back(field);
-        }
-
-        return fields;
-    }
-
     // round a floating point number to an integer
     template <typename IntType>
-    IntType Round(double value) {
+    IntType round(double value) {
         return static_cast<IntType>(std::floor(value + 0.5));
     }
 
     // make_shared with alignment
     template <typename T, typename... ArgumentTypes>
-    std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
+    std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments) {
         const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
             T(std::forward<ArgumentTypes>(arguments)...);
 
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index dd70b8fb..f6d374ef 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -21,7 +21,7 @@ namespace Eval::NNUE {
 
     public:
         // factory function
-        static std::shared_ptr<Trainer> Create(
+        static std::shared_ptr<Trainer> create(
             LayerType* target_layer, FeatureTransformer* ft) {
 
             return std::shared_ptr<Trainer>(
@@ -29,31 +29,31 @@ namespace Eval::NNUE {
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
-            previous_layer_trainer_->SendMessage(message);
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
 
-            if (ReceiveMessage("momentum", message)) {
+            if (receive_message("momentum", message)) {
                 momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
             }
 
-            if (ReceiveMessage("learning_rate_scale", message)) {
+            if (receive_message("learning_rate_scale", message)) {
                 learning_rate_scale_ =
                     static_cast<LearnFloatType>(std::stod(message->value));
             }
 
-            if (ReceiveMessage("reset", message)) {
-                DequantizeParameters();
+            if (receive_message("reset", message)) {
+                dequantize_parameters();
             }
 
-            if (ReceiveMessage("quantize_parameters", message)) {
-                QuantizeParameters();
+            if (receive_message("quantize_parameters", message)) {
+                quantize_parameters();
             }
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
-            previous_layer_trainer_->Initialize(rng);
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
 
             if (kIsOutputLayer) {
                 // Initialize output layer with 0
@@ -80,18 +80,18 @@ namespace Eval::NNUE {
                 }
             }
 
-            QuantizeParameters();
+            quantize_parameters();
         }
 
         // forward propagation
-        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
                 output_.resize(kOutputDimensions * batch.size());
                 gradients_.resize(kInputDimensions * batch.size());
             }
 
             batch_size_ = static_cast<IndexType>(batch.size());
-            batch_input_ = previous_layer_trainer_->Propagate(batch);
+            batch_input_ = previous_layer_trainer_->propagate(batch);
 #if defined(USE_BLAS)
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
@@ -123,7 +123,7 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             const LearnFloatType local_learning_rate =
@@ -206,7 +206,7 @@ namespace Eval::NNUE {
             }
 
 #endif
-            previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+            previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
         }
 
     private:
@@ -214,7 +214,7 @@ namespace Eval::NNUE {
         Trainer(LayerType* target_layer, FeatureTransformer* ft) :
             batch_size_(0),
             batch_input_(nullptr),
-            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
                 &target_layer->previous_layer_, ft)),
             target_layer_(target_layer),
             biases_(),
@@ -224,11 +224,11 @@ namespace Eval::NNUE {
             momentum_(0.2),
             learning_rate_scale_(1.0) {
 
-            DequantizeParameters();
+            dequantize_parameters();
         }
 
         // Weight saturation and parameterization
-        void QuantizeParameters() {
+        void quantize_parameters() {
             for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
                 weights_[i] = std::max(-kMaxWeightMagnitude,
                                        std::min(+kMaxWeightMagnitude, weights_[i]));
@@ -236,7 +236,7 @@ namespace Eval::NNUE {
 
             for (IndexType i = 0; i < kOutputDimensions; ++i) {
                 target_layer_->biases_[i] =
-                    Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
             }
 
             for (IndexType i = 0; i < kOutputDimensions; ++i) {
@@ -244,14 +244,14 @@ namespace Eval::NNUE {
                 const auto padded_offset = LayerType::kPaddedInputDimensions * i;
                 for (IndexType j = 0; j < kInputDimensions; ++j) {
                     target_layer_->weights_[padded_offset + j] =
-                        Round<typename LayerType::WeightType>(
+                        round<typename LayerType::WeightType>(
                             weights_[offset + j] * kWeightScale);
                 }
             }
         }
 
         // read parameterized integer
-        void DequantizeParameters() {
+        void dequantize_parameters() {
             for (IndexType i = 0; i < kOutputDimensions; ++i) {
                 biases_[i] = static_cast<LearnFloatType>(
                     target_layer_->biases_[i] / kBiasScale);
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 902c2747..35503493 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -19,7 +19,7 @@ namespace Eval::NNUE {
 
     public:
         // factory function
-        static std::shared_ptr<Trainer> Create(
+        static std::shared_ptr<Trainer> create(
             LayerType* target_layer, FeatureTransformer* ft) {
 
             return std::shared_ptr<Trainer>(
@@ -27,27 +27,27 @@ namespace Eval::NNUE {
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
-            previous_layer_trainer_->SendMessage(message);
-            if (ReceiveMessage("check_health", message)) {
-                CheckHealth();
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
+            if (receive_message("check_health", message)) {
+                check_health();
             }
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
-            previous_layer_trainer_->Initialize(rng);
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
         }
 
         // forward propagation
-        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
               output_.resize(kOutputDimensions * batch.size());
               gradients_.resize(kInputDimensions * batch.size());
             }
 
-            const auto input = previous_layer_trainer_->Propagate(batch);
+            const auto input = previous_layer_trainer_->propagate(batch);
             batch_size_ = static_cast<IndexType>(batch.size());
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
@@ -63,7 +63,7 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             for (IndexType b = 0; b < batch_size_; ++b) {
@@ -75,14 +75,14 @@ namespace Eval::NNUE {
                 }
             }
 
-            previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+            previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
         }
 
     private:
         // constructor
         Trainer(LayerType* target_layer, FeatureTransformer* ft) :
             batch_size_(0),
-            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
                 &target_layer->previous_layer_, ft)),
             target_layer_(target_layer) {
 
@@ -93,7 +93,7 @@ namespace Eval::NNUE {
         }
 
         // Check if there are any problems with learning
-        void CheckHealth() {
+        void check_health() {
             const auto largest_min_activation = *std::max_element(
                 std::begin(min_activations_), std::end(min_activations_));
             const auto smallest_max_activation = *std::min_element(
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 4173f46d..a3d6c16a 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -34,44 +34,44 @@ namespace Eval::NNUE {
         friend struct AlignedDeleter;
 
         template <typename T, typename... ArgumentTypes>
-        friend std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments);
+        friend std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments);
 
         // factory function
-        static std::shared_ptr<Trainer> Create(LayerType* target_layer) {
-            return MakeAlignedSharedPtr<Trainer>(target_layer);
+        static std::shared_ptr<Trainer> create(LayerType* target_layer) {
+            return make_aligned_shared_ptr<Trainer>(target_layer);
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
-            if (ReceiveMessage("momentum", message)) {
+        void send_message(Message* message) {
+            if (receive_message("momentum", message)) {
                 momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
             }
 
-            if (ReceiveMessage("learning_rate_scale", message)) {
+            if (receive_message("learning_rate_scale", message)) {
                 learning_rate_scale_ =
                     static_cast<LearnFloatType>(std::stod(message->value));
             }
 
-            if (ReceiveMessage("reset", message)) {
-                DequantizeParameters();
+            if (receive_message("reset", message)) {
+                dequantize_parameters();
             }
 
-            if (ReceiveMessage("quantize_parameters", message)) {
-                QuantizeParameters();
+            if (receive_message("quantize_parameters", message)) {
+                quantize_parameters();
             }
 
-            if (ReceiveMessage("clear_unobserved_feature_weights", message)) {
-                ClearUnobservedFeatureWeights();
+            if (receive_message("clear_unobserved_feature_weights", message)) {
+                clear_unobserved_feature_weights();
             }
 
-            if (ReceiveMessage("check_health", message)) {
-                CheckHealth();
+            if (receive_message("check_health", message)) {
+                check_health();
             }
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
+        void initialize(RNG& rng) {
             std::fill(std::begin(weights_), std::end(weights_), +kZero);
 
             const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
@@ -86,11 +86,11 @@ namespace Eval::NNUE {
                 biases_[i] = static_cast<LearnFloatType>(0.5);
             }
 
-            QuantizeParameters();
+            quantize_parameters();
         }
 
         // forward propagation
-        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
                 output_.resize(kOutputDimensions * batch.size());
                 gradients_.resize(kOutputDimensions * batch.size());
@@ -106,8 +106,8 @@ namespace Eval::NNUE {
 #if defined(USE_BLAS)
                     cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
                     for (const auto& feature : batch[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-                        cblas_saxpy(kHalfDimensions, (float)feature.GetCount(),
+                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                        cblas_saxpy(kHalfDimensions, (float)feature.get_count(),
                                     &weights_[weights_offset], 1, &output_[output_offset], 1);
                     }
 #else
@@ -115,10 +115,10 @@ namespace Eval::NNUE {
                         output_[output_offset + i] = biases_[i];
                     }
                     for (const auto& feature : batch[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
                         for (IndexType i = 0; i < kHalfDimensions; ++i) {
                             output_[output_offset + i] +=
-                                feature.GetCount() * weights_[weights_offset + i];
+                                feature.get_count() * weights_[weights_offset + i];
                         }
                     }
 #endif
@@ -143,7 +143,7 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             const LearnFloatType local_learning_rate =
@@ -188,13 +188,13 @@ namespace Eval::NNUE {
                         const IndexType output_offset = batch_offset + kHalfDimensions * c;
                         for (const auto& feature : (*batch_)[b].training_features[c]) {
 #if defined(_OPENMP)
-                            if (feature.GetIndex() % num_threads != thread_index)
+                            if (feature.get_index() % num_threads != thread_index)
                                 continue;
 #endif
                             const IndexType weights_offset =
-                                kHalfDimensions * feature.GetIndex();
+                                kHalfDimensions * feature.get_index();
                             const auto scale = static_cast<LearnFloatType>(
-                                effective_learning_rate / feature.GetCount());
+                                effective_learning_rate / feature.get_count());
 
                             cblas_saxpy(kHalfDimensions, -scale,
                                         &gradients_[output_offset], 1,
@@ -228,9 +228,9 @@ namespace Eval::NNUE {
                 for (IndexType c = 0; c < 2; ++c) {
                     const IndexType output_offset = batch_offset + kHalfDimensions * c;
                     for (const auto& feature : (*batch_)[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
                         const auto scale = static_cast<LearnFloatType>(
-                            effective_learning_rate / feature.GetCount());
+                            effective_learning_rate / feature.get_count());
 
                         for (IndexType i = 0; i < kHalfDimensions; ++i) {
                             weights_[weights_offset + i] -=
@@ -244,7 +244,7 @@ namespace Eval::NNUE {
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 for (IndexType c = 0; c < 2; ++c) {
                     for (const auto& feature : (*batch_)[b].training_features[c]) {
-                        observed_features.set(feature.GetIndex());
+                        observed_features.set(feature.get_index());
                     }
                 }
             }
@@ -269,14 +269,14 @@ namespace Eval::NNUE {
             std::fill(std::begin(max_activations_), std::end(max_activations_),
                       std::numeric_limits<LearnFloatType>::lowest());
 
-            DequantizeParameters();
+            dequantize_parameters();
         }
 
         // Weight saturation and parameterization
-        void QuantizeParameters() {
+        void quantize_parameters() {
             for (IndexType i = 0; i < kHalfDimensions; ++i) {
                 target_layer_->biases_[i] =
-                    Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
             }
 
             std::vector<TrainingFeature> training_features;
@@ -284,23 +284,23 @@ namespace Eval::NNUE {
 #pragma omp parallel for private(training_features)
             for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
                 training_features.clear();
-                Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+                Features::Factorizer<RawFeatures>::append_training_features(
                     j, &training_features);
 
                 for (IndexType i = 0; i < kHalfDimensions; ++i) {
                     double sum = 0.0;
                     for (const auto& feature : training_features) {
-                        sum += weights_[kHalfDimensions * feature.GetIndex() + i];
+                        sum += weights_[kHalfDimensions * feature.get_index() + i];
                     }
 
                     target_layer_->weights_[kHalfDimensions * j + i] =
-                        Round<typename LayerType::WeightType>(sum * kWeightScale);
+                        round<typename LayerType::WeightType>(sum * kWeightScale);
                 }
             }
         }
 
         // read parameterized integer
-        void DequantizeParameters() {
+        void dequantize_parameters() {
             for (IndexType i = 0; i < kHalfDimensions; ++i) {
                 biases_[i] = static_cast<LearnFloatType>(
                     target_layer_->biases_[i] / kBiasScale);
@@ -317,7 +317,7 @@ namespace Eval::NNUE {
         }
 
         // Set the weight corresponding to the feature that does not appear in the learning data to 0
-        void ClearUnobservedFeatureWeights() {
+        void clear_unobserved_feature_weights() {
             for (IndexType i = 0; i < kInputDimensions; ++i) {
                 if (!observed_features.test(i)) {
                     std::fill(std::begin(weights_) + kHalfDimensions * i,
@@ -325,11 +325,11 @@ namespace Eval::NNUE {
                 }
             }
 
-            QuantizeParameters();
+            quantize_parameters();
         }
 
         // Check if there are any problems with learning
-        void CheckHealth() {
+        void check_health() {
             std::cout << "INFO: observed " << observed_features.count()
                       << " (out of " << kInputDimensions << ") features" << std::endl;
 
@@ -359,7 +359,7 @@ namespace Eval::NNUE {
 
         // number of input/output dimensions
         static constexpr IndexType kInputDimensions =
-            Features::Factorizer<RawFeatures>::GetDimensions();
+            Features::Factorizer<RawFeatures>::get_dimensions();
         static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
         static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
 
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 45dcbacc..43968776 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -14,7 +14,7 @@ namespace Eval::NNUE {
     class SharedInputTrainer {
     public:
         // factory function
-        static std::shared_ptr<SharedInputTrainer> Create(
+        static std::shared_ptr<SharedInputTrainer> create(
             FeatureTransformer* ft) {
 
             static std::shared_ptr<SharedInputTrainer> instance;
@@ -29,10 +29,10 @@ namespace Eval::NNUE {
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
+        void send_message(Message* message) {
             if (num_calls_ == 0) {
                 current_operation_ = Operation::kSendMessage;
-                feature_transformer_trainer_->SendMessage(message);
+                feature_transformer_trainer_->send_message(message);
             }
 
             assert(current_operation_ == Operation::kSendMessage);
@@ -45,10 +45,10 @@ namespace Eval::NNUE {
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
+        void initialize(RNG& rng) {
             if (num_calls_ == 0) {
                 current_operation_ = Operation::kInitialize;
-                feature_transformer_trainer_->Initialize(rng);
+                feature_transformer_trainer_->initialize(rng);
             }
 
             assert(current_operation_ == Operation::kInitialize);
@@ -60,7 +60,7 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(const std::vector<Example>& batch) {
             if (gradients_.size() < kInputDimensions * batch.size()) {
                 gradients_.resize(kInputDimensions * batch.size());
             }
@@ -69,7 +69,7 @@ namespace Eval::NNUE {
 
             if (num_calls_ == 0) {
                 current_operation_ = Operation::kPropagate;
-                output_ = feature_transformer_trainer_->Propagate(batch);
+                output_ = feature_transformer_trainer_->propagate(batch);
             }
 
             assert(current_operation_ == Operation::kPropagate);
@@ -83,11 +83,11 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             if (num_referrers_ == 1) {
-                feature_transformer_trainer_->Backpropagate(gradients, learning_rate);
+                feature_transformer_trainer_->backpropagate(gradients, learning_rate);
                 return;
             }
 
@@ -111,7 +111,7 @@ namespace Eval::NNUE {
             }
 
             if (++num_calls_ == num_referrers_) {
-                feature_transformer_trainer_->Backpropagate(
+                feature_transformer_trainer_->backpropagate(
                     gradients_.data(), learning_rate);
                 num_calls_ = 0;
                 current_operation_ = Operation::kNone;
@@ -125,7 +125,7 @@ namespace Eval::NNUE {
             num_referrers_(0),
             num_calls_(0),
             current_operation_(Operation::kNone),
-            feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
+            feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
                 ft)),
             output_(nullptr) {
         }
@@ -175,25 +175,25 @@ namespace Eval::NNUE {
 
     public:
         // factory function
-        static std::shared_ptr<Trainer> Create(
+        static std::shared_ptr<Trainer> create(
             LayerType* /*target_layer*/, FeatureTransformer* ft) {
 
             return std::shared_ptr<Trainer>(new Trainer(ft));
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
-            shared_input_trainer_->SendMessage(message);
+        void send_message(Message* message) {
+            shared_input_trainer_->send_message(message);
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
-            shared_input_trainer_->Initialize(rng);
+        void initialize(RNG& rng) {
+            shared_input_trainer_->initialize(rng);
         }
 
         // forward propagation
-        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
               output_.resize(kOutputDimensions * batch.size());
               gradients_.resize(kInputDimensions * batch.size());
@@ -201,7 +201,7 @@ namespace Eval::NNUE {
 
             batch_size_ = static_cast<IndexType>(batch.size());
 
-            const auto input = shared_input_trainer_->Propagate(batch);
+            const auto input = shared_input_trainer_->propagate(batch);
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType input_offset = kInputDimensions * b;
                 const IndexType output_offset = kOutputDimensions * b;
@@ -219,7 +219,7 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             for (IndexType b = 0; b < batch_size_; ++b) {
@@ -233,14 +233,14 @@ namespace Eval::NNUE {
                     }
                 }
             }
-            shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate);
+            shared_input_trainer_->backpropagate(gradients_.data(), learning_rate);
         }
 
     private:
         // constructor
         Trainer(FeatureTransformer* ft):
             batch_size_(0),
-            shared_input_trainer_(SharedInputTrainer::Create(ft)) {
+            shared_input_trainer_(SharedInputTrainer::create(ft)) {
         }
 
         // number of input/output dimensions
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index 24fc6152..c2e40b1c 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -21,7 +21,7 @@ namespace Eval::NNUE {
 
     public:
         // factory function
-        static std::shared_ptr<Trainer> Create(
+        static std::shared_ptr<Trainer> create(
             LayerType* target_layer, FeatureTransformer* ft) {
 
             return std::shared_ptr<Trainer>(
@@ -29,26 +29,26 @@ namespace Eval::NNUE {
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
+        void send_message(Message* message) {
             // The results of other member functions do not depend on the processing order, so
             // Tail is processed first for the purpose of simplifying the implementation, but
             // SendMessage processes Head first to make it easier to understand subscript correspondence
-            previous_layer_trainer_->SendMessage(message);
-            Tail::SendMessage(message);
+            previous_layer_trainer_->send_message(message);
+            Tail::send_message(message);
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
-            Tail::Initialize(rng);
-            previous_layer_trainer_->Initialize(rng);
+        void initialize(RNG& rng) {
+            Tail::initialize(rng);
+            previous_layer_trainer_->initialize(rng);
         }
 
         // forward propagation
-        /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        /*const*/ LearnFloatType* propagate(const std::vector<Example>& batch) {
             batch_size_ = static_cast<IndexType>(batch.size());
-            auto output = Tail::Propagate(batch);
-            const auto head_output = previous_layer_trainer_->Propagate(batch);
+            auto output = Tail::propagate(batch);
+            const auto head_output = previous_layer_trainer_->propagate(batch);
 
 #if defined(USE_BLAS)
             cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
@@ -66,11 +66,11 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
-            Tail::Backpropagate(gradients, learning_rate);
-            previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+            Tail::backpropagate(gradients, learning_rate);
+            previous_layer_trainer_->backpropagate(gradients, learning_rate);
         }
 
     private:
@@ -78,7 +78,7 @@ namespace Eval::NNUE {
         Trainer(LayerType* target_layer, FeatureTransformer* ft):
             Tail(target_layer, ft),
             batch_size_(0),
-            previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
+            previous_layer_trainer_(Trainer<FirstPreviousLayer>::create(
                 &target_layer->previous_layer_, ft)),
             target_layer_(target_layer) {
         }
@@ -110,7 +110,7 @@ namespace Eval::NNUE {
 
     public:
         // factory function
-        static std::shared_ptr<Trainer> Create(
+        static std::shared_ptr<Trainer> create(
             LayerType* target_layer, FeatureTransformer* ft) {
 
             return std::shared_ptr<Trainer>(
@@ -118,24 +118,24 @@ namespace Eval::NNUE {
         }
 
         // Set options such as hyperparameters
-        void SendMessage(Message* message) {
-            previous_layer_trainer_->SendMessage(message);
+        void send_message(Message* message) {
+            previous_layer_trainer_->send_message(message);
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
-        void Initialize(RNG& rng) {
-            previous_layer_trainer_->Initialize(rng);
+        void initialize(RNG& rng) {
+            previous_layer_trainer_->initialize(rng);
         }
 
         // forward propagation
-        /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        /*const*/ LearnFloatType* propagate(const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
                 output_.resize(kOutputDimensions * batch.size());
             }
 
             batch_size_ = static_cast<IndexType>(batch.size());
-            const auto output = previous_layer_trainer_->Propagate(batch);
+            const auto output = previous_layer_trainer_->propagate(batch);
 
 #if defined(USE_BLAS)
             cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
@@ -152,17 +152,17 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void Backpropagate(const LearnFloatType* gradients,
+        void backpropagate(const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
-            previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+            previous_layer_trainer_->backpropagate(gradients, learning_rate);
         }
 
     private:
         // constructor
         Trainer(LayerType* target_layer, FeatureTransformer* ft) :
             batch_size_(0),
-            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+            previous_layer_trainer_(Trainer<PreviousLayer>::create(
                 &target_layer->previous_layer_, ft)),
             target_layer_(target_layer) {
         }
diff --git a/src/uci.cpp b/src/uci.cpp
index 896f6db8..b5a0524c 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -53,7 +53,7 @@ void test_cmd(Position& pos, istringstream& is)
     std::string param;
     is >> param;
 
-    if (param == "nnue") Eval::NNUE::TestCommand(pos, is);
+    if (param == "nnue") Eval::NNUE::test_command(pos, is);
 }
 
 namespace {

From 5188c26b2081740fc668aced2a544822a3ce479b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 17 Oct 2020 23:26:29 +0200
Subject: [PATCH 347/583] Allow execution of tasks on the global thread pool.

---
 src/thread.cpp | 27 +++++++++++++++++++++++++--
 src/thread.h   |  6 ++++++
 src/uci.cpp    |  6 ++++++
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/src/thread.cpp b/src/thread.cpp
index c81ac43d..e4226769 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -80,6 +80,13 @@ void Thread::start_searching() {
   cv.notify_one(); // Wake up the thread in idle_loop()
 }
 
+void Thread::execute_task(std::function<void(Thread&)> t)
+{
+  std::lock_guard<std::mutex> lk(mutex);
+  task = std::move(t);
+  cv.notify_one(); // Wake up the thread in idle_loop()
+}
+
 
 /// Thread::wait_for_search_finished() blocks on the condition variable
 /// until the thread has finished searching.
@@ -109,14 +116,22 @@ void Thread::idle_loop() {
       std::unique_lock<std::mutex> lk(mutex);
       searching = false;
       cv.notify_one(); // Wake up anyone waiting for search finished
-      cv.wait(lk, [&]{ return searching; });
+      cv.wait(lk, [&]{ return searching || task; });
 
       if (exit)
           return;
 
       lk.unlock();
 
-      search();
+      if (task)
+      {
+        task(*this);
+        task = nullptr;
+      }
+      else
+      {
+        search();
+      }
   }
 }
 
@@ -162,6 +177,14 @@ void ThreadPool::clear() {
 }
 
 
+void ThreadPool::execute_parallel(std::function<void(Thread&)> task)
+{
+  for(Thread* th : *this)
+  {
+    th->execute_task(task);
+  }
+}
+
 /// ThreadPool::start_thinking() wakes up main thread waiting in idle_loop() and
 /// returns immediately. Main thread will wake up other threads and start the search.
 
diff --git a/src/thread.h b/src/thread.h
index 501a6042..8e9e6fba 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -24,6 +24,7 @@
 #include <mutex>
 #include <thread>
 #include <vector>
+#include <functional>
 
 #include "material.h"
 #include "movepick.h"
@@ -50,10 +51,12 @@ public:
   explicit Thread(size_t);
   virtual ~Thread();
   virtual void search();
+  virtual void execute_task(std::function<void(Thread&)> t);
   void clear();
   void idle_loop();
   void start_searching();
   void wait_for_search_finished();
+  size_t thread_idx() const { return idx; }
 
   Pawns::Table pawnsTable;
   Material::Table materialTable;
@@ -78,6 +81,7 @@ public:
   bool UseRule50;
   Depth ProbeDepth;
 
+  std::function<void(Thread&)> task;
 };
 
 
@@ -105,6 +109,8 @@ struct MainThread : public Thread {
 
 struct ThreadPool : public std::vector<Thread*> {
 
+  void execute_parallel(std::function<void(Thread&)> task);
+
   void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
   void clear();
   void set(size_t);
diff --git a/src/uci.cpp b/src/uci.cpp
index b5a0524c..1aa9f95e 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -345,6 +345,12 @@ void UCI::loop(int argc, char* argv[]) {
       // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);
       else if (token == "search") search_cmd(pos, is);
+      else if (token == "tasktest")
+      {
+        Threads.execute_parallel([](auto& th) {
+          std::cout << th.thread_idx() << '\n';
+        });
+      }
 
       // test command
       else if (token == "test") test_cmd(pos, is);

From 97fb9a89e46f485c64c55d585981c46f032c81d0 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 10:34:48 +0200
Subject: [PATCH 348/583] allow waiting for task completion.

---
 src/thread.cpp | 13 +++++++++++++
 src/thread.h   |  2 ++
 2 files changed, 15 insertions(+)

diff --git a/src/thread.cpp b/src/thread.cpp
index e4226769..874b09ee 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -98,6 +98,12 @@ void Thread::wait_for_search_finished() {
 }
 
 
+void Thread::wait_for_task_finished() {
+
+  std::unique_lock<std::mutex> lk(mutex);
+  cv.wait(lk, [&]{ return !task; });
+}
+
 /// Thread::idle_loop() is where the thread is parked, blocked on the
 /// condition variable, when it has no work to do.
 
@@ -293,3 +299,10 @@ void ThreadPool::wait_for_search_finished() const {
         if (th != front())
             th->wait_for_search_finished();
 }
+
+
+void ThreadPool::wait_for_tasks_finished() const {
+
+    for (Thread* th : *this)
+        th->wait_for_task_finished();
+}
diff --git a/src/thread.h b/src/thread.h
index 8e9e6fba..8be6eb5a 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -56,6 +56,7 @@ public:
   void idle_loop();
   void start_searching();
   void wait_for_search_finished();
+  void wait_for_task_finished();
   size_t thread_idx() const { return idx; }
 
   Pawns::Table pawnsTable;
@@ -121,6 +122,7 @@ struct ThreadPool : public std::vector<Thread*> {
   Thread* get_best_thread() const;
   void start_searching();
   void wait_for_search_finished() const;
+  void wait_for_tasks_finished() const;
 
   std::atomic_bool stop, increaseDepth;
 

From fd229c0768d80e7a71353e044556bbf74dd5c145 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 22:35:16 +0200
Subject: [PATCH 349/583] Fix races and UBs

---
 src/thread.cpp | 28 ++++++++++++++++------------
 src/thread.h   | 11 +++++------
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/thread.cpp b/src/thread.cpp
index 874b09ee..2ecd167a 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -35,6 +35,7 @@ ThreadPool Threads; // Global object
 Thread::Thread(size_t n) : idx(n), stdThread(&Thread::idle_loop, this) {
 
   wait_for_search_finished();
+  wait_for_worker_finished();
 }
 
 
@@ -80,10 +81,11 @@ void Thread::start_searching() {
   cv.notify_one(); // Wake up the thread in idle_loop()
 }
 
-void Thread::execute_task(std::function<void(Thread&)> t)
+void Thread::execute_with_worker(std::function<void(Thread&)> t)
 {
   std::lock_guard<std::mutex> lk(mutex);
-  task = std::move(t);
+  worker = std::move(t);
+  searching = true;
   cv.notify_one(); // Wake up the thread in idle_loop()
 }
 
@@ -98,10 +100,10 @@ void Thread::wait_for_search_finished() {
 }
 
 
-void Thread::wait_for_task_finished() {
+void Thread::wait_for_worker_finished() {
 
   std::unique_lock<std::mutex> lk(mutex);
-  cv.wait(lk, [&]{ return !task; });
+  cv.wait(lk, [&]{ return !searching; });
 }
 
 /// Thread::idle_loop() is where the thread is parked, blocked on the
@@ -121,18 +123,20 @@ void Thread::idle_loop() {
   {
       std::unique_lock<std::mutex> lk(mutex);
       searching = false;
+      worker = nullptr;
       cv.notify_one(); // Wake up anyone waiting for search finished
-      cv.wait(lk, [&]{ return searching || task; });
+      cv.wait(lk, [&]{ return searching; });
 
       if (exit)
           return;
 
+      auto wrk = std::move(worker);
+
       lk.unlock();
 
-      if (task)
+      if (wrk)
       {
-        task(*this);
-        task = nullptr;
+        wrk(*this);
       }
       else
       {
@@ -183,11 +187,11 @@ void ThreadPool::clear() {
 }
 
 
-void ThreadPool::execute_parallel(std::function<void(Thread&)> task)
+void ThreadPool::execute_with_workers(std::function<void(Thread&)> worker)
 {
   for(Thread* th : *this)
   {
-    th->execute_task(task);
+    th->execute_with_worker(std::move(worker));
   }
 }
 
@@ -301,8 +305,8 @@ void ThreadPool::wait_for_search_finished() const {
 }
 
 
-void ThreadPool::wait_for_tasks_finished() const {
+void ThreadPool::wait_for_workers_finished() const {
 
     for (Thread* th : *this)
-        th->wait_for_task_finished();
+        th->wait_for_worker_finished();
 }
diff --git a/src/thread.h b/src/thread.h
index 8be6eb5a..7474ea44 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -45,18 +45,19 @@ class Thread {
   std::condition_variable cv;
   size_t idx;
   bool exit = false, searching = true; // Set before starting std::thread
+  std::function<void(Thread&)> worker;
   NativeThread stdThread;
 
 public:
   explicit Thread(size_t);
   virtual ~Thread();
   virtual void search();
-  virtual void execute_task(std::function<void(Thread&)> t);
+  virtual void execute_with_worker(std::function<void(Thread&)> t);
   void clear();
   void idle_loop();
   void start_searching();
   void wait_for_search_finished();
-  void wait_for_task_finished();
+  void wait_for_worker_finished();
   size_t thread_idx() const { return idx; }
 
   Pawns::Table pawnsTable;
@@ -81,8 +82,6 @@ public:
   int Cardinality;
   bool UseRule50;
   Depth ProbeDepth;
-
-  std::function<void(Thread&)> task;
 };
 
 
@@ -110,7 +109,7 @@ struct MainThread : public Thread {
 
 struct ThreadPool : public std::vector<Thread*> {
 
-  void execute_parallel(std::function<void(Thread&)> task);
+  void execute_with_workers(std::function<void(Thread&)> worker);
 
   void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
   void clear();
@@ -122,7 +121,7 @@ struct ThreadPool : public std::vector<Thread*> {
   Thread* get_best_thread() const;
   void start_searching();
   void wait_for_search_finished() const;
-  void wait_for_tasks_finished() const;
+  void wait_for_workers_finished() const;
 
   std::atomic_bool stop, increaseDepth;
 

From 71862e2ebbf91527e0ca18ae44757425797b1f9e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 19 Oct 2020 13:27:39 +0200
Subject: [PATCH 350/583] remove incorrect move in execute_with_workers

---
 src/thread.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/thread.cpp b/src/thread.cpp
index 2ecd167a..72333078 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -191,7 +191,7 @@ void ThreadPool::execute_with_workers(std::function<void(Thread&)> worker)
 {
   for(Thread* th : *this)
   {
-    th->execute_with_worker(std::move(worker));
+    th->execute_with_worker(worker);
   }
 }
 

From 74af28763718258f250500dfd19b5d68c12339b8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 19 Oct 2020 15:27:33 +0200
Subject: [PATCH 351/583] Fix execute_with_workers test call in uci

---
 src/uci.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/uci.cpp b/src/uci.cpp
index 1aa9f95e..b05c7eeb 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -347,7 +347,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "search") search_cmd(pos, is);
       else if (token == "tasktest")
       {
-        Threads.execute_parallel([](auto& th) {
+        Threads.execute_with_workers([](auto& th) {
           std::cout << th.thread_idx() << '\n';
         });
       }

From f2ad307de313d18c56b147f8a682971cd8ca088a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 20 Oct 2020 10:50:59 +0200
Subject: [PATCH 352/583] Clarify the behaviour of execute_with_worker[s]

---
 src/thread.cpp |  3 +--
 src/thread.h   | 11 ++++++++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/thread.cpp b/src/thread.cpp
index 72333078..e867048d 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -186,8 +186,7 @@ void ThreadPool::clear() {
   main()->previousTimeReduction = 1.0;
 }
 
-
-void ThreadPool::execute_with_workers(std::function<void(Thread&)> worker)
+void ThreadPool::execute_with_workers(const std::function<void(Thread&)>& worker)
 {
   for(Thread* th : *this)
   {
diff --git a/src/thread.h b/src/thread.h
index 7474ea44..c0a01770 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -52,7 +52,13 @@ public:
   explicit Thread(size_t);
   virtual ~Thread();
   virtual void search();
+
+  // The function object to be executed is taken by value to remove
+  // the need for separate lvalue and rvalue overloads.
+  // The worker thread needs to have ownership of the task
+  // to be executed because otherwise there's no way to manage its lifetime.
   virtual void execute_with_worker(std::function<void(Thread&)> t);
+
   void clear();
   void idle_loop();
   void start_searching();
@@ -109,7 +115,10 @@ struct MainThread : public Thread {
 
 struct ThreadPool : public std::vector<Thread*> {
 
-  void execute_with_workers(std::function<void(Thread&)> worker);
+  // Each thread gets its own copy of the `worker` function object.
+  // This means that each worker thread will have exclusive access
+  // to the state of the `worker` function object.
+  void execute_with_workers(const std::function<void(Thread&)>& worker);
 
   void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
   void clear();

From ff06d1e0ad571a5d6f12de6b1b0f7b0a354d05d8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 11:23:58 +0200
Subject: [PATCH 353/583] Rewrite learner to be based on stockfish's thread
 pool. Reduce coupling along the way

---
 src/learn/learn.cpp | 1013 ++++++++++++++++++++-----------------------
 src/misc.h          |   12 +
 2 files changed, 472 insertions(+), 553 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index dfbba391..411e0016 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -20,7 +20,6 @@
 #include "learn.h"
 
 #include "convert.h"
-#include "multi_think.h"
 #include "sfen_stream.h"
 
 #include "misc.h"
@@ -95,6 +94,68 @@ namespace Learner
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
+    namespace Detail {
+        template <bool AtomicV>
+        struct Loss
+        {
+            using T =
+                std::conditional_t<
+                    AtomicV,
+                    atomic<double>,
+                    double
+                >;
+
+            T cross_entropy_eval{0.0};
+            T cross_entropy_win{0.0};
+            T cross_entropy{0.0};
+            T entropy_eval{0.0};
+            T entropy_win{0.0};
+            T entropy{0.0};
+            T count{0.0};
+
+            template <bool OtherAtomicV>
+            Loss& operator += (const Loss<OtherAtomicV>& rhs)
+            {
+                cross_entropy_eval += rhs.cross_entropy_eval;
+                cross_entropy_win += rhs.cross_entropy_win;
+                cross_entropy += rhs.cross_entropy;
+                entropy_eval += rhs.entropy_eval;
+                entropy_win += rhs.entropy_win;
+                entropy += rhs.entropy;
+                count += rhs.count;
+
+                return *this;
+            }
+
+            void reset()
+            {
+                cross_entropy_eval = 0.0;
+                cross_entropy_win = 0.0;
+                cross_entropy = 0.0;
+                entropy_eval = 0.0;
+                entropy_win = 0.0;
+                entropy = 0.0;
+                count = 0.0;
+            }
+
+            void print(const std::string& prefix, ostream& s) const
+            {
+                s
+                    << "INFO: "
+                    << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count
+                    << " , " << prefix << "_cross_entropy_win = " << cross_entropy_win / count
+                    << " , " << prefix << "_entropy_eval = " << entropy_eval / count
+                    << " , " << prefix << "_entropy_win = " << entropy_win / count
+                    << " , " << prefix << "_cross_entropy = " << cross_entropy / count
+                    << " , " << prefix << "_entropy = " << entropy / count
+                    << endl;
+            }
+        };
+    }
+
+    using Loss = Detail::Loss<false>;
+    using AtomicLoss = Detail::Loss<true>;
+
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage(double value)
     {
@@ -243,16 +304,10 @@ namespace Learner
     // The individual cross entropy of the win/loss term and win
     // rate term of the elmo expression is returned
     // to the arguments cross_entropy_eval and cross_entropy_win.
-    void calc_cross_entropy(
+    Loss calc_cross_entropy(
         Value teacher_signal,
         Value shallow,
-        const PackedSfenValue& psv,
-        double& cross_entropy_eval,
-        double& cross_entropy_win,
-        double& cross_entropy,
-        double& entropy_eval,
-        double& entropy_win,
-        double& entropy)
+        const PackedSfenValue& psv)
     {
         // Teacher winning probability.
         const double q = winning_percentage(shallow, psv.gamePly);
@@ -264,19 +319,25 @@ namespace Learner
 
         const double m = (1.0 - lambda) * t + lambda * p;
 
-        cross_entropy_eval =
+        Loss loss{};
+
+        loss.cross_entropy_eval =
             (-p * std::log(q + epsilon) - (1.0 - p) * std::log(1.0 - q + epsilon));
-        cross_entropy_win =
+        loss.cross_entropy_win =
             (-t * std::log(q + epsilon) - (1.0 - t) * std::log(1.0 - q + epsilon));
-        entropy_eval =
+        loss.entropy_eval =
             (-p * std::log(p + epsilon) - (1.0 - p) * std::log(1.0 - p + epsilon));
-        entropy_win =
+        loss.entropy_win =
             (-t * std::log(t + epsilon) - (1.0 - t) * std::log(1.0 - t + epsilon));
 
-        cross_entropy =
+        loss.cross_entropy =
             (-m * std::log(q + epsilon) - (1.0 - m) * std::log(1.0 - q + epsilon));
-        entropy =
+        loss.entropy =
             (-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
+
+        loss.count = 1;
+
+        return loss;
     }
 
     // Other objective functions may be considered in the future...
@@ -288,12 +349,6 @@ namespace Learner
     // Sfen reader
     struct SfenReader
     {
-        // Number of phases used for calculation such as mse
-        // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
-        // Since search() is performed with depth = 1 in calculation of
-        // move match rate, simple comparison is not possible...
-        static constexpr uint64_t sfen_for_mse_size = 2000;
-
         // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
         static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
 
@@ -303,11 +358,6 @@ namespace Learner
         // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
         static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
 
-        // hash to limit the reading of the same situation
-        // Is there too many 64 million phases? Or Not really..
-        // It must be 2**N because it will be used as the mask to calculate hash_index.
-        static constexpr uint64_t READ_SFEN_HASH_SIZE = 64 * 1024 * 1024;
-
         // Do not use std::random_device().
         // Because it always the same integers on MinGW.
         SfenReader(int thread_num, const std::string& seed) :
@@ -315,15 +365,9 @@ namespace Learner
         {
             packed_sfens.resize(thread_num);
             total_read = 0;
-            total_done = 0;
-            last_done = 0;
-            next_update_weights = 0;
-            save_count = 0;
             end_of_files = false;
-            no_shuffle = false;
+            shuffle = true;
             stop_flag = false;
-
-            hash.resize(READ_SFEN_HASH_SIZE);
         }
 
         ~SfenReader()
@@ -333,30 +377,30 @@ namespace Learner
         }
 
         // Load the phase for calculation such as mse.
-        void read_for_mse()
+        PSVector read_for_mse(uint64_t count)
         {
-            auto th = Threads.main();
-            Position& pos = th->rootPos;
-            for (uint64_t i = 0; i < sfen_for_mse_size; ++i)
+            PSVector sfen_for_mse;
+            sfen_for_mse.reserve(count);
+
+            for (uint64_t i = 0; i < count; ++i)
             {
                 PackedSfenValue ps;
                 if (!read_to_thread_buffer(0, ps))
                 {
                     cout << "Error! read packed sfen , failed." << endl;
-                    break;
+                    return sfen_for_mse;
                 }
 
                 sfen_for_mse.push_back(ps);
-
-                // Get the hash key.
-                StateInfo si;
-                pos.set_from_packed_sfen(ps.sfen, &si, th);
-                sfen_for_mse_hash.insert(pos.key());
             }
+
+            return sfen_for_mse;
         }
 
-        void read_validation_set(const string& file_name, int eval_limit)
+        PSVector read_validation_set(const string& file_name, int eval_limit)
         {
+            PSVector sfen_for_mse;
+
             auto input = open_sfen_input_file(file_name);
 
             while(!input->eof())
@@ -379,6 +423,8 @@ namespace Learner
                     break;
                 }
             }
+
+            return sfen_for_mse;
         }
 
         // [ASYNC] Thread returns one aspect. Otherwise returns false.
@@ -465,8 +511,8 @@ namespace Learner
                         return false;
 
                     // Get the next file name.
-                    string filename = filenames.back();
-                    filenames.pop_back();
+                    string filename = filenames.front();
+                    filenames.pop_front();
 
                     sfen_input_stream = open_sfen_input_file(filename);
                     cout << "open filename = " << filename << endl;
@@ -515,7 +561,7 @@ namespace Learner
                 }
 
                 // Shuffle the read phase data.
-                if (!no_shuffle)
+                if (shuffle)
                 {
                     Algo::shuffle(sfens, prng);
                 }
@@ -553,45 +599,37 @@ namespace Learner
             }
         }
 
-        // Determine if it is a phase for calculating rmse.
-        // (The computational aspects of rmse should not be used for learning.)
-        bool is_for_rmse(Key key) const
+        void stop()
         {
-            return sfen_for_mse_hash.count(key) != 0;
+            stop_flag = true;
         }
 
-        // sfen files
-        vector<string> filenames;
+        void set_do_shuffle(bool v)
+        {
+            shuffle = v;
+        }
 
-        // number of phases read (file to memory buffer)
-        atomic<uint64_t> total_read;
-
-        // number of processed phases
-        atomic<uint64_t> total_done;
-
-        // number of cases processed so far
-        uint64_t last_done;
-
-        // If total_read exceeds this value, update_weights() and calculate mse.
-        std::atomic<uint64_t> next_update_weights;
-
-        uint64_t save_count;
-
-        // Do not shuffle when reading the phase.
-        bool no_shuffle;
-
-        std::atomic<bool> stop_flag;
-
-        vector<Key> hash;
-
-        // test phase for mse calculation
-        PSVector sfen_for_mse;
+        void add_file(const std::string& filename)
+        {
+            filenames.push_back(filename);
+        }
 
     protected:
 
         // worker thread reading file in background
         std::thread file_worker_thread;
 
+        // sfen files
+        deque<string> filenames;
+
+        std::atomic<bool> stop_flag;
+
+        // number of phases read (file to memory buffer)
+        atomic<uint64_t> total_read;
+
+        // Do not shuffle when reading the phase.
+        bool shuffle;
+
         // Random number to shuffle when reading the phase
         PRNG prng;
 
@@ -612,27 +650,25 @@ namespace Learner
         // Each worker thread fills its own packed_sfens[thread_id] from here.
         // * Lock and access the mutex.
         std::list<std::unique_ptr<PSVector>> packed_sfens_pool;
-
-        // Hold the hash key so that the mse calculation phase is not used for learning.
-        std::unordered_set<Key> sfen_for_mse_hash;
     };
 
     // Class to generate sfen with multiple threads
-    struct LearnerThink : public MultiThink
+    struct LearnerThink
     {
-        LearnerThink(SfenReader& sr_, const std::string& seed) :
-            MultiThink(seed),
-            sr(sr_),
-            stop_flag(false),
-            save_only_once(false)
-        {
-            learn_sum_cross_entropy_eval = 0.0;
-            learn_sum_cross_entropy_win = 0.0;
-            learn_sum_cross_entropy = 0.0;
-            learn_sum_entropy_eval = 0.0;
-            learn_sum_entropy_win = 0.0;
-            learn_sum_entropy = 0.0;
+        // Number of phases used for calculation such as mse
+        // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
+        // Since search() is performed with depth = 1 in calculation of
+        // move match rate, simple comparison is not possible...
+        static constexpr uint64_t sfen_for_mse_size = 2000;
 
+        LearnerThink(uint64_t thread_num, const std::string& seed) :
+            prng(seed),
+            sr(thread_num, std::to_string(prng.next_random_seed())),
+            learn_loss_sum{}
+        {
+            save_only_once = false;
+            save_count = 0;
+            loss_output_count = 0;
             newbob_decay = 1.0;
             newbob_num_trials = 2;
             auto_lr_drop = 0;
@@ -640,32 +676,27 @@ namespace Learner
             best_loss = std::numeric_limits<double>::infinity();
             latest_loss_sum = 0.0;
             latest_loss_count = 0;
+            total_done = 0;
         }
 
-        virtual void thread_worker(size_t thread_id);
-
-        // Start a thread that loads the phase file in the background.
-        void start_file_read_worker()
+        void set_do_shuffle(bool v)
         {
-            sr.start_file_read_worker();
+            sr.set_do_shuffle(v);
         }
 
-        Value get_shallow_value(Position& task_pos);
+        void add_file(const std::string& filename)
+        {
+            sr.add_file(filename);
+        }
 
-        // save merit function parameters to a file
-        bool save(bool is_final = false);
+        void learn();
 
-        // sfen reader
-        SfenReader& sr;
 
-        // Learning iteration counter
-        uint64_t epoch = 0;
+        std::string validation_set_file_name;
 
         // Mini batch size size. Be sure to set it on the side that uses this class.
         uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
 
-        std::atomic<bool> stop_flag;
-
         // Option to exclude early stage from learning
         int reduction_gameply;
 
@@ -677,342 +708,143 @@ namespace Learner
         // If true, do not dig the folder.
         bool save_only_once;
 
-        // --- loss calculation
-
-        // For calculation of learning data loss
-        atomic<double> learn_sum_cross_entropy_eval;
-        atomic<double> learn_sum_cross_entropy_win;
-        atomic<double> learn_sum_cross_entropy;
-        atomic<double> learn_sum_entropy_eval;
-        atomic<double> learn_sum_entropy_win;
-        atomic<double> learn_sum_entropy;
-
-        shared_timed_mutex nn_mutex;
         double newbob_decay;
         int newbob_num_trials;
         uint64_t auto_lr_drop;
-        uint64_t last_lr_drop;
-        double best_loss;
-        double latest_loss_sum;
-        uint64_t latest_loss_count;
+
         std::string best_nn_directory;
 
         uint64_t eval_save_interval;
         uint64_t loss_output_interval;
 
-        // Loss calculation.
-        // done: Number of phases targeted this time
-        void calc_loss(size_t thread_id, uint64_t done);
+    private:
+        void learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
 
-        // Define the loss calculation in ↑ as a task and execute it
-        TaskDispatcher task_dispatcher;
+        void update_weights(const PSVector& psv);
+
+        void calc_loss(const PSVector& psv);
+
+        void calc_loss_worker(
+            Thread& th,
+            std::atomic<uint64_t>& counter,
+            const PSVector& psv,
+            AtomicLoss& test_loss_sum,
+            atomic<double>& sum_norm,
+            atomic<int>& move_accord_count
+        );
+
+        Value get_shallow_value(Position& pos);
+
+        // save merit function parameters to a file
+        bool save(bool is_final = false);
+
+        PRNG prng;
+
+        // sfen reader
+        SfenReader sr;
+
+        uint64_t save_count;
+        uint64_t loss_output_count;
+
+        // Learning iteration counter
+        uint64_t epoch = 0;
+
+        std::atomic<bool> stop_flag;
+
+        uint64_t total_done;
+
+        uint64_t last_lr_drop;
+        double best_loss;
+        double latest_loss_sum;
+        uint64_t latest_loss_count;
+
+        // For calculation of learning data loss
+        AtomicLoss learn_loss_sum;
     };
 
-    Value LearnerThink::get_shallow_value(Position& task_pos)
+    void LearnerThink::learn()
     {
-        // Evaluation value for shallow search
-        // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
-        // Use qsearch() because it is difficult to compare the values.
-        // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
-        const auto [_, pv] = Search::qsearch(task_pos);
 
-        const auto rootColor = task_pos.side_to_move();
-
-        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
-        for (size_t i = 0; i < pv.size(); ++i)
-        {
-            task_pos.do_move(pv[i], states[i]);
-        }
-
-        const Value shallow_value =
-            (rootColor == task_pos.side_to_move())
-            ? Eval::evaluate(task_pos)
-            : -Eval::evaluate(task_pos);
-
-        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-            task_pos.undo_move(*it);
-
-        return shallow_value;
-    }
-
-    void LearnerThink::calc_loss(size_t thread_id, uint64_t done)
-    {
-        // There is no point in hitting the replacement table,
-        // so at this timing the generation of the replacement table is updated.
-        // It doesn't matter if you have disabled the substitution table.
-        TT.new_search();
-        TimePoint elapsed = now() - Search::Limits.startTime + 1;
-
-        cout << "PROGRESS: " << now_string() << ", ";
-        cout << sr.total_done << " sfens, ";
-        cout << sr.total_done * 1000 / elapsed  << " sfens/second";
-        cout << ", iteration " << epoch;
-        cout << ", learning rate = " << global_learning_rate << ", ";
-
-        // For calculation of verification data loss
-        atomic<double> test_sum_cross_entropy_eval, test_sum_cross_entropy_win, test_sum_cross_entropy;
-        atomic<double> test_sum_entropy_eval, test_sum_entropy_win, test_sum_entropy;
-        test_sum_cross_entropy_eval = 0;
-        test_sum_cross_entropy_win = 0;
-        test_sum_cross_entropy = 0;
-        test_sum_entropy_eval = 0;
-        test_sum_entropy_win = 0;
-        test_sum_entropy = 0;
-
-        // norm for learning
-        atomic<double> sum_norm;
-        sum_norm = 0;
-
-        // The number of times the pv first move of deep
-        // search matches the pv first move of search(1).
-        atomic<int> move_accord_count;
-        move_accord_count = 0;
-
-        auto th = Threads[thread_id];
-        auto& pos = th->rootPos;
-        StateInfo si;
-        pos.set(StartFEN, false, &si, th);
-        cout << "startpos eval = " << Eval::evaluate(pos) << endl;
-
-        // It's better to parallelize here, but it's a bit
-        // troublesome because the search before slave has not finished.
-        // I created a mechanism to call task, so I will use it.
-
-        // The number of tasks to do.
-        atomic<int> task_count;
-        task_count = (int)sr.sfen_for_mse.size();
-        task_dispatcher.task_reserve(task_count);
-
-        // Create a task to search for the situation and give it to each thread.
-        for (const auto& ps : sr.sfen_for_mse)
-        {
-            // Assign work to each thread using TaskDispatcher.
-            // A task definition for that.
-            // It is not possible to capture pos used in ↑,
-            // so specify the variables you want to capture one by one.
-            auto task =
-                [
-                    this,
-                    &ps,
-                    &test_sum_cross_entropy_eval,
-                    &test_sum_cross_entropy_win,
-                    &test_sum_cross_entropy,
-                    &test_sum_entropy_eval,
-                    &test_sum_entropy_win,
-                    &test_sum_entropy,
-                    &sum_norm,
-                    &task_count,
-                    &move_accord_count
-                ](size_t task_thread_id)
-            {
-                auto task_th = Threads[task_thread_id];
-                auto& task_pos = task_th->rootPos;
-                StateInfo task_si;
-                if (task_pos.set_from_packed_sfen(ps.sfen, &task_si, task_th) != 0)
-                {
-                    // Unfortunately, as an sfen for rmse calculation, an invalid sfen was drawn.
-                    cout << "Error! : illegal packed sfen " << task_pos.fen() << endl;
-                }
-
-                const Value shallow_value = get_shallow_value(task_pos);
-
-                // Evaluation value of deep search
-                auto deep_value = (Value)ps.score;
-
-                // Note) This code does not consider when
-                //       eval_limit is specified in the learn command.
-
-                // --- calculation of cross entropy
-
-                // For the time being, regarding the win rate and loss terms only in the elmo method
-                // Calculate and display the cross entropy.
-
-                double test_cross_entropy_eval, test_cross_entropy_win, test_cross_entropy;
-                double test_entropy_eval, test_entropy_win, test_entropy;
-                calc_cross_entropy(
-                    deep_value,
-                    shallow_value,
-                    ps,
-                    test_cross_entropy_eval,
-                    test_cross_entropy_win,
-                    test_cross_entropy,
-                    test_entropy_eval,
-                    test_entropy_win,
-                    test_entropy);
-
-                // The total cross entropy need not be abs() by definition.
-                test_sum_cross_entropy_eval += test_cross_entropy_eval;
-                test_sum_cross_entropy_win += test_cross_entropy_win;
-                test_sum_cross_entropy += test_cross_entropy;
-                test_sum_entropy_eval += test_entropy_eval;
-                test_sum_entropy_win += test_entropy_win;
-                test_sum_entropy += test_entropy;
-                sum_norm += (double)abs(shallow_value);
-
-                // Determine if the teacher's move and the score of the shallow search match
-                {
-                    const auto [value, pv] = Search::search(task_pos, 1);
-                    if ((uint16_t)pv[0] == ps.move)
-                        move_accord_count.fetch_add(1, std::memory_order_relaxed);
-                }
-
-                // Reduced one task because I did it
-                --task_count;
-            };
-
-            // Throw the defined task to slave.
-            task_dispatcher.push_task_async(task);
-        }
-
-        // join yourself as a slave
-        task_dispatcher.on_idle(thread_id);
-
-        // wait for all tasks to complete
-        while (task_count)
-            sleep(1);
-
-        latest_loss_sum += test_sum_cross_entropy - test_sum_entropy;
-        latest_loss_count += sr.sfen_for_mse.size();
-
-        // learn_cross_entropy may be called train cross
-        // entropy in the world of machine learning,
-        // When omitting the acronym, it is nice to be able to
-        // distinguish it from test cross entropy(tce) by writing it as lce.
-
-        if (sr.sfen_for_mse.size() && done)
-        {
-            cout << "INFO: "
-                << "test_cross_entropy_eval = " << test_sum_cross_entropy_eval / sr.sfen_for_mse.size()
-                << " , test_cross_entropy_win = " << test_sum_cross_entropy_win / sr.sfen_for_mse.size()
-                << " , test_entropy_eval = " << test_sum_entropy_eval / sr.sfen_for_mse.size()
-                << " , test_entropy_win = " << test_sum_entropy_win / sr.sfen_for_mse.size()
-                << " , test_cross_entropy = " << test_sum_cross_entropy / sr.sfen_for_mse.size()
-                << " , test_entropy = " << test_sum_entropy / sr.sfen_for_mse.size()
-                << " , norm = " << sum_norm
-                << " , move accuracy = " << (move_accord_count * 100.0 / sr.sfen_for_mse.size()) << "%"
-                << endl;
-
-            if (done != static_cast<uint64_t>(-1))
-            {
-                cout << "INFO: "
-                    << "learn_cross_entropy_eval = " << learn_sum_cross_entropy_eval / done
-                    << " , learn_cross_entropy_win = " << learn_sum_cross_entropy_win / done
-                    << " , learn_entropy_eval = " << learn_sum_entropy_eval / done
-                    << " , learn_entropy_win = " << learn_sum_entropy_win / done
-                    << " , learn_cross_entropy = " << learn_sum_cross_entropy / done
-                    << " , learn_entropy = " << learn_sum_entropy / done
-                    << endl;
-            }
-        }
-        else
-        {
-            cout << "Error! : sr.sfen_for_mse.size() = " << sr.sfen_for_mse.size() << " ,  done = " << done << endl;
-        }
-
-        // Clear 0 for next time.
-        learn_sum_cross_entropy_eval = 0.0;
-        learn_sum_cross_entropy_win = 0.0;
-        learn_sum_cross_entropy = 0.0;
-        learn_sum_entropy_eval = 0.0;
-        learn_sum_entropy_win = 0.0;
-        learn_sum_entropy = 0.0;
-    }
-
-    void LearnerThink::thread_worker(size_t thread_id)
-    {
 #if defined(_OPENMP)
         omp_set_num_threads((int)Options["Threads"]);
 #endif
 
-        auto th = Threads[thread_id];
-        auto& pos = th->rootPos;
+        Eval::NNUE::verify_any_net_loaded();
 
-        while (true)
+        // Start a thread that loads the training data in the background
+        sr.start_file_read_worker();
+
+        const PSVector sfen_for_mse =
+            validation_set_file_name.empty()
+            ? sr.read_for_mse(sfen_for_mse_size)
+            : sr.read_validation_set(validation_set_file_name, eval_limit);
+
+        if (validation_set_file_name.empty()
+            && sfen_for_mse.size() != sfen_for_mse_size)
         {
-            // display mse (this is sometimes done only for thread 0)
-            // Immediately after being read from the file...
+            cout
+                << "Error reading sfen_for_mse. Read " << sfen_for_mse.size()
+                << " out of " << sfen_for_mse_size << '\n';
 
-            // Lock the evaluation function so that it is not used during updating.
-            shared_lock<shared_timed_mutex> read_lock(nn_mutex, defer_lock);
-            if (sr.next_update_weights <= sr.total_done ||
-                (thread_id != 0 && !read_lock.try_lock()))
-            {
-                if (thread_id != 0)
-                {
-                    // Wait except thread_id == 0.
+            sr.stop();
 
-                    if (stop_flag)
-                        break;
+            return;
+        }
 
-                    // I want to parallelize rmse calculation etc., so if task() is loaded, process it.
-                    task_dispatcher.on_idle(thread_id);
-                    continue;
-                }
-                else
-                {
-                    // Only thread_id == 0 performs the following update process.
+        if (newbob_decay != 1.0) {
 
-                    // The weight array is not updated for the first time.
-                    if (sr.next_update_weights == 0)
-                    {
-                        sr.next_update_weights += mini_batch_size;
-                        continue;
-                    }
+            calc_loss(sfen_for_mse);
 
-                    {
-                        // update parameters
+            best_loss = latest_loss_sum / latest_loss_count;
+            latest_loss_sum = 0.0;
+            latest_loss_count = 0;
 
-                        // Lock the evaluation function so that it is not used during updating.
-                        lock_guard<shared_timed_mutex> write_lock(nn_mutex);
-                        Eval::NNUE::update_parameters();
-                    }
+            cout << "initial loss: " << best_loss << endl;
+        }
 
-                    ++epoch;
+        stop_flag = false;
 
-                    // However, the elapsed time during update_weights() and calc_rmse() is ignored.
-                    if (++sr.save_count * mini_batch_size >= eval_save_interval)
-                    {
-                        sr.save_count = 0;
+        for(;;)
+        {
+            std::atomic<uint64_t> counter{0};
 
-                        // During this time, as the gradient calculation proceeds,
-                        // the value becomes too large and I feel annoyed, so stop other threads.
-                        const bool converged = save();
-                        if (converged)
-                        {
-                            stop_flag = true;
-                            sr.stop_flag = true;
-                            break;
-                        }
-                    }
+            Threads.execute_with_workers([this, &counter](auto& th){
+                learn_worker(th, counter, mini_batch_size);
+            });
 
-                    // Calculate rmse. This is done for samples of 10,000 phases.
-                    // If you do with 40 cores, update_weights every 1 million phases
-                    static uint64_t loss_output_count = 0;
-                    if (++loss_output_count * mini_batch_size >= loss_output_interval)
-                    {
-                        loss_output_count = 0;
+            total_done += mini_batch_size;
 
-                        // Number of cases processed this time
-                        uint64_t done = sr.total_done - sr.last_done;
+            Threads.wait_for_workers_finished();
 
-                        // loss calculation
-                        calc_loss(thread_id, done);
+            if (stop_flag)
+                break;
 
-                        Eval::NNUE::check_health();
+            update_weights(sfen_for_mse);
 
-                        // Make a note of how far you have totaled.
-                        sr.last_done = sr.total_done;
-                    }
+            if (stop_flag)
+                break;
+        }
 
-                    // Next time, I want you to do this series of
-                    // processing again when you process only mini_batch_size.
-                    sr.next_update_weights += mini_batch_size;
+        sr.stop();
 
-                    // Since I was waiting for the update of this
-                    // sr.next_update_weights except the main thread,
-                    // Once this value is updated, it will start moving again.
-                }
-            }
+        Eval::NNUE::finalize_net();
+
+        save(true);
+    }
+
+    void LearnerThink::learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit)
+    {
+        const auto thread_id = th.thread_idx();
+        auto& pos = th.rootPos;
+
+        Loss local_loss_sum{};
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> state(MAX_PLY);
+
+        while(!stop_flag)
+        {
+            const auto iter = counter.fetch_add(1);
+            if (iter >= limit)
+                break;
 
             PackedSfenValue ps;
 
@@ -1020,16 +852,12 @@ namespace Learner
 
             if (!sr.read_to_thread_buffer(thread_id, ps))
             {
-                // ran out of thread pool for my thread.
-                // Because there are almost no phases left,
-                // Terminate all other threads.
-
+                // If we ran out of data we stop completely
+                // because there's nothing left to do.
                 stop_flag = true;
                 break;
             }
 
-            // The evaluation value exceeds the learning target value.
-            // Ignore this aspect information.
             if (eval_limit < abs(ps.score))
                 goto RETRY_READ;
 
@@ -1041,123 +869,242 @@ namespace Learner
                 goto RETRY_READ;
 
             StateInfo si;
-            if (pos.set_from_packed_sfen(ps.sfen, &si, th) != 0)
+            if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
             {
-                // I got a strange sfen. Should be debugged!
-                // Since it is an illegal sfen, it may not be
-                // displayed with pos.sfen(), but it is better than not.
+                // Malformed sfen
                 cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
                 goto RETRY_READ;
             }
 
-            // I can read it, so try displaying it.
-            //      cout << pos << value << endl;
-
             const auto rootColor = pos.side_to_move();
 
-            int ply = 0;
-            StateInfo state[MAX_PLY]; // PV of qsearch cannot be so long.
-
-            if (!pos.pseudo_legal((Move)ps.move) || !pos.legal((Move)ps.move))
-            {
-                goto RETRY_READ;
-            }
-
-            pos.do_move((Move)ps.move, state[ply++]);
-
-            // There is a possibility that all the pieces are blocked and stuck.
-            // Also, the declaration win phase is excluded from
-            // learning because you cannot go to leaf with PV moves.
-            // (shouldn't write out such teacher aspect itself,
-            // but may have written it out with an old generation routine)
-            // Skip the position if there are no legal moves (=checkmated or stalemate).
-            if (MoveList<LEGAL>(pos).size() == 0)
-                goto RETRY_READ;
-
-            // Evaluation value of shallow search (qsearch)
-            const auto [_, pv] = Search::qsearch(pos);
-
-            // Evaluation value of deep search
-            const auto deep_value = (Value)ps.score;
-
-            // I feel that the mini batch has a better gradient.
-            // Go to the leaf node as it is, add only to the gradient array,
-            // and later try AdaGrad at the time of rmse aggregation.
-
-
-            // If the initial PV is different, it is better not to use it for learning.
-            // If it is the result of searching a completely different place, it may become noise.
-            // It may be better not to study where the difference in evaluation values ​​is too large.
-
-
-            // A helper function that adds the gradient to the current phase.
+            // A function that adds the current `pos` and `ps`
+            // to the training set.
             auto pos_add_grad = [&]() {
-                // Use the value of evaluate in leaf as shallow_value.
-                // Using the return value of qsearch() as shallow_value,
-                // If PV is interrupted in the middle, the phase where
-                // evaluate() is called to calculate the gradient,
-                // and I don't think this is a very desirable property,
-                // as the aspect that gives that gradient will be different.
-                // I have turned off the substitution table, but since
-                // the pv array has not been updated due to one stumbling block etc...
+
+                // Evaluation value of deep search
+                const auto deep_value = (Value)ps.score;
 
                 const Value shallow_value =
                     (rootColor == pos.side_to_move())
                     ? Eval::evaluate(pos)
                     : -Eval::evaluate(pos);
 
-                // Calculate loss for training data
-                double learn_cross_entropy_eval, learn_cross_entropy_win, learn_cross_entropy;
-                double learn_entropy_eval, learn_entropy_win, learn_entropy;
-                calc_cross_entropy(
+                const auto loss = calc_cross_entropy(
                     deep_value,
                     shallow_value,
-                    ps,
-                    learn_cross_entropy_eval,
-                    learn_cross_entropy_win,
-                    learn_cross_entropy,
-                    learn_entropy_eval,
-                    learn_entropy_win,
-                    learn_entropy);
+                    ps);
 
-                learn_sum_cross_entropy_eval += learn_cross_entropy_eval;
-                learn_sum_cross_entropy_win += learn_cross_entropy_win;
-                learn_sum_cross_entropy += learn_cross_entropy;
-                learn_sum_entropy_eval += learn_entropy_eval;
-                learn_sum_entropy_win += learn_entropy_win;
-                learn_sum_entropy += learn_entropy;
+                local_loss_sum += loss;
 
                 Eval::NNUE::add_example(pos, rootColor, ps, 1.0);
-
-                // Since the processing is completed, the counter of the processed number is incremented
-                sr.total_done++;
             };
 
-            bool illegal_move = false;
-            for (auto m : pv)
-            {
-                // I shouldn't be an illegal player.
-                // An illegal move sometimes comes here...
-                if (!pos.pseudo_legal(m) || !pos.legal(m))
-                {
-                    //cout << pos << m << endl;
-                    //assert(false);
-                    illegal_move = true;
-                    break;
-                }
-
-                pos.do_move(m, state[ply++]);
-            }
-
-            if (illegal_move)
+            if (!pos.pseudo_legal((Move)ps.move) || !pos.legal((Move)ps.move))
             {
                 goto RETRY_READ;
             }
 
+            int ply = 0;
+            pos.do_move((Move)ps.move, state[ply++]);
+
+            // We want to position being trained on not to be terminal
+            if (MoveList<LEGAL>(pos).size() == 0)
+                goto RETRY_READ;
+
+            // Evaluation value of shallow search (qsearch)
+            const auto [_, pv] = Search::qsearch(pos);
+
+            for (auto m : pv)
+            {
+                pos.do_move(m, state[ply++]);
+            }
+
             // Since we have reached the end phase of PV, add the slope here.
             pos_add_grad();
         }
 
+        learn_loss_sum += local_loss_sum;
+    }
+
+    void LearnerThink::update_weights(const PSVector& psv)
+    {
+        // I'm not sure this fencing is correct. But either way there
+        // should be no real issues happening since
+        // the read/write phases are isolated.
+        atomic_thread_fence(memory_order_seq_cst);
+        Eval::NNUE::update_parameters();
+        atomic_thread_fence(memory_order_seq_cst);
+
+        ++epoch;
+
+        if (++save_count * mini_batch_size >= eval_save_interval)
+        {
+            save_count = 0;
+
+            const bool converged = save();
+            if (converged)
+            {
+                stop_flag = true;
+                return;
+            }
+        }
+
+        if (++loss_output_count * mini_batch_size >= loss_output_interval)
+        {
+            loss_output_count = 0;
+
+            // loss calculation
+            calc_loss(psv);
+
+            Eval::NNUE::check_health();
+        }
+    }
+
+    void LearnerThink::calc_loss(const PSVector& psv)
+    {
+        TT.new_search();
+        TimePoint elapsed = now() - Search::Limits.startTime + 1;
+
+        cout << "PROGRESS: " << now_string() << ", ";
+        cout << total_done << " sfens, ";
+        cout << total_done * 1000 / elapsed  << " sfens/second";
+        cout << ", iteration " << epoch;
+        cout << ", learning rate = " << global_learning_rate << ", ";
+
+        // For calculation of verification data loss
+        AtomicLoss test_loss_sum{};
+
+        // norm for learning
+        atomic<double> sum_norm{0.0};
+
+        // The number of times the pv first move of deep
+        // search matches the pv first move of search(1).
+        atomic<int> move_accord_count{0};
+
+        auto mainThread = Threads.main();
+        mainThread->execute_with_worker([](auto& th){
+            auto& pos = th.rootPos;
+            StateInfo si;
+            pos.set(StartFEN, false, &si, &th);
+            cout << "startpos eval = " << Eval::evaluate(pos) << endl;
+        });
+        mainThread->wait_for_worker_finished();
+
+        // The number of tasks to do.
+        atomic<uint64_t> counter{0};
+        Threads.execute_with_workers([&](auto& th){
+            calc_loss_worker(
+                th,
+                counter,
+                psv,
+                test_loss_sum,
+                sum_norm,
+                move_accord_count
+            );
+        });
+        Threads.wait_for_workers_finished();
+
+        latest_loss_sum += test_loss_sum.cross_entropy - test_loss_sum.entropy;
+        latest_loss_count += psv.size();
+
+        if (psv.size() && test_loss_sum.count > 0.0)
+        {
+            cout << "INFO: norm = " << sum_norm
+                << " , move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%"
+                << endl;
+
+            test_loss_sum.print("test", cout);
+
+            if (learn_loss_sum.count > 0.0)
+            {
+                learn_loss_sum.print("learn", cout);
+            }
+        }
+        else
+        {
+            cout << "Error! : psv.size() = " << psv.size() << " ,  done = " << test_loss_sum.count << endl;
+        }
+
+        learn_loss_sum.reset();
+    }
+
+    void LearnerThink::calc_loss_worker(
+        Thread& th,
+        std::atomic<uint64_t>& counter,
+        const PSVector& psv,
+        AtomicLoss& test_loss_sum,
+        atomic<double>& sum_norm,
+        atomic<int>& move_accord_count
+    )
+    {
+        Loss local_loss_sum{};
+        auto& pos = th.rootPos;
+
+        for(;;)
+        {
+            const auto task_id = counter.fetch_add(1);
+            if (task_id >= psv.size())
+            {
+                break;
+            }
+
+            const auto& ps = psv[task_id];
+
+            StateInfo si;
+            if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
+            {
+                cout << "Error! : illegal packed sfen " << pos.fen() << endl;
+                continue;
+            }
+
+            const Value shallow_value = get_shallow_value(pos);
+
+            // Evaluation value of deep search
+            const auto deep_value = (Value)ps.score;
+
+            const auto loss = calc_cross_entropy(
+                deep_value,
+                shallow_value,
+                ps);
+
+            local_loss_sum += loss;
+            sum_norm += (double)abs(shallow_value);
+
+            // Determine if the teacher's move and the score of the shallow search match
+            const auto [value, pv] = Search::search(pos, 1);
+            if (pv.size() > 0 && (uint16_t)pv[0] == ps.move)
+                move_accord_count.fetch_add(1, std::memory_order_relaxed);
+        }
+
+        test_loss_sum += local_loss_sum;
+    }
+
+    Value LearnerThink::get_shallow_value(Position& pos)
+    {
+        // Evaluation value for shallow search
+        // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
+        // Use qsearch() because it is difficult to compare the values.
+        // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
+        const auto [_, pv] = Search::qsearch(pos);
+
+        const auto rootColor = pos.side_to_move();
+
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
+        for (size_t i = 0; i < pv.size(); ++i)
+        {
+            pos.do_move(pv[i], states[i]);
+        }
+
+        const Value shallow_value =
+            (rootColor == pos.side_to_move())
+            ? Eval::evaluate(pos)
+            : -Eval::evaluate(pos);
+
+        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
+            pos.undo_move(*it);
+
+        return shallow_value;
     }
 
     // Write evaluation function file.
@@ -1189,7 +1136,7 @@ namespace Learner
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
                 cout << "loss: " << latest_loss;
-                auto tot = sr.total_done.load();
+                auto tot = total_done;
                 if (auto_lr_drop)
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
@@ -1681,6 +1628,7 @@ namespace Learner
             else if (option == "seed") is >> seed;
             else if (option == "set_recommended_uci_options")
             {
+                UCI::setoption("Use NNUE", "pure");
                 UCI::setoption("MultiPV", "1");
                 UCI::setoption("Contempt", "0");
                 UCI::setoption("Skill Level", "20");
@@ -1707,8 +1655,7 @@ namespace Learner
         cout << "Warning! OpenMP disabled." << endl;
 #endif
 
-        SfenReader sr(thread_num, seed);
-        LearnerThink learn_think(sr, seed);
+        LearnerThink learn_think(thread_num, seed);
 
         // Display learning game file
         if (target_dir != "")
@@ -1807,17 +1754,6 @@ namespace Learner
         cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
         cout << "no_shuffle        : " << (no_shuffle ? "true" : "false") << endl;
 
-        // Insert the file name for the number of loops.
-        for (int i = 0; i < loop; ++i)
-        {
-            // sfen reader, I'll read it in reverse
-            // order so I'll reverse it here. I'm sorry.
-            for (auto it = filenames.rbegin(); it != filenames.rend(); ++it)
-            {
-                sr.filenames.push_back(Path::combine(base_dir, *it));
-            }
-        }
-
         cout << "Loss Function     : " << LOSS_FUNCTION << endl;
         cout << "mini-batch size   : " << mini_batch_size << endl;
 
@@ -1876,7 +1812,7 @@ namespace Learner
         // Reflect other option settings.
         learn_think.eval_limit = eval_limit;
         learn_think.save_only_once = save_only_once;
-        learn_think.sr.no_shuffle = no_shuffle;
+        learn_think.set_do_shuffle(!no_shuffle);
         learn_think.reduction_gameply = reduction_gameply;
 
         learn_think.newbob_decay = newbob_decay;
@@ -1886,49 +1822,20 @@ namespace Learner
         learn_think.eval_save_interval = eval_save_interval;
         learn_think.loss_output_interval = loss_output_interval;
 
-        // Start a thread that loads the phase file in the background
-        // (If this is not started, mse cannot be calculated.)
-        learn_think.start_file_read_worker();
-
         learn_think.mini_batch_size = mini_batch_size;
+        learn_think.validation_set_file_name = validation_set_file_name;
 
-        if (validation_set_file_name.empty())
+        // Insert the file name for the number of loops.
+        for (int i = 0; i < loop; ++i)
         {
-            // Get about 10,000 data for mse calculation.
-            sr.read_for_mse();
+            for(auto& file : filenames)
+            {
+                learn_think.add_file(Path::combine(base_dir, file));
+            }
         }
-        else
-        {
-            sr.read_validation_set(validation_set_file_name, eval_limit);
-        }
-
-        cout << "Forcing Use NNUE pure.\n";
-        UCI::setoption("Use NNUE", "pure");
-
-        Eval::NNUE::verify_any_net_loaded();
-
-        // Calculate rmse once at this point (timing of 0 sfen)
-        // sr.calc_rmse();
-
-        if (newbob_decay != 1.0) {
-            learn_think.calc_loss(0, -1);
-            learn_think.best_loss = learn_think.latest_loss_sum / learn_think.latest_loss_count;
-            learn_think.latest_loss_sum = 0.0;
-            learn_think.latest_loss_count = 0;
-            cout << "initial loss: " << learn_think.best_loss << endl;
-        }
-
-        // -----------------------------------
-        // start learning evaluation function parameters
-        // -----------------------------------
 
         // Start learning.
-        learn_think.go_think();
-
-        Eval::NNUE::finalize_net();
-
-        // Save once at the end.
-        learn_think.save(true);
+        learn_think.learn();
     }
 
 } // namespace Learner
diff --git a/src/misc.h b/src/misc.h
index 320eea76..dca959cd 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -128,6 +128,18 @@ public:
 
   void set_seed(uint64_t seed) { s = seed; }
 
+  uint64_t next_random_seed()
+  {
+    uint64_t seed = 0;
+    for(int i = 0; i < 64; ++i)
+    {
+      const auto off = rand64() % 64;
+      seed |= (rand64() & (uint64_t(1) << off)) >> off;
+      seed <<= 1;
+    }
+    return seed;
+  }
+
   void set_seed_from_time()
   {
       set_seed(std::chrono::system_clock::now().time_since_epoch().count());

From 8f3e64a6d5d48b5d94c7e4083914ab4c5d5b3aa0 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 13:36:32 +0200
Subject: [PATCH 354/583] move sfen reader to separate file

---
 src/learn/learn.cpp     | 311 +-------------------------------------
 src/learn/sfen_reader.h | 326 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 329 insertions(+), 308 deletions(-)
 create mode 100644 src/learn/sfen_reader.h

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 411e0016..af53791c 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -20,7 +20,7 @@
 #include "learn.h"
 
 #include "convert.h"
-#include "sfen_stream.h"
+#include "sfen_reader.h"
 
 #include "misc.h"
 #include "position.h"
@@ -51,6 +51,7 @@
 #include <shared_mutex>
 #include <sstream>
 #include <unordered_set>
+#include <iostream>
 
 #if defined (_OPENMP)
 #include <omp.h>
@@ -346,312 +347,6 @@ namespace Learner
         return calc_grad((Value)psv.score, shallow, psv);
     }
 
-    // Sfen reader
-    struct SfenReader
-    {
-        // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
-        static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
-
-        // Buffer for reading files (If this is made larger,
-        // the shuffle becomes larger and the phases may vary.
-        // If it is too large, the memory consumption will increase.
-        // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
-        static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
-
-        // Do not use std::random_device().
-        // Because it always the same integers on MinGW.
-        SfenReader(int thread_num, const std::string& seed) :
-            prng(seed)
-        {
-            packed_sfens.resize(thread_num);
-            total_read = 0;
-            end_of_files = false;
-            shuffle = true;
-            stop_flag = false;
-        }
-
-        ~SfenReader()
-        {
-            if (file_worker_thread.joinable())
-                file_worker_thread.join();
-        }
-
-        // Load the phase for calculation such as mse.
-        PSVector read_for_mse(uint64_t count)
-        {
-            PSVector sfen_for_mse;
-            sfen_for_mse.reserve(count);
-
-            for (uint64_t i = 0; i < count; ++i)
-            {
-                PackedSfenValue ps;
-                if (!read_to_thread_buffer(0, ps))
-                {
-                    cout << "Error! read packed sfen , failed." << endl;
-                    return sfen_for_mse;
-                }
-
-                sfen_for_mse.push_back(ps);
-            }
-
-            return sfen_for_mse;
-        }
-
-        PSVector read_validation_set(const string& file_name, int eval_limit)
-        {
-            PSVector sfen_for_mse;
-
-            auto input = open_sfen_input_file(file_name);
-
-            while(!input->eof())
-            {
-                std::optional<PackedSfenValue> p_opt = input->next();
-                if (p_opt.has_value())
-                {
-                    auto& p = *p_opt;
-
-                    if (eval_limit < abs(p.score))
-                        continue;
-
-                    if (!use_draw_games_in_validation && p.game_result == 0)
-                        continue;
-
-                    sfen_for_mse.push_back(p);
-                }
-                else
-                {
-                    break;
-                }
-            }
-
-            return sfen_for_mse;
-        }
-
-        // [ASYNC] Thread returns one aspect. Otherwise returns false.
-        bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
-        {
-            // If there are any positions left in the thread buffer
-            // then retrieve one and return it.
-            auto& thread_ps = packed_sfens[thread_id];
-
-            // Fill the read buffer if there is no remaining buffer,
-            // but if it doesn't even exist, finish.
-            // If the buffer is empty, fill it.
-            if ((thread_ps == nullptr || thread_ps->empty())
-                && !read_to_thread_buffer_impl(thread_id))
-                return false;
-
-            // read_to_thread_buffer_impl() returned true,
-            // Since the filling of the thread buffer with the
-            // phase has been completed successfully
-            // thread_ps->rbegin() is alive.
-
-            ps = thread_ps->back();
-            thread_ps->pop_back();
-
-            // If you've run out of buffers, call delete yourself to free this buffer.
-            if (thread_ps->empty())
-            {
-                thread_ps.reset();
-            }
-
-            return true;
-        }
-
-        // [ASYNC] Read some aspects into thread buffer.
-        bool read_to_thread_buffer_impl(size_t thread_id)
-        {
-            while (true)
-            {
-                {
-                    std::unique_lock<std::mutex> lk(mutex);
-                    // If you can fill from the file buffer, that's fine.
-                    if (packed_sfens_pool.size() != 0)
-                    {
-                        // It seems that filling is possible, so fill and finish.
-
-                        packed_sfens[thread_id] = std::move(packed_sfens_pool.front());
-                        packed_sfens_pool.pop_front();
-
-                        total_read += THREAD_BUFFER_SIZE;
-
-                        return true;
-                    }
-                }
-
-                // The file to read is already gone. No more use.
-                if (end_of_files)
-                    return false;
-
-                // Waiting for file worker to fill packed_sfens_pool.
-                // The mutex isn't locked, so it should fill up soon.
-                // Poor man's condition variable.
-                sleep(1);
-            }
-
-        }
-
-        // Start a thread that loads the phase file in the background.
-        void start_file_read_worker()
-        {
-            file_worker_thread = std::thread([&] {
-                this->file_read_worker();
-                });
-        }
-
-        void file_read_worker()
-        {
-            auto open_next_file = [&]() {
-                // no more
-                for(;;)
-                {
-                    sfen_input_stream.reset();
-
-                    if (filenames.empty())
-                        return false;
-
-                    // Get the next file name.
-                    string filename = filenames.front();
-                    filenames.pop_front();
-
-                    sfen_input_stream = open_sfen_input_file(filename);
-                    cout << "open filename = " << filename << endl;
-
-                    // in case the file is empty or was deleted.
-                    if (!sfen_input_stream->eof())
-                        return true;
-                }
-            };
-
-            if (sfen_input_stream == nullptr && !open_next_file())
-            {
-                cout << "..end of files." << endl;
-                end_of_files = true;
-                return;
-            }
-
-            while (true)
-            {
-                // Wait for the buffer to run out.
-                // This size() is read only, so you don't need to lock it.
-                while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
-                    sleep(100);
-
-                if (stop_flag)
-                    return;
-
-                PSVector sfens;
-                sfens.reserve(SFEN_READ_SIZE);
-
-                // Read from the file into the file buffer.
-                while (sfens.size() < SFEN_READ_SIZE)
-                {
-                    std::optional<PackedSfenValue> p = sfen_input_stream->next();
-                    if (p.has_value())
-                    {
-                        sfens.push_back(*p);
-                    }
-                    else if(!open_next_file())
-                    {
-                        // There was no next file. Abort.
-                        cout << "..end of files." << endl;
-                        end_of_files = true;
-                        return;
-                    }
-                }
-
-                // Shuffle the read phase data.
-                if (shuffle)
-                {
-                    Algo::shuffle(sfens, prng);
-                }
-
-                // Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
-                // SFEN_READ_SIZE shall be a multiple of THREAD_BUFFER_SIZE.
-                assert((SFEN_READ_SIZE % THREAD_BUFFER_SIZE) == 0);
-
-                auto size = size_t(SFEN_READ_SIZE / THREAD_BUFFER_SIZE);
-                std::vector<std::unique_ptr<PSVector>> buffers;
-                buffers.reserve(size);
-
-                for (size_t i = 0; i < size; ++i)
-                {
-                    // Delete this pointer on the receiving side.
-                    auto buf = std::make_unique<PSVector>();
-                    buf->resize(THREAD_BUFFER_SIZE);
-                    memcpy(
-                        buf->data(),
-                        &sfens[i * THREAD_BUFFER_SIZE],
-                        sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
-
-                    buffers.emplace_back(std::move(buf));
-                }
-
-                {
-                    std::unique_lock<std::mutex> lk(mutex);
-
-                    // The mutex lock is required because the%
-                    // contents of packed_sfens_pool are changed.
-
-                    for (auto& buf : buffers)
-                        packed_sfens_pool.emplace_back(std::move(buf));
-                }
-            }
-        }
-
-        void stop()
-        {
-            stop_flag = true;
-        }
-
-        void set_do_shuffle(bool v)
-        {
-            shuffle = v;
-        }
-
-        void add_file(const std::string& filename)
-        {
-            filenames.push_back(filename);
-        }
-
-    protected:
-
-        // worker thread reading file in background
-        std::thread file_worker_thread;
-
-        // sfen files
-        deque<string> filenames;
-
-        std::atomic<bool> stop_flag;
-
-        // number of phases read (file to memory buffer)
-        atomic<uint64_t> total_read;
-
-        // Do not shuffle when reading the phase.
-        bool shuffle;
-
-        // Random number to shuffle when reading the phase
-        PRNG prng;
-
-        // Did you read the files and reached the end?
-        atomic<bool> end_of_files;
-
-        // handle of sfen file
-        std::unique_ptr<BasicSfenInputStream> sfen_input_stream;
-
-        // sfen for each thread
-        // (When the thread is used up, the thread should call delete to release it.)
-        std::vector<std::unique_ptr<PSVector>> packed_sfens;
-
-        // Mutex when accessing packed_sfens_pool
-        std::mutex mutex;
-
-        // pool of sfen. The worker thread read from the file is added here.
-        // Each worker thread fills its own packed_sfens[thread_id] from here.
-        // * Lock and access the mutex.
-        std::list<std::unique_ptr<PSVector>> packed_sfens_pool;
-    };
-
     // Class to generate sfen with multiple threads
     struct LearnerThink
     {
@@ -777,7 +472,7 @@ namespace Learner
         const PSVector sfen_for_mse =
             validation_set_file_name.empty()
             ? sr.read_for_mse(sfen_for_mse_size)
-            : sr.read_validation_set(validation_set_file_name, eval_limit);
+            : sr.read_validation_set(validation_set_file_name, eval_limit, use_draw_games_in_validation);
 
         if (validation_set_file_name.empty()
             && sfen_for_mse.size() != sfen_for_mse_size)
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
new file mode 100644
index 00000000..2645bb6c
--- /dev/null
+++ b/src/learn/sfen_reader.h
@@ -0,0 +1,326 @@
+#include "sfen_stream.h"
+
+#include "packed_sfen.h"
+
+#include "misc.h"
+
+#include <string>
+#include <vector>
+#include <deque>
+#include <memory>
+#include <mutex>
+#include <list>
+#include <atomic>
+#include <optional>
+#include <iostream>
+#include <cstdint>
+#include <thread>
+
+namespace Learner{
+
+    // Sfen reader
+    struct SfenReader
+    {
+        // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
+        static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
+
+        // Buffer for reading files (If this is made larger,
+        // the shuffle becomes larger and the phases may vary.
+        // If it is too large, the memory consumption will increase.
+        // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
+        static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
+
+        // Do not use std::random_device().
+        // Because it always the same integers on MinGW.
+        SfenReader(int thread_num, const std::string& seed) :
+            prng(seed)
+        {
+            packed_sfens.resize(thread_num);
+            total_read = 0;
+            end_of_files = false;
+            shuffle = true;
+            stop_flag = false;
+        }
+
+        ~SfenReader()
+        {
+            if (file_worker_thread.joinable())
+                file_worker_thread.join();
+        }
+
+        // Load the phase for calculation such as mse.
+        PSVector read_for_mse(uint64_t count)
+        {
+            PSVector sfen_for_mse;
+            sfen_for_mse.reserve(count);
+
+            for (uint64_t i = 0; i < count; ++i)
+            {
+                PackedSfenValue ps;
+                if (!read_to_thread_buffer(0, ps))
+                {
+                    std::cout << "Error! read packed sfen , failed." << std::endl;
+                    return sfen_for_mse;
+                }
+
+                sfen_for_mse.push_back(ps);
+            }
+
+            return sfen_for_mse;
+        }
+
+        PSVector read_validation_set(const std::string& file_name, int eval_limit, bool use_draw_games)
+        {
+            PSVector sfen_for_mse;
+
+            auto input = open_sfen_input_file(file_name);
+
+            while(!input->eof())
+            {
+                std::optional<PackedSfenValue> p_opt = input->next();
+                if (p_opt.has_value())
+                {
+                    auto& p = *p_opt;
+
+                    if (eval_limit < abs(p.score))
+                        continue;
+
+                    if (!use_draw_games && p.game_result == 0)
+                        continue;
+
+                    sfen_for_mse.push_back(p);
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            return sfen_for_mse;
+        }
+
+        // [ASYNC] Thread returns one aspect. Otherwise returns false.
+        bool read_to_thread_buffer(size_t thread_id, PackedSfenValue& ps)
+        {
+            // If there are any positions left in the thread buffer
+            // then retrieve one and return it.
+            auto& thread_ps = packed_sfens[thread_id];
+
+            // Fill the read buffer if there is no remaining buffer,
+            // but if it doesn't even exist, finish.
+            // If the buffer is empty, fill it.
+            if ((thread_ps == nullptr || thread_ps->empty())
+                && !read_to_thread_buffer_impl(thread_id))
+                return false;
+
+            // read_to_thread_buffer_impl() returned true,
+            // Since the filling of the thread buffer with the
+            // phase has been completed successfully
+            // thread_ps->rbegin() is alive.
+
+            ps = thread_ps->back();
+            thread_ps->pop_back();
+
+            // If you've run out of buffers, call delete yourself to free this buffer.
+            if (thread_ps->empty())
+            {
+                thread_ps.reset();
+            }
+
+            return true;
+        }
+
+        // [ASYNC] Read some aspects into thread buffer.
+        bool read_to_thread_buffer_impl(size_t thread_id)
+        {
+            while (true)
+            {
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+                    // If you can fill from the file buffer, that's fine.
+                    if (packed_sfens_pool.size() != 0)
+                    {
+                        // It seems that filling is possible, so fill and finish.
+
+                        packed_sfens[thread_id] = std::move(packed_sfens_pool.front());
+                        packed_sfens_pool.pop_front();
+
+                        total_read += THREAD_BUFFER_SIZE;
+
+                        return true;
+                    }
+                }
+
+                // The file to read is already gone. No more use.
+                if (end_of_files)
+                    return false;
+
+                // Waiting for file worker to fill packed_sfens_pool.
+                // The mutex isn't locked, so it should fill up soon.
+                // Poor man's condition variable.
+                sleep(1);
+            }
+
+        }
+
+        // Start a thread that loads the phase file in the background.
+        void start_file_read_worker()
+        {
+            file_worker_thread = std::thread([&] {
+                this->file_read_worker();
+                });
+        }
+
+        void file_read_worker()
+        {
+            auto open_next_file = [&]() {
+                // no more
+                for(;;)
+                {
+                    sfen_input_stream.reset();
+
+                    if (filenames.empty())
+                        return false;
+
+                    // Get the next file name.
+                    std::string filename = filenames.front();
+                    filenames.pop_front();
+
+                    sfen_input_stream = open_sfen_input_file(filename);
+                    std::cout << "open filename = " << filename << std::endl;
+
+                    // in case the file is empty or was deleted.
+                    if (!sfen_input_stream->eof())
+                        return true;
+                }
+            };
+
+            if (sfen_input_stream == nullptr && !open_next_file())
+            {
+                std::cout << "..end of files." << std::endl;
+                end_of_files = true;
+                return;
+            }
+
+            while (true)
+            {
+                // Wait for the buffer to run out.
+                // This size() is read only, so you don't need to lock it.
+                while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
+                    sleep(100);
+
+                if (stop_flag)
+                    return;
+
+                PSVector sfens;
+                sfens.reserve(SFEN_READ_SIZE);
+
+                // Read from the file into the file buffer.
+                while (sfens.size() < SFEN_READ_SIZE)
+                {
+                    std::optional<PackedSfenValue> p = sfen_input_stream->next();
+                    if (p.has_value())
+                    {
+                        sfens.push_back(*p);
+                    }
+                    else if(!open_next_file())
+                    {
+                        // There was no next file. Abort.
+                        std::cout << "..end of files." << std::endl;
+                        end_of_files = true;
+                        return;
+                    }
+                }
+
+                // Shuffle the read phase data.
+                if (shuffle)
+                {
+                    Algo::shuffle(sfens, prng);
+                }
+
+                // Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
+                // SFEN_READ_SIZE shall be a multiple of THREAD_BUFFER_SIZE.
+                assert((SFEN_READ_SIZE % THREAD_BUFFER_SIZE) == 0);
+
+                auto size = size_t(SFEN_READ_SIZE / THREAD_BUFFER_SIZE);
+                std::vector<std::unique_ptr<PSVector>> buffers;
+                buffers.reserve(size);
+
+                for (size_t i = 0; i < size; ++i)
+                {
+                    // Delete this pointer on the receiving side.
+                    auto buf = std::make_unique<PSVector>();
+                    buf->resize(THREAD_BUFFER_SIZE);
+                    memcpy(
+                        buf->data(),
+                        &sfens[i * THREAD_BUFFER_SIZE],
+                        sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
+
+                    buffers.emplace_back(std::move(buf));
+                }
+
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // The mutex lock is required because the%
+                    // contents of packed_sfens_pool are changed.
+
+                    for (auto& buf : buffers)
+                        packed_sfens_pool.emplace_back(std::move(buf));
+                }
+            }
+        }
+
+        void stop()
+        {
+            stop_flag = true;
+        }
+
+        void set_do_shuffle(bool v)
+        {
+            shuffle = v;
+        }
+
+        void add_file(const std::string& filename)
+        {
+            filenames.push_back(filename);
+        }
+
+    protected:
+
+        // worker thread reading file in background
+        std::thread file_worker_thread;
+
+        // sfen files
+        std::deque<std::string> filenames;
+
+        std::atomic<bool> stop_flag;
+
+        // number of phases read (file to memory buffer)
+        std::atomic<uint64_t> total_read;
+
+        // Do not shuffle when reading the phase.
+        bool shuffle;
+
+        // Random number to shuffle when reading the phase
+        PRNG prng;
+
+        // Did you read the files and reached the end?
+        std::atomic<bool> end_of_files;
+
+        // handle of sfen file
+        std::unique_ptr<BasicSfenInputStream> sfen_input_stream;
+
+        // sfen for each thread
+        // (When the thread is used up, the thread should call delete to release it.)
+        std::vector<std::unique_ptr<PSVector>> packed_sfens;
+
+        // Mutex when accessing packed_sfens_pool
+        std::mutex mutex;
+
+        // pool of sfen. The worker thread read from the file is added here.
+        // Each worker thread fills its own packed_sfens[thread_id] from here.
+        // * Lock and access the mutex.
+        std::list<std::unique_ptr<PSVector>> packed_sfens_pool;
+    };
+}

From 11b28ad3b5c455ab7db9b6c1276a23457079a453 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 14:25:39 +0200
Subject: [PATCH 355/583] Don't treat unknown options in learn as file names.
 Add targetfile to specify individual files.

---
 src/learn/learn.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index af53791c..f6f4b3f4 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1245,6 +1245,12 @@ namespace Learner
 
             // Specify the folder in which the game record is stored and make it the rooting target.
             else if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
 
             // Specify the number of loops
             else if (option == "loop")      is >> loop;
@@ -1333,9 +1339,10 @@ namespace Learner
                 UCI::setoption("PruneAtShallowDepth", "false");
                 UCI::setoption("EnableTranspositionTable", "false");
             }
-            // Otherwise, it's a filename.
             else
-                filenames.push_back(option);
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
         }
 
         if (loss_output_interval == 0)

From 886467e09f815fda97bbea9090b045bb6fb803f3 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 11:16:41 +0200
Subject: [PATCH 356/583] Fix crash when trying to read a non existing .binpack
 file.

---
 src/extra/nnue_data_binpack_format.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 826b2959..b9e45c3e 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -6141,6 +6141,11 @@ namespace binpack
 
         [[nodiscard]] bool hasNextChunk()
         {
+            if (!m_file)
+            {
+                return false;
+            }
+
             m_file.peek();
             return !m_file.eof();
         }

From af138d19379effc9862691639d0f7c4f393ae7ff Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 11:20:31 +0200
Subject: [PATCH 357/583] Fix crashes when trying to open a file of unknown
 type. Increase robustness of error handling.

---
 src/learn/sfen_reader.h | 22 ++++++++++++++++++----
 src/learn/sfen_stream.h |  1 -
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 2645bb6c..38c2532c 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -187,11 +187,25 @@ namespace Learner{
                     filenames.pop_front();
 
                     sfen_input_stream = open_sfen_input_file(filename);
-                    std::cout << "open filename = " << filename << std::endl;
 
-                    // in case the file is empty or was deleted.
-                    if (!sfen_input_stream->eof())
-                        return true;
+                    if (sfen_input_stream == nullptr)
+                    {
+                        std::cout << "File does not exist: " << filename << '\n';
+                    }
+                    else
+                    {
+                        std::cout << "Opened file for reading: " << filename << '\n';
+
+                        // in case the file is empty or was deleted.
+                        if (sfen_input_stream->eof())
+                        {
+                            std::cout << "File empty, nothing to read.\n";
+                        }
+                        else
+                        {
+                            return true;
+                        }
+                    }
                 }
             };
 
diff --git a/src/learn/sfen_stream.h b/src/learn/sfen_stream.h
index 4d44901b..d25dd41d 100644
--- a/src/learn/sfen_stream.h
+++ b/src/learn/sfen_stream.h
@@ -191,7 +191,6 @@ namespace Learner {
         else if (has_extension(filename, BinpackSfenInputStream::extension))
             return std::make_unique<BinpackSfenInputStream>(filename);
 
-        assert(false);
         return nullptr;
     }
 

From 7b4a769cca7bba6971460fd96149dd1c4f29d374 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 11:26:59 +0200
Subject: [PATCH 358/583] Fix base_dir not being applied to singular filenames.

---
 src/learn/learn.cpp | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index f6f4b3f4..b945e06c 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -157,6 +157,32 @@ namespace Learner
     using Loss = Detail::Loss<false>;
     using AtomicLoss = Detail::Loss<true>;
 
+    static void append_files_from_dir(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir,
+        const std::string& target_dir)
+    {
+        string kif_base_dir = Path::combine(base_dir, target_dir);
+
+        namespace sys = std::filesystem;
+        sys::path p(kif_base_dir); // Origin of enumeration
+        std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
+            [&](const sys::path& path) {
+                if (sys::is_regular_file(path))
+                    filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
+            });
+    }
+
+    static void rebase_files(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir)
+    {
+        for (auto& file : filenames)
+        {
+            file = Path::combine(base_dir, file);
+        }
+    }
+
     // A function that converts the evaluation value to the winning rate [0,1]
     double winning_percentage(double value)
     {
@@ -1359,18 +1385,10 @@ namespace Learner
 
         LearnerThink learn_think(thread_num, seed);
 
-        // Display learning game file
-        if (target_dir != "")
+        rebase_files(filenames, base_dir);
+        if (!target_dir.empty())
         {
-            string kif_base_dir = Path::combine(base_dir, target_dir);
-
-            namespace sys = std::filesystem;
-            sys::path p(kif_base_dir); // Origin of enumeration
-            std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
-                [&](const sys::path& path) {
-                    if (sys::is_regular_file(path))
-                        filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
-                });
+            append_files_from_dir(filenames, base_dir, target_dir);
         }
 
         cout << "learn from ";

From 9564a52523b6001ea4d0e34fa17b8835c4a7b116 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 14:29:12 +0200
Subject: [PATCH 359/583] Remove whole file shuffling as it does not change
 learning behaviour, only works for bin, and is considered harmful for
 binpack.

---
 src/learn/learn.cpp | 283 --------------------------------------------
 1 file changed, 283 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index b945e06c..2cab54b7 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -904,252 +904,6 @@ namespace Learner
         return false;
     }
 
-    // Shuffle_files(), shuffle_files_quick() subcontracting, writing part.
-    // output_file_name: Name of the file to write
-    // prng: random number generator
-    // sfen_file_streams: fstream of each teacher phase file
-    // sfen_count_in_file: The number of teacher positions present in each file.
-    void shuffle_write(
-        const string& output_file_name,
-        PRNG& prng,
-        vector<fstream>& sfen_file_streams,
-        vector<uint64_t>& sfen_count_in_file)
-    {
-        uint64_t total_sfen_count = 0;
-        for (auto c : sfen_count_in_file)
-            total_sfen_count += c;
-
-        // number of exported phases
-        uint64_t write_sfen_count = 0;
-
-        // Output the progress on the screen for each phase.
-        const uint64_t buffer_size = 10000000;
-
-        auto print_status = [&]()
-        {
-            // Output progress every 10M phase or when all writing is completed
-            if (((write_sfen_count % buffer_size) == 0) ||
-                (write_sfen_count == total_sfen_count))
-            {
-                cout << write_sfen_count << " / " << total_sfen_count << endl;
-            }
-        };
-
-        cout << endl << "write : " << output_file_name << endl;
-
-        fstream fs(output_file_name, ios::out | ios::binary);
-
-        // total teacher positions
-        uint64_t sfen_count_left = total_sfen_count;
-
-        while (sfen_count_left != 0)
-        {
-            auto r = prng.rand(sfen_count_left);
-
-            // Aspects stored in fs[0] file ... Aspects stored in fs[1] file ...
-            //Think of it as a series like, and determine in which file r is pointing.
-            // The contents of the file are shuffled, so you can take the next element from that file.
-            // Each file has a_count[x] phases, so this process can be written as follows.
-
-            uint64_t i = 0;
-            while (sfen_count_in_file[i] <= r)
-                r -= sfen_count_in_file[i++];
-
-            // This confirms n. Before you forget it, reduce the remaining number.
-
-            --sfen_count_in_file[i];
-            --sfen_count_left;
-
-            PackedSfenValue psv;
-            // It's better to read and write all at once until the performance is not so good...
-            if (sfen_file_streams[i].read((char*)&psv, sizeof(PackedSfenValue)))
-            {
-                fs.write((char*)&psv, sizeof(PackedSfenValue));
-                ++write_sfen_count;
-                print_status();
-            }
-        }
-
-        print_status();
-        fs.close();
-
-        cout << "done!" << endl;
-    }
-
-    // Subcontracting the teacher shuffle "learn shuffle" command.
-    // output_file_name: name of the output file where the shuffled teacher positions will be written
-    void shuffle_files(const vector<string>& filenames, const string& output_file_name, uint64_t buffer_size, const std::string& seed)
-    {
-        // The destination folder is
-        // tmp/ for temporary writing
-
-        // Temporary file is written to tmp/ folder for each buffer_size phase.
-        // For example, if buffer_size = 20M, you need a buffer of 20M*40bytes = 800MB.
-        // In a PC with a small memory, it would be better to reduce this.
-        // However, if the number of files increases too much,
-        // it will not be possible to open at the same time due to OS restrictions.
-        // There should have been a limit of 512 per process on Windows, so you can open here as 500,
-        // The current setting is 500 files x 20M = 10G = 10 billion phases.
-
-        PSVector buf(buffer_size);
-
-        // ↑ buffer, a marker that indicates how much you have used
-        uint64_t buf_write_marker = 0;
-
-        // File name to write (incremental counter because it is a serial number)
-        uint64_t write_file_count = 0;
-
-        // random number to shuffle
-        // Do not use std::random_device().  Because it always the same integers on MinGW.
-        PRNG prng(seed);
-
-        // generate the name of the temporary file
-        auto make_filename = [](uint64_t i)
-        {
-            return "tmp/" + to_string(i) + ".bin";
-        };
-
-        // Exported files in tmp/ folder, number of teacher positions stored in each
-        vector<uint64_t> a_count;
-
-        auto write_buffer = [&](uint64_t size)
-        {
-            Algo::shuffle(buf, prng);
-
-            // write to a file
-            fstream fs;
-            fs.open(make_filename(write_file_count++), ios::out | ios::binary);
-            fs.write(reinterpret_cast<char*>(buf.data()), size * sizeof(PackedSfenValue));
-            fs.close();
-            a_count.push_back(size);
-
-            buf_write_marker = 0;
-            cout << ".";
-        };
-
-        std::filesystem::create_directory("tmp");
-
-        // Shuffle and export as a 10M phase shredded file.
-        for (auto filename : filenames)
-        {
-            fstream fs(filename, ios::in | ios::binary);
-            cout << endl << "open file = " << filename;
-            while (fs.read(reinterpret_cast<char*>(&buf[buf_write_marker]), sizeof(PackedSfenValue)))
-                if (++buf_write_marker == buffer_size)
-                    write_buffer(buffer_size);
-
-            // Read in units of sizeof(PackedSfenValue),
-            // Ignore the last remaining fraction. (Fails in fs.read, so exit while)
-            // (The remaining fraction seems to be half-finished data
-            // that was created because it was stopped halfway during teacher generation.)
-        }
-
-        if (buf_write_marker != 0)
-            write_buffer(buf_write_marker);
-
-        // Only shuffled files have been written write_file_count.
-        // As a second pass, if you open all of them at the same time,
-        // select one at random and load one phase at a time
-        // Now you have shuffled.
-
-        // Original file for shirt full + tmp file + file to write
-        // requires 3 times the storage capacity of the original file.
-        // 1 billion SSD is not enough for shuffling because it is 400GB for 10 billion phases.
-        // If you want to delete (or delete by hand) the
-        // original file at this point after writing to tmp,
-        // The storage capacity is about twice that of the original file.
-        // So, maybe we should have an option to delete the original file.
-
-        // Files are opened at the same time. It is highly possible that this will exceed FOPEN_MAX.
-        // In that case, rather than adjusting buffer_size to reduce the number of files.
-
-        vector<fstream> afs;
-        for (uint64_t i = 0; i < write_file_count; ++i)
-            afs.emplace_back(fstream(make_filename(i), ios::in | ios::binary));
-
-        // Throw to the subcontract function and end.
-        shuffle_write(output_file_name, prng, afs, a_count);
-    }
-
-    // Subcontracting the teacher shuffle "learn shuffleq" command.
-    // This is written in 1 pass.
-    // output_file_name: name of the output file where the shuffled teacher positions will be written
-    void shuffle_files_quick(const vector<string>& filenames, const string& output_file_name, const std::string& seed)
-    {
-        // random number to shuffle
-        // Do not use std::random_device().  Because it always the same integers on MinGW.
-        PRNG prng(seed);
-
-        // number of files
-        const size_t file_count = filenames.size();
-
-        // Number of teacher positions stored in each file in filenames
-        vector<uint64_t> sfen_count_in_file(file_count);
-
-        // Count the number of teacher aspects in each file.
-        vector<fstream> sfen_file_streams(file_count);
-
-        for (size_t i = 0; i < file_count; ++i)
-        {
-            auto filename = filenames[i];
-            auto& fs = sfen_file_streams[i];
-
-            fs.open(filename, ios::in | ios::binary);
-            const uint64_t file_size = get_file_size(fs);
-            const uint64_t sfen_count = file_size / sizeof(PackedSfenValue);
-            sfen_count_in_file[i] = sfen_count;
-
-            // Output the number of sfen stored in each file.
-            cout << filename << " = " << sfen_count << " sfens." << endl;
-        }
-
-        // Since we know the file size of each file,
-        // open them all at once (already open),
-        // Select one at a time and load one phase at a time
-        // Now you have shuffled.
-
-        // Throw to the subcontract function and end.
-        shuffle_write(output_file_name, prng, sfen_file_streams, sfen_count_in_file);
-    }
-
-    // Subcontracting the teacher shuffle "learn shufflem" command.
-    // Read the whole memory and write it out with the specified file name.
-    void shuffle_files_on_memory(const vector<string>& filenames, const string output_file_name, const std::string& seed)
-    {
-        PSVector buf;
-
-        for (auto filename : filenames)
-        {
-            std::cout << "read : " << filename << std::endl;
-            read_file_to_memory(filename, [&buf](uint64_t size) {
-                assert((size % sizeof(PackedSfenValue)) == 0);
-                // Expand the buffer and read after the last end.
-                uint64_t last = buf.size();
-                buf.resize(last + size / sizeof(PackedSfenValue));
-                return (void*)&buf[last];
-                });
-        }
-
-        // shuffle from buf[0] to buf[size-1]
-        // Do not use std::random_device().  Because it always the same integers on MinGW.
-        PRNG prng(seed);
-        uint64_t size = (uint64_t)buf.size();
-        std::cout << "shuffle buf.size() = " << size << std::endl;
-
-        Algo::shuffle(buf, prng);
-
-        std::cout << "write : " << output_file_name << endl;
-
-        // If the file to be written exceeds 2GB, it cannot be
-        // written in one shot with fstream::write, so use wrapper.
-        write_memory_to_file(
-            output_file_name,
-            (void*)&buf[0],
-            sizeof(PackedSfenValue) * buf.size());
-
-        std::cout << "..shuffle_on_memory done." << std::endl;
-    }
-
     static void set_learning_search_limits()
     {
         // About Search::Limits
@@ -1192,13 +946,6 @@ namespace Learner
         // --- Function that only shuffles the teacher aspect
 
         // normal shuffle
-        bool shuffle_normal = false;
-        uint64_t buffer_size = 20000000;
-        // fast shuffling assuming each file is shuffled
-        bool shuffle_quick = false;
-        // A function to read the entire file in memory and shuffle it.
-        // (Requires file size memory)
-        bool shuffle_on_memory = false;
         // Conversion of packed sfen. In plain, it consists of sfen(string),
         // evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
         bool use_convert_plain = false;
@@ -1318,13 +1065,6 @@ namespace Learner
 
             else if (option == "reduction_gameply") is >> reduction_gameply;
 
-            // shuffle related
-            else if (option == "shuffle")   shuffle_normal = true;
-            else if (option == "buffer_size") is >> buffer_size;
-            else if (option == "shuffleq")  shuffle_quick = true;
-            else if (option == "shufflem")  shuffle_on_memory = true;
-            else if (option == "output_file_name") is >> output_file_name;
-
             else if (option == "eval_limit") is >> eval_limit;
             else if (option == "save_only_once") save_only_once = true;
             else if (option == "no_shuffle") no_shuffle = true;
@@ -1404,29 +1144,6 @@ namespace Learner
         cout << "base dir        : " << base_dir << endl;
         cout << "target dir      : " << target_dir << endl;
 
-        // shuffle mode
-        if (shuffle_normal)
-        {
-            cout << "buffer_size     : " << buffer_size << endl;
-            cout << "shuffle mode.." << endl;
-            shuffle_files(filenames, output_file_name, buffer_size, seed);
-            return;
-        }
-
-        if (shuffle_quick)
-        {
-            cout << "quick shuffle mode.." << endl;
-            shuffle_files_quick(filenames, output_file_name, seed);
-            return;
-        }
-
-        if (shuffle_on_memory)
-        {
-            cout << "shuffle on memory.." << endl;
-            shuffle_files_on_memory(filenames, output_file_name, seed);
-            return;
-        }
-
         if (use_convert_plain)
         {
             Eval::NNUE::init();

From f7530de20def38f858bc00cf3608d0247ea8c925 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 15:39:43 +0200
Subject: [PATCH 360/583] Fix assertion in trainer

---
 src/nnue/trainer/trainer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index 85666576..763bd5c8 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -38,7 +38,7 @@ namespace Eval::NNUE {
 
         TrainingFeature& operator+=(const TrainingFeature& other) {
             assert(other.get_index() == get_index());
-            assert(other.get_index() + get_count() < (1 << kCountBits));
+            assert(other.get_count() + get_count() < (1 << kCountBits));
             index_and_count_ += other.get_count();
             return *this;
         }

From c7ac3688a7bc87b0984cadb50778952ca1149ccd Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 14:51:38 +0200
Subject: [PATCH 361/583] Move the old convert stuff from learn to their own
 commands.

---
 src/learn/convert.cpp | 212 ++++++++++++++++++++++++++++++++++++++++++
 src/learn/convert.h   |  29 ++----
 src/learn/learn.cpp   |  84 ++---------------
 src/uci.cpp           |   3 +
 4 files changed, 228 insertions(+), 100 deletions(-)

diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index 59111dcf..a7528b02 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -10,6 +10,8 @@
 
 #include "extra/nnue_data_binpack_format.h"
 
+#include "nnue/evaluate_nnue.h"
+
 #include "syzygy/tbprobe.h"
 
 #include <sstream>
@@ -600,4 +602,214 @@ namespace Learner
 
         convert(args);
     }
+
+    static void append_files_from_dir(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir,
+        const std::string& target_dir)
+    {
+        string kif_base_dir = Path::combine(base_dir, target_dir);
+
+        namespace sys = std::filesystem;
+        sys::path p(kif_base_dir); // Origin of enumeration
+        std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
+            [&](const sys::path& path) {
+                if (sys::is_regular_file(path))
+                    filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
+            });
+    }
+
+    static void rebase_files(
+        std::vector<std::string>& filenames,
+        const std::string& base_dir)
+    {
+        for (auto& file : filenames)
+        {
+            file = Path::combine(base_dir, file);
+        }
+    }
+
+    void convert_bin_from_pgn_extract(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        bool pgn_eval_side_to_move = false;
+        bool convert_no_eval_fens_as_score_zero = false;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_bin_from_pgn-extract.." << endl;
+        convert_bin_from_pgn_extract(
+            filenames,
+            output_file_name,
+            pgn_eval_side_to_move,
+            convert_no_eval_fens_as_score_zero);
+    }
+
+    void convert_bin(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        int ply_minimum = 0;
+        int ply_maximum = 114514;
+        bool interpolate_eval = 0;
+        bool check_invalid_fen = false;
+        bool check_illegal_move = false;
+
+        bool pgn_eval_side_to_move = false;
+        bool convert_no_eval_fens_as_score_zero = false;
+
+        double src_score_min_value = 0.0;
+        double src_score_max_value = 1.0;
+        double dest_score_min_value = 0.0;
+        double dest_score_max_value = 1.0;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "ply_minimum") is >> ply_minimum;
+            else if (option == "ply_maximum") is >> ply_maximum;
+            else if (option == "interpolate_eval") is >> interpolate_eval;
+            else if (option == "check_invalid_fen") is >> check_invalid_fen;
+            else if (option == "check_illegal_move") is >> check_illegal_move;
+            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
+            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
+            else if (option == "src_score_min_value") is >> src_score_min_value;
+            else if (option == "src_score_max_value") is >> src_score_max_value;
+            else if (option == "dest_score_min_value") is >> dest_score_min_value;
+            else if (option == "dest_score_max_value") is >> dest_score_max_value;
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_bin.." << endl;
+            convert_bin(
+                filenames,
+                output_file_name,
+                ply_minimum,
+                ply_maximum,
+                interpolate_eval,
+                src_score_min_value,
+                src_score_max_value,
+                dest_score_min_value,
+                dest_score_max_value,
+                check_invalid_fen,
+                check_illegal_move
+            );
+    }
+
+    void convert_plain(std::istringstream& is)
+    {
+        std::vector<std::string> filenames;
+
+        string base_dir;
+        string target_dir;
+
+        string output_file_name = "shuffled_sfen.bin";
+
+        while (true)
+        {
+            string option;
+            is >> option;
+
+            if (option == "")
+                break;
+
+            if (option == "targetdir") is >> target_dir;
+            else if (option == "targetfile")
+            {
+                std::string filename;
+                is >> filename;
+                filenames.push_back(filename);
+            }
+
+            else if (option == "basedir")   is >> base_dir;
+
+            else if (option == "output_file_name") is >> output_file_name;
+            else
+            {
+                cout << "Unknown option: " << option << ". Ignoring.\n";
+            }
+        }
+
+        if (!target_dir.empty())
+        {
+            append_files_from_dir(filenames, base_dir, target_dir);
+        }
+        rebase_files(filenames, base_dir);
+
+        Eval::NNUE::init();
+
+        cout << "convert_plain.." << endl;
+        convert_plain(filenames, output_file_name);
+    }
 }
diff --git a/src/learn/convert.h b/src/learn/convert.h
index a41885d9..227f0799 100644
--- a/src/learn/convert.h
+++ b/src/learn/convert.h
@@ -6,30 +6,13 @@
 #include <sstream>
 
 namespace Learner {
-    void convert_bin_from_pgn_extract(
-        const std::vector<std::string>& filenames,
-        const std::string& output_file_name,
-        const bool pgn_eval_side_to_move,
-        const bool convert_no_eval_fens_as_score_zero);
-
-    void convert_bin(
-        const std::vector<std::string>& filenames,
-        const std::string& output_file_name,
-        const int ply_minimum,
-        const int ply_maximum,
-        const int interpolate_eval,
-        const int src_score_min_value,
-        const int src_score_max_value,
-        const int dest_score_min_value,
-        const int dest_score_max_value,
-        const bool check_invalid_fen,
-        const bool check_illegal_move);
-
-    void convert_plain(
-        const std::vector<std::string>& filenames,
-        const std::string& output_file_name);
-
     void convert(std::istringstream& is);
+
+    void convert_bin_from_pgn_extract(std::istringstream& is);
+
+    void convert_bin(std::istringstream& is);
+
+    void convert_plain(std::istringstream& is);
 }
 
 #endif
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 2cab54b7..32aa986f 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -19,7 +19,6 @@
 
 #include "learn.h"
 
-#include "convert.h"
 #include "sfen_reader.h"
 
 #include "misc.h"
@@ -940,29 +939,8 @@ namespace Learner
 
         // Game file storage folder (get game file with relative path from here)
         string base_dir;
-
         string target_dir;
 
-        // --- Function that only shuffles the teacher aspect
-
-        // normal shuffle
-        // Conversion of packed sfen. In plain, it consists of sfen(string),
-        // evaluation value (integer), move (eg 7g7f, string), result (loss-1, win 1, draw 0)
-        bool use_convert_plain = false;
-        // convert plain format teacher to Yaneura King's bin
-        bool use_convert_bin = false;
-        int ply_minimum = 0;
-        int ply_maximum = 114514;
-        bool interpolate_eval = 0;
-        bool check_invalid_fen = false;
-        bool check_illegal_move = false;
-        // convert teacher in pgn-extract format to Yaneura King's bin
-        bool use_convert_bin_from_pgn_extract = false;
-        bool pgn_eval_side_to_move = false;
-        bool convert_no_eval_fens_as_score_zero = false;
-        // File name to write in those cases (default is "shuffled_sfen.bin")
-        string output_file_name = "shuffled_sfen.bin";
-
         // If the absolute value of the evaluation value
         // in the deep search of the teacher phase exceeds this value,
         // that phase is discarded.
@@ -1079,19 +1057,11 @@ namespace Learner
             else if (option == "loss_output_interval") is >> loss_output_interval;
             else if (option == "validation_set_file_name") is >> validation_set_file_name;
 
-            // Rabbit convert related
-            else if (option == "convert_plain") use_convert_plain = true;
-            else if (option == "convert_bin") use_convert_bin = true;
-            else if (option == "interpolate_eval") is >> interpolate_eval;
-            else if (option == "check_invalid_fen") is >> check_invalid_fen;
-            else if (option == "check_illegal_move") is >> check_illegal_move;
-            else if (option == "convert_bin_from_pgn-extract") use_convert_bin_from_pgn_extract = true;
-            else if (option == "pgn_eval_side_to_move") is >> pgn_eval_side_to_move;
-            else if (option == "convert_no_eval_fens_as_score_zero") is >> convert_no_eval_fens_as_score_zero;
             else if (option == "src_score_min_value") is >> src_score_min_value;
             else if (option == "src_score_max_value") is >> src_score_max_value;
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
+
             else if (option == "seed") is >> seed;
             else if (option == "set_recommended_uci_options")
             {
@@ -1123,8 +1093,8 @@ namespace Learner
         cout << "Warning! OpenMP disabled." << endl;
 #endif
 
-        LearnerThink learn_think(thread_num, seed);
-
+        // Right now we only have the individual files.
+        // We need to apply base_dir here
         rebase_files(filenames, base_dir);
         if (!target_dir.empty())
         {
@@ -1144,48 +1114,6 @@ namespace Learner
         cout << "base dir        : " << base_dir << endl;
         cout << "target dir      : " << target_dir << endl;
 
-        if (use_convert_plain)
-        {
-            Eval::NNUE::init();
-            cout << "convert_plain.." << endl;
-            convert_plain(filenames, output_file_name);
-            return;
-        }
-
-        if (use_convert_bin)
-        {
-            Eval::NNUE::init();
-            cout << "convert_bin.." << endl;
-            convert_bin(
-                filenames,
-                output_file_name,
-                ply_minimum,
-                ply_maximum,
-                interpolate_eval,
-                src_score_min_value,
-                src_score_max_value,
-                dest_score_min_value,
-                dest_score_max_value,
-                check_invalid_fen,
-                check_illegal_move);
-
-            return;
-
-        }
-
-        if (use_convert_bin_from_pgn_extract)
-        {
-            Eval::NNUE::init();
-            cout << "convert_bin_from_pgn-extract.." << endl;
-            convert_bin_from_pgn_extract(
-                filenames,
-                output_file_name,
-                pgn_eval_side_to_move,
-                convert_no_eval_fens_as_score_zero);
-
-            return;
-        }
-
         cout << "loop              : " << loop << endl;
         cout << "eval_limit        : " << eval_limit << endl;
         cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
@@ -1226,6 +1154,8 @@ namespace Learner
 
         cout << "init.." << endl;
 
+        LearnerThink learn_think(thread_num, seed);
+
         Threads.main()->ponder = false;
 
         set_learning_search_limits();
@@ -1244,8 +1174,6 @@ namespace Learner
                 Path::combine(Options["EvalSaveDir"], "original");
         }
 
-        cout << "init done." << endl;
-
         // Reflect other option settings.
         learn_think.eval_limit = eval_limit;
         learn_think.save_only_once = save_only_once;
@@ -1271,6 +1199,8 @@ namespace Learner
             }
         }
 
+        cout << "init done." << endl;
+
         // Start learning.
         learn_think.learn();
     }
diff --git a/src/uci.cpp b/src/uci.cpp
index b05c7eeb..398fd01a 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -341,6 +341,9 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "gensfen") Learner::gen_sfen(pos, is);
       else if (token == "learn") Learner::learn(pos, is);
       else if (token == "convert") Learner::convert(is);
+      else if (token == "convert_bin") Learner::convert_bin(is);
+      else if (token == "convert_plain") Learner::convert_plain(is);
+      else if (token == "convert_bin_from_pgn_extract") Learner::convert_bin_from_pgn_extract(is);
 
       // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);

From e4e9f7e39b7bf6c2aa627584fafd3afd7c5f74d5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 23 Oct 2020 17:14:59 +0200
Subject: [PATCH 362/583] Reduce bench depth for testing with valgrind to
 prevent timeouts in CI.

---
 tests/instrumented.sh | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index 03e9c9de..788d8741 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -16,6 +16,9 @@ case $1 in
     exeprefix='valgrind --error-exitcode=42'
     postfix='1>/dev/null'
     threads="1"
+    bench_depth=5
+    go_depth=10
+    tt_size=16
   ;;
   --valgrind-thread)
     echo "valgrind-thread testing started"
@@ -23,6 +26,9 @@ case $1 in
     exeprefix='valgrind --fair-sched=try --error-exitcode=42'
     postfix='1>/dev/null'
     threads="2"
+    bench_depth=5
+    go_depth=10
+    tt_size=16
   ;;
   --sanitizer-undefined)
     echo "sanitizer-undefined testing started"
@@ -30,6 +36,9 @@ case $1 in
     exeprefix=''
     postfix='2>&1 | grep -A50 "runtime error:"'
     threads="1"
+    bench_depth=8
+    go_depth=20
+    tt_size=128
   ;;
   --sanitizer-thread)
     echo "sanitizer-thread testing started"
@@ -37,6 +46,9 @@ case $1 in
     exeprefix=''
     postfix='2>&1 | grep -A50 "WARNING: ThreadSanitizer:"'
     threads="2"
+    bench_depth=8
+    go_depth=20
+    tt_size=128
 
 cat << EOF > tsan.supp
 race:TTEntry::move
@@ -70,7 +82,7 @@ for args in "eval" \
             "go depth 10" \
             "go movetime 1000" \
             "go wtime 8000 btime 8000 winc 500 binc 500" \
-            "bench 128 $threads 8 default depth"
+            "bench $tt_size $threads $bench_depth default depth"
 do
 
    echo "$prefix $exeprefix ./stockfish $args $postfix"
@@ -98,7 +110,7 @@ cat << EOF > game.exp
  expect "bestmove"
 
  send "position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1\n"
- send "go depth 20\n"
+ send "go depth $go_depth\n"
  expect "bestmove"
 
  send "quit\n"
@@ -121,7 +133,7 @@ cat << EOF > syzygy.exp
  send "uci\n"
  send "setoption name SyzygyPath value ../tests/syzygy/\n"
  expect "info string Found 35 tablebases" {} timeout {exit 1}
- send "bench 128 1 8 default depth\n"
+ send "bench $tt_size 1 $bench_depth default depth\n"
  send "quit\n"
  expect eof
 

From e4a38c18dd75d5109f6c5ba93071f68d197d5ed4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 23 Oct 2020 22:03:01 +0200
Subject: [PATCH 363/583] Don't test syzygi

---
 tests/instrumented.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index 788d8741..07ecbb9c 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -142,7 +142,7 @@ cat << EOF > syzygy.exp
  exit \$value
 EOF
 
-for exp in game.exp syzygy.exp
+for exp in game.exp
 do
 
   echo "$prefix expect $exp $postfix"

From 0636e1256d09edde22df3bc75207d75c24b6f2fa Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Oct 2020 14:51:38 +0200
Subject: [PATCH 364/583] Add cyclic mode to the sfen reader. Make sfen reader
 take all files at construction

---
 src/learn/learn.cpp     | 44 ++++++++++++++++++--------------
 src/learn/sfen_reader.h | 56 ++++++++++++++++++++++++++++++-----------
 2 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 32aa986f..57dbeb63 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -381,9 +381,13 @@ namespace Learner
         // move match rate, simple comparison is not possible...
         static constexpr uint64_t sfen_for_mse_size = 2000;
 
-        LearnerThink(uint64_t thread_num, const std::string& seed) :
+        LearnerThink(
+            const std::vector<std::string>& filenames,
+            uint64_t thread_num,
+            const std::string& seed
+        ) :
             prng(seed),
-            sr(thread_num, std::to_string(prng.next_random_seed())),
+            sr(filenames, SfenReaderMode::Sequential, thread_num, std::to_string(prng.next_random_seed())),
             learn_loss_sum{}
         {
             save_only_once = false;
@@ -404,11 +408,6 @@ namespace Learner
             sr.set_do_shuffle(v);
         }
 
-        void add_file(const std::string& filename)
-        {
-            sr.add_file(filename);
-        }
-
         void learn();
 
 
@@ -1095,11 +1094,26 @@ namespace Learner
 
         // Right now we only have the individual files.
         // We need to apply base_dir here
-        rebase_files(filenames, base_dir);
         if (!target_dir.empty())
         {
             append_files_from_dir(filenames, base_dir, target_dir);
         }
+        rebase_files(filenames, base_dir);
+
+        // Insert the file name for the number of loops.
+        {
+            std::vector<std::string> filenamesTimesLoop;
+
+            for (int i = 0; i < loop; ++i)
+            {
+                for(auto& file : filenames)
+                {
+                    filenamesTimesLoop.emplace_back(file);
+                }
+            }
+
+            filenames = std::move(filenamesTimesLoop);
+        }
 
         cout << "learn from ";
         for (auto s : filenames)
@@ -1154,8 +1168,6 @@ namespace Learner
 
         cout << "init.." << endl;
 
-        LearnerThink learn_think(thread_num, seed);
-
         Threads.main()->ponder = false;
 
         set_learning_search_limits();
@@ -1164,6 +1176,9 @@ namespace Learner
         Eval::NNUE::initialize_training(seed);
         Eval::NNUE::set_batch_size(nn_batch_size);
         Eval::NNUE::set_options(nn_options);
+
+        LearnerThink learn_think(filenames, thread_num, seed);
+
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
             // Save the current net to [EvalSaveDir]\original.
             Eval::NNUE::save_eval("original");
@@ -1190,15 +1205,6 @@ namespace Learner
         learn_think.mini_batch_size = mini_batch_size;
         learn_think.validation_set_file_name = validation_set_file_name;
 
-        // Insert the file name for the number of loops.
-        for (int i = 0; i < loop; ++i)
-        {
-            for(auto& file : filenames)
-            {
-                learn_think.add_file(Path::combine(base_dir, file));
-            }
-        }
-
         cout << "init done." << endl;
 
         // Start learning.
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 38c2532c..1ba9bd3b 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -18,6 +18,12 @@
 
 namespace Learner{
 
+    enum struct SfenReaderMode
+    {
+        Sequential,
+        Cyclic
+    };
+
     // Sfen reader
     struct SfenReader
     {
@@ -32,7 +38,14 @@ namespace Learner{
 
         // Do not use std::random_device().
         // Because it always the same integers on MinGW.
-        SfenReader(int thread_num, const std::string& seed) :
+        SfenReader(
+            const std::vector<std::string>& filenames_,
+            SfenReaderMode mode_,
+            int thread_num,
+            const std::string& seed
+        ) :
+            filenames(filenames_.begin(), filenames_.end()),
+            mode(mode_),
             prng(seed)
         {
             packed_sfens.resize(thread_num);
@@ -173,6 +186,9 @@ namespace Learner{
 
         void file_read_worker()
         {
+            std::string currentFilename;
+            uint64_t numEntriesReadFromCurrentFile = 0;
+
             auto open_next_file = [&]() {
                 // no more
                 for(;;)
@@ -183,18 +199,20 @@ namespace Learner{
                         return false;
 
                     // Get the next file name.
-                    std::string filename = filenames.front();
+                    currentFilename = filenames.front();
                     filenames.pop_front();
 
-                    sfen_input_stream = open_sfen_input_file(filename);
+                    numEntriesReadFromCurrentFile = 0;
+
+                    sfen_input_stream = open_sfen_input_file(currentFilename);
 
                     if (sfen_input_stream == nullptr)
                     {
-                        std::cout << "File does not exist: " << filename << '\n';
+                        std::cout << "File does not exist: " << currentFilename << '\n';
                     }
                     else
                     {
-                        std::cout << "Opened file for reading: " << filename << '\n';
+                        std::cout << "Opened file for reading: " << currentFilename << '\n';
 
                         // in case the file is empty or was deleted.
                         if (sfen_input_stream->eof())
@@ -236,13 +254,24 @@ namespace Learner{
                     if (p.has_value())
                     {
                         sfens.push_back(*p);
+                        ++numEntriesReadFromCurrentFile;
                     }
-                    else if(!open_next_file())
+                    else
                     {
-                        // There was no next file. Abort.
-                        std::cout << "..end of files." << std::endl;
-                        end_of_files = true;
-                        return;
+                        if (mode == SfenReaderMode::Cyclic
+                            && numEntriesReadFromCurrentFile > 0)
+                        {
+                            // The file contained data so we add it again to the end of the queue.
+                            filenames.emplace_back(currentFilename);
+                        }
+
+                        if(!open_next_file())
+                        {
+                            // There was no next file. Abort.
+                            std::cout << "..end of files." << std::endl;
+                            end_of_files = true;
+                            return;
+                        }
                     }
                 }
 
@@ -295,11 +324,6 @@ namespace Learner{
             shuffle = v;
         }
 
-        void add_file(const std::string& filename)
-        {
-            filenames.push_back(filename);
-        }
-
     protected:
 
         // worker thread reading file in background
@@ -316,6 +340,8 @@ namespace Learner{
         // Do not shuffle when reading the phase.
         bool shuffle;
 
+        SfenReaderMode mode;
+
         // Random number to shuffle when reading the phase
         PRNG prng;
 

From c58aa9696ad3c579fe7f610505fe6b0903062182 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 14:43:38 +0200
Subject: [PATCH 365/583] Start sfen reader worker thread in the constructor.

---
 src/learn/learn.cpp     |  3 ---
 src/learn/sfen_reader.h | 12 ++++--------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 57dbeb63..cc51b04e 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -490,9 +490,6 @@ namespace Learner
 
         Eval::NNUE::verify_any_net_loaded();
 
-        // Start a thread that loads the training data in the background
-        sr.start_file_read_worker();
-
         const PSVector sfen_for_mse =
             validation_set_file_name.empty()
             ? sr.read_for_mse(sfen_for_mse_size)
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 1ba9bd3b..78bf4ee8 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -53,6 +53,10 @@ namespace Learner{
             end_of_files = false;
             shuffle = true;
             stop_flag = false;
+
+            file_worker_thread = std::thread([&] {
+                this->file_read_worker();
+            });
         }
 
         ~SfenReader()
@@ -176,14 +180,6 @@ namespace Learner{
 
         }
 
-        // Start a thread that loads the phase file in the background.
-        void start_file_read_worker()
-        {
-            file_worker_thread = std::thread([&] {
-                this->file_read_worker();
-                });
-        }
-
         void file_read_worker()
         {
             std::string currentFilename;

From ad3d1b42e4f5de24608053e0d27031c4c52887d9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 14:46:19 +0200
Subject: [PATCH 366/583] Make sfen reader only stop when it's destroyed. Now
 it is fully RAII.

---
 src/learn/learn.cpp     | 4 ----
 src/learn/sfen_reader.h | 7 ++-----
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index cc51b04e..3e4f9495 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -502,8 +502,6 @@ namespace Learner
                 << "Error reading sfen_for_mse. Read " << sfen_for_mse.size()
                 << " out of " << sfen_for_mse_size << '\n';
 
-            sr.stop();
-
             return;
         }
 
@@ -541,8 +539,6 @@ namespace Learner
                 break;
         }
 
-        sr.stop();
-
         Eval::NNUE::finalize_net();
 
         save(true);
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 78bf4ee8..0ef9765b 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -61,6 +61,8 @@ namespace Learner{
 
         ~SfenReader()
         {
+            stop_flag = true;
+
             if (file_worker_thread.joinable())
                 file_worker_thread.join();
         }
@@ -310,11 +312,6 @@ namespace Learner{
             }
         }
 
-        void stop()
-        {
-            stop_flag = true;
-        }
-
         void set_do_shuffle(bool v)
         {
             shuffle = v;

From fc3788f630b4524bfd50ade8ed46b0f007fa1b5a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 15:45:33 +0200
Subject: [PATCH 367/583] Use cyclic sfen reader for learning, change loop
 option to epochs.

---
 src/learn/learn.cpp | 52 ++++++++++++++-------------------------------
 1 file changed, 16 insertions(+), 36 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 3e4f9495..66a27b28 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -387,7 +387,7 @@ namespace Learner
             const std::string& seed
         ) :
             prng(seed),
-            sr(filenames, SfenReaderMode::Sequential, thread_num, std::to_string(prng.next_random_seed())),
+            sr(filenames, SfenReaderMode::Cyclic, thread_num, std::to_string(prng.next_random_seed())),
             learn_loss_sum{}
         {
             save_only_once = false;
@@ -408,7 +408,7 @@ namespace Learner
             sr.set_do_shuffle(v);
         }
 
-        void learn();
+        void learn(uint64_t epochs);
 
 
         std::string validation_set_file_name;
@@ -439,9 +439,9 @@ namespace Learner
     private:
         void learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
 
-        void update_weights(const PSVector& psv);
+        void update_weights(const PSVector& psv, uint64_t epoch);
 
-        void calc_loss(const PSVector& psv);
+        void calc_loss(const PSVector& psv, uint64_t epoch);
 
         void calc_loss_worker(
             Thread& th,
@@ -465,9 +465,6 @@ namespace Learner
         uint64_t save_count;
         uint64_t loss_output_count;
 
-        // Learning iteration counter
-        uint64_t epoch = 0;
-
         std::atomic<bool> stop_flag;
 
         uint64_t total_done;
@@ -481,7 +478,7 @@ namespace Learner
         AtomicLoss learn_loss_sum;
     };
 
-    void LearnerThink::learn()
+    void LearnerThink::learn(uint64_t epochs)
     {
 
 #if defined(_OPENMP)
@@ -507,7 +504,7 @@ namespace Learner
 
         if (newbob_decay != 1.0) {
 
-            calc_loss(sfen_for_mse);
+            calc_loss(sfen_for_mse, 0);
 
             best_loss = latest_loss_sum / latest_loss_count;
             latest_loss_sum = 0.0;
@@ -518,7 +515,7 @@ namespace Learner
 
         stop_flag = false;
 
-        for(;;)
+        for(uint64_t epoch = 1; epoch <= epochs; ++epoch)
         {
             std::atomic<uint64_t> counter{0};
 
@@ -533,7 +530,7 @@ namespace Learner
             if (stop_flag)
                 break;
 
-            update_weights(sfen_for_mse);
+            update_weights(sfen_for_mse, epoch);
 
             if (stop_flag)
                 break;
@@ -639,7 +636,7 @@ namespace Learner
         learn_loss_sum += local_loss_sum;
     }
 
-    void LearnerThink::update_weights(const PSVector& psv)
+    void LearnerThink::update_weights(const PSVector& psv, uint64_t epoch)
     {
         // I'm not sure this fencing is correct. But either way there
         // should be no real issues happening since
@@ -648,8 +645,6 @@ namespace Learner
         Eval::NNUE::update_parameters();
         atomic_thread_fence(memory_order_seq_cst);
 
-        ++epoch;
-
         if (++save_count * mini_batch_size >= eval_save_interval)
         {
             save_count = 0;
@@ -667,13 +662,13 @@ namespace Learner
             loss_output_count = 0;
 
             // loss calculation
-            calc_loss(psv);
+            calc_loss(psv, epoch);
 
             Eval::NNUE::check_health();
         }
     }
 
-    void LearnerThink::calc_loss(const PSVector& psv)
+    void LearnerThink::calc_loss(const PSVector& psv, uint64_t epoch)
     {
         TT.new_search();
         TimePoint elapsed = now() - Search::Limits.startTime + 1;
@@ -926,8 +921,8 @@ namespace Learner
         // mini_batch_size 1M aspect by default. This can be increased.
         auto mini_batch_size = LEARN_MINI_BATCH_SIZE;
 
-        // Number of loops (read the game record file this number of times)
-        int loop = 1;
+        // Number of epochs
+        uint64_t epochs = 1;
 
         // Game file storage folder (get game file with relative path from here)
         string base_dir;
@@ -996,7 +991,7 @@ namespace Learner
             }
 
             // Specify the number of loops
-            else if (option == "loop")      is >> loop;
+            else if (option == "epochs")      is >> epochs;
 
             // Game file storage folder (get game file with relative path from here)
             else if (option == "basedir")   is >> base_dir;
@@ -1093,21 +1088,6 @@ namespace Learner
         }
         rebase_files(filenames, base_dir);
 
-        // Insert the file name for the number of loops.
-        {
-            std::vector<std::string> filenamesTimesLoop;
-
-            for (int i = 0; i < loop; ++i)
-            {
-                for(auto& file : filenames)
-                {
-                    filenamesTimesLoop.emplace_back(file);
-                }
-            }
-
-            filenames = std::move(filenamesTimesLoop);
-        }
-
         cout << "learn from ";
         for (auto s : filenames)
             cout << s << " , ";
@@ -1121,7 +1101,7 @@ namespace Learner
         cout << "base dir        : " << base_dir << endl;
         cout << "target dir      : " << target_dir << endl;
 
-        cout << "loop              : " << loop << endl;
+        cout << "epochs            : " << epochs << endl;
         cout << "eval_limit        : " << eval_limit << endl;
         cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
         cout << "no_shuffle        : " << (no_shuffle ? "true" : "false") << endl;
@@ -1201,7 +1181,7 @@ namespace Learner
         cout << "init done." << endl;
 
         // Start learning.
-        learn_think.learn();
+        learn_think.learn(epochs);
     }
 
 } // namespace Learner

From 31f94a18b3368a533b874e5e5b65970725f30597 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 22 Oct 2020 15:57:46 +0200
Subject: [PATCH 368/583] Update readme and docs after change from loop to
 epochs.

---
 README.md           | 2 +-
 docs/learn.md       | 2 +-
 src/learn/learn.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 84898792..5fa8179e 100644
--- a/README.md
+++ b/README.md
@@ -78,7 +78,7 @@ setoption name SkipLoadingEval value true
 setoption name Use NNUE value pure
 setoption name Threads value x
 isready
-learn targetdir trainingdata loop 100 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 lr 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 validation_set_file_name validationdata\val.binpack
+learn targetdir trainingdata epochs 10000 batchsize 1000000 use_draw_in_training 1 use_draw_in_validation 1 lr 1 lambda 1 eval_limit 32000 nn_batch_size 1000 newbob_decay 0.5 eval_save_interval 250000000 loss_output_interval 1000000 validation_set_file_name validationdata\val.binpack
 ```
 
 This will utilize training data files in the "trainingdata" directory and validation data from file "validationdata\val.bin". Produced nets are saved in the "evalsave" folder.
diff --git a/docs/learn.md b/docs/learn.md
index 3a580134..dc55ec1f 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -20,7 +20,7 @@ Currently the following options are available:
 
 `targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
 
-`loop` - the number of times to loop over all training data.
+`epochs` - the number of weight update cycles (epochs) to train the network for. One such cycle is `batchsize` positions. If not specified then the training will loop forever.
 
 `basedir` - the base directory for the paths. Default: "" (current directory)
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 66a27b28..328f646a 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -922,7 +922,7 @@ namespace Learner
         auto mini_batch_size = LEARN_MINI_BATCH_SIZE;
 
         // Number of epochs
-        uint64_t epochs = 1;
+        uint64_t epochs = std::numeric_limits<uint64_t>::max();
 
         // Game file storage folder (get game file with relative path from here)
         string base_dir;

From 8fb208598b496cf79b8f457b064eb9800ca59cf3 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 23 Oct 2020 22:19:50 +0200
Subject: [PATCH 369/583] pass shuffle flag in the constructor

---
 src/learn/learn.cpp     | 11 +++--------
 src/learn/sfen_reader.h |  8 ++------
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 328f646a..fa447a77 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -383,11 +383,12 @@ namespace Learner
 
         LearnerThink(
             const std::vector<std::string>& filenames,
+            bool shuffle,
             uint64_t thread_num,
             const std::string& seed
         ) :
             prng(seed),
-            sr(filenames, SfenReaderMode::Cyclic, thread_num, std::to_string(prng.next_random_seed())),
+            sr(filenames, shuffle, SfenReaderMode::Cyclic, thread_num, std::to_string(prng.next_random_seed())),
             learn_loss_sum{}
         {
             save_only_once = false;
@@ -403,11 +404,6 @@ namespace Learner
             total_done = 0;
         }
 
-        void set_do_shuffle(bool v)
-        {
-            sr.set_do_shuffle(v);
-        }
-
         void learn(uint64_t epochs);
 
 
@@ -1150,7 +1146,7 @@ namespace Learner
         Eval::NNUE::set_batch_size(nn_batch_size);
         Eval::NNUE::set_options(nn_options);
 
-        LearnerThink learn_think(filenames, thread_num, seed);
+        LearnerThink learn_think(filenames, !no_shuffle, thread_num, seed);
 
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
             // Save the current net to [EvalSaveDir]\original.
@@ -1165,7 +1161,6 @@ namespace Learner
         // Reflect other option settings.
         learn_think.eval_limit = eval_limit;
         learn_think.save_only_once = save_only_once;
-        learn_think.set_do_shuffle(!no_shuffle);
         learn_think.reduction_gameply = reduction_gameply;
 
         learn_think.newbob_decay = newbob_decay;
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 0ef9765b..d39fef4e 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -40,6 +40,7 @@ namespace Learner{
         // Because it always the same integers on MinGW.
         SfenReader(
             const std::vector<std::string>& filenames_,
+            bool do_shuffle,
             SfenReaderMode mode_,
             int thread_num,
             const std::string& seed
@@ -51,7 +52,7 @@ namespace Learner{
             packed_sfens.resize(thread_num);
             total_read = 0;
             end_of_files = false;
-            shuffle = true;
+            shuffle = do_shuffle;
             stop_flag = false;
 
             file_worker_thread = std::thread([&] {
@@ -312,11 +313,6 @@ namespace Learner{
             }
         }
 
-        void set_do_shuffle(bool v)
-        {
-            shuffle = v;
-        }
-
     protected:
 
         // worker thread reading file in background

From d31169bab5f9545b147701ad55bc68984180ed71 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 13:44:19 +0200
Subject: [PATCH 370/583] Update CI to use epochs instead of loops.

---
 tests/instrumented_learn.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 267a3bb6..ff1a8a72 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -127,7 +127,7 @@ cat << EOF > learn01.exp
  send "setoption name Use NNUE value true\n"
  send "setoption name Threads value $threads\n"
  send "isready\n"
- send "learn targetdir training_data loop 2 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
+ send "learn targetdir training_data epochs 1 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
 
  expect "save_eval() finished."
 

From 371acaa0b56391919075dcabfac5e33ca830495d Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 14:51:28 +0200
Subject: [PATCH 371/583] Allow changing sfen reader buffer sizes for the learn
 command.

---
 src/learn/learn.cpp     | 27 ++++++++++++++++++++++++---
 src/learn/learn.h       |  6 ------
 src/learn/sfen_reader.h | 35 +++++++++++++++++++++--------------
 3 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index fa447a77..7de359ef 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -385,10 +385,19 @@ namespace Learner
             const std::vector<std::string>& filenames,
             bool shuffle,
             uint64_t thread_num,
-            const std::string& seed
+            const std::string& seed,
+            size_t read_size,
+            size_t buffer_size
         ) :
             prng(seed),
-            sr(filenames, shuffle, SfenReaderMode::Cyclic, thread_num, std::to_string(prng.next_random_seed())),
+            sr(
+                filenames,
+                shuffle,
+                SfenReaderMode::Cyclic,
+                thread_num,
+                std::to_string(prng.next_random_seed()),
+                read_size,
+                buffer_size),
             learn_loss_sum{}
         {
             save_only_once = false;
@@ -958,6 +967,9 @@ namespace Learner
         uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
         uint64_t loss_output_interval = 1'000'000;
 
+        size_t sfen_read_size = SfenReader::DEFAULT_SFEN_READ_SIZE;
+        size_t thread_buffer_size = SfenReader::DEFAULT_THREAD_BUFFER_SIZE;
+
         string validation_set_file_name;
         string seed;
 
@@ -1045,6 +1057,9 @@ namespace Learner
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
 
+            else if (option == "sfen_read_size") is >> sfen_read_size;
+            else if (option == "thread_buffer_size") is >> thread_buffer_size;
+
             else if (option == "seed") is >> seed;
             else if (option == "set_recommended_uci_options")
             {
@@ -1146,7 +1161,13 @@ namespace Learner
         Eval::NNUE::set_batch_size(nn_batch_size);
         Eval::NNUE::set_options(nn_options);
 
-        LearnerThink learn_think(filenames, !no_shuffle, thread_num, seed);
+        LearnerThink learn_think(
+            filenames,
+            !no_shuffle,
+            thread_num,
+            seed,
+            sfen_read_size,
+            thread_buffer_size);
 
         if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
             // Save the current net to [EvalSaveDir]\original.
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 3ba75ce3..5efeb516 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -54,12 +54,6 @@ namespace Learner
 
     constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
 
-    // The number of phases to read from the file at one time. After reading this much, shuffle.
-    // It is better to have a certain size, but this number x 40 bytes x 3 times as much memory is consumed. 400MB*3 is consumed in the 10M phase.
-    // Must be a multiple of THREAD_BUFFER_SIZE(=10000).
-
-    constexpr std::size_t LEARN_SFEN_READ_SIZE = 1000 * 1000 * 10;
-
     // Saving interval of evaluation function at learning. Save each time you learn this number of phases.
     // Needless to say, the longer the saving interval, the shorter the learning time.
     // Folder name is incremented for each save like 0/, 1/, 2/...
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index d39fef4e..71767bc6 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -28,13 +28,13 @@ namespace Learner{
     struct SfenReader
     {
         // Number of phases buffered by each thread 0.1M phases. 4M phase at 40HT
-        static constexpr size_t THREAD_BUFFER_SIZE = 10 * 1000;
+        static constexpr size_t DEFAULT_THREAD_BUFFER_SIZE = 10 * 1000;
 
         // Buffer for reading files (If this is made larger,
         // the shuffle becomes larger and the phases may vary.
         // If it is too large, the memory consumption will increase.
         // SFEN_READ_SIZE is a multiple of THREAD_BUFFER_SIZE.
-        static constexpr const size_t SFEN_READ_SIZE = LEARN_SFEN_READ_SIZE;
+        static constexpr const size_t DEFAULT_SFEN_READ_SIZE = 1000 * 1000 * 10;
 
         // Do not use std::random_device().
         // Because it always the same integers on MinGW.
@@ -43,10 +43,14 @@ namespace Learner{
             bool do_shuffle,
             SfenReaderMode mode_,
             int thread_num,
-            const std::string& seed
+            const std::string& seed,
+            size_t read_size = DEFAULT_SFEN_READ_SIZE,
+            size_t buffer_size = DEFAULT_THREAD_BUFFER_SIZE
         ) :
             filenames(filenames_.begin(), filenames_.end()),
             mode(mode_),
+            sfen_read_size(read_size),
+            thread_buffer_size(buffer_size),
             prng(seed)
         {
             packed_sfens.resize(thread_num);
@@ -165,7 +169,7 @@ namespace Learner{
                         packed_sfens[thread_id] = std::move(packed_sfens_pool.front());
                         packed_sfens_pool.pop_front();
 
-                        total_read += THREAD_BUFFER_SIZE;
+                        total_read += thread_buffer_size;
 
                         return true;
                     }
@@ -237,17 +241,17 @@ namespace Learner{
             {
                 // Wait for the buffer to run out.
                 // This size() is read only, so you don't need to lock it.
-                while (!stop_flag && packed_sfens_pool.size() >= SFEN_READ_SIZE / THREAD_BUFFER_SIZE)
+                while (!stop_flag && packed_sfens_pool.size() >= sfen_read_size / thread_buffer_size)
                     sleep(100);
 
                 if (stop_flag)
                     return;
 
                 PSVector sfens;
-                sfens.reserve(SFEN_READ_SIZE);
+                sfens.reserve(sfen_read_size);
 
                 // Read from the file into the file buffer.
-                while (sfens.size() < SFEN_READ_SIZE)
+                while (sfens.size() < sfen_read_size)
                 {
                     std::optional<PackedSfenValue> p = sfen_input_stream->next();
                     if (p.has_value())
@@ -280,11 +284,11 @@ namespace Learner{
                     Algo::shuffle(sfens, prng);
                 }
 
-                // Divide this by THREAD_BUFFER_SIZE. There should be size pieces.
-                // SFEN_READ_SIZE shall be a multiple of THREAD_BUFFER_SIZE.
-                assert((SFEN_READ_SIZE % THREAD_BUFFER_SIZE) == 0);
+                // Divide this by thread_buffer_size. There should be size pieces.
+                // sfen_read_size shall be a multiple of thread_buffer_size.
+                assert((sfen_read_size % thread_buffer_size) == 0);
 
-                auto size = size_t(SFEN_READ_SIZE / THREAD_BUFFER_SIZE);
+                auto size = size_t(sfen_read_size / thread_buffer_size);
                 std::vector<std::unique_ptr<PSVector>> buffers;
                 buffers.reserve(size);
 
@@ -292,11 +296,11 @@ namespace Learner{
                 {
                     // Delete this pointer on the receiving side.
                     auto buf = std::make_unique<PSVector>();
-                    buf->resize(THREAD_BUFFER_SIZE);
+                    buf->resize(thread_buffer_size);
                     memcpy(
                         buf->data(),
-                        &sfens[i * THREAD_BUFFER_SIZE],
-                        sizeof(PackedSfenValue) * THREAD_BUFFER_SIZE);
+                        &sfens[i * thread_buffer_size],
+                        sizeof(PackedSfenValue) * thread_buffer_size);
 
                     buffers.emplace_back(std::move(buf));
                 }
@@ -331,6 +335,9 @@ namespace Learner{
 
         SfenReaderMode mode;
 
+        size_t sfen_read_size;
+        size_t thread_buffer_size;
+
         // Random number to shuffle when reading the phase
         PRNG prng;
 

From 47a82bfc912516fbbad325a24134647af0a4e81d Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 14:52:50 +0200
Subject: [PATCH 372/583] Document new options.

---
 docs/learn.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/learn.md b/docs/learn.md
index dc55ec1f..f815284c 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -74,6 +74,10 @@ Currently the following options are available:
 
 `validation_set_file_name` - path to the file with training data to be used for validation (loss computation and move accuracy)
 
+`sfen_read_size` - the number of sfens to always keep in the buffer. Default: 10000000 (10M)
+
+`thread_buffer_size` - the number of sfens to copy at once to each thread requesting more sfens for learning. Default: 10000
+
 `seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
 
 ## Legacy subcommands and parameters

From 3bf397a569f958e1fbbb4d96c4f8f89af53b9a41 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 14:54:00 +0200
Subject: [PATCH 373/583] Update instrumented_learn for the current codebase.

---
 tests/instrumented_learn.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index ff1a8a72..4ce3dc1c 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -80,7 +80,7 @@ cat << EOF > gensfen01.exp
  send "isready\n"
  send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin sfen_format bin\n"
  expect "gensfen finished."
- send "learn training_data/training_data.bin convert_plain output_file_name training_data.txt\n"
+ send "convert_plain targetfile training_data/training_data.bin output_file_name training_data.txt\n"
  expect "all done"
  send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack sfen_format binpack\n"
  expect "gensfen finished."
@@ -127,7 +127,7 @@ cat << EOF > learn01.exp
  send "setoption name Use NNUE value true\n"
  send "setoption name Threads value $threads\n"
  send "isready\n"
- send "learn targetdir training_data epochs 1 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
+ send "learn targetdir training_data epochs 1 sfen_read_size 100 thread_buffer_size 10 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
 
  expect "save_eval() finished."
 

From be3937c37bfc6e69a26b307d2179c16409ccadc8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 10:52:49 +0200
Subject: [PATCH 374/583] Print layers and their indices during training
 initialization.

---
 src/nnue/evaluate_nnue.cpp          |  6 ++
 src/nnue/evaluate_nnue.h            |  2 +
 src/nnue/evaluate_nnue_learner.cpp  |  7 +++
 src/nnue/layers/affine_transform.h  | 21 ++++++-
 src/nnue/layers/clipped_relu.h      | 19 +++++-
 src/nnue/layers/input_slice.h       | 93 ++++++++++++++++-------------
 src/nnue/layers/sum.h               | 38 ++++++++++--
 src/nnue/nnue_feature_transformer.h | 17 +++++-
 8 files changed, 152 insertions(+), 51 deletions(-)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 67398f81..9da8b1e6 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -71,6 +71,12 @@ namespace Eval::NNUE {
             ",Network=" + Network::get_structure_string();
     }
 
+    std::string get_layers_info() {
+        return
+            FeatureTransformer::get_layers_info()
+            + '\n' + Network::get_layers_info();
+    }
+
     UseNNUEMode useNNUE;
     std::string eval_file_loaded = "None";
 
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index d0f61644..100e693c 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -81,6 +81,8 @@ namespace Eval::NNUE {
     // Get a string that represents the structure of the evaluation function
     std::string get_architecture_string();
 
+    std::string get_layers_info();
+
     // read the header
     bool read_header(std::istream& stream,
         std::uint32_t* hash_value, std::string* architecture);
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index e0236781..54525fe4 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -58,6 +58,13 @@ namespace Eval::NNUE {
         std::cout << "Initializing NN training for "
                   << get_architecture_string() << std::endl;
 
+        std::cout << std::endl;
+
+        std::cout << "Layers:\n"
+                  << get_layers_info() << std::endl;
+
+        std::cout << std::endl;
+
         assert(feature_transformer);
         assert(network);
         trainer = Trainer<Network>::create(network.get(), feature_transformer.get());
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 6efaecbc..e734580e 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -57,6 +57,8 @@ namespace Eval::NNUE::Layers {
         static constexpr std::size_t kBufferSize =
             PreviousLayer::kBufferSize + kSelfBufferSize;
 
+        static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
         // Hash value embedded in the evaluation file
         static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0xCC03DAE4u;
@@ -66,14 +68,27 @@ namespace Eval::NNUE::Layers {
             return hash_value;
         }
 
-        // A string that represents the structure from the input layer to this layer
-        static std::string get_structure_string() {
+        static std::string get_name() {
             return "AffineTransform[" +
                 std::to_string(kOutputDimensions) + "<-" +
-                std::to_string(kInputDimensions) + "](" +
+                std::to_string(kInputDimensions) + "]";
+        }
+
+        // A string that represents the structure from the input layer to this layer
+        static std::string get_structure_string() {
+            return get_name() + "(" +
                 PreviousLayer::get_structure_string() + ")";
         }
 
+        static std::string get_layers_info() {
+            std::string info = PreviousLayer::get_layers_info();
+            info += '\n';
+            info += std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
+        }
+
        // Read network parameters
         bool read_parameters(std::istream& stream) {
             if (!previous_layer_.read_parameters(stream))
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 889effa7..5fbd66cc 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -54,6 +54,8 @@ namespace Eval::NNUE::Layers {
         static constexpr std::size_t kBufferSize =
             PreviousLayer::kBufferSize + kSelfBufferSize;
 
+        static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
         // Hash value embedded in the evaluation file
         static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0x538D24C7u;
@@ -61,13 +63,26 @@ namespace Eval::NNUE::Layers {
             return hash_value;
         }
 
+        static std::string get_name() {
+            return "ClippedReLU[" +
+                std::to_string(kOutputDimensions) + "]";
+        }
+
         // A string that represents the structure from the input layer to this layer
         static std::string get_structure_string() {
-            return "ClippedReLU[" +
-                std::to_string(kOutputDimensions) + "](" +
+            return get_name() + "(" +
                 PreviousLayer::get_structure_string() + ")";
         }
 
+        static std::string get_layers_info() {
+            std::string info = PreviousLayer::get_layers_info();
+            info += '\n';
+            info += std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
+        }
+
         // Read network parameters
         bool read_parameters(std::istream& stream) {
             return previous_layer_.read_parameters(stream);
diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h
index b69028ab..56c738af 100644
--- a/src/nnue/layers/input_slice.h
+++ b/src/nnue/layers/input_slice.h
@@ -28,56 +28,69 @@
 
 namespace Eval::NNUE::Layers {
 
-  // Input layer
-  template <IndexType OutputDimensions, IndexType Offset = 0>
-  class InputSlice {
-  public:
-      // Need to maintain alignment
-      static_assert(Offset % kMaxSimdWidth == 0, "");
+    // Input layer
+    template <IndexType OutputDimensions, IndexType Offset = 0>
+    class InputSlice {
+    public:
+        // Need to maintain alignment
+        static_assert(Offset % kMaxSimdWidth == 0, "");
 
-      // Output type
-      using OutputType = TransformedFeatureType;
+        // Output type
+        using OutputType = TransformedFeatureType;
 
-      // Output dimensionality
-      static constexpr IndexType kOutputDimensions = OutputDimensions;
+        // Output dimensionality
+        static constexpr IndexType kOutputDimensions = OutputDimensions;
 
-      // Size of forward propagation buffer used from the input layer to this layer
-      static constexpr std::size_t kBufferSize = 0;
+        // Size of forward propagation buffer used from the input layer to this layer
+        static constexpr std::size_t kBufferSize = 0;
 
-      // Hash value embedded in the evaluation file
-      static constexpr std::uint32_t get_hash_value() {
-          std::uint32_t hash_value = 0xEC42E90Du;
-          hash_value ^= kOutputDimensions ^ (Offset << 10);
-          return hash_value;
-      }
+        static constexpr int kLayerIndex = 1;
 
-      // A string that represents the structure from the input layer to this layer
-      static std::string get_structure_string() {
-          return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
-              std::to_string(Offset) + ":" +
-              std::to_string(Offset + kOutputDimensions) + ")]";
-      }
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t get_hash_value() {
+            std::uint32_t hash_value = 0xEC42E90Du;
+            hash_value ^= kOutputDimensions ^ (Offset << 10);
+            return hash_value;
+        }
 
-      // Read network parameters
-      bool read_parameters(std::istream& /*stream*/) {
-          return true;
-      }
+        static std::string get_name() {
+            return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
+                std::to_string(Offset) + ":" +
+                std::to_string(Offset + kOutputDimensions) + ")]";
+        }
 
-      // write parameters
-      bool write_parameters(std::ostream& /*stream*/) const {
-          return true;
-      }
+        // A string that represents the structure from the input layer to this layer
+        static std::string get_structure_string() {
+            return get_name();
+        }
 
-      // Forward propagation
-      const OutputType* propagate(
-          const TransformedFeatureType* transformed_features,
-          char* /*buffer*/) const {
+        static std::string get_layers_info() {
+            std::string info = std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
+        }
 
-          return transformed_features + Offset;
-      }
+        // Read network parameters
+        bool read_parameters(std::istream& /*stream*/) {
+            return true;
+        }
 
-  private:
-  };
+        // write parameters
+        bool write_parameters(std::ostream& /*stream*/) const {
+            return true;
+        }
+
+        // Forward propagation
+        const OutputType* propagate(
+            const TransformedFeatureType* transformed_features,
+            char* /*buffer*/) const {
+
+            return transformed_features + Offset;
+        }
+
+    private:
+    };
 
 }  // namespace Layers
 
diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h
index 64ef30f9..0f71bd61 100644
--- a/src/nnue/layers/sum.h
+++ b/src/nnue/layers/sum.h
@@ -36,6 +36,8 @@ namespace Eval::NNUE::Layers {
         static constexpr std::size_t kBufferSize =
             std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
 
+        static constexpr int kLayerIndex = Tail::kLayerIndex + 1;
+
         // Hash value embedded in the evaluation function file
         static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0xBCE400B4u;
@@ -46,10 +48,23 @@ namespace Eval::NNUE::Layers {
             return hash_value;
         }
 
+        static std::string get_name() {
+             return "Sum[" +
+                std::to_string(kOutputDimensions) + "]";
+        }
+
         // A string that represents the structure from the input layer to this layer
         static std::string get_structure_string() {
-            return "Sum[" +
-                std::to_string(kOutputDimensions) + "](" + get_summands_string() + ")";
+            return get_name() + "(" + get_summands_string() + ")";
+        }
+
+        static std::string get_layers_info() {
+            std::string info = Tail::get_layers_info();
+            info += '\n';
+            info += std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
         }
 
         // read parameters
@@ -117,6 +132,8 @@ namespace Eval::NNUE::Layers {
         // Size of the forward propagation buffer used from the input layer to this layer
         static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
 
+        static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
+
         // Hash value embedded in the evaluation function file
         static constexpr std::uint32_t get_hash_value() {
             std::uint32_t hash_value = 0xBCE400B4u;
@@ -125,10 +142,23 @@ namespace Eval::NNUE::Layers {
             return hash_value;
         }
 
+        static std::string get_name() {
+             return "Sum[" +
+                std::to_string(kOutputDimensions) + "]";
+        }
+
         // A string that represents the structure from the input layer to this layer
         static std::string get_structure_string() {
-            return "Sum[" +
-                std::to_string(kOutputDimensions) + "](" + get_summands_string() + ")";
+            return get_name() + "(" + get_summands_string() + ")";
+        }
+
+        static std::string get_layers_info() {
+            std::string info = PreviousLayer::get_layers_info();
+            info += '\n';
+            info += std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
         }
 
         // read parameters
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 87b8ee58..3e18e68a 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -110,19 +110,32 @@ namespace Eval::NNUE {
         static constexpr std::size_t kBufferSize =
             kOutputDimensions * sizeof(OutputType);
 
+        static constexpr int kLayerIndex = 0;
+
         // Hash value embedded in the evaluation file
         static constexpr std::uint32_t get_hash_value() {
 
             return RawFeatures::kHashValue ^ kOutputDimensions;
         }
 
-        // a string representing the structure
-        static std::string get_structure_string() {
+        static std::string get_name() {
             return RawFeatures::get_name() + "[" +
                 std::to_string(kInputDimensions) + "->" +
                 std::to_string(kHalfDimensions) + "x2]";
         }
 
+        // a string representing the structure
+        static std::string get_structure_string() {
+            return get_name();
+        }
+
+        static std::string get_layers_info() {
+            std::string info = std::to_string(kLayerIndex);
+            info += ": ";
+            info += get_name();
+            return info;
+        }
+
         // Read network parameters
         bool read_parameters(std::istream& stream) {
 

From ec436d3dfd212b90e568864318f9cd42b55faece Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 11:15:59 +0200
Subject: [PATCH 375/583] Print some weight update stats

---
 src/learn/learn.cpp                |  9 +++----
 src/nnue/evaluate_nnue_learner.cpp | 42 ++++++++++++++++++++++++++++--
 src/nnue/evaluate_nnue_learner.h   |  1 +
 src/nnue/trainer/trainer.h         |  1 +
 4 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 7de359ef..e3d2fecf 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -599,19 +599,16 @@ namespace Learner
                 // Evaluation value of deep search
                 const auto deep_value = (Value)ps.score;
 
-                const Value shallow_value =
-                    (rootColor == pos.side_to_move())
-                    ? Eval::evaluate(pos)
-                    : -Eval::evaluate(pos);
+                const Value shallow_value = Eval::evaluate(pos);
 
                 const auto loss = calc_cross_entropy(
                     deep_value,
-                    shallow_value,
+                    (rootColor == pos.side_to_move()) ? shallow_value : -shallow_value,
                     ps);
 
                 local_loss_sum += loss;
 
-                Eval::NNUE::add_example(pos, rootColor, ps, 1.0);
+                Eval::NNUE::add_example(pos, rootColor, shallow_value, ps, 1.0);
             };
 
             if (!pos.pseudo_legal((Move)ps.move) || !pos.legal((Move)ps.move))
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 54525fe4..581e7928 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -118,8 +118,12 @@ namespace Eval::NNUE {
     }
 
     // Add 1 sample of learning data
-    void add_example(Position& pos, Color rootColor,
-                    const Learner::PackedSfenValue& psv, double weight) {
+    void add_example(
+        Position& pos,
+        Color rootColor,
+        Value discrete_nn_eval,
+        const Learner::PackedSfenValue& psv,
+        double weight) {
 
         Example example;
         if (rootColor == pos.side_to_move()) {
@@ -128,6 +132,7 @@ namespace Eval::NNUE {
             example.sign = -1;
         }
 
+        example.discrete_nn_eval = discrete_nn_eval;
         example.psv = psv;
         example.weight = weight;
 
@@ -176,6 +181,13 @@ namespace Eval::NNUE {
 
         std::lock_guard<std::mutex> lock(examples_mutex);
         std::shuffle(examples.begin(), examples.end(), rng);
+
+        double abs_eval_diff_sum = 0.0;
+        double abs_discrete_eval_sum = 0.0;
+        double gradient_norm = 0.0;
+
+        bool is_first_batch = true;
+
         while (examples.size() >= batch_size) {
             std::vector<Example> batch(examples.end() - batch_size, examples.end());
             examples.resize(examples.size() - batch_size);
@@ -186,13 +198,39 @@ namespace Eval::NNUE {
             for (std::size_t b = 0; b < batch.size(); ++b) {
                 const auto shallow = static_cast<Value>(round<std::int32_t>(
                     batch[b].sign * network_output[b] * kPonanzaConstant));
+                const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
                 const auto& psv = batch[b].psv;
                 const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
                 gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+
+
+                // The discrete eval will only be valid before first backpropagation,
+                // that is only for the first batch.
+                // Similarily we want only gradients from one batch.
+                if (is_first_batch)
+                {
+                    abs_eval_diff_sum += std::abs(discrete - shallow);
+                    abs_discrete_eval_sum += std::abs(discrete);
+                    gradient_norm += std::abs(gradient);
+                }
             }
 
             trainer->backpropagate(gradients.data(), learning_rate);
+
+            is_first_batch = false;
         }
+
+        const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
+        const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
+
+        std::cout << "INFO (update_weights):"
+            << " avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
+            << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
+            << " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval
+            << " , batch_size = " << batch_size
+            << " , grad_norm = " << gradient_norm
+            << std::endl;
+
         send_messages({{"quantize_parameters"}});
     }
 
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 431fb02e..48ab31b9 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -22,6 +22,7 @@ namespace Eval::NNUE {
     void add_example(
         Position& pos,
         Color rootColor,
+        Value discrete_nn_eval,
     	const Learner::PackedSfenValue& psv,
         double weight);
 
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
index 763bd5c8..973bc898 100644
--- a/src/nnue/trainer/trainer.h
+++ b/src/nnue/trainer/trainer.h
@@ -68,6 +68,7 @@ namespace Eval::NNUE {
     struct Example {
         std::vector<TrainingFeature> training_features[2];
         Learner::PackedSfenValue psv;
+        Value discrete_nn_eval;
         int sign;
         double weight;
     };

From a351c1d65e23b04bb56bac4cd74ac5dd2041b658 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 11:39:16 +0200
Subject: [PATCH 376/583] Add verbose flag to learn. Only print update
 parameters info when vebose=true

---
 src/learn/learn.cpp                |  9 ++++++++-
 src/nnue/evaluate_nnue_learner.cpp | 29 ++++++++++++++++-------------
 src/nnue/evaluate_nnue_learner.h   |  2 +-
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index e3d2fecf..a56ac15f 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -432,6 +432,8 @@ namespace Learner
         // If true, do not dig the folder.
         bool save_only_once;
 
+        bool verbose;
+
         double newbob_decay;
         int newbob_num_trials;
         uint64_t auto_lr_drop;
@@ -644,7 +646,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters();
+        Eval::NNUE::update_parameters(epoch, verbose);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * mini_batch_size >= eval_save_interval)
@@ -943,6 +945,8 @@ namespace Learner
         // Turn on if you want to pass a pre-shuffled file.
         bool no_shuffle = false;
 
+        bool verbose = false;
+
         global_learning_rate = 1.0;
 
         // elmo lambda
@@ -1070,6 +1074,7 @@ namespace Learner
                 UCI::setoption("PruneAtShallowDepth", "false");
                 UCI::setoption("EnableTranspositionTable", "false");
             }
+            else if (option == "verbose") verbose = true;
             else
             {
                 cout << "Unknown option: " << option << ". Ignoring.\n";
@@ -1191,6 +1196,8 @@ namespace Learner
         learn_think.mini_batch_size = mini_batch_size;
         learn_think.validation_set_file_name = validation_set_file_name;
 
+        learn_think.verbose = verbose;
+
         cout << "init done." << endl;
 
         // Start learning.
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 581e7928..e0d2351d 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -173,7 +173,7 @@ namespace Eval::NNUE {
     }
 
     // update the evaluation function parameters
-    void update_parameters() {
+    void update_parameters(uint64_t epoch, bool verbose) {
         assert(batch_size > 0);
 
         const auto learning_rate = static_cast<LearnFloatType>(
@@ -186,7 +186,7 @@ namespace Eval::NNUE {
         double abs_discrete_eval_sum = 0.0;
         double gradient_norm = 0.0;
 
-        bool is_first_batch = true;
+        bool collect_stats = verbose;
 
         while (examples.size() >= batch_size) {
             std::vector<Example> batch(examples.end() - batch_size, examples.end());
@@ -207,7 +207,7 @@ namespace Eval::NNUE {
                 // The discrete eval will only be valid before first backpropagation,
                 // that is only for the first batch.
                 // Similarily we want only gradients from one batch.
-                if (is_first_batch)
+                if (collect_stats)
                 {
                     abs_eval_diff_sum += std::abs(discrete - shallow);
                     abs_discrete_eval_sum += std::abs(discrete);
@@ -217,19 +217,22 @@ namespace Eval::NNUE {
 
             trainer->backpropagate(gradients.data(), learning_rate);
 
-            is_first_batch = false;
+            collect_stats = false;
         }
 
-        const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
-        const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
+        if (verbose) {
+            const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
+            const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
 
-        std::cout << "INFO (update_weights):"
-            << " avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
-            << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
-            << " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval
-            << " , batch_size = " << batch_size
-            << " , grad_norm = " << gradient_norm
-            << std::endl;
+            std::cout << "INFO (update_parameters):"
+                << " epoch = " << epoch
+                << " , avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
+                << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
+                << " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval
+                << " , batch_size = " << batch_size
+                << " , grad_norm = " << gradient_norm
+                << std::endl;
+        }
 
         send_messages({{"quantize_parameters"}});
     }
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 48ab31b9..03a23c83 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -27,7 +27,7 @@ namespace Eval::NNUE {
         double weight);
 
     // update the evaluation function parameters
-    void update_parameters();
+    void update_parameters(uint64_t epoch, bool verbose);
 
     // Check if there are any problems with learning
     void check_health();

From d70408f20431a0f18283aaff7ddb09d3fc42cb51 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 11:40:26 +0200
Subject: [PATCH 377/583] Add docs entry for the verbose flag.

---
 docs/learn.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/learn.md b/docs/learn.md
index f815284c..7051a173 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -80,6 +80,8 @@ Currently the following options are available:
 
 `seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
 
+`verbose` - this is a modifier, not a parameter. When used there will be more detailed output during training.
+
 ## Legacy subcommands and parameters
 
 ### Convert

From 8ddef320e6117e6d9174b2445dec28212c4cb92f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 11:42:30 +0200
Subject: [PATCH 378/583] Print an additional new line before calc_loss
 progress instead of after check_health in the feature transformer layer.

---
 src/learn/learn.cpp                            | 3 ++-
 src/nnue/trainer/trainer_feature_transformer.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index a56ac15f..6257d920 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -677,7 +677,8 @@ namespace Learner
         TT.new_search();
         TimePoint elapsed = now() - Search::Limits.startTime + 1;
 
-        cout << "PROGRESS: " << now_string() << ", ";
+        cout << "\n";
+        cout << "PROGRESS (calc_loss): " << now_string() << ", ";
         cout << total_done << " sfens, ";
         cout << total_done * 1000 / elapsed  << " sfens/second";
         cout << ", iteration " << epoch;
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index a3d6c16a..2311fc0c 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -349,7 +349,7 @@ namespace Eval::NNUE {
 
             std::cout << "INFO: largest min activation = " << largest_min_activation
                       << ", smallest max activation = " << smallest_max_activation
-                      << std::endl << std::endl;
+                      << std::endl;
 
             std::fill(std::begin(min_activations_), std::end(min_activations_),
                       std::numeric_limits<LearnFloatType>::max());

From c49ae541c42b4111767d13424bc29d65532aa1a9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 11:58:53 +0200
Subject: [PATCH 379/583] Add layer info for check_health. Print subsequent
 infos from the same scope with "-->" instead of "INFO:" for clarity.

---
 src/learn/learn.cpp                           |  4 ++--
 src/nnue/trainer/trainer_clipped_relu.h       |  9 +++++++--
 .../trainer/trainer_feature_transformer.h     | 19 +++++++++++++------
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 6257d920..c9313575 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -141,7 +141,7 @@ namespace Learner
             void print(const std::string& prefix, ostream& s) const
             {
                 s
-                    << "INFO: "
+                    << "--> "
                     << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count
                     << " , " << prefix << "_cross_entropy_win = " << cross_entropy_win / count
                     << " , " << prefix << "_entropy_eval = " << entropy_eval / count
@@ -722,7 +722,7 @@ namespace Learner
 
         if (psv.size() && test_loss_sum.count > 0.0)
         {
-            cout << "INFO: norm = " << sum_norm
+            cout << "--> norm = " << sum_norm
                 << " , move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%"
                 << endl;
 
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 35503493..d1dd738b 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -99,8 +99,13 @@ namespace Eval::NNUE {
             const auto smallest_max_activation = *std::min_element(
                 std::begin(max_activations_), std::end(max_activations_));
 
-            std::cout << "INFO: largest min activation = " << largest_min_activation
-                      << ", smallest max activation = " << smallest_max_activation
+            std::cout << "INFO (check_health):"
+                      << " layer = " << LayerType::kLayerIndex
+                      << " , name = " << LayerType::get_name()
+                      << std::endl;
+
+            std::cout << "--> largest min activation = " << largest_min_activation
+                      << " , smallest max activation = " << smallest_max_activation
                       << std::endl;
 
             std::fill(std::begin(min_activations_), std::end(min_activations_),
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 2311fc0c..dbfe18a2 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -330,25 +330,32 @@ namespace Eval::NNUE {
 
         // Check if there are any problems with learning
         void check_health() {
-            std::cout << "INFO: observed " << observed_features.count()
-                      << " (out of " << kInputDimensions << ") features" << std::endl;
+            std::cout << "INFO (check_health):"
+                      << " layer = " << LayerType::kLayerIndex
+                      << " , name = " << LayerType::get_name()
+                      << std::endl;
+
+            std::cout << "--> observed " << observed_features.count()
+                      << " (out of " << kInputDimensions << ") features"
+                      << std::endl;
 
             constexpr LearnFloatType kPreActivationLimit =
                 std::numeric_limits<typename LayerType::WeightType>::max() /
                 kWeightScale;
 
-            std::cout << "INFO: (min, max) of pre-activations = "
+            std::cout << "--> (min, max) of pre-activations = "
                       << min_pre_activation_ << ", "
                       << max_pre_activation_ << " (limit = "
-                      << kPreActivationLimit << ")" << std::endl;
+                      << kPreActivationLimit << ")"
+                      << std::endl;
 
             const auto largest_min_activation = *std::max_element(
                 std::begin(min_activations_), std::end(min_activations_));
             const auto smallest_max_activation = *std::min_element(
                 std::begin(max_activations_), std::end(max_activations_));
 
-            std::cout << "INFO: largest min activation = " << largest_min_activation
-                      << ", smallest max activation = " << smallest_max_activation
+            std::cout << "--> largest min activation = " << largest_min_activation
+                      << " , smallest max activation = " << smallest_max_activation
                       << std::endl;
 
             std::fill(std::begin(min_activations_), std::end(min_activations_),

From cf3edfed8203ad249bca9eab16336afb961d1e9e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 13:12:32 +0200
Subject: [PATCH 380/583] Improve info messages.

---
 src/learn/learn.cpp                           | 43 +++++++++----------
 src/learn/sfen_reader.h                       | 12 +++---
 src/nnue/evaluate_nnue_learner.cpp            |  4 +-
 src/nnue/trainer/trainer_clipped_relu.h       |  6 +--
 .../trainer/trainer_feature_transformer.h     | 10 ++---
 5 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index c9313575..cf26e05e 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -140,15 +140,12 @@ namespace Learner
 
             void print(const std::string& prefix, ostream& s) const
             {
-                s
-                    << "--> "
-                    << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count
-                    << " , " << prefix << "_cross_entropy_win = " << cross_entropy_win / count
-                    << " , " << prefix << "_entropy_eval = " << entropy_eval / count
-                    << " , " << prefix << "_entropy_win = " << entropy_win / count
-                    << " , " << prefix << "_cross_entropy = " << cross_entropy / count
-                    << " , " << prefix << "_entropy = " << entropy / count
-                    << endl;
+                s << "==> " << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count << endl;
+                s << "==> " << prefix << "_cross_entropy_win  = " << cross_entropy_win / count << endl;
+                s << "==> " << prefix << "_entropy_eval       = " << entropy_eval / count << endl;
+                s << "==> " << prefix << "_entropy_win        = " << entropy_win / count << endl;
+                s << "==> " << prefix << "_cross_entropy      = " << cross_entropy / count << endl;
+                s << "==> " << prefix << "_entropy            = " << entropy / count << endl;
             }
         };
     }
@@ -678,11 +675,13 @@ namespace Learner
         TimePoint elapsed = now() - Search::Limits.startTime + 1;
 
         cout << "\n";
-        cout << "PROGRESS (calc_loss): " << now_string() << ", ";
-        cout << total_done << " sfens, ";
-        cout << total_done * 1000 / elapsed  << " sfens/second";
-        cout << ", iteration " << epoch;
-        cout << ", learning rate = " << global_learning_rate << ", ";
+        cout << "PROGRESS (calc_loss): " << now_string()
+             << ", " << total_done << " sfens"
+             << ", " << total_done * 1000 / elapsed  << " sfens/second"
+             << ", epoch " << epoch
+             << endl;
+
+        cout << "==> learning rate = " << global_learning_rate << endl;
 
         // For calculation of verification data loss
         AtomicLoss test_loss_sum{};
@@ -699,7 +698,7 @@ namespace Learner
             auto& pos = th.rootPos;
             StateInfo si;
             pos.set(StartFEN, false, &si, &th);
-            cout << "startpos eval = " << Eval::evaluate(pos) << endl;
+            cout << "==> startpos eval = " << Eval::evaluate(pos) << endl;
         });
         mainThread->wait_for_worker_finished();
 
@@ -722,16 +721,15 @@ namespace Learner
 
         if (psv.size() && test_loss_sum.count > 0.0)
         {
-            cout << "--> norm = " << sum_norm
-                << " , move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%"
-                << endl;
-
             test_loss_sum.print("test", cout);
 
             if (learn_loss_sum.count > 0.0)
             {
                 learn_loss_sum.print("learn", cout);
             }
+
+            cout << "==> norm = " << sum_norm << endl;
+            cout << "==> move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
         }
         else
         {
@@ -847,7 +845,8 @@ namespace Learner
                 const double latest_loss = latest_loss_sum / latest_loss_count;
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
-                cout << "loss: " << latest_loss;
+                cout << "INFO (learning_rate):" << endl;
+                cout << "==> loss = " << latest_loss;
                 auto tot = total_done;
                 if (auto_lr_drop)
                 {
@@ -877,7 +876,7 @@ namespace Learner
                     if (--trials > 0 && !is_final)
                     {
                         cout
-                            << "reducing learning rate from " << global_learning_rate
+                            << "==> reducing learning rate from " << global_learning_rate
                             << " to " << (global_learning_rate * newbob_decay)
                             << " (" << trials << " more trials)" << endl;
 
@@ -887,7 +886,7 @@ namespace Learner
 
                 if (trials == 0)
                 {
-                    cout << "converged" << endl;
+                    cout << "==> converged" << endl;
                     return true;
                 }
             }
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 71767bc6..4d5a6d1a 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -83,7 +83,7 @@ namespace Learner{
                 PackedSfenValue ps;
                 if (!read_to_thread_buffer(0, ps))
                 {
-                    std::cout << "Error! read packed sfen , failed." << std::endl;
+                    std::cout << "ERROR (sfen_reader): Reading failed." << std::endl;
                     return sfen_for_mse;
                 }
 
@@ -211,16 +211,16 @@ namespace Learner{
 
                     if (sfen_input_stream == nullptr)
                     {
-                        std::cout << "File does not exist: " << currentFilename << '\n';
+                        std::cout << "INFO (sfen_reader): File does not exist: " << currentFilename << '\n';
                     }
                     else
                     {
-                        std::cout << "Opened file for reading: " << currentFilename << '\n';
+                        std::cout << "INFO (sfen_reader): Opened file for reading: " << currentFilename << '\n';
 
                         // in case the file is empty or was deleted.
                         if (sfen_input_stream->eof())
                         {
-                            std::cout << "File empty, nothing to read.\n";
+                            std::cout << "INFO (sfen_reader): File empty, nothing to read.\n";
                         }
                         else
                         {
@@ -232,7 +232,7 @@ namespace Learner{
 
             if (sfen_input_stream == nullptr && !open_next_file())
             {
-                std::cout << "..end of files." << std::endl;
+                std::cout << "INFO (sfen_reader): End of files." << std::endl;
                 end_of_files = true;
                 return;
             }
@@ -271,7 +271,7 @@ namespace Learner{
                         if(!open_next_file())
                         {
                             // There was no next file. Abort.
-                            std::cout << "..end of files." << std::endl;
+                            std::cout << "INFO (sfen_reader): End of files." << std::endl;
                             end_of_files = true;
                             return;
                         }
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index e0d2351d..64b558bd 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -245,7 +245,7 @@ namespace Eval::NNUE {
     // save merit function parameters to a file
     void save_eval(std::string dir_name) {
         auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name);
-        std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+        std::cout << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
 
         // mkdir() will fail if this folder already exists, but
         // Apart from that. If not, I just want you to make it.
@@ -261,7 +261,5 @@ namespace Eval::NNUE {
 #ifndef NDEBUG
         assert(result);
 #endif
-
-        std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
     }
 }  // namespace Eval::NNUE
\ No newline at end of file
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index d1dd738b..284b7e73 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -100,11 +100,11 @@ namespace Eval::NNUE {
                 std::begin(max_activations_), std::end(max_activations_));
 
             std::cout << "INFO (check_health):"
-                      << " layer = " << LayerType::kLayerIndex
-                      << " , name = " << LayerType::get_name()
+                      << " layer " << LayerType::kLayerIndex
+                      << " - " << LayerType::get_name()
                       << std::endl;
 
-            std::cout << "--> largest min activation = " << largest_min_activation
+            std::cout << "==> largest min activation = " << largest_min_activation
                       << " , smallest max activation = " << smallest_max_activation
                       << std::endl;
 
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index dbfe18a2..fea419c9 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -331,11 +331,11 @@ namespace Eval::NNUE {
         // Check if there are any problems with learning
         void check_health() {
             std::cout << "INFO (check_health):"
-                      << " layer = " << LayerType::kLayerIndex
-                      << " , name = " << LayerType::get_name()
+                      << " layer " << LayerType::kLayerIndex
+                      << " - " << LayerType::get_name()
                       << std::endl;
 
-            std::cout << "--> observed " << observed_features.count()
+            std::cout << "==> observed " << observed_features.count()
                       << " (out of " << kInputDimensions << ") features"
                       << std::endl;
 
@@ -343,7 +343,7 @@ namespace Eval::NNUE {
                 std::numeric_limits<typename LayerType::WeightType>::max() /
                 kWeightScale;
 
-            std::cout << "--> (min, max) of pre-activations = "
+            std::cout << "==> (min, max) of pre-activations = "
                       << min_pre_activation_ << ", "
                       << max_pre_activation_ << " (limit = "
                       << kPreActivationLimit << ")"
@@ -354,7 +354,7 @@ namespace Eval::NNUE {
             const auto smallest_max_activation = *std::min_element(
                 std::begin(max_activations_), std::end(max_activations_));
 
-            std::cout << "--> largest min activation = " << largest_min_activation
+            std::cout << "==> largest min activation = " << largest_min_activation
                       << " , smallest max activation = " << smallest_max_activation
                       << std::endl;
 

From 54dd6a240705e83c44ff0b4201d113b0868630b1 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 13:13:10 +0200
Subject: [PATCH 381/583] Add logger with synchronized regions.

---
 src/misc.cpp |   2 +
 src/misc.h   | 178 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 180 insertions(+)

diff --git a/src/misc.cpp b/src/misc.cpp
index e09b8eed..879f4462 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -61,6 +61,8 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 
 using namespace std;
 
+SynchronizedRegionLogger sync_region_cout(std::cout);
+
 namespace {
 
 /// Version number. If Version is left empty, then compile date in the format
diff --git a/src/misc.h b/src/misc.h
index dca959cd..af40ab16 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -31,6 +31,7 @@
 #include <cmath>
 #include <cctype>
 #include <sstream>
+#include <deque>
 
 #include "types.h"
 
@@ -70,6 +71,183 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 #define sync_cout std::cout << IO_LOCK
 #define sync_endl std::endl << IO_UNLOCK
 
+// This logger allows printing many parts in a region atomically
+// but doesn't block the threads trying to append to other regions.
+// Instead if some region tries to pring while other region holds
+// the lock the messages are queued to be printed as soon as the
+// current region releases the lock.
+struct SynchronizedRegionLogger
+{
+private:
+  using RegionId = std::uint64_t;
+
+  struct RegionLock
+  {
+    RegionLock(SynchronizedRegionLogger& log, RegionId id) :
+      logger(&log), region_id(id), is_held(true)
+    {
+    }
+
+    RegionLock(const RegionLock&) = delete;
+    RegionLock& operator=(const RegionLock&) = delete;
+
+    RegionLock(RegionLock&& other) :
+      logger(other.logger), region_id(other.region_id), is_held(other.is_held)
+    {
+      other.logger = nullptr;
+      other.is_held = false;
+    }
+
+    RegionLock& operator=(RegionLock&& other) {
+      if (is_held && logger != nullptr)
+      {
+        logger->release_region(region_id);
+      }
+
+      logger = other.logger;
+      region_id = other.region_id;
+      is_held = other.is_held;
+
+      other.is_held = false;
+
+      return *this;
+    }
+
+    ~RegionLock() { unlock(); }
+
+    void unlock() {
+      if (is_held) {
+        is_held = false;
+
+        if (logger != nullptr)
+          logger->release_region(region_id);
+      }
+    }
+
+    template <typename T>
+    RegionLock& operator << (const T& value) {
+      if (logger != nullptr)
+        logger->write(region_id, value);
+
+      return *this;
+    }
+
+  private:
+    SynchronizedRegionLogger* logger;
+    RegionId region_id;
+    bool is_held;
+  };
+
+  struct Region
+  {
+    Region(RegionId rid) : id(rid), is_held(true) {}
+
+    std::vector<std::string> pending_parts;
+    RegionId id;
+    bool is_held;
+  };
+
+  RegionId init_next_region()
+  {
+    static RegionId next_id = 0;
+
+    std::lock_guard lock(mutex);
+
+    const auto id = next_id++;
+    regions.emplace_back(id);
+
+    return id;
+  }
+
+  template <typename T>
+  void write(RegionId id, const T& value) {
+    std::lock_guard lock(mutex);
+
+    if (regions.empty())
+      return;
+
+    if (id == regions.front().id) {
+      // We can just directly print to the output because
+      // we are at the front of the region queue.
+      out << value;
+    } else {
+      // We have to schedule the print until previous regions are
+      // processed
+      auto* region = find_region_nolock(id);
+      if (region == nullptr)
+        return;
+
+      std::stringstream ss;
+      ss << value;
+      region->pending_parts.emplace_back(std::move(ss).str());
+    }
+  }
+
+  std::ostream& out;
+
+  std::deque<Region> regions;
+
+  std::mutex mutex;
+
+  Region* find_region_nolock(RegionId id) {
+    // Linear search because the amount of concurrent regions should be small.
+    auto it = std::find_if(
+      regions.begin(),
+      regions.end(),
+      [id](const Region& r) { return r.id == id; });
+
+    if (it == regions.end())
+      return nullptr;
+    else
+      return &*it;
+  }
+
+  void release_region(RegionId id) {
+    std::lock_guard lock(mutex);
+
+    auto* region = find_region_nolock(id);
+    if (region == nullptr)
+      return;
+
+    region->is_held = false;
+
+    process_backlog_nolock();
+  }
+
+  void process_backlog_nolock()
+  {
+    while(!regions.empty()) {
+      auto& region = regions.front();
+
+      for(auto& part : region.pending_parts) {
+        out << part;
+      }
+
+      // If the region is still held then we don't
+      // want to start printing stuff from the next region.
+      if (region.is_held)
+        break;
+
+      regions.pop_front();
+    }
+  }
+
+public:
+
+  SynchronizedRegionLogger(std::ostream& s) :
+    out(s)
+  {
+  }
+
+  [[nodiscard]] RegionLock new_region() {
+    const auto id = init_next_region();
+    return RegionLock(*this, id);
+  }
+
+};
+
+extern SynchronizedRegionLogger sync_region_cout;
+
 
 /// xorshift64star Pseudo-Random Number Generator
 /// This class is based on original code written and dedicated

From d824bd8ec5057d1b63b3d7721fe31f4a828b7516 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 13:31:40 +0200
Subject: [PATCH 382/583] Add an overload for io manip in the logger.

---
 src/misc.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/misc.h b/src/misc.h
index af40ab16..4c99cc2b 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -124,6 +124,13 @@ private:
       }
     }
 
+    RegionLock& operator << (std::ostream&(*pManip)(std::ostream&)) {
+      if (logger != nullptr)
+        logger->write(region_id, pManip);
+
+      return *this;
+    }
+
     template <typename T>
     RegionLock& operator << (const T& value) {
       if (logger != nullptr)
@@ -159,6 +166,29 @@ private:
     return id;
   }
 
+  void write(RegionId id, std::ostream&(*pManip)(std::ostream&)) {
+    std::lock_guard lock(mutex);
+
+    if (regions.empty())
+      return;
+
+    if (id == regions.front().id) {
+      // We can just directly print to the output because
+      // we are at the front of the region queue.
+      out << *pManip;
+    } else {
+      // We have to schedule the print until previous regions are
+      // processed
+      auto* region = find_region_nolock(id);
+      if (region == nullptr)
+        return;
+
+      std::stringstream ss;
+      ss << *pManip;
+      region->pending_parts.emplace_back(std::move(ss).str());
+    }
+  }
+
   template <typename T>
   void write(RegionId id, const T& value) {
     std::lock_guard lock(mutex);

From 4b72658409379d0f0c7de0530b49509a41631408 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 13:34:10 +0200
Subject: [PATCH 383/583] Synchronize printed info regions in the learner and
 sfen reader.

---
 src/learn/learn.cpp                           | 36 ++++++++++-------
 src/learn/sfen_reader.h                       | 13 ++++---
 src/nnue/evaluate_nnue_learner.cpp            |  4 +-
 src/nnue/trainer/trainer_clipped_relu.h       | 19 +++++----
 .../trainer/trainer_feature_transformer.h     | 39 +++++++++++--------
 5 files changed, 66 insertions(+), 45 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index cf26e05e..b0ae62f6 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -138,7 +138,8 @@ namespace Learner
                 count = 0.0;
             }
 
-            void print(const std::string& prefix, ostream& s) const
+            template <typename StreamT>
+            void print(const std::string& prefix, StreamT& s) const
             {
                 s << "==> " << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count << endl;
                 s << "==> " << prefix << "_cross_entropy_win  = " << cross_entropy_win / count << endl;
@@ -499,8 +500,9 @@ namespace Learner
         if (validation_set_file_name.empty()
             && sfen_for_mse.size() != sfen_for_mse_size)
         {
-            cout
-                << "Error reading sfen_for_mse. Read " << sfen_for_mse.size()
+            auto out = sync_region_cout.new_region();
+            out
+                << "INFO (learn): Error reading sfen_for_mse. Read " << sfen_for_mse.size()
                 << " out of " << sfen_for_mse_size << '\n';
 
             return;
@@ -514,7 +516,8 @@ namespace Learner
             latest_loss_sum = 0.0;
             latest_loss_count = 0;
 
-            cout << "initial loss: " << best_loss << endl;
+            auto out = sync_region_cout.new_region();
+            out << "INFO (learn): initial loss = " << best_loss << endl;
         }
 
         stop_flag = false;
@@ -585,7 +588,8 @@ namespace Learner
             if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
             {
                 // Malformed sfen
-                cout << "Error! : illigal packed sfen = " << pos.fen() << endl;
+                auto out = sync_region_cout.new_region();
+                out << "ERROR: illigal packed sfen = " << pos.fen() << endl;
                 goto RETRY_READ;
             }
 
@@ -674,14 +678,16 @@ namespace Learner
         TT.new_search();
         TimePoint elapsed = now() - Search::Limits.startTime + 1;
 
-        cout << "\n";
-        cout << "PROGRESS (calc_loss): " << now_string()
+        auto out = sync_region_cout.new_region();
+
+        out << "\n";
+        out << "PROGRESS (calc_loss): " << now_string()
              << ", " << total_done << " sfens"
              << ", " << total_done * 1000 / elapsed  << " sfens/second"
              << ", epoch " << epoch
              << endl;
 
-        cout << "==> learning rate = " << global_learning_rate << endl;
+        out << "==> learning rate = " << global_learning_rate << endl;
 
         // For calculation of verification data loss
         AtomicLoss test_loss_sum{};
@@ -694,11 +700,11 @@ namespace Learner
         atomic<int> move_accord_count{0};
 
         auto mainThread = Threads.main();
-        mainThread->execute_with_worker([](auto& th){
+        mainThread->execute_with_worker([&out](auto& th){
             auto& pos = th.rootPos;
             StateInfo si;
             pos.set(StartFEN, false, &si, &th);
-            cout << "==> startpos eval = " << Eval::evaluate(pos) << endl;
+            out << "==> startpos eval = " << Eval::evaluate(pos) << endl;
         });
         mainThread->wait_for_worker_finished();
 
@@ -721,19 +727,19 @@ namespace Learner
 
         if (psv.size() && test_loss_sum.count > 0.0)
         {
-            test_loss_sum.print("test", cout);
+            test_loss_sum.print("test", out);
 
             if (learn_loss_sum.count > 0.0)
             {
-                learn_loss_sum.print("learn", cout);
+                learn_loss_sum.print("learn", out);
             }
 
-            cout << "==> norm = " << sum_norm << endl;
-            cout << "==> move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
+            out << "==> norm = " << sum_norm << endl;
+            out << "==> move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
         }
         else
         {
-            cout << "Error! : psv.size() = " << psv.size() << " ,  done = " << test_loss_sum.count << endl;
+            out << "ERROR: psv.size() = " << psv.size() << " ,  done = " << test_loss_sum.count << endl;
         }
 
         learn_loss_sum.reset();
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 4d5a6d1a..3547b6bb 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -209,18 +209,19 @@ namespace Learner{
 
                     sfen_input_stream = open_sfen_input_file(currentFilename);
 
+                    auto out = sync_region_cout.new_region();
                     if (sfen_input_stream == nullptr)
                     {
-                        std::cout << "INFO (sfen_reader): File does not exist: " << currentFilename << '\n';
+                        out << "INFO (sfen_reader): File does not exist: " << currentFilename << '\n';
                     }
                     else
                     {
-                        std::cout << "INFO (sfen_reader): Opened file for reading: " << currentFilename << '\n';
+                        out << "INFO (sfen_reader): Opened file for reading: " << currentFilename << '\n';
 
                         // in case the file is empty or was deleted.
                         if (sfen_input_stream->eof())
                         {
-                            std::cout << "INFO (sfen_reader): File empty, nothing to read.\n";
+                            out << "==> File empty, nothing to read.\n";
                         }
                         else
                         {
@@ -232,7 +233,8 @@ namespace Learner{
 
             if (sfen_input_stream == nullptr && !open_next_file())
             {
-                std::cout << "INFO (sfen_reader): End of files." << std::endl;
+                auto out = sync_region_cout.new_region();
+                out << "INFO (sfen_reader): End of files." << std::endl;
                 end_of_files = true;
                 return;
             }
@@ -271,7 +273,8 @@ namespace Learner{
                         if(!open_next_file())
                         {
                             // There was no next file. Abort.
-                            std::cout << "INFO (sfen_reader): End of files." << std::endl;
+                            auto out = sync_region_cout.new_region();
+                            out << "INFO (sfen_reader): End of files." << std::endl;
                             end_of_files = true;
                             return;
                         }
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 64b558bd..9e960da4 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -224,7 +224,9 @@ namespace Eval::NNUE {
             const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
             const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
 
-            std::cout << "INFO (update_parameters):"
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (update_parameters):"
                 << " epoch = " << epoch
                 << " , avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
                 << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 284b7e73..49b715db 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -94,19 +94,24 @@ namespace Eval::NNUE {
 
         // Check if there are any problems with learning
         void check_health() {
+
             const auto largest_min_activation = *std::max_element(
                 std::begin(min_activations_), std::end(min_activations_));
             const auto smallest_max_activation = *std::min_element(
                 std::begin(max_activations_), std::end(max_activations_));
 
-            std::cout << "INFO (check_health):"
-                      << " layer " << LayerType::kLayerIndex
-                      << " - " << LayerType::get_name()
-                      << std::endl;
+            auto out = sync_region_cout.new_region();
 
-            std::cout << "==> largest min activation = " << largest_min_activation
-                      << " , smallest max activation = " << smallest_max_activation
-                      << std::endl;
+            out << "INFO (check_health):"
+                << " layer " << LayerType::kLayerIndex
+                << " - " << LayerType::get_name()
+                << std::endl;
+
+            out << "==> largest min activation = " << largest_min_activation
+                << " , smallest max activation = " << smallest_max_activation
+                << std::endl;
+
+            out.unlock();
 
             std::fill(std::begin(min_activations_), std::end(min_activations_),
                       std::numeric_limits<LearnFloatType>::max());
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index fea419c9..34c423b4 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -330,33 +330,38 @@ namespace Eval::NNUE {
 
         // Check if there are any problems with learning
         void check_health() {
-            std::cout << "INFO (check_health):"
-                      << " layer " << LayerType::kLayerIndex
-                      << " - " << LayerType::get_name()
-                      << std::endl;
-
-            std::cout << "==> observed " << observed_features.count()
-                      << " (out of " << kInputDimensions << ") features"
-                      << std::endl;
 
             constexpr LearnFloatType kPreActivationLimit =
                 std::numeric_limits<typename LayerType::WeightType>::max() /
                 kWeightScale;
 
-            std::cout << "==> (min, max) of pre-activations = "
-                      << min_pre_activation_ << ", "
-                      << max_pre_activation_ << " (limit = "
-                      << kPreActivationLimit << ")"
-                      << std::endl;
-
             const auto largest_min_activation = *std::max_element(
                 std::begin(min_activations_), std::end(min_activations_));
             const auto smallest_max_activation = *std::min_element(
                 std::begin(max_activations_), std::end(max_activations_));
 
-            std::cout << "==> largest min activation = " << largest_min_activation
-                      << " , smallest max activation = " << smallest_max_activation
-                      << std::endl;
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (check_health):"
+                << " layer " << LayerType::kLayerIndex
+                << " - " << LayerType::get_name()
+                << std::endl;
+
+            out << "==> observed " << observed_features.count()
+                << " (out of " << kInputDimensions << ") features"
+                << std::endl;
+
+            out << "==> (min, max) of pre-activations = "
+                << min_pre_activation_ << ", "
+                << max_pre_activation_ << " (limit = "
+                << kPreActivationLimit << ")"
+                << std::endl;
+
+            out << "==> largest min activation = " << largest_min_activation
+                << " , smallest max activation = " << smallest_max_activation
+                << std::endl;
+
+            out.unlock();
 
             std::fill(std::begin(min_activations_), std::end(min_activations_),
                       std::numeric_limits<LearnFloatType>::max());

From b882423005f62978cec4eb7f903b76df59149f48 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 13:46:34 +0200
Subject: [PATCH 384/583] Bring back info for finished evalsave. Update tests
 with the new message.

---
 src/nnue/evaluate_nnue_learner.cpp | 6 +++++-
 tests/instrumented_learn.sh        | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 9e960da4..0151b3f8 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -247,7 +247,10 @@ namespace Eval::NNUE {
     // save merit function parameters to a file
     void save_eval(std::string dir_name) {
         auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name);
-        std::cout << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
+
+        auto out = sync_region_cout.new_region();
+
+        out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
 
         // mkdir() will fail if this folder already exists, but
         // Apart from that. If not, I just want you to make it.
@@ -263,5 +266,6 @@ namespace Eval::NNUE {
 #ifndef NDEBUG
         assert(result);
 #endif
+        out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
     }
 }  // namespace Eval::NNUE
\ No newline at end of file
diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 4ce3dc1c..50b6e4ae 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -129,7 +129,7 @@ cat << EOF > learn01.exp
  send "isready\n"
  send "learn targetdir training_data epochs 1 sfen_read_size 100 thread_buffer_size 10 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
 
- expect "save_eval() finished."
+ expect "INFO (save_eval): Saving current evaluation file in"
 
  send "quit\n"
  expect eof

From 2c477d76ec8cf8915dc520cb35bd150d78c7ddd6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 14:21:59 +0200
Subject: [PATCH 385/583] Cleaner and more outputs during training
 initialization.

---
 src/learn/learn.cpp                           | 124 +++++++++++-------
 src/learn/sfen_reader.h                       |   2 +-
 src/misc.h                                    |  42 +++---
 src/nnue/evaluate_nnue_learner.cpp            |  19 ++-
 src/nnue/evaluate_nnue_learner.h              |   6 +-
 src/nnue/layers/affine_transform.h            |   4 +-
 src/nnue/layers/clipped_relu.h                |   4 +-
 src/nnue/layers/input_slice.h                 |   5 +-
 src/nnue/layers/sum.h                         |   4 +-
 src/nnue/nnue_feature_transformer.h           |   5 +-
 src/nnue/trainer/trainer_clipped_relu.h       |   2 +-
 .../trainer/trainer_feature_transformer.h     |   6 +-
 12 files changed, 129 insertions(+), 94 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index b0ae62f6..3faab0ea 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -141,12 +141,12 @@ namespace Learner
             template <typename StreamT>
             void print(const std::string& prefix, StreamT& s) const
             {
-                s << "==> " << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count << endl;
-                s << "==> " << prefix << "_cross_entropy_win  = " << cross_entropy_win / count << endl;
-                s << "==> " << prefix << "_entropy_eval       = " << entropy_eval / count << endl;
-                s << "==> " << prefix << "_entropy_win        = " << entropy_win / count << endl;
-                s << "==> " << prefix << "_cross_entropy      = " << cross_entropy / count << endl;
-                s << "==> " << prefix << "_entropy            = " << entropy / count << endl;
+                s << "  - " << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count << endl;
+                s << "  - " << prefix << "_cross_entropy_win  = " << cross_entropy_win / count << endl;
+                s << "  - " << prefix << "_entropy_eval       = " << entropy_eval / count << endl;
+                s << "  - " << prefix << "_entropy_win        = " << entropy_win / count << endl;
+                s << "  - " << prefix << "_cross_entropy      = " << cross_entropy / count << endl;
+                s << "  - " << prefix << "_entropy            = " << entropy / count << endl;
             }
         };
     }
@@ -687,7 +687,7 @@ namespace Learner
              << ", epoch " << epoch
              << endl;
 
-        out << "==> learning rate = " << global_learning_rate << endl;
+        out << "  - learning rate = " << global_learning_rate << endl;
 
         // For calculation of verification data loss
         AtomicLoss test_loss_sum{};
@@ -704,7 +704,7 @@ namespace Learner
             auto& pos = th.rootPos;
             StateInfo si;
             pos.set(StartFEN, false, &si, &th);
-            out << "==> startpos eval = " << Eval::evaluate(pos) << endl;
+            out << "  - startpos eval = " << Eval::evaluate(pos) << endl;
         });
         mainThread->wait_for_worker_finished();
 
@@ -734,8 +734,8 @@ namespace Learner
                 learn_loss_sum.print("learn", out);
             }
 
-            out << "==> norm = " << sum_norm << endl;
-            out << "==> move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
+            out << "  - norm = " << sum_norm << endl;
+            out << "  - move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
         }
         else
         {
@@ -852,7 +852,7 @@ namespace Learner
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
                 cout << "INFO (learning_rate):" << endl;
-                cout << "==> loss = " << latest_loss;
+                cout << "  - loss = " << latest_loss;
                 auto tot = total_done;
                 if (auto_lr_drop)
                 {
@@ -882,7 +882,7 @@ namespace Learner
                     if (--trials > 0 && !is_final)
                     {
                         cout
-                            << "==> reducing learning rate from " << global_learning_rate
+                            << "  - reducing learning rate from " << global_learning_rate
                             << " to " << (global_learning_rate * newbob_decay)
                             << " (" << trials << " more trials)" << endl;
 
@@ -892,7 +892,7 @@ namespace Learner
 
                 if (trials == 0)
                 {
-                    cout << "==> converged" << endl;
+                    cout << "  - converged" << endl;
                     return true;
                 }
             }
@@ -980,6 +980,8 @@ namespace Learner
         string validation_set_file_name;
         string seed;
 
+        auto out = sync_region_cout.new_region();
+
         // Assume the filenames are staggered.
         while (true)
         {
@@ -1083,7 +1085,7 @@ namespace Learner
             else if (option == "verbose") verbose = true;
             else
             {
-                cout << "Unknown option: " << option << ". Ignoring.\n";
+                out << "INFO: Unknown option: " << option << ". Ignoring.\n";
             }
         }
 
@@ -1092,11 +1094,14 @@ namespace Learner
             loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
         }
 
-        cout << "learn command , ";
+        // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
+        reduction_gameply = max(reduction_gameply, 1);
+
+        out << "INFO: Executing learn command\n";
 
         // Issue a warning if OpenMP is disabled.
 #if !defined(_OPENMP)
-        cout << "Warning! OpenMP disabled." << endl;
+        out << "WARNING: OpenMP disabled." << endl;
 #endif
 
         // Right now we only have the individual files.
@@ -1107,65 +1112,80 @@ namespace Learner
         }
         rebase_files(filenames, base_dir);
 
-        cout << "learn from ";
+        out << "INFO: Input files:\n";
         for (auto s : filenames)
-            cout << s << " , ";
+            out << "  - " << s << '\n';
 
-        cout << endl;
+        out << "INFO: Parameters:\n";
         if (!validation_set_file_name.empty())
         {
-            cout << "validation set  : " << validation_set_file_name << endl;
+            out << "  - validation set           : " << validation_set_file_name << endl;
         }
 
-        cout << "base dir        : " << base_dir << endl;
-        cout << "target dir      : " << target_dir << endl;
+        out << "  - epochs                   : " << epochs << endl;
+        out << "  - epochs * minibatch size  : " << epochs * mini_batch_size << endl;
+        out << "  - eval_limit               : " << eval_limit << endl;
+        out << "  - save_only_once           : " << (save_only_once ? "true" : "false") << endl;
+        out << "  - shuffle on read          : " << (no_shuffle ? "false" : "true") << endl;
 
-        cout << "epochs            : " << epochs << endl;
-        cout << "eval_limit        : " << eval_limit << endl;
-        cout << "save_only_once    : " << (save_only_once ? "true" : "false") << endl;
-        cout << "no_shuffle        : " << (no_shuffle ? "true" : "false") << endl;
+        out << "  - Loss Function            : " << LOSS_FUNCTION << endl;
+        out << "  - minibatch size           : " << mini_batch_size << endl;
 
-        cout << "Loss Function     : " << LOSS_FUNCTION << endl;
-        cout << "mini-batch size   : " << mini_batch_size << endl;
+        out << "  - nn_batch_size            : " << nn_batch_size << endl;
+        out << "  - nn_options               : " << nn_options << endl;
 
-        cout << "nn_batch_size     : " << nn_batch_size << endl;
-        cout << "nn_options        : " << nn_options << endl;
+        out << "  - learning rate            : " << global_learning_rate << endl;
+        out << "  - use draws in training    : " << use_draw_games_in_training << endl;
+        out << "  - use draws in validation  : " << use_draw_games_in_validation << endl;
+        out << "  - skip repeated positions  : " << skip_duplicated_positions_in_training << endl;
 
-        cout << "learning rate     : " << global_learning_rate << endl;
-        cout << "use_draw_games_in_training : " << use_draw_games_in_training << endl;
-        cout << "use_draw_games_in_validation : " << use_draw_games_in_validation << endl;
-        cout << "skip_duplicated_positions_in_training : " << skip_duplicated_positions_in_training << endl;
+        out << "  - winning prob coeff       : " << winning_probability_coefficient << endl;
+        out << "  - use_wdl                  : " << use_wdl << endl;
 
-        if (newbob_decay != 1.0) {
-            cout << "scheduling        : newbob with decay = " << newbob_decay
-                << ", " << newbob_num_trials << " trials" << endl;
+        out << "  - src_score_min_value      : " << src_score_min_value << endl;
+        out << "  - src_score_max_value      : " << src_score_max_value << endl;
+        out << "  - dest_score_min_value     : " << dest_score_min_value << endl;
+        out << "  - dest_score_max_value     : " << dest_score_max_value << endl;
+
+        out << "  - reduction_gameply        : " << reduction_gameply << endl;
+
+        out << "  - LAMBDA                   : " << ELMO_LAMBDA << endl;
+        out << "  - LAMBDA2                  : " << ELMO_LAMBDA2 << endl;
+        out << "  - LAMBDA_LIMIT             : " << ELMO_LAMBDA_LIMIT << endl;
+        out << "  - eval_save_interval       : " << eval_save_interval << " sfens" << endl;
+        out << "  - loss_output_interval     : " << loss_output_interval << " sfens" << endl;
+
+        out << "  - sfen_read_size           : " << sfen_read_size << endl;
+        out << "  - thread_buffer_size       : " << thread_buffer_size << endl;
+
+        out << "  - seed                     : " << seed << endl;
+        out << "  - verbose                  : " << (verbose ? "true" : "false") << endl;
+
+        if (auto_lr_drop) {
+            out << "  - learning rate scheduling : every " << auto_lr_drop << " sfens" << endl;
+        }
+        else if (newbob_decay != 1.0) {
+            out << "  - learning rate scheduling : newbob with decay" << endl;
+            out << "  - newbob_decay             : " << newbob_decay << endl;
+            out << "  - newbob_num_trials        : " << newbob_num_trials << endl;
         }
         else {
-            cout << "scheduling        : default" << endl;
+            out << "  - learning rate scheduling : fixed learning rate" << endl;
         }
 
-        // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
-        reduction_gameply = max(reduction_gameply, 1);
-        cout << "reduction_gameply : " << reduction_gameply << endl;
-
-        cout << "LAMBDA            : " << ELMO_LAMBDA << endl;
-        cout << "LAMBDA2           : " << ELMO_LAMBDA2 << endl;
-        cout << "LAMBDA_LIMIT      : " << ELMO_LAMBDA_LIMIT << endl;
-        cout << "eval_save_interval  : " << eval_save_interval << " sfens" << endl;
-        cout << "loss_output_interval: " << loss_output_interval << " sfens" << endl;
+        out << endl;
 
         // -----------------------------------
         // various initialization
         // -----------------------------------
 
-        cout << "init.." << endl;
+        out << "INFO: Started initialization." << endl;
 
         Threads.main()->ponder = false;
 
         set_learning_search_limits();
 
-        cout << "init_training.." << endl;
-        Eval::NNUE::initialize_training(seed);
+        Eval::NNUE::initialize_training(seed, out);
         Eval::NNUE::set_batch_size(nn_batch_size);
         Eval::NNUE::set_options(nn_options);
 
@@ -1204,7 +1224,9 @@ namespace Learner
 
         learn_think.verbose = verbose;
 
-        cout << "init done." << endl;
+        out << "Finished initialization." << endl;
+
+        out.unlock();
 
         // Start learning.
         learn_think.learn(epochs);
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 3547b6bb..512f1165 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -221,7 +221,7 @@ namespace Learner{
                         // in case the file is empty or was deleted.
                         if (sfen_input_stream->eof())
                         {
-                            out << "==> File empty, nothing to read.\n";
+                            out << "  - File empty, nothing to read.\n";
                         }
                         else
                         {
diff --git a/src/misc.h b/src/misc.h
index 4c99cc2b..3e6dc5b0 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -78,27 +78,23 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 // current region releases the lock.
 struct SynchronizedRegionLogger
 {
-private:
   using RegionId = std::uint64_t;
 
-  struct RegionLock
+  struct Region
   {
-    RegionLock(SynchronizedRegionLogger& log, RegionId id) :
-      logger(&log), region_id(id), is_held(true)
-    {
-    }
+    friend struct SynchronizedRegionLogger;
 
-    RegionLock(const RegionLock&) = delete;
-    RegionLock& operator=(const RegionLock&) = delete;
+    Region(const Region&) = delete;
+    Region& operator=(const Region&) = delete;
 
-    RegionLock(RegionLock&& other) :
+    Region(Region&& other) :
       logger(other.logger), region_id(other.region_id), is_held(other.is_held)
     {
       other.logger = nullptr;
       other.is_held = false;
     }
 
-    RegionLock& operator=(RegionLock&& other) {
+    Region& operator=(Region&& other) {
       if (is_held && logger != nullptr)
       {
         logger->release_region(region_id);
@@ -113,7 +109,7 @@ private:
       return *this;
     }
 
-    ~RegionLock() { unlock(); }
+    ~Region() { unlock(); }
 
     void unlock() {
       if (is_held) {
@@ -124,7 +120,7 @@ private:
       }
     }
 
-    RegionLock& operator << (std::ostream&(*pManip)(std::ostream&)) {
+    Region& operator << (std::ostream&(*pManip)(std::ostream&)) {
       if (logger != nullptr)
         logger->write(region_id, pManip);
 
@@ -132,7 +128,7 @@ private:
     }
 
     template <typename T>
-    RegionLock& operator << (const T& value) {
+    Region& operator << (const T& value) {
       if (logger != nullptr)
         logger->write(region_id, value);
 
@@ -143,11 +139,17 @@ private:
     SynchronizedRegionLogger* logger;
     RegionId region_id;
     bool is_held;
+
+    Region(SynchronizedRegionLogger& log, RegionId id) :
+      logger(&log), region_id(id), is_held(true)
+    {
+    }
   };
 
-  struct Region
+private:
+  struct RegionBookkeeping
   {
-    Region(RegionId rid) : id(rid), is_held(true) {}
+    RegionBookkeeping(RegionId rid) : id(rid), is_held(true) {}
 
     std::vector<std::string> pending_parts;
     RegionId id;
@@ -215,16 +217,16 @@ private:
 
   std::ostream& out;
 
-  std::deque<Region> regions;
+  std::deque<RegionBookkeeping> regions;
 
   std::mutex mutex;
 
-  Region* find_region_nolock(RegionId id) {
+  RegionBookkeeping* find_region_nolock(RegionId id) {
     // Linear search because the amount of concurrent regions should be small.
     auto it = std::find_if(
       regions.begin(),
       regions.end(),
-      [id](const Region& r) { return r.id == id; });
+      [id](const RegionBookkeeping& r) { return r.id == id; });
 
     if (it == regions.end())
       return nullptr;
@@ -269,9 +271,9 @@ public:
   {
   }
 
-  [[nodiscard]] RegionLock new_region() {
+  [[nodiscard]] Region new_region() {
     const auto id = init_next_region();
-    return RegionLock(*this, id);
+    return Region(*this, id);
   }
 
 };
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 0151b3f8..7a72ea19 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -54,23 +54,28 @@ namespace Eval::NNUE {
     }  // namespace
 
     // Initialize learning
-    void initialize_training(const std::string& seed) {
-        std::cout << "Initializing NN training for "
-                  << get_architecture_string() << std::endl;
+    void initialize_training(
+        const std::string& seed,
+        SynchronizedRegionLogger::Region& out) {
 
-        std::cout << std::endl;
+        out << "INFO (initialize_training): Initializing NN training for "
+            << get_architecture_string() << std::endl;
 
-        std::cout << "Layers:\n"
-                  << get_layers_info() << std::endl;
+        out << std::endl;
 
-        std::cout << std::endl;
+        out << "Layers:\n"
+            << get_layers_info() << std::endl;
+
+        out << std::endl;
 
         assert(feature_transformer);
         assert(network);
+
         trainer = Trainer<Network>::create(network.get(), feature_transformer.get());
         rng.seed(PRNG(seed).rand<uint64_t>());
 
         if (Options["SkipLoadingEval"]) {
+            out << "INFO (initialize_training): Performing random net initialization.\n";
             trainer->initialize(rng);
         }
     }
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 03a23c83..91d2aa99 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -3,11 +3,15 @@
 
 #include "learn/learn.h"
 
+#include "misc.h"
+
 // Interface used for learning NNUE evaluation function
 namespace Eval::NNUE {
 
     // Initialize learning
-    void initialize_training(const std::string& seed);
+    void initialize_training(
+        const std::string& seed,
+        SynchronizedRegionLogger::Region& out);
 
     // set the number of samples in the mini-batch
     void set_batch_size(uint64_t size);
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index e734580e..1227efff 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -82,9 +82,9 @@ namespace Eval::NNUE::Layers {
 
         static std::string get_layers_info() {
             std::string info = PreviousLayer::get_layers_info();
-            info += '\n';
+            info += "\n  - ";
             info += std::to_string(kLayerIndex);
-            info += ": ";
+            info += " - ";
             info += get_name();
             return info;
         }
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 5fbd66cc..40185b13 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -76,9 +76,9 @@ namespace Eval::NNUE::Layers {
 
         static std::string get_layers_info() {
             std::string info = PreviousLayer::get_layers_info();
-            info += '\n';
+            info += "\n  - ";
             info += std::to_string(kLayerIndex);
-            info += ": ";
+            info += " - ";
             info += get_name();
             return info;
         }
diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h
index 56c738af..3dc613b9 100644
--- a/src/nnue/layers/input_slice.h
+++ b/src/nnue/layers/input_slice.h
@@ -65,8 +65,9 @@ namespace Eval::NNUE::Layers {
         }
 
         static std::string get_layers_info() {
-            std::string info = std::to_string(kLayerIndex);
-            info += ": ";
+            std::string info = "  - ";
+            info += std::to_string(kLayerIndex);
+            info += " - ";
             info += get_name();
             return info;
         }
diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h
index 0f71bd61..261dbee1 100644
--- a/src/nnue/layers/sum.h
+++ b/src/nnue/layers/sum.h
@@ -60,9 +60,9 @@ namespace Eval::NNUE::Layers {
 
         static std::string get_layers_info() {
             std::string info = Tail::get_layers_info();
-            info += '\n';
+            info += "\n  - ";
             info += std::to_string(kLayerIndex);
-            info += ": ";
+            info += " - ";
             info += get_name();
             return info;
         }
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 3e18e68a..2089ab1c 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -130,8 +130,9 @@ namespace Eval::NNUE {
         }
 
         static std::string get_layers_info() {
-            std::string info = std::to_string(kLayerIndex);
-            info += ": ";
+            std::string info = "  - ";
+            info += std::to_string(kLayerIndex);
+            info += " - ";
             info += get_name();
             return info;
         }
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 49b715db..f9bbd833 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -107,7 +107,7 @@ namespace Eval::NNUE {
                 << " - " << LayerType::get_name()
                 << std::endl;
 
-            out << "==> largest min activation = " << largest_min_activation
+            out << "  - largest min activation = " << largest_min_activation
                 << " , smallest max activation = " << smallest_max_activation
                 << std::endl;
 
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 34c423b4..ffde6eba 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -347,17 +347,17 @@ namespace Eval::NNUE {
                 << " - " << LayerType::get_name()
                 << std::endl;
 
-            out << "==> observed " << observed_features.count()
+            out << "  - observed " << observed_features.count()
                 << " (out of " << kInputDimensions << ") features"
                 << std::endl;
 
-            out << "==> (min, max) of pre-activations = "
+            out << "  - (min, max) of pre-activations = "
                 << min_pre_activation_ << ", "
                 << max_pre_activation_ << " (limit = "
                 << kPreActivationLimit << ")"
                 << std::endl;
 
-            out << "==> largest min activation = " << largest_min_activation
+            out << "  - largest min activation = " << largest_min_activation
                 << " , smallest max activation = " << smallest_max_activation
                 << std::endl;
 

From fe766f4f4298ac8cfa62be340f3c94f45f1cd365 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 16:51:31 +0200
Subject: [PATCH 386/583] Additional output from layers during training.

---
 src/nnue/trainer/trainer_affine_transform.h   | 52 ++++++++++++++++---
 src/nnue/trainer/trainer_clipped_relu.h       | 24 ++++++---
 .../trainer/trainer_feature_transformer.h     | 40 +++++++++-----
 3 files changed, 88 insertions(+), 28 deletions(-)

diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index f6d374ef..21e54f18 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -48,6 +48,10 @@ namespace Eval::NNUE {
             if (receive_message("quantize_parameters", message)) {
                 quantize_parameters();
             }
+
+            if (receive_message("check_health", message)) {
+                check_health();
+            }
         }
 
         // Initialize the parameters with random numbers
@@ -145,16 +149,11 @@ namespace Eval::NNUE {
                           &gradients[batch_offset], 1, biases_diff_, 1);
             }
 
-            cblas_saxpy(kOutputDimensions, -local_learning_rate,
-                        biases_diff_, 1, biases_, 1);
-
             cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
                         kOutputDimensions, kInputDimensions, batch_size_, 1.0,
                         gradients, kOutputDimensions,
                         batch_input_, kInputDimensions,
                         momentum_, weights_diff_, kInputDimensions);
-            cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
-                        weights_diff_, 1, weights_, 1);
 
 #else
             // backpropagate
@@ -196,16 +195,22 @@ namespace Eval::NNUE {
                     }
                 }
             }
+#endif
 
             for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                biases_[i] -= local_learning_rate * biases_diff_[i];
+                const double d = local_learning_rate * biases_diff_[i];
+                biases_[i] -= d;
+                abs_biases_diff_sum_ += std::abs(d);
             }
+            num_biases_diffs_ += kOutputDimensions;
 
             for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-                weights_[i] -= local_learning_rate * weights_diff_[i];
+                const double d = local_learning_rate * weights_diff_[i];
+                weights_[i] -= d;
+                abs_weights_diff_sum_ += std::abs(d);
             }
+            num_weights_diffs_ += kOutputDimensions * kInputDimensions;
 
-#endif
             previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
         }
 
@@ -227,6 +232,30 @@ namespace Eval::NNUE {
             dequantize_parameters();
         }
 
+        void reset_stats() {
+            abs_biases_diff_sum_ = 0.0;
+            abs_weights_diff_sum_ = 0.0;
+            num_biases_diffs_ = 0;
+            num_weights_diffs_ = 0;
+        }
+
+        void check_health() {
+
+            auto out = sync_region_cout.new_region();
+
+            out << "INFO (check_health):"
+                << " layer " << LayerType::kLayerIndex
+                << " - " << LayerType::get_name()
+                << std::endl;
+
+            out << "  - avg_abs_bias_diff   = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl;
+            out << "  - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl;
+
+            out.unlock();
+
+            reset_stats();
+        }
+
         // Weight saturation and parameterization
         void quantize_parameters() {
             for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
@@ -270,6 +299,8 @@ namespace Eval::NNUE {
                       static_cast<LearnFloatType>(0.0));
             std::fill(std::begin(weights_diff_), std::end(weights_diff_),
                       static_cast<LearnFloatType>(0.0));
+
+            reset_stats();
         }
 
         // number of input/output dimensions
@@ -296,6 +327,11 @@ namespace Eval::NNUE {
         // number of samples in mini-batch
         IndexType batch_size_;
 
+        double abs_biases_diff_sum_;
+        double abs_weights_diff_sum_;
+        uint64_t num_biases_diffs_;
+        uint64_t num_weights_diffs_;
+
         // Input mini batch
         const LearnFloatType* batch_input_;
 
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index f9bbd833..57e9bac4 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -70,10 +70,12 @@ namespace Eval::NNUE {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
                     const IndexType index = batch_offset + i;
-                    gradients_[index] = gradients[index] *
-                        (output_[index] > kZero) * (output_[index] < kOne);
+                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
+                    gradients_[index] = gradients[index] * !clipped;
+                    num_clipped_ += clipped;
                 }
             }
+            num_total_ += batch_size_ * kOutputDimensions;
 
             previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
         }
@@ -86,10 +88,17 @@ namespace Eval::NNUE {
                 &target_layer->previous_layer_, ft)),
             target_layer_(target_layer) {
 
+            reset_stats();
+        }
+
+        void reset_stats() {
             std::fill(std::begin(min_activations_), std::end(min_activations_),
                       std::numeric_limits<LearnFloatType>::max());
             std::fill(std::begin(max_activations_), std::end(max_activations_),
                       std::numeric_limits<LearnFloatType>::lowest());
+
+            num_clipped_ = 0;
+            num_total_ = 0;
         }
 
         // Check if there are any problems with learning
@@ -111,12 +120,12 @@ namespace Eval::NNUE {
                 << " , smallest max activation = " << smallest_max_activation
                 << std::endl;
 
+            out << "  - clipped " << static_cast<double>(num_clipped_) / num_total_ * 100.0 << "% of outputs"
+                << std::endl;
+
             out.unlock();
 
-            std::fill(std::begin(min_activations_), std::end(min_activations_),
-                      std::numeric_limits<LearnFloatType>::max());
-            std::fill(std::begin(max_activations_), std::end(max_activations_),
-                      std::numeric_limits<LearnFloatType>::lowest());
+            reset_stats();
         }
 
         // number of input/output dimensions
@@ -130,6 +139,9 @@ namespace Eval::NNUE {
         // number of samples in mini-batch
         IndexType batch_size_;
 
+        IndexType num_clipped_;
+        IndexType num_total_;
+
         // Trainer of the previous layer
         const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
 
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index ffde6eba..869ceb85 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -153,10 +153,12 @@ namespace Eval::NNUE {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
                     const IndexType index = batch_offset + i;
-                    gradients_[index] = gradients[index] *
-                        ((output_[index] > kZero) * (output_[index] < kOne));
+                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
+                    gradients_[index] = gradients[index] * !clipped;
+                    num_clipped_ += clipped;
                 }
             }
+            num_total_ += batch_->size() * kOutputDimensions;
 
             // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
             // Correct the learning rate and adjust the scale without using momentum
@@ -261,14 +263,6 @@ namespace Eval::NNUE {
             momentum_(0.2),
             learning_rate_scale_(1.0) {
 
-            min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
-            max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
-
-            std::fill(std::begin(min_activations_), std::end(min_activations_),
-                      std::numeric_limits<LearnFloatType>::max());
-            std::fill(std::begin(max_activations_), std::end(max_activations_),
-                      std::numeric_limits<LearnFloatType>::lowest());
-
             dequantize_parameters();
         }
 
@@ -299,6 +293,19 @@ namespace Eval::NNUE {
             }
         }
 
+        void reset_stats() {
+            min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
+            max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
+
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
+                      std::numeric_limits<LearnFloatType>::max());
+            std::fill(std::begin(max_activations_), std::end(max_activations_),
+                      std::numeric_limits<LearnFloatType>::lowest());
+
+            num_clipped_ = 0;
+            num_total_ = 0;
+        }
+
         // read parameterized integer
         void dequantize_parameters() {
             for (IndexType i = 0; i < kHalfDimensions; ++i) {
@@ -314,6 +321,8 @@ namespace Eval::NNUE {
             }
 
             std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
+
+            reset_stats();
         }
 
         // Set the weight corresponding to the feature that does not appear in the learning data to 0
@@ -361,12 +370,12 @@ namespace Eval::NNUE {
                 << " , smallest max activation = " << smallest_max_activation
                 << std::endl;
 
+            out << "  - clipped " << static_cast<double>(num_clipped_) / num_total_ * 100.0 << "% of outputs"
+                << std::endl;
+
             out.unlock();
 
-            std::fill(std::begin(min_activations_), std::end(min_activations_),
-                      std::numeric_limits<LearnFloatType>::max());
-            std::fill(std::begin(max_activations_), std::end(max_activations_),
-                      std::numeric_limits<LearnFloatType>::lowest());
+            reset_stats();
         }
 
         // number of input/output dimensions
@@ -391,6 +400,9 @@ namespace Eval::NNUE {
         // layer to learn
         LayerType* const target_layer_;
 
+        IndexType num_clipped_;
+        IndexType num_total_;
+
         // parameter
         alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
         alignas(kCacheLineSize)

From 0e528995c279c773f5e6e5903bc4631586d8d27c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 20:03:40 +0200
Subject: [PATCH 387/583] Print avg bias/weight for affine trasform and feature
 transformer during training.

---
 src/nnue/trainer/trainer_affine_transform.h    | 11 +++++++++++
 src/nnue/trainer/trainer_feature_transformer.h | 12 ++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 21e54f18..3179aeb0 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -241,6 +241,15 @@ namespace Eval::NNUE {
 
         void check_health() {
 
+            double abs_bias_sum = 0.0;
+            double abs_weight_sum = 0.0;
+
+            for(auto b : biases_)
+                abs_bias_sum += std::abs(b);
+
+            for(auto w : weights_)
+                abs_weight_sum += std::abs(w);
+
             auto out = sync_region_cout.new_region();
 
             out << "INFO (check_health):"
@@ -248,7 +257,9 @@ namespace Eval::NNUE {
                 << " - " << LayerType::get_name()
                 << std::endl;
 
+            out << "  - avg_abs_bias        = " << abs_bias_sum / std::size(biases_) << std::endl;
             out << "  - avg_abs_bias_diff   = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl;
+            out << "  - avg_abs_weight      = " << abs_weight_sum / std::size(weights_) << std::endl;
             out << "  - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl;
 
             out.unlock();
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 869ceb85..97b19c46 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -349,6 +349,15 @@ namespace Eval::NNUE {
             const auto smallest_max_activation = *std::min_element(
                 std::begin(max_activations_), std::end(max_activations_));
 
+            double abs_bias_sum = 0.0;
+            double abs_weight_sum = 0.0;
+
+            for(auto b : biases_)
+                abs_bias_sum += std::abs(b);
+
+            for(auto w : weights_)
+                abs_weight_sum += std::abs(w);
+
             auto out = sync_region_cout.new_region();
 
             out << "INFO (check_health):"
@@ -370,6 +379,9 @@ namespace Eval::NNUE {
                 << " , smallest max activation = " << smallest_max_activation
                 << std::endl;
 
+            out << "  - avg_abs_bias   = " << abs_bias_sum / std::size(biases_) << std::endl;
+            out << "  - avg_abs_weight = " << abs_weight_sum / std::size(weights_) << std::endl;
+
             out << "  - clipped " << static_cast<double>(num_clipped_) / num_total_ * 100.0 << "% of outputs"
                 << std::endl;
 

From af238fe132778621125f8406052f1b55f7a6e13b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 21:18:06 +0200
Subject: [PATCH 388/583] Rewrite gensfen to use stockfish's thread pool.

---
 src/learn/gensfen.cpp | 180 ++++++++++++++++++++----------------------
 1 file changed, 84 insertions(+), 96 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 22fddafb..b2325e40 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,7 +1,6 @@
 ﻿#include "gensfen.h"
 
 #include "packed_sfen.h"
-#include "multi_think.h"
 #include "sfen_stream.h"
 
 #include "misc.h"
@@ -261,7 +260,7 @@ namespace Learner
     // -----------------------------------
 
     // Class to generate sfen with multiple threads
-    struct MultiThinkGenSfen : public MultiThink
+    struct MultiThinkGenSfen
     {
         // Hash to limit the export of identical sfens
         static constexpr uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
@@ -269,7 +268,7 @@ namespace Learner
         static_assert((GENSFEN_HASH_SIZE& (GENSFEN_HASH_SIZE - 1)) == 0);
 
         MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_, const std::string& seed) :
-            MultiThink(seed),
+            prng(seed),
             search_depth_min(search_depth_min_),
             search_depth_max(search_depth_max_),
             sfen_writer(sw_)
@@ -285,7 +284,9 @@ namespace Learner
             sfen_writer.start_file_write_worker();
         }
 
-        void thread_worker(size_t thread_id) override;
+        void gensfen(uint64_t limit);
+
+        void thread_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
 
         optional<int8_t> get_current_game_result(
             Position& pos,
@@ -293,7 +294,14 @@ namespace Learner
 
         vector<uint8_t> generate_random_move_flags();
 
-        bool commit_psv(PSVector& a_psv, size_t thread_id, int8_t lastTurnIsWin);
+        bool was_seen_before(const Position& pos);
+
+        bool commit_psv(
+            Thread& th,
+            PSVector& sfens,
+            int8_t lastTurnIsWin,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit);
 
         optional<Move> choose_random_move(
             Position& pos,
@@ -301,6 +309,8 @@ namespace Learner
             int ply,
             int& random_move_c);
 
+        PRNG prng;
+
         // Min and max depths for search during gensfen
         int search_depth_min;
         int search_depth_max;
@@ -347,6 +357,15 @@ namespace Learner
         vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
     };
 
+    void MultiThinkGenSfen::gensfen(uint64_t limit)
+    {
+        std::atomic<uint64_t> counter{0};
+        Threads.execute_with_workers([&counter, limit, this](Thread& th) {
+            thread_worker(th, counter, limit);
+        });
+        Threads.wait_for_workers_finished();
+    }
+
     optional<int8_t> MultiThinkGenSfen::get_current_game_result(
         Position& pos,
         const vector<int>& move_hist_scores) const
@@ -470,7 +489,12 @@ namespace Learner
     // 1 when winning. -1 when losing. Pass 0 for a draw.
     // Return value: true if the specified number of
     // sfens has already been reached and the process ends.
-    bool MultiThinkGenSfen::commit_psv(PSVector& sfens, size_t thread_id, int8_t lastTurnIsWin)
+    bool MultiThinkGenSfen::commit_psv(
+        Thread& th,
+        PSVector& sfens,
+        int8_t lastTurnIsWin,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit)
     {
         if (!write_out_draw_game_in_training_data_generation && lastTurnIsWin == 0)
         {
@@ -482,34 +506,26 @@ namespace Learner
 
         // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
         // The phases stored in sfens are assumed to be continuous (in order).
-        bool quit = false;
-        int num_sfens_to_commit = 0;
         for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
         {
             // If is_win == 0 (draw), multiply by -1 and it will remain 0 (draw)
             is_win = -is_win;
             it->game_result = is_win;
-
-            // See how many sfens were already written and get the next id.
-            // Exit if requested number of sfens reached.
-            auto now_loop_count = get_next_loop_count();
-            if (now_loop_count == LOOP_COUNT_FINISHED)
-            {
-                quit = true;
-                break;
-            }
-
-            ++num_sfens_to_commit;
         }
 
         // Write sfens in move order to make potential compression easier
-        for (auto it = sfens.end() - num_sfens_to_commit; it != sfens.end(); ++it)
+        for (auto& sfen : sfens)
         {
+            // Return true if there is already enough data generated.
+            const auto iter = counter.fetch_add(1);
+            if (iter >= limit)
+                return true;
+
             // Write out one sfen.
-            sfen_writer.write(thread_id, *it);
+            sfen_writer.write(th.thread_idx(), sfen);
         }
 
-        return quit;
+        return false;
     }
 
     optional<Move> MultiThinkGenSfen::choose_random_move(
@@ -640,8 +656,29 @@ namespace Learner
         return random_move_flag;
     }
 
+    bool MultiThinkGenSfen::was_seen_before(const Position& pos)
+    {
+        // Look into the position hashtable to see if the same
+        // position was seen before.
+        // This is a good heuristic to exlude already seen
+        // positions without many false positives.
+        auto key = pos.key();
+        auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
+        auto old_key = hash[hash_index];
+        if (key == old_key)
+        {
+            return true;
+        }
+        else
+        {
+            // Replace with the current key.
+            hash[hash_index] = key;
+            return false;
+        }
+    }
+
     // thread_id = 0..Threads.size()-1
-    void MultiThinkGenSfen::thread_worker(size_t thread_id)
+    void MultiThinkGenSfen::thread_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit)
     {
         // For the time being, it will be treated as a draw
         // at the maximum number of steps to write.
@@ -660,10 +697,8 @@ namespace Learner
             // It is necessary to set a dependent thread for Position.
             // When parallelizing, Threads (since this is a vector<Thread*>,
             // Do the same for up to Threads[0]...Threads[thread_num-1].
-            auto th = Threads[thread_id];
-
-            auto& pos = th->rootPos;
-            pos.set(StartFEN, false, &si, th);
+            auto& pos = th.rootPos;
+            pos.set(StartFEN, false, &si, &th);
 
             int resign_counter = 0;
             bool should_resign = prng.rand(10) > 1;
@@ -684,13 +719,11 @@ namespace Learner
             vector<int> move_hist_scores;
 
             auto flush_psv = [&](int8_t result) {
-                quit = commit_psv(a_psv, thread_id, result);
+                quit = commit_psv(th, a_psv, result, counter, limit);
             };
 
             for (int ply = 0; ; ++ply)
             {
-                Move next_move = MOVE_NONE;
-
                 // Current search depth
                 const int depth = search_depth_min + (int)prng.rand(search_depth_max - search_depth_min + 1);
 
@@ -715,18 +748,17 @@ namespace Learner
                         flush_psv((search_value >= eval_limit) ? 1 : -1);
                         break;
                     }
-                } else {
+                }
+                else
+                {
                     resign_counter = 0;
                 }
-                // Verification of a strange move
-                if (search_pv.size() > 0
-                    && (search_pv[0] == MOVE_NONE || search_pv[0] == MOVE_NULL))
+
+                // In case there is no PV and the game was not ended here
+                // there is nothing we can do, we can't continue the game,
+                // we don't know the result, so discard this game.
+                if (search_pv.empty())
                 {
-                    // (???)
-                    // MOVE_WIN is checking if it is the declaration victory stage before this
-                    // The declarative winning move should never come back here.
-                    // Also, when MOVE_RESIGN, search_value is a one-stop score, which should be the minimum value of eval_limit (-31998)...
-                    cout << "Error! : " << pos.fen() << next_move << search_value << endl;
                     break;
                 }
 
@@ -736,34 +768,10 @@ namespace Learner
                 // Discard stuff before write_minply is reached
                 // because it can harm training due to overfitting.
                 // Initial positions would be too common.
-                if (ply < write_minply - 1)
-                {
-                    a_psv.clear();
-                    goto SKIP_SAVE;
-                }
-
-                // Look into the position hashtable to see if the same
-                // position was seen before.
-                // This is a good heuristic to exlude already seen
-                // positions without many false positives.
-                {
-                    auto key = pos.key();
-                    auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
-                    auto old_key = hash[hash_index];
-                    if (key == old_key)
-                    {
-                        goto SKIP_SAVE;
-                    }
-                    else
-                    {
-                        // Replace with the current key.
-                        hash[hash_index] = key;
-                    }
-                }
-
-                // Pack the current position into a packed sfen and save it into the buffer.
+                if (ply >= write_minply && !was_seen_before(pos))
                 {
                     a_psv.emplace_back(PackedSfenValue());
+
                     auto& psv = a_psv.back();
 
                     // Here we only write the position data.
@@ -771,48 +779,29 @@ namespace Learner
                     pos.sfen_pack(psv.sfen);
 
                     psv.score = search_value;
-
                     psv.gamePly = ply;
-
-                    // Take out the first PV move. This should be present unless depth 0.
-                    assert(search_pv.size() >= 1);
                     psv.move = search_pv[0];
                 }
 
-            SKIP_SAVE:;
+                // Update the next move according to best search result or random move.
+                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
+                const Move next_move = random_move.has_value() ? *random_move : search_pv[0];
 
-                // For some reason, We could not get PV (hit the substitution table etc. and got stuck?)
-                // so go to the next game. It's a rare case, so you can ignore it.
-                if (search_pv.size() == 0)
+                // We don't have the whole game yet, but it ended,
+                // so the writing process ends and the next game starts.
+                // This shouldn't really happen.
+                if (!is_ok(next_move))
                 {
                     break;
                 }
 
-                // Update the next move according to best search result.
-                next_move = search_pv[0];
-
-                // Random move.
-                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
-                if (random_move.has_value())
-                {
-                    next_move = random_move.value();
-
-                    // We don't have the whole game yet, but it ended,
-                    // so the writing process ends and the next game starts.
-                    if (!is_ok(next_move))
-                    {
-                        break;
-                    }
-                }
-
                 // Do move.
                 pos.do_move(next_move, states[ply]);
 
-            } // for (int ply = 0; ; ++ply)
+            }
+        }
 
-        } // while(!quit)
-
-        sfen_writer.finalize(thread_id);
+        sfen_writer.finalize(th.thread_idx());
     }
 
     // -----------------------------------
@@ -1029,7 +1018,6 @@ namespace Learner
 
             MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, sfen_writer, seed);
             multi_think.nodes = nodes;
-            multi_think.set_loop_max(loop_max);
             multi_think.eval_limit = eval_limit;
             multi_think.random_move_minply = random_move_minply;
             multi_think.random_move_maxply = random_move_maxply;
@@ -1041,7 +1029,7 @@ namespace Learner
             multi_think.write_minply = write_minply;
             multi_think.write_maxply = write_maxply;
             multi_think.start_file_write_worker();
-            multi_think.go_think();
+            multi_think.gensfen(loop_max);
 
             // Since we are joining with the destructor of SfenWriter, please give a message that it has finished after the join
             // Enclose this in a block because it should be displayed.

From 821b655bc63515effbdebbba277f0a45cc463be3 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 21:51:22 +0200
Subject: [PATCH 389/583] Move gensfen progress reporting from sfen writer to
 gensfen

---
 src/learn/gensfen.cpp | 93 ++++++++++++++++++++++++++++---------------
 1 file changed, 60 insertions(+), 33 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index b2325e40..c661200e 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -134,20 +134,6 @@ namespace Learner
         // Dedicated thread to write to file
         void file_write_worker()
         {
-            auto startTime = now();
-
-            auto output_status = [&]()
-            {
-                // Also output the current time to console.
-                const auto nowTime = now();
-                const TimePoint elapsed = nowTime - startTime + 1;
-
-                sync_cout << endl
-                    << sfen_write_count << " sfens, "
-                    << sfen_write_count * 1000 / elapsed << " sfens/second, "
-                    << "at " << now_string() << sync_endl;
-            };
-
             while (!finished || sfen_buffers_pool.size())
             {
                 vector<std::unique_ptr<PSVector>> buffers;
@@ -190,28 +176,9 @@ namespace Learner
                             output_file_stream = create_new_sfen_output(new_filename, sfen_output_type);
                             cout << endl << "output sfen file = " << new_filename << endl;
                         }
-
-                        // Output '.' every time when writing a game record.
-                        std::cout << ".";
-
-                        // Output the number of phases processed
-                        // every STATUS_OUTPUT_PERIOD times
-                        // Finally, the remainder of the teacher phase
-                        // of each thread is written out,
-                        // so halfway numbers are displayed, but is it okay?
-                        // If you overuse the threads to the maximum number
-                        // of logical cores, the console will be clogged,
-                        // so it may be beneficial to increase that value.
-                        if ((++batch_counter % STATUS_OUTPUT_PERIOD) == 0)
-                        {
-                            output_status();
-                        }
                     }
                 }
             }
-
-            // Output the status again after whole processing is done.
-            output_status();
         }
 
         void set_save_interval(uint64_t v)
@@ -267,6 +234,10 @@ namespace Learner
         // It must be 2**N because it will be used as the mask to calculate hash_index.
         static_assert((GENSFEN_HASH_SIZE& (GENSFEN_HASH_SIZE - 1)) == 0);
 
+        static constexpr uint64_t REPORT_DOT_EVERY = 5000;
+        static constexpr uint64_t REPORT_STATS_EVERY = 200000;
+        static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
+
         MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_, const std::string& seed) :
             prng(seed),
             search_depth_min(search_depth_min_),
@@ -296,6 +267,10 @@ namespace Learner
 
         bool was_seen_before(const Position& pos);
 
+        void report(uint64_t done, uint64_t new_done);
+
+        void maybe_report(uint64_t done);
+
         bool commit_psv(
             Thread& th,
             PSVector& sfens,
@@ -351,6 +326,9 @@ namespace Learner
         int write_minply;
         int write_maxply;
 
+        std::mutex stats_mutex;
+        TimePoint last_stats_report_time;
+
         // sfen exporter
         SfenWriter& sfen_writer;
 
@@ -359,11 +337,20 @@ namespace Learner
 
     void MultiThinkGenSfen::gensfen(uint64_t limit)
     {
+        last_stats_report_time = 0;
+
         std::atomic<uint64_t> counter{0};
         Threads.execute_with_workers([&counter, limit, this](Thread& th) {
             thread_worker(th, counter, limit);
         });
         Threads.wait_for_workers_finished();
+
+        if (limit % REPORT_STATS_EVERY != 0)
+        {
+            report(limit, limit % REPORT_STATS_EVERY);
+        }
+
+        std::cout << std::endl;
     }
 
     optional<int8_t> MultiThinkGenSfen::get_current_game_result(
@@ -484,6 +471,43 @@ namespace Learner
         return nullopt;
     }
 
+    void MultiThinkGenSfen::report(uint64_t done, uint64_t new_done)
+    {
+        const auto now_time = now();
+        const TimePoint elapsed = now_time - last_stats_report_time + 1;
+
+        sync_cout
+            << endl
+            << done << " sfens, "
+            << new_done * 1000 / elapsed << " sfens/second, "
+            << "at " << now_string() << sync_endl;
+
+        last_stats_report_time = now_time;
+    }
+
+    void MultiThinkGenSfen::maybe_report(uint64_t done)
+    {
+        if (done % REPORT_DOT_EVERY == 0)
+        {
+            std::lock_guard lock(stats_mutex);
+
+            if (last_stats_report_time == 0)
+            {
+                last_stats_report_time = now();
+            }
+
+            if (done != 0)
+            {
+                std::cout << '.';
+
+                if (done % REPORT_STATS_EVERY == 0)
+                {
+                    report(done, REPORT_STATS_EVERY);
+                }
+            }
+        }
+    }
+
     // Write out the phases loaded in sfens to a file.
     // lastTurnIsWin: win/loss in the next phase after the final phase in sfens
     // 1 when winning. -1 when losing. Pass 0 for a draw.
@@ -521,6 +545,9 @@ namespace Learner
             if (iter >= limit)
                 return true;
 
+            // because `iter` was done, now we do one more
+            maybe_report(iter + 1);
+
             // Write out one sfen.
             sfen_writer.write(th.thread_idx(), sfen);
         }

From 3f289546da73b96f48c913de539088cde9d64a65 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 21:53:16 +0200
Subject: [PATCH 390/583] Make some gensfen members private.

---
 src/learn/gensfen.cpp | 67 ++++++++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 30 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index c661200e..08b9c3d9 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -257,35 +257,6 @@ namespace Learner
 
         void gensfen(uint64_t limit);
 
-        void thread_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
-
-        optional<int8_t> get_current_game_result(
-            Position& pos,
-            const vector<int>& move_hist_scores) const;
-
-        vector<uint8_t> generate_random_move_flags();
-
-        bool was_seen_before(const Position& pos);
-
-        void report(uint64_t done, uint64_t new_done);
-
-        void maybe_report(uint64_t done);
-
-        bool commit_psv(
-            Thread& th,
-            PSVector& sfens,
-            int8_t lastTurnIsWin,
-            std::atomic<uint64_t>& counter,
-            uint64_t limit);
-
-        optional<Move> choose_random_move(
-            Position& pos,
-            std::vector<uint8_t>& random_move_flag,
-            int ply,
-            int& random_move_c);
-
-        PRNG prng;
-
         // Min and max depths for search during gensfen
         int search_depth_min;
         int search_depth_max;
@@ -326,6 +297,9 @@ namespace Learner
         int write_minply;
         int write_maxply;
 
+    private:
+        PRNG prng;
+
         std::mutex stats_mutex;
         TimePoint last_stats_report_time;
 
@@ -333,6 +307,36 @@ namespace Learner
         SfenWriter& sfen_writer;
 
         vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
+
+        void gensfen_worker(
+            Thread& th,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit);
+
+        optional<int8_t> get_current_game_result(
+            Position& pos,
+            const vector<int>& move_hist_scores) const;
+
+        vector<uint8_t> generate_random_move_flags();
+
+        bool was_seen_before(const Position& pos);
+
+        void report(uint64_t done, uint64_t new_done);
+
+        void maybe_report(uint64_t done);
+
+        bool commit_psv(
+            Thread& th,
+            PSVector& sfens,
+            int8_t lastTurnIsWin,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit);
+
+        optional<Move> choose_random_move(
+            Position& pos,
+            std::vector<uint8_t>& random_move_flag,
+            int ply,
+            int& random_move_c);
     };
 
     void MultiThinkGenSfen::gensfen(uint64_t limit)
@@ -705,7 +709,10 @@ namespace Learner
     }
 
     // thread_id = 0..Threads.size()-1
-    void MultiThinkGenSfen::thread_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit)
+    void MultiThinkGenSfen::gensfen_worker(
+        Thread& th,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit)
     {
         // For the time being, it will be treated as a draw
         // at the maximum number of steps to write.

From cb61dc9c9b92c4bfab5f5d3f82d021f4d94b69a2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:00:52 +0200
Subject: [PATCH 391/583] Make sfen writer a part of gensfen.

---
 src/learn/gensfen.cpp | 51 +++++++++++++++----------------------------
 1 file changed, 17 insertions(+), 34 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 08b9c3d9..6f759db3 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -49,20 +49,19 @@ namespace Learner
         // Amount of sfens required to flush the buffer.
         static constexpr size_t SFEN_WRITE_SIZE = 5000;
 
-        // Current status is output after
-        // each (SFEN_WRITE_SIZE * STATUS_OUTPUT_PERIOD) sfens
-        static constexpr uint64_t STATUS_OUTPUT_PERIOD = 40;
-
         // File name to write and number of threads to create
-        SfenWriter(string filename_, int thread_num)
+        SfenWriter(string filename_, int thread_num, uint64_t save_count)
         {
             sfen_buffers_pool.reserve((size_t)thread_num * 10);
             sfen_buffers.resize(thread_num);
 
             output_file_stream = create_new_sfen_output(filename_, sfen_output_type);
             filename = filename_;
+            save_every = save_count;
 
             finished = false;
+
+            file_worker_thread = std::thread([&] { this->file_write_worker(); });
         }
 
         ~SfenWriter()
@@ -125,12 +124,6 @@ namespace Learner
             }
         }
 
-        // Start the write_worker thread.
-        void start_file_write_worker()
-        {
-            file_worker_thread = std::thread([&] { this->file_write_worker(); });
-        }
-
         // Dedicated thread to write to file
         void file_write_worker()
         {
@@ -181,11 +174,6 @@ namespace Learner
             }
         }
 
-        void set_save_interval(uint64_t v)
-        {
-            save_every = v;
-        }
-
     private:
 
         std::unique_ptr<BasicSfenOutputStream> output_file_stream;
@@ -202,9 +190,6 @@ namespace Learner
         // Flag that all threads have finished
         atomic<bool> finished;
 
-        // Counter for time stamp output
-        uint64_t batch_counter = 0;
-
         // buffer before writing to file
         // sfen_buffers is the buffer for each thread
         // sfen_buffers_pool is a buffer for writing.
@@ -238,11 +223,18 @@ namespace Learner
         static constexpr uint64_t REPORT_STATS_EVERY = 200000;
         static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
 
-        MultiThinkGenSfen(int search_depth_min_, int search_depth_max_, SfenWriter& sw_, const std::string& seed) :
-            prng(seed),
+        MultiThinkGenSfen(
+            int search_depth_min_,
+            int search_depth_max_,
+            std::string output_file_name,
+            int thread_num,
+            uint64_t save_every,
+            const std::string& seed
+        ) :
             search_depth_min(search_depth_min_),
             search_depth_max(search_depth_max_),
-            sfen_writer(sw_)
+            prng(seed),
+            sfen_writer(output_file_name, thread_num, save_every)
         {
             hash.resize(GENSFEN_HASH_SIZE);
 
@@ -250,11 +242,6 @@ namespace Learner
             std::cout << prng << std::endl;
         }
 
-        void start_file_write_worker()
-        {
-            sfen_writer.start_file_write_worker();
-        }
-
         void gensfen(uint64_t limit);
 
         // Min and max depths for search during gensfen
@@ -304,7 +291,7 @@ namespace Learner
         TimePoint last_stats_report_time;
 
         // sfen exporter
-        SfenWriter& sfen_writer;
+        SfenWriter sfen_writer;
 
         vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
 
@@ -345,7 +332,7 @@ namespace Learner
 
         std::atomic<uint64_t> counter{0};
         Threads.execute_with_workers([&counter, limit, this](Thread& th) {
-            thread_worker(th, counter, limit);
+            gensfen_worker(th, counter, limit);
         });
         Threads.wait_for_workers_finished();
 
@@ -1047,10 +1034,7 @@ namespace Learner
 
         // Create and execute threads as many as Options["Threads"].
         {
-            SfenWriter sfen_writer(output_file_name, thread_num);
-            sfen_writer.set_save_interval(save_every);
-
-            MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, sfen_writer, seed);
+            MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, output_file_name, thread_num, save_every, seed);
             multi_think.nodes = nodes;
             multi_think.eval_limit = eval_limit;
             multi_think.random_move_minply = random_move_minply;
@@ -1062,7 +1046,6 @@ namespace Learner
             multi_think.random_multi_pv_depth = random_multi_pv_depth;
             multi_think.write_minply = write_minply;
             multi_think.write_maxply = write_maxply;
-            multi_think.start_file_write_worker();
             multi_think.gensfen(loop_max);
 
             // Since we are joining with the destructor of SfenWriter, please give a message that it has finished after the join

From 21fac7c53cca6b48ba8e3e1cb913120e5fffdb44 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:27:01 +0200
Subject: [PATCH 392/583] A collective struct for gensfen parameters.

---
 src/learn/gensfen.cpp | 381 +++++++++++++++++++-----------------------
 1 file changed, 171 insertions(+), 210 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 6f759db3..d69fcf53 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -37,12 +37,6 @@ using namespace std;
 
 namespace Learner
 {
-    static bool write_out_draw_game_in_training_data_generation = true;
-    static bool detect_draw_by_consecutive_low_score = true;
-    static bool detect_draw_by_insufficient_mating_material = true;
-
-    static SfenOutputType sfen_output_type = SfenOutputType::Bin;
-
     // Helper class for exporting Sfen
     struct SfenWriter
     {
@@ -50,12 +44,13 @@ namespace Learner
         static constexpr size_t SFEN_WRITE_SIZE = 5000;
 
         // File name to write and number of threads to create
-        SfenWriter(string filename_, int thread_num, uint64_t save_count)
+        SfenWriter(string filename_, int thread_num, uint64_t save_count, SfenOutputType sfen_output_type)
         {
             sfen_buffers_pool.reserve((size_t)thread_num * 10);
             sfen_buffers.resize(thread_num);
 
-            output_file_stream = create_new_sfen_output(filename_, sfen_output_type);
+            sfen_format = sfen_output_type;
+            output_file_stream = create_new_sfen_output(filename_, sfen_format);
             filename = filename_;
             save_every = save_count;
 
@@ -166,7 +161,7 @@ namespace Learner
                             // Add ios::app in consideration of overwriting.
                             // (Depending on the operation, it may not be necessary.)
                             string new_filename = filename + "_" + std::to_string(n);
-                            output_file_stream = create_new_sfen_output(new_filename, sfen_output_type);
+                            output_file_stream = create_new_sfen_output(new_filename, sfen_format);
                             cout << endl << "output sfen file = " << new_filename << endl;
                         }
                     }
@@ -190,6 +185,8 @@ namespace Learner
         // Flag that all threads have finished
         atomic<bool> finished;
 
+        SfenOutputType sfen_format;
+
         // buffer before writing to file
         // sfen_buffers is the buffer for each thread
         // sfen_buffers_pool is a buffer for writing.
@@ -214,6 +211,74 @@ namespace Learner
     // Class to generate sfen with multiple threads
     struct MultiThinkGenSfen
     {
+        struct Params
+        {
+            // Min and max depths for search during gensfen
+            int search_depth_min = 3;
+            int search_depth_max = -1;
+
+            // Number of the nodes to be searched.
+            // 0 represents no limits.
+            uint64_t nodes = 0;
+
+            // Upper limit of evaluation value of generated situation
+            int eval_limit = 3000;
+
+            // minimum ply with random move
+            // maximum ply with random move
+            // Number of random moves in one station
+            int random_move_minply = 1;
+            int random_move_maxply = 24;
+            int random_move_count = 5;
+
+            // Move kings with a probability of 1/N when randomly moving like Apery software.
+            // When you move the king again, there is a 1/N chance that it will randomly moved
+            // once in the opponent's turn.
+            // Apery has N=2. Specifying 0 here disables this function.
+            int random_move_like_apery = 0;
+
+            // For when using multi pv instead of random move.
+            // random_multi_pv is the number of candidates for MultiPV.
+            // When adopting the move of the candidate move, the difference
+            // between the evaluation value of the move of the 1st place
+            // and the evaluation value of the move of the Nth place is.
+            // Must be in the range random_multi_pv_diff.
+            // random_multi_pv_depth is the search depth for MultiPV.
+            int random_multi_pv = 0;
+            int random_multi_pv_diff = 32000;
+            int random_multi_pv_depth = -1;
+
+            // The minimum and maximum ply (number of steps from
+            // the initial phase) of the sfens to write out.
+            int write_minply = 16;
+            int write_maxply = 400;
+
+            uint64_t save_every = std::numeric_limits<uint64_t>::max();
+
+            std::string output_file_name = "generated_kifu";
+
+            SfenOutputType sfen_format = SfenOutputType::Binpack;
+
+            std::string seed;
+
+            bool write_out_draw_game_in_training_data_generation = true;
+            bool detect_draw_by_consecutive_low_score = true;
+            bool detect_draw_by_insufficient_mating_material = true;
+
+            uint64_t num_threads;
+
+            void enforce_constraints()
+            {
+                search_depth_max = std::max(search_depth_min, search_depth_max);
+                random_multi_pv_depth = std::max(search_depth_min, random_multi_pv_depth);
+
+                // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
+                eval_limit = std::min(eval_limit, (int)mate_in(2));
+
+                num_threads = Options["Threads"];
+            }
+        };
+
         // Hash to limit the export of identical sfens
         static constexpr uint64_t GENSFEN_HASH_SIZE = 64 * 1024 * 1024;
         // It must be 2**N because it will be used as the mask to calculate hash_index.
@@ -224,17 +289,11 @@ namespace Learner
         static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
 
         MultiThinkGenSfen(
-            int search_depth_min_,
-            int search_depth_max_,
-            std::string output_file_name,
-            int thread_num,
-            uint64_t save_every,
-            const std::string& seed
+            const Params& prm
         ) :
-            search_depth_min(search_depth_min_),
-            search_depth_max(search_depth_max_),
-            prng(seed),
-            sfen_writer(output_file_name, thread_num, save_every)
+            params(prm),
+            prng(prm.seed),
+            sfen_writer(prm.output_file_name, prm.num_threads, prm.save_every, prm.sfen_format)
         {
             hash.resize(GENSFEN_HASH_SIZE);
 
@@ -244,47 +303,9 @@ namespace Learner
 
         void gensfen(uint64_t limit);
 
-        // Min and max depths for search during gensfen
-        int search_depth_min;
-        int search_depth_max;
-
-        // Number of the nodes to be searched.
-        // 0 represents no limits.
-        uint64_t nodes;
-
-        // Upper limit of evaluation value of generated situation
-        int eval_limit;
-
-        // minimum ply with random move
-        // maximum ply with random move
-        // Number of random moves in one station
-        int random_move_minply;
-        int random_move_maxply;
-        int random_move_count;
-
-        // Move kings with a probability of 1/N when randomly moving like Apery software.
-        // When you move the king again, there is a 1/N chance that it will randomly moved
-        // once in the opponent's turn.
-        // Apery has N=2. Specifying 0 here disables this function.
-        int random_move_like_apery;
-
-        // For when using multi pv instead of random move.
-        // random_multi_pv is the number of candidates for MultiPV.
-        // When adopting the move of the candidate move, the difference
-        // between the evaluation value of the move of the 1st place
-        // and the evaluation value of the move of the Nth place is.
-        // Must be in the range random_multi_pv_diff.
-        // random_multi_pv_depth is the search depth for MultiPV.
-        int random_multi_pv;
-        int random_multi_pv_diff;
-        int random_multi_pv_depth;
-
-        // The minimum and maximum ply (number of steps from
-        // the initial phase) of the sfens to write out.
-        int write_minply;
-        int write_maxply;
-
     private:
+        Params params;
+
         PRNG prng;
 
         std::mutex stats_mutex;
@@ -365,7 +386,7 @@ namespace Learner
         const int ply = move_hist_scores.size();
 
         // has it reached the max length or is a draw
-        if (ply >= write_maxply || pos.is_draw(ply))
+        if (ply >= params.write_maxply || pos.is_draw(ply))
         {
             return 0;
         }
@@ -379,7 +400,7 @@ namespace Learner
         }
 
         // Adjudicate game to a draw if the last 4 scores of each engine is 0.
-        if (detect_draw_by_consecutive_low_score)
+        if (params.detect_draw_by_consecutive_low_score)
         {
             if (ply >= adj_draw_ply)
             {
@@ -414,7 +435,7 @@ namespace Learner
         }
 
         // Draw by insufficient mating material
-        if (detect_draw_by_insufficient_mating_material)
+        if (params.detect_draw_by_insufficient_mating_material)
         {
             if (pos.count<ALL_PIECES>() <= 4)
             {
@@ -511,7 +532,7 @@ namespace Learner
         std::atomic<uint64_t>& counter,
         uint64_t limit)
     {
-        if (!write_out_draw_game_in_training_data_generation && lastTurnIsWin == 0)
+        if (!params.write_out_draw_game_in_training_data_generation && lastTurnIsWin == 0)
         {
             // We didn't write anything so why quit.
             return false;
@@ -557,21 +578,21 @@ namespace Learner
         // Randomly choose one from legal move
         if (
             // 1. Random move of random_move_count times from random_move_minply to random_move_maxply
-            (random_move_minply != -1 && ply < (int)random_move_flag.size() && random_move_flag[ply]) ||
+            (params.random_move_minply != -1 && ply < (int)random_move_flag.size() && random_move_flag[ply]) ||
             // 2. A mode to perform random move of random_move_count times after leaving the startpos
-            (random_move_minply == -1 && random_move_c < random_move_count))
+            (params.random_move_minply == -1 && random_move_c < params.random_move_count))
         {
             ++random_move_c;
 
             // It's not a mate, so there should be one legal move...
-            if (random_multi_pv == 0)
+            if (params.random_multi_pv == 0)
             {
                 // Normal random move
                 MoveList<LEGAL> list(pos);
 
                 // I don't really know the goodness and badness of making this the Apery method.
-                if (random_move_like_apery == 0
-                    || prng.rand(random_move_like_apery) != 0)
+                if (params.random_move_like_apery == 0
+                    || prng.rand(params.random_move_like_apery) != 0)
                 {
                     // Normally one move from legal move
                     random_move = list.at((size_t)prng.rand((uint64_t)list.size()));
@@ -612,18 +633,18 @@ namespace Learner
             }
             else
             {
-                Search::search(pos, random_multi_pv_depth, random_multi_pv);
+                Search::search(pos, params.random_multi_pv_depth, params.random_multi_pv);
 
                 // Select one from the top N hands of root Moves
                 auto& rm = pos.this_thread()->rootMoves;
 
-                uint64_t s = min((uint64_t)rm.size(), (uint64_t)random_multi_pv);
+                uint64_t s = min((uint64_t)rm.size(), (uint64_t)params.random_multi_pv);
                 for (uint64_t i = 1; i < s; ++i)
                 {
                     // The difference from the evaluation value of rm[0] must
                     // be within the range of random_multi_pv_diff.
                     // It can be assumed that rm[x].score is arranged in descending order.
-                    if (rm[0].score > rm[i].score + random_multi_pv_diff)
+                    if (rm[0].score > rm[i].score + params.random_multi_pv_diff)
                     {
                         s = i;
                         break;
@@ -651,21 +672,21 @@ namespace Learner
         // to shuffle the first N pieces with Fisher-Yates.
 
         vector<int> a;
-        a.reserve((size_t)random_move_maxply);
+        a.reserve((size_t)params.random_move_maxply);
 
         // random_move_minply ,random_move_maxply is specified by 1 origin,
         // Note that we are handling 0 origin here.
-        for (int i = std::max(random_move_minply - 1, 0); i < random_move_maxply; ++i)
+        for (int i = std::max(params.random_move_minply - 1, 0); i < params.random_move_maxply; ++i)
         {
             a.push_back(i);
         }
 
         // In case of Apery random move, insert() may be called random_move_count times.
         // Reserve only the size considering it.
-        random_move_flag.resize((size_t)random_move_maxply + random_move_count);
+        random_move_flag.resize((size_t)params.random_move_maxply + params.random_move_count);
 
         // A random move that exceeds the size() of a[] cannot be applied, so limit it.
-        for (int i = 0; i < std::min(random_move_count, (int)a.size()); ++i)
+        for (int i = 0; i < std::min(params.random_move_count, (int)a.size()); ++i)
         {
             swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
             random_move_flag[a[i]] = true;
@@ -705,7 +726,7 @@ namespace Learner
         // at the maximum number of steps to write.
         // Maximum StateInfo + Search PV to advance to leaf buffer
         std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
-            write_maxply + MAX_PLY /* == search_depth_min + α */);
+            params.write_maxply + MAX_PLY /* == search_depth_min + α */);
 
         StateInfo si;
 
@@ -725,7 +746,7 @@ namespace Learner
             bool should_resign = prng.rand(10) > 1;
             // Vector for holding the sfens in the current simulated game.
             PSVector a_psv;
-            a_psv.reserve(write_maxply + MAX_PLY);
+            a_psv.reserve(params.write_maxply + MAX_PLY);
 
             // Precomputed flags. Used internally by choose_random_move.
             vector<uint8_t> random_move_flag = generate_random_move_flags();
@@ -746,10 +767,10 @@ namespace Learner
             for (int ply = 0; ; ++ply)
             {
                 // Current search depth
-                const int depth = search_depth_min + (int)prng.rand(search_depth_max - search_depth_min + 1);
+                const int depth = params.search_depth_min + (int)prng.rand(params.search_depth_max - params.search_depth_min + 1);
 
                 // Starting search calls init_for_search
-                auto [search_value, search_pv] = Search::search(pos, depth, 1, nodes);
+                auto [search_value, search_pv] = Search::search(pos, depth, 1, params.nodes);
 
                 // This has to be performed after search because it needs to know
                 // rootMoves which are filled in init_for_search.
@@ -762,11 +783,11 @@ namespace Learner
 
                 // Always adjudivate by eval limit.
                 // Also because of this we don't have to check for TB/MATE scores
-                if (abs(search_value) >= eval_limit)
+                if (abs(search_value) >= params.eval_limit)
                 {
                     resign_counter++;
                     if ((should_resign && resign_counter >= 4) || abs(search_value) >= VALUE_KNOWN_WIN) {
-                        flush_psv((search_value >= eval_limit) ? 1 : -1);
+                        flush_psv((search_value >= params.eval_limit) ? 1 : -1);
                         break;
                     }
                 }
@@ -789,7 +810,7 @@ namespace Learner
                 // Discard stuff before write_minply is reached
                 // because it can harm training due to overfitting.
                 // Initial positions would be too common.
-                if (ply >= write_minply && !was_seen_before(pos))
+                if (ply >= params.write_minply && !was_seen_before(pos))
                 {
                     a_psv.emplace_back(PackedSfenValue());
 
@@ -825,6 +846,25 @@ namespace Learner
         sfen_writer.finalize(th.thread_idx());
     }
 
+    void set_gensfen_search_limits()
+    {
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+    }
+
     // -----------------------------------
     // Command to generate a game record (master thread)
     // -----------------------------------
@@ -832,55 +872,16 @@ namespace Learner
     // Command to generate a game record
     void gen_sfen(Position&, istringstream& is)
     {
-        // number of threads (given by USI setoption)
-        uint32_t thread_num = (uint32_t)Options["Threads"];
-
         // Number of generated game records default = 8 billion phases (Ponanza specification)
         uint64_t loop_max = 8000000000UL;
 
-        // Stop the generation when the evaluation value reaches this value.
-        int eval_limit = 3000;
-
-        // search depth
-        int search_depth_min = 3;
-        int search_depth_max = INT_MIN;
-
-        // Number of nodes to be searched.
-        uint64_t nodes = 0;
-
-        // minimum ply, maximum ply and number of random moves
-        int random_move_minply = 1;
-        int random_move_maxply = 24;
-        int random_move_count = 5;
-
-        // A function to move the random move mainly like Apery
-        // If this is set to 3, the ball will move with a probability of 1/3.
-        int random_move_like_apery = 0;
-
-        // If you search with multipv instead of random move and choose from among them randomly, set random_multi_pv = 1 or more.
-        int random_multi_pv = 0;
-        int random_multi_pv_diff = 32000;
-        int random_multi_pv_depth = INT_MIN;
-
-        // The minimum and maximum ply (number of steps from the initial phase) of the phase to write out.
-        int write_minply = 16;
-        int write_maxply = 400;
-
-        // File name to write
-        string output_file_name = "generated_kifu";
-
-        string token;
-
-        // Save to file in this unit.
-        // File names are serialized like file_1.bin, file_2.bin.
-        uint64_t save_every = UINT64_MAX;
+        MultiThinkGenSfen::Params params;
 
         // Add a random number to the end of the file name.
         bool random_file_name = false;
-
         std::string sfen_format = "binpack";
-        std::string seed;
 
+        string token;
         while (true)
         {
             token = "";
@@ -889,55 +890,51 @@ namespace Learner
                 break;
 
             if (token == "depth")
-                is >> search_depth_min;
+                is >> params.search_depth_min;
             else if (token == "depth2")
-                is >> search_depth_max;
+                is >> params.search_depth_max;
             else if (token == "nodes")
-                is >> nodes;
+                is >> params.nodes;
             else if (token == "loop")
                 is >> loop_max;
             else if (token == "output_file_name")
-                is >> output_file_name;
+                is >> params.output_file_name;
             else if (token == "eval_limit")
-            {
-                is >> eval_limit;
-                // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
-                eval_limit = std::min(eval_limit, (int)mate_in(2));
-            }
+                is >> params.eval_limit;
             else if (token == "random_move_minply")
-                is >> random_move_minply;
+                is >> params.random_move_minply;
             else if (token == "random_move_maxply")
-                is >> random_move_maxply;
+                is >> params.random_move_maxply;
             else if (token == "random_move_count")
-                is >> random_move_count;
+                is >> params.random_move_count;
             else if (token == "random_move_like_apery")
-                is >> random_move_like_apery;
+                is >> params.random_move_like_apery;
             else if (token == "random_multi_pv")
-                is >> random_multi_pv;
+                is >> params.random_multi_pv;
             else if (token == "random_multi_pv_diff")
-                is >> random_multi_pv_diff;
+                is >> params.random_multi_pv_diff;
             else if (token == "random_multi_pv_depth")
-                is >> random_multi_pv_depth;
+                is >> params.random_multi_pv_depth;
             else if (token == "write_minply")
-                is >> write_minply;
+                is >> params.write_minply;
             else if (token == "write_maxply")
-                is >> write_maxply;
+                is >> params.write_maxply;
             else if (token == "save_every")
-                is >> save_every;
+                is >> params.save_every;
             else if (token == "random_file_name")
                 is >> random_file_name;
             // Accept also the old option name.
             else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
-                is >> write_out_draw_game_in_training_data_generation;
+                is >> params.write_out_draw_game_in_training_data_generation;
             // Accept also the old option name.
             else if (token == "use_game_draw_adjudication" || token == "detect_draw_by_consecutive_low_score")
-                is >> detect_draw_by_consecutive_low_score;
+                is >> params.detect_draw_by_consecutive_low_score;
             else if (token == "detect_draw_by_insufficient_mating_material")
-                is >> detect_draw_by_insufficient_mating_material;
+                is >> params.detect_draw_by_insufficient_mating_material;
             else if (token == "sfen_format")
                 is >> sfen_format;
             else if (token == "seed")
-                is >> seed;
+                is >> params.seed;
             else if (token == "set_recommended_uci_options")
             {
                 UCI::setoption("Contempt", "0");
@@ -955,26 +952,20 @@ namespace Learner
         if (!sfen_format.empty())
         {
             if (sfen_format == "bin")
-                sfen_output_type = SfenOutputType::Bin;
+                params.sfen_format = SfenOutputType::Bin;
             else if (sfen_format == "binpack")
-                sfen_output_type = SfenOutputType::Binpack;
+                params.sfen_format = SfenOutputType::Binpack;
             else
             {
                 cout << "Unknown sfen format `" << sfen_format << "`. Using bin\n";
             }
         }
 
-        // If search depth2 is not set, leave it the same as search depth.
-        if (search_depth_max == INT_MIN)
-            search_depth_max = search_depth_min;
-        if (random_multi_pv_depth == INT_MIN)
-            random_multi_pv_depth = search_depth_min;
-
         if (random_file_name)
         {
             // Give a random number to output_file_name at this point.
             // Do not use std::random_device().  Because it always the same integers on MinGW.
-            PRNG r(seed);
+            PRNG r(params.seed);
             // Just in case, reassign the random numbers.
             for (int i = 0; i < 10; ++i)
                 r.rand(1);
@@ -983,74 +974,44 @@ namespace Learner
                 ss << std::hex << u;
                 return ss.str();
             };
+
             // I don't want to wear 64bit numbers by accident, so I'next_move going to make a 64bit number 2 just in case.
-            output_file_name = output_file_name + "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
+            params.output_file_name += "_" + to_hex(r.rand<uint64_t>()) + to_hex(r.rand<uint64_t>());
         }
 
+        params.enforce_constraints();
+
         std::cout << "gensfen : " << endl
-            << "  search_depth_min = " << search_depth_min << " to " << search_depth_max << endl
-            << "  nodes = " << nodes << endl
+            << "  search_depth_min = " << params.search_depth_min << " to " << params.search_depth_max << endl
+            << "  nodes = " << params.nodes << endl
             << "  loop_max = " << loop_max << endl
-            << "  eval_limit = " << eval_limit << endl
-            << "  thread_num (set by USI setoption) = " << thread_num << endl
-            << "  random_move_minply     = " << random_move_minply << endl
-            << "  random_move_maxply     = " << random_move_maxply << endl
-            << "  random_move_count      = " << random_move_count << endl
-            << "  random_move_like_apery = " << random_move_like_apery << endl
-            << "  random_multi_pv        = " << random_multi_pv << endl
-            << "  random_multi_pv_diff   = " << random_multi_pv_diff << endl
-            << "  random_multi_pv_depth  = " << random_multi_pv_depth << endl
-            << "  write_minply           = " << write_minply << endl
-            << "  write_maxply           = " << write_maxply << endl
-            << "  output_file_name       = " << output_file_name << endl
-            << "  save_every             = " << save_every << endl
+            << "  eval_limit = " << params.eval_limit << endl
+            << "  thread_num (set by USI setoption) = " << params.num_threads << endl
+            << "  random_move_minply     = " << params.random_move_minply << endl
+            << "  random_move_maxply     = " << params.random_move_maxply << endl
+            << "  random_move_count      = " << params.random_move_count << endl
+            << "  random_move_like_apery = " << params.random_move_like_apery << endl
+            << "  random_multi_pv        = " << params.random_multi_pv << endl
+            << "  random_multi_pv_diff   = " << params.random_multi_pv_diff << endl
+            << "  random_multi_pv_depth  = " << params.random_multi_pv_depth << endl
+            << "  write_minply           = " << params.write_minply << endl
+            << "  write_maxply           = " << params.write_maxply << endl
+            << "  output_file_name       = " << params.output_file_name << endl
+            << "  save_every             = " << params.save_every << endl
             << "  random_file_name       = " << random_file_name << endl
-            << "  write_out_draw_game_in_training_data_generation = " << write_out_draw_game_in_training_data_generation << endl
-            << "  detect_draw_by_consecutive_low_score = " << detect_draw_by_consecutive_low_score << endl
-            << "  detect_draw_by_insufficient_mating_material = " << detect_draw_by_insufficient_mating_material << endl;
+            << "  write_out_draw_game_in_training_data_generation = " << params.write_out_draw_game_in_training_data_generation << endl
+            << "  detect_draw_by_consecutive_low_score = " << params.detect_draw_by_consecutive_low_score << endl
+            << "  detect_draw_by_insufficient_mating_material = " << params.detect_draw_by_insufficient_mating_material << endl;
 
         // Show if the training data generator uses NNUE.
         Eval::NNUE::verify_eval_file_loaded();
 
         Threads.main()->ponder = false;
 
-        // About Search::Limits
-        // Be careful because this member variable is global and affects other threads.
-        {
-          auto& limits = Search::Limits;
+        set_gensfen_search_limits();
 
-          // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
-          limits.infinite = true;
-
-          // Since PV is an obstacle when displayed, erase it.
-          limits.silent = true;
-
-          // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
-          limits.nodes = 0;
-
-          // depth is also processed by the one passed as an argument of Learner::search().
-          limits.depth = 0;
-        }
-
-        // Create and execute threads as many as Options["Threads"].
-        {
-            MultiThinkGenSfen multi_think(search_depth_min, search_depth_max, output_file_name, thread_num, save_every, seed);
-            multi_think.nodes = nodes;
-            multi_think.eval_limit = eval_limit;
-            multi_think.random_move_minply = random_move_minply;
-            multi_think.random_move_maxply = random_move_maxply;
-            multi_think.random_move_count = random_move_count;
-            multi_think.random_move_like_apery = random_move_like_apery;
-            multi_think.random_multi_pv = random_multi_pv;
-            multi_think.random_multi_pv_diff = random_multi_pv_diff;
-            multi_think.random_multi_pv_depth = random_multi_pv_depth;
-            multi_think.write_minply = write_minply;
-            multi_think.write_maxply = write_maxply;
-            multi_think.gensfen(loop_max);
-
-            // Since we are joining with the destructor of SfenWriter, please give a message that it has finished after the join
-            // Enclose this in a block because it should be displayed.
-        }
+        MultiThinkGenSfen multi_think(params);
+        multi_think.gensfen(loop_max);
 
         std::cout << "gensfen finished." << endl;
     }

From d77b3d176e6736d4729b00d1a4465943d30ea64c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:30:08 +0200
Subject: [PATCH 393/583] Always flush sfen writer at the end of gensfen and
 when it is destroyed.

---
 src/learn/gensfen.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index d69fcf53..a4ce5728 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -61,6 +61,8 @@ namespace Learner
 
         ~SfenWriter()
         {
+            flush();
+
             finished = true;
             file_worker_thread.join();
             output_file_stream.reset();
@@ -105,8 +107,16 @@ namespace Learner
             }
         }
 
+        void flush()
+        {
+            for (size_t i = 0; i < sfen_buffers.size(); ++i)
+            {
+                flush(i);
+            }
+        }
+
         // Move what remains in the buffer for your thread to a buffer for writing to a file.
-        void finalize(size_t thread_id)
+        void flush(size_t thread_id)
         {
             std::unique_lock<std::mutex> lk(mutex);
 
@@ -357,6 +367,8 @@ namespace Learner
         });
         Threads.wait_for_workers_finished();
 
+        sfen_writer.flush();
+
         if (limit % REPORT_STATS_EVERY != 0)
         {
             report(limit, limit % REPORT_STATS_EVERY);
@@ -842,8 +854,6 @@ namespace Learner
 
             }
         }
-
-        sfen_writer.finalize(th.thread_idx());
     }
 
     void set_gensfen_search_limits()

From 6d4d20c4be5ce46760de0f51d19148d75aedcfd5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:44:02 +0200
Subject: [PATCH 394/583] Cleaner printing and some renaming.

---
 src/learn/gensfen.cpp | 72 ++++++++++++++++++++++++++-----------------
 src/learn/gensfen.h   |  2 +-
 src/misc.h            |  5 +++
 src/uci.cpp           |  2 +-
 4 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index a4ce5728..971afd1b 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -49,6 +49,9 @@ namespace Learner
             sfen_buffers_pool.reserve((size_t)thread_num * 10);
             sfen_buffers.resize(thread_num);
 
+            auto out = sync_region_cout.new_region();
+            out << "INFO (sfen_writer): Creating new data file at " << filename_ << endl;
+
             sfen_format = sfen_output_type;
             output_file_stream = create_new_sfen_output(filename_, sfen_format);
             filename = filename_;
@@ -172,7 +175,9 @@ namespace Learner
                             // (Depending on the operation, it may not be necessary.)
                             string new_filename = filename + "_" + std::to_string(n);
                             output_file_stream = create_new_sfen_output(new_filename, sfen_format);
-                            cout << endl << "output sfen file = " << new_filename << endl;
+
+                            auto out = sync_region_cout.new_region();
+                            out << "INFO (sfen_writer): Creating new data file at " << new_filename << endl;
                         }
                     }
                 }
@@ -285,6 +290,8 @@ namespace Learner
                 // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
                 eval_limit = std::min(eval_limit, (int)mate_in(2));
 
+                save_every = std::max(save_every, REPORT_STATS_EVERY);
+
                 num_threads = Options["Threads"];
             }
         };
@@ -324,6 +331,8 @@ namespace Learner
         // sfen exporter
         SfenWriter sfen_writer;
 
+        SynchronizedRegionLogger::Region out;
+
         vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
 
         void gensfen_worker(
@@ -500,13 +509,15 @@ namespace Learner
         const auto now_time = now();
         const TimePoint elapsed = now_time - last_stats_report_time + 1;
 
-        sync_cout
+        out
             << endl
             << done << " sfens, "
             << new_done * 1000 / elapsed << " sfens/second, "
             << "at " << now_string() << sync_endl;
 
         last_stats_report_time = now_time;
+
+        out = sync_region_cout.new_region();
     }
 
     void MultiThinkGenSfen::maybe_report(uint64_t done)
@@ -518,11 +529,12 @@ namespace Learner
             if (last_stats_report_time == 0)
             {
                 last_stats_report_time = now();
+                out = sync_region_cout.new_region();
             }
 
             if (done != 0)
             {
-                std::cout << '.';
+                out << '.';
 
                 if (done % REPORT_STATS_EVERY == 0)
                 {
@@ -880,7 +892,7 @@ namespace Learner
     // -----------------------------------
 
     // Command to generate a game record
-    void gen_sfen(Position&, istringstream& is)
+    void gensfen(istringstream& is)
     {
         // Number of generated game records default = 8 billion phases (Ponanza specification)
         uint64_t loop_max = 8000000000UL;
@@ -956,7 +968,7 @@ namespace Learner
                 UCI::setoption("EnableTranspositionTable", "true");
             }
             else
-                cout << "Error! : Illegal token " << token << endl;
+                cout << "ERROR: Ignoring unknown option " << token << endl;
         }
 
         if (!sfen_format.empty())
@@ -967,7 +979,7 @@ namespace Learner
                 params.sfen_format = SfenOutputType::Binpack;
             else
             {
-                cout << "Unknown sfen format `" << sfen_format << "`. Using bin\n";
+                cout << "WARNING: Unknown sfen format `" << sfen_format << "`. Using bin\n";
             }
         }
 
@@ -991,27 +1003,31 @@ namespace Learner
 
         params.enforce_constraints();
 
-        std::cout << "gensfen : " << endl
-            << "  search_depth_min = " << params.search_depth_min << " to " << params.search_depth_max << endl
-            << "  nodes = " << params.nodes << endl
-            << "  loop_max = " << loop_max << endl
-            << "  eval_limit = " << params.eval_limit << endl
-            << "  thread_num (set by USI setoption) = " << params.num_threads << endl
-            << "  random_move_minply     = " << params.random_move_minply << endl
-            << "  random_move_maxply     = " << params.random_move_maxply << endl
-            << "  random_move_count      = " << params.random_move_count << endl
-            << "  random_move_like_apery = " << params.random_move_like_apery << endl
-            << "  random_multi_pv        = " << params.random_multi_pv << endl
-            << "  random_multi_pv_diff   = " << params.random_multi_pv_diff << endl
-            << "  random_multi_pv_depth  = " << params.random_multi_pv_depth << endl
-            << "  write_minply           = " << params.write_minply << endl
-            << "  write_maxply           = " << params.write_maxply << endl
-            << "  output_file_name       = " << params.output_file_name << endl
-            << "  save_every             = " << params.save_every << endl
-            << "  random_file_name       = " << random_file_name << endl
-            << "  write_out_draw_game_in_training_data_generation = " << params.write_out_draw_game_in_training_data_generation << endl
-            << "  detect_draw_by_consecutive_low_score = " << params.detect_draw_by_consecutive_low_score << endl
-            << "  detect_draw_by_insufficient_mating_material = " << params.detect_draw_by_insufficient_mating_material << endl;
+        std::cout << "INFO: Executing gensfen command\n";
+
+        std::cout << "INFO: Parameters:\n";
+        std::cout
+            << "  - search_depth_min       = " << params.search_depth_min << endl
+            << "  - search_depth_max       = " << params.search_depth_max << endl
+            << "  - nodes                  = " << params.nodes << endl
+            << "  - num sfens to generate  = " << loop_max << endl
+            << "  - eval_limit             = " << params.eval_limit << endl
+            << "  - num threads (UCI)      = " << params.num_threads << endl
+            << "  - random_move_minply     = " << params.random_move_minply << endl
+            << "  - random_move_maxply     = " << params.random_move_maxply << endl
+            << "  - random_move_count      = " << params.random_move_count << endl
+            << "  - random_move_like_apery = " << params.random_move_like_apery << endl
+            << "  - random_multi_pv        = " << params.random_multi_pv << endl
+            << "  - random_multi_pv_diff   = " << params.random_multi_pv_diff << endl
+            << "  - random_multi_pv_depth  = " << params.random_multi_pv_depth << endl
+            << "  - write_minply           = " << params.write_minply << endl
+            << "  - write_maxply           = " << params.write_maxply << endl
+            << "  - output_file_name       = " << params.output_file_name << endl
+            << "  - save_every             = " << params.save_every << endl
+            << "  - random_file_name       = " << random_file_name << endl
+            << "  - write_drawn_games      = " << params.write_out_draw_game_in_training_data_generation << endl
+            << "  - draw by low score      = " << params.detect_draw_by_consecutive_low_score << endl
+            << "  - draw by insuff. mat.   = " << params.detect_draw_by_insufficient_mating_material << endl;
 
         // Show if the training data generator uses NNUE.
         Eval::NNUE::verify_eval_file_loaded();
@@ -1023,6 +1039,6 @@ namespace Learner
         MultiThinkGenSfen multi_think(params);
         multi_think.gensfen(loop_max);
 
-        std::cout << "gensfen finished." << endl;
+        std::cout << "INFO: Gensfen finished." << endl;
     }
 }
diff --git a/src/learn/gensfen.h b/src/learn/gensfen.h
index d39e44c9..c0a7c978 100644
--- a/src/learn/gensfen.h
+++ b/src/learn/gensfen.h
@@ -8,7 +8,7 @@
 namespace Learner {
 
     // Automatic generation of teacher position
-    void gen_sfen(Position& pos, std::istringstream& is);
+    void gensfen(std::istringstream& is);
 }
 
 #endif
\ No newline at end of file
diff --git a/src/misc.h b/src/misc.h
index 3e6dc5b0..9f250b6e 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -84,6 +84,11 @@ struct SynchronizedRegionLogger
   {
     friend struct SynchronizedRegionLogger;
 
+    Region() :
+      logger(nullptr), region_id(0), is_held(false)
+    {
+    }
+
     Region(const Region&) = delete;
     Region& operator=(const Region&) = delete;
 
diff --git a/src/uci.cpp b/src/uci.cpp
index 398fd01a..dbef05bf 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -338,7 +338,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "eval")     trace_eval(pos);
       else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
 
-      else if (token == "gensfen") Learner::gen_sfen(pos, is);
+      else if (token == "gensfen") Learner::gensfen(is);
       else if (token == "learn") Learner::learn(pos, is);
       else if (token == "convert") Learner::convert(is);
       else if (token == "convert_bin") Learner::convert_bin(is);

From 03abfae41f912f99d0d6c86d7a237971b8266d03 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:53:14 +0200
Subject: [PATCH 395/583] Reorder members, renaming.

---
 src/learn/gensfen.cpp | 588 +++++++++++++++++++++---------------------
 1 file changed, 289 insertions(+), 299 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 971afd1b..1dddac5a 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -219,12 +219,8 @@ namespace Learner
         uint64_t sfen_write_count_current_file = 0;
     };
 
-    // -----------------------------------
-    // worker that creates the game record (for each thread)
-    // -----------------------------------
-
     // Class to generate sfen with multiple threads
-    struct MultiThinkGenSfen
+    struct Gensfen
     {
         struct Params
         {
@@ -305,7 +301,7 @@ namespace Learner
         static constexpr uint64_t REPORT_STATS_EVERY = 200000;
         static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
 
-        MultiThinkGenSfen(
+        Gensfen(
             const Params& prm
         ) :
             params(prm),
@@ -318,7 +314,7 @@ namespace Learner
             std::cout << prng << std::endl;
         }
 
-        void gensfen(uint64_t limit);
+        void generate(uint64_t limit);
 
     private:
         Params params;
@@ -335,22 +331,26 @@ namespace Learner
 
         vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
 
-        void gensfen_worker(
+        static void set_gensfen_search_limits();
+
+        void generate_worker(
             Thread& th,
             std::atomic<uint64_t>& counter,
             uint64_t limit);
 
+        bool was_seen_before(const Position& pos);
+
         optional<int8_t> get_current_game_result(
             Position& pos,
             const vector<int>& move_hist_scores) const;
 
         vector<uint8_t> generate_random_move_flags();
 
-        bool was_seen_before(const Position& pos);
-
-        void report(uint64_t done, uint64_t new_done);
-
-        void maybe_report(uint64_t done);
+        optional<Move> choose_random_move(
+            Position& pos,
+            std::vector<uint8_t>& random_move_flag,
+            int ply,
+            int& random_move_c);
 
         bool commit_psv(
             Thread& th,
@@ -359,20 +359,39 @@ namespace Learner
             std::atomic<uint64_t>& counter,
             uint64_t limit);
 
-        optional<Move> choose_random_move(
-            Position& pos,
-            std::vector<uint8_t>& random_move_flag,
-            int ply,
-            int& random_move_c);
+        void report(uint64_t done, uint64_t new_done);
+
+        void maybe_report(uint64_t done);
     };
 
-    void MultiThinkGenSfen::gensfen(uint64_t limit)
+    void Gensfen::set_gensfen_search_limits()
+    {
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+    }
+
+    void Gensfen::generate(uint64_t limit)
     {
         last_stats_report_time = 0;
 
+        set_gensfen_search_limits();
+
         std::atomic<uint64_t> counter{0};
         Threads.execute_with_workers([&counter, limit, this](Thread& th) {
-            gensfen_worker(th, counter, limit);
+            generate_worker(th, counter, limit);
         });
         Threads.wait_for_workers_finished();
 
@@ -386,7 +405,154 @@ namespace Learner
         std::cout << std::endl;
     }
 
-    optional<int8_t> MultiThinkGenSfen::get_current_game_result(
+    void Gensfen::generate_worker(
+        Thread& th,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit)
+    {
+        // For the time being, it will be treated as a draw
+        // at the maximum number of steps to write.
+        // Maximum StateInfo + Search PV to advance to leaf buffer
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
+            params.write_maxply + MAX_PLY /* == search_depth_min + α */);
+
+        StateInfo si;
+
+        // end flag
+        bool quit = false;
+
+        // repeat until the specified number of times
+        while (!quit)
+        {
+            // It is necessary to set a dependent thread for Position.
+            // When parallelizing, Threads (since this is a vector<Thread*>,
+            // Do the same for up to Threads[0]...Threads[thread_num-1].
+            auto& pos = th.rootPos;
+            pos.set(StartFEN, false, &si, &th);
+
+            int resign_counter = 0;
+            bool should_resign = prng.rand(10) > 1;
+            // Vector for holding the sfens in the current simulated game.
+            PSVector packed_sfens;
+            packed_sfens.reserve(params.write_maxply + MAX_PLY);
+
+            // Precomputed flags. Used internally by choose_random_move.
+            vector<uint8_t> random_move_flag = generate_random_move_flags();
+
+            // A counter that keeps track of the number of random moves
+            // When random_move_minply == -1, random moves are
+            // performed continuously, so use it at this time.
+            // Used internally by choose_random_move.
+            int actual_random_move_count = 0;
+
+            // Save history of move scores for adjudication
+            vector<int> move_hist_scores;
+
+            auto flush_psv = [&](int8_t result) {
+                quit = commit_psv(th, packed_sfens, result, counter, limit);
+            };
+
+            for (int ply = 0; ; ++ply)
+            {
+                // Current search depth
+                const int depth = params.search_depth_min + (int)prng.rand(params.search_depth_max - params.search_depth_min + 1);
+
+                // Starting search calls init_for_search
+                auto [search_value, search_pv] = Search::search(pos, depth, 1, params.nodes);
+
+                // This has to be performed after search because it needs to know
+                // rootMoves which are filled in init_for_search.
+                const auto result = get_current_game_result(pos, move_hist_scores);
+                if (result.has_value())
+                {
+                    flush_psv(result.value());
+                    break;
+                }
+
+                // Always adjudivate by eval limit.
+                // Also because of this we don't have to check for TB/MATE scores
+                if (abs(search_value) >= params.eval_limit)
+                {
+                    resign_counter++;
+                    if ((should_resign && resign_counter >= 4) || abs(search_value) >= VALUE_KNOWN_WIN) {
+                        flush_psv((search_value >= params.eval_limit) ? 1 : -1);
+                        break;
+                    }
+                }
+                else
+                {
+                    resign_counter = 0;
+                }
+
+                // In case there is no PV and the game was not ended here
+                // there is nothing we can do, we can't continue the game,
+                // we don't know the result, so discard this game.
+                if (search_pv.empty())
+                {
+                    break;
+                }
+
+                // Save the move score for adjudication.
+                move_hist_scores.push_back(search_value);
+
+                // Discard stuff before write_minply is reached
+                // because it can harm training due to overfitting.
+                // Initial positions would be too common.
+                if (ply >= params.write_minply && !was_seen_before(pos))
+                {
+                    packed_sfens.emplace_back(PackedSfenValue());
+
+                    auto& psv = packed_sfens.back();
+
+                    // Here we only write the position data.
+                    // Result is added after the whole game is done.
+                    pos.sfen_pack(psv.sfen);
+
+                    psv.score = search_value;
+                    psv.gamePly = ply;
+                    psv.move = search_pv[0];
+                }
+
+                // Update the next move according to best search result or random move.
+                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
+                const Move next_move = random_move.has_value() ? *random_move : search_pv[0];
+
+                // We don't have the whole game yet, but it ended,
+                // so the writing process ends and the next game starts.
+                // This shouldn't really happen.
+                if (!is_ok(next_move))
+                {
+                    break;
+                }
+
+                // Do move.
+                pos.do_move(next_move, states[ply]);
+            }
+        }
+    }
+
+    bool Gensfen::was_seen_before(const Position& pos)
+    {
+        // Look into the position hashtable to see if the same
+        // position was seen before.
+        // This is a good heuristic to exlude already seen
+        // positions without many false positives.
+        auto key = pos.key();
+        auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
+        auto old_key = hash[hash_index];
+        if (key == old_key)
+        {
+            return true;
+        }
+        else
+        {
+            // Replace with the current key.
+            hash[hash_index] = key;
+            return false;
+        }
+    }
+
+    optional<int8_t> Gensfen::get_current_game_result(
         Position& pos,
         const vector<int>& move_hist_scores) const
     {
@@ -504,94 +670,44 @@ namespace Learner
         return nullopt;
     }
 
-    void MultiThinkGenSfen::report(uint64_t done, uint64_t new_done)
+    vector<uint8_t> Gensfen::generate_random_move_flags()
     {
-        const auto now_time = now();
-        const TimePoint elapsed = now_time - last_stats_report_time + 1;
+        vector<uint8_t> random_move_flag;
 
-        out
-            << endl
-            << done << " sfens, "
-            << new_done * 1000 / elapsed << " sfens/second, "
-            << "at " << now_string() << sync_endl;
+        // Depending on random move selection parameters setup
+        // the array of flags that indicates whether a random move
+        // be taken at a given ply.
 
-        last_stats_report_time = now_time;
+        // Make an array like a[0] = 0 ,a[1] = 1, ...
+        // Fisher-Yates shuffle and take out the first N items.
+        // Actually, I only want N pieces, so I only need
+        // to shuffle the first N pieces with Fisher-Yates.
 
-        out = sync_region_cout.new_region();
+        vector<int> a;
+        a.reserve((size_t)params.random_move_maxply);
+
+        // random_move_minply ,random_move_maxply is specified by 1 origin,
+        // Note that we are handling 0 origin here.
+        for (int i = std::max(params.random_move_minply - 1, 0); i < params.random_move_maxply; ++i)
+        {
+            a.push_back(i);
+        }
+
+        // In case of Apery random move, insert() may be called random_move_count times.
+        // Reserve only the size considering it.
+        random_move_flag.resize((size_t)params.random_move_maxply + params.random_move_count);
+
+        // A random move that exceeds the size() of a[] cannot be applied, so limit it.
+        for (int i = 0; i < std::min(params.random_move_count, (int)a.size()); ++i)
+        {
+            swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
+            random_move_flag[a[i]] = true;
+        }
+
+        return random_move_flag;
     }
 
-    void MultiThinkGenSfen::maybe_report(uint64_t done)
-    {
-        if (done % REPORT_DOT_EVERY == 0)
-        {
-            std::lock_guard lock(stats_mutex);
-
-            if (last_stats_report_time == 0)
-            {
-                last_stats_report_time = now();
-                out = sync_region_cout.new_region();
-            }
-
-            if (done != 0)
-            {
-                out << '.';
-
-                if (done % REPORT_STATS_EVERY == 0)
-                {
-                    report(done, REPORT_STATS_EVERY);
-                }
-            }
-        }
-    }
-
-    // Write out the phases loaded in sfens to a file.
-    // lastTurnIsWin: win/loss in the next phase after the final phase in sfens
-    // 1 when winning. -1 when losing. Pass 0 for a draw.
-    // Return value: true if the specified number of
-    // sfens has already been reached and the process ends.
-    bool MultiThinkGenSfen::commit_psv(
-        Thread& th,
-        PSVector& sfens,
-        int8_t lastTurnIsWin,
-        std::atomic<uint64_t>& counter,
-        uint64_t limit)
-    {
-        if (!params.write_out_draw_game_in_training_data_generation && lastTurnIsWin == 0)
-        {
-            // We didn't write anything so why quit.
-            return false;
-        }
-
-        int8_t is_win = lastTurnIsWin;
-
-        // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
-        // The phases stored in sfens are assumed to be continuous (in order).
-        for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
-        {
-            // If is_win == 0 (draw), multiply by -1 and it will remain 0 (draw)
-            is_win = -is_win;
-            it->game_result = is_win;
-        }
-
-        // Write sfens in move order to make potential compression easier
-        for (auto& sfen : sfens)
-        {
-            // Return true if there is already enough data generated.
-            const auto iter = counter.fetch_add(1);
-            if (iter >= limit)
-                return true;
-
-            // because `iter` was done, now we do one more
-            maybe_report(iter + 1);
-
-            // Write out one sfen.
-            sfen_writer.write(th.thread_idx(), sfen);
-        }
-
-        return false;
-    }
-
-    optional<Move> MultiThinkGenSfen::choose_random_move(
+    optional<Move> Gensfen::choose_random_move(
         Position& pos,
         std::vector<uint8_t>& random_move_flag,
         int ply,
@@ -682,222 +798,98 @@ namespace Learner
         return random_move;
     }
 
-    vector<uint8_t> MultiThinkGenSfen::generate_random_move_flags()
-    {
-        vector<uint8_t> random_move_flag;
-
-        // Depending on random move selection parameters setup
-        // the array of flags that indicates whether a random move
-        // be taken at a given ply.
-
-        // Make an array like a[0] = 0 ,a[1] = 1, ...
-        // Fisher-Yates shuffle and take out the first N items.
-        // Actually, I only want N pieces, so I only need
-        // to shuffle the first N pieces with Fisher-Yates.
-
-        vector<int> a;
-        a.reserve((size_t)params.random_move_maxply);
-
-        // random_move_minply ,random_move_maxply is specified by 1 origin,
-        // Note that we are handling 0 origin here.
-        for (int i = std::max(params.random_move_minply - 1, 0); i < params.random_move_maxply; ++i)
-        {
-            a.push_back(i);
-        }
-
-        // In case of Apery random move, insert() may be called random_move_count times.
-        // Reserve only the size considering it.
-        random_move_flag.resize((size_t)params.random_move_maxply + params.random_move_count);
-
-        // A random move that exceeds the size() of a[] cannot be applied, so limit it.
-        for (int i = 0; i < std::min(params.random_move_count, (int)a.size()); ++i)
-        {
-            swap(a[i], a[prng.rand((uint64_t)a.size() - i) + i]);
-            random_move_flag[a[i]] = true;
-        }
-
-        return random_move_flag;
-    }
-
-    bool MultiThinkGenSfen::was_seen_before(const Position& pos)
-    {
-        // Look into the position hashtable to see if the same
-        // position was seen before.
-        // This is a good heuristic to exlude already seen
-        // positions without many false positives.
-        auto key = pos.key();
-        auto hash_index = (size_t)(key & (GENSFEN_HASH_SIZE - 1));
-        auto old_key = hash[hash_index];
-        if (key == old_key)
-        {
-            return true;
-        }
-        else
-        {
-            // Replace with the current key.
-            hash[hash_index] = key;
-            return false;
-        }
-    }
-
-    // thread_id = 0..Threads.size()-1
-    void MultiThinkGenSfen::gensfen_worker(
+    // Write out the phases loaded in sfens to a file.
+    // result: win/loss in the next phase after the final phase in sfens
+    // 1 when winning. -1 when losing. Pass 0 for a draw.
+    // Return value: true if the specified number of
+    // sfens has already been reached and the process ends.
+    bool Gensfen::commit_psv(
         Thread& th,
+        PSVector& sfens,
+        int8_t result,
         std::atomic<uint64_t>& counter,
         uint64_t limit)
     {
-        // For the time being, it will be treated as a draw
-        // at the maximum number of steps to write.
-        // Maximum StateInfo + Search PV to advance to leaf buffer
-        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
-            params.write_maxply + MAX_PLY /* == search_depth_min + α */);
-
-        StateInfo si;
-
-        // end flag
-        bool quit = false;
-
-        // repeat until the specified number of times
-        while (!quit)
+        if (!params.write_out_draw_game_in_training_data_generation && result == 0)
         {
-            // It is necessary to set a dependent thread for Position.
-            // When parallelizing, Threads (since this is a vector<Thread*>,
-            // Do the same for up to Threads[0]...Threads[thread_num-1].
-            auto& pos = th.rootPos;
-            pos.set(StartFEN, false, &si, &th);
+            // We didn't write anything so why quit.
+            return false;
+        }
 
-            int resign_counter = 0;
-            bool should_resign = prng.rand(10) > 1;
-            // Vector for holding the sfens in the current simulated game.
-            PSVector a_psv;
-            a_psv.reserve(params.write_maxply + MAX_PLY);
+        // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
+        // The phases stored in sfens are assumed to be continuous (in order).
+        for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
+        {
+            // If is_win == 0 (draw), multiply by -1 and it will remain 0 (draw)
+            result = -result;
+            it->game_result = result;
+        }
 
-            // Precomputed flags. Used internally by choose_random_move.
-            vector<uint8_t> random_move_flag = generate_random_move_flags();
+        // Write sfens in move order to make potential compression easier
+        for (auto& sfen : sfens)
+        {
+            // Return true if there is already enough data generated.
+            const auto iter = counter.fetch_add(1);
+            if (iter >= limit)
+                return true;
 
-            // A counter that keeps track of the number of random moves
-            // When random_move_minply == -1, random moves are
-            // performed continuously, so use it at this time.
-            // Used internally by choose_random_move.
-            int actual_random_move_count = 0;
+            // because `iter` was done, now we do one more
+            maybe_report(iter + 1);
 
-            // Save history of move scores for adjudication
-            vector<int> move_hist_scores;
+            // Write out one sfen.
+            sfen_writer.write(th.thread_idx(), sfen);
+        }
 
-            auto flush_psv = [&](int8_t result) {
-                quit = commit_psv(th, a_psv, result, counter, limit);
-            };
+        return false;
+    }
 
-            for (int ply = 0; ; ++ply)
+    void Gensfen::report(uint64_t done, uint64_t new_done)
+    {
+        const auto now_time = now();
+        const TimePoint elapsed = now_time - last_stats_report_time + 1;
+
+        out
+            << endl
+            << done << " sfens, "
+            << new_done * 1000 / elapsed << " sfens/second, "
+            << "at " << now_string() << sync_endl;
+
+        last_stats_report_time = now_time;
+
+        out = sync_region_cout.new_region();
+    }
+
+    void Gensfen::maybe_report(uint64_t done)
+    {
+        if (done % REPORT_DOT_EVERY == 0)
+        {
+            std::lock_guard lock(stats_mutex);
+
+            if (last_stats_report_time == 0)
             {
-                // Current search depth
-                const int depth = params.search_depth_min + (int)prng.rand(params.search_depth_max - params.search_depth_min + 1);
+                last_stats_report_time = now();
+                out = sync_region_cout.new_region();
+            }
 
-                // Starting search calls init_for_search
-                auto [search_value, search_pv] = Search::search(pos, depth, 1, params.nodes);
+            if (done != 0)
+            {
+                out << '.';
 
-                // This has to be performed after search because it needs to know
-                // rootMoves which are filled in init_for_search.
-                const auto result = get_current_game_result(pos, move_hist_scores);
-                if (result.has_value())
+                if (done % REPORT_STATS_EVERY == 0)
                 {
-                    flush_psv(result.value());
-                    break;
+                    report(done, REPORT_STATS_EVERY);
                 }
-
-                // Always adjudivate by eval limit.
-                // Also because of this we don't have to check for TB/MATE scores
-                if (abs(search_value) >= params.eval_limit)
-                {
-                    resign_counter++;
-                    if ((should_resign && resign_counter >= 4) || abs(search_value) >= VALUE_KNOWN_WIN) {
-                        flush_psv((search_value >= params.eval_limit) ? 1 : -1);
-                        break;
-                    }
-                }
-                else
-                {
-                    resign_counter = 0;
-                }
-
-                // In case there is no PV and the game was not ended here
-                // there is nothing we can do, we can't continue the game,
-                // we don't know the result, so discard this game.
-                if (search_pv.empty())
-                {
-                    break;
-                }
-
-                // Save the move score for adjudication.
-                move_hist_scores.push_back(search_value);
-
-                // Discard stuff before write_minply is reached
-                // because it can harm training due to overfitting.
-                // Initial positions would be too common.
-                if (ply >= params.write_minply && !was_seen_before(pos))
-                {
-                    a_psv.emplace_back(PackedSfenValue());
-
-                    auto& psv = a_psv.back();
-
-                    // Here we only write the position data.
-                    // Result is added after the whole game is done.
-                    pos.sfen_pack(psv.sfen);
-
-                    psv.score = search_value;
-                    psv.gamePly = ply;
-                    psv.move = search_pv[0];
-                }
-
-                // Update the next move according to best search result or random move.
-                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
-                const Move next_move = random_move.has_value() ? *random_move : search_pv[0];
-
-                // We don't have the whole game yet, but it ended,
-                // so the writing process ends and the next game starts.
-                // This shouldn't really happen.
-                if (!is_ok(next_move))
-                {
-                    break;
-                }
-
-                // Do move.
-                pos.do_move(next_move, states[ply]);
-
             }
         }
     }
 
-    void set_gensfen_search_limits()
-    {
-        // About Search::Limits
-        // Be careful because this member variable is global and affects other threads.
-        auto& limits = Search::Limits;
-
-        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
-        limits.infinite = true;
-
-        // Since PV is an obstacle when displayed, erase it.
-        limits.silent = true;
-
-        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
-        limits.nodes = 0;
-
-        // depth is also processed by the one passed as an argument of Learner::search().
-        limits.depth = 0;
-    }
-
-    // -----------------------------------
-    // Command to generate a game record (master thread)
-    // -----------------------------------
-
     // Command to generate a game record
     void gensfen(istringstream& is)
     {
         // Number of generated game records default = 8 billion phases (Ponanza specification)
         uint64_t loop_max = 8000000000UL;
 
-        MultiThinkGenSfen::Params params;
+        Gensfen::Params params;
 
         // Add a random number to the end of the file name.
         bool random_file_name = false;
@@ -978,9 +970,7 @@ namespace Learner
             else if (sfen_format == "binpack")
                 params.sfen_format = SfenOutputType::Binpack;
             else
-            {
                 cout << "WARNING: Unknown sfen format `" << sfen_format << "`. Using bin\n";
-            }
         }
 
         if (random_file_name)
@@ -988,9 +978,11 @@ namespace Learner
             // Give a random number to output_file_name at this point.
             // Do not use std::random_device().  Because it always the same integers on MinGW.
             PRNG r(params.seed);
+
             // Just in case, reassign the random numbers.
             for (int i = 0; i < 10; ++i)
                 r.rand(1);
+
             auto to_hex = [](uint64_t u) {
                 std::stringstream ss;
                 ss << std::hex << u;
@@ -1034,10 +1026,8 @@ namespace Learner
 
         Threads.main()->ponder = false;
 
-        set_gensfen_search_limits();
-
-        MultiThinkGenSfen multi_think(params);
-        multi_think.gensfen(loop_max);
+        Gensfen gensfen(params);
+        gensfen.generate(loop_max);
 
         std::cout << "INFO: Gensfen finished." << endl;
     }

From 65e443954a14bb38b7e68c7827c148313fc78176 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 25 Oct 2020 19:14:19 +0100
Subject: [PATCH 396/583] Update expected gensfen finished responses.

---
 tests/instrumented_learn.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 50b6e4ae..07f5f98b 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -79,11 +79,11 @@ cat << EOF > gensfen01.exp
  send "setoption name Use NNUE value false\n"
  send "isready\n"
  send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin sfen_format bin\n"
- expect "gensfen finished."
+ expect "INFO: Gensfen finished."
  send "convert_plain targetfile training_data/training_data.bin output_file_name training_data.txt\n"
  expect "all done"
  send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack sfen_format binpack\n"
- expect "gensfen finished."
+ expect "INFO: Gensfen finished."
 
  send "quit\n"
  expect eof
@@ -105,9 +105,9 @@ cat << EOF > gensfen02.exp
  send "setoption name Use NNUE value true\n"
  send "isready\n"
  send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.bin sfen_format bin\n"
- expect "gensfen finished."
+ expect "INFO: Gensfen finished."
  send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.binpack sfen_format binpack\n"
- expect "gensfen finished."
+ expect "INFO: Gensfen finished."
 
  send "quit\n"
  expect eof
@@ -129,7 +129,7 @@ cat << EOF > learn01.exp
  send "isready\n"
  send "learn targetdir training_data epochs 1 sfen_read_size 100 thread_buffer_size 10 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
 
- expect "INFO (save_eval): Saving current evaluation file in"
+ expect "INFO (save_eval): Finished saving evaluation file in"
 
  send "quit\n"
  expect eof

From e515f1f61f880caddef0cf7a09a767d0204d0dd4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:56:42 +0200
Subject: [PATCH 397/583] Move SfenWriter to a separate file

---
 src/learn/gensfen.cpp   | 186 +-----------------------------------
 src/learn/sfen_writer.h | 206 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 208 insertions(+), 184 deletions(-)
 create mode 100644 src/learn/sfen_writer.h

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 1dddac5a..4accb882 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -1,7 +1,7 @@
 ﻿#include "gensfen.h"
 
+#include "sfen_writer.h"
 #include "packed_sfen.h"
-#include "sfen_stream.h"
 
 #include "misc.h"
 #include "position.h"
@@ -16,6 +16,7 @@
 
 #include "syzygy/tbprobe.h"
 
+#include <atomic>
 #include <chrono>
 #include <climits>
 #include <cmath>
@@ -28,7 +29,6 @@
 #include <memory>
 #include <optional>
 #include <random>
-#include <regex>
 #include <shared_mutex>
 #include <sstream>
 #include <unordered_set>
@@ -37,188 +37,6 @@ using namespace std;
 
 namespace Learner
 {
-    // Helper class for exporting Sfen
-    struct SfenWriter
-    {
-        // Amount of sfens required to flush the buffer.
-        static constexpr size_t SFEN_WRITE_SIZE = 5000;
-
-        // File name to write and number of threads to create
-        SfenWriter(string filename_, int thread_num, uint64_t save_count, SfenOutputType sfen_output_type)
-        {
-            sfen_buffers_pool.reserve((size_t)thread_num * 10);
-            sfen_buffers.resize(thread_num);
-
-            auto out = sync_region_cout.new_region();
-            out << "INFO (sfen_writer): Creating new data file at " << filename_ << endl;
-
-            sfen_format = sfen_output_type;
-            output_file_stream = create_new_sfen_output(filename_, sfen_format);
-            filename = filename_;
-            save_every = save_count;
-
-            finished = false;
-
-            file_worker_thread = std::thread([&] { this->file_write_worker(); });
-        }
-
-        ~SfenWriter()
-        {
-            flush();
-
-            finished = true;
-            file_worker_thread.join();
-            output_file_stream.reset();
-
-#if !defined(NDEBUG)
-            {
-                // All buffers should be empty since file_worker_thread
-                // should have written everything before exiting.
-                for (const auto& p : sfen_buffers) { assert(p == nullptr); (void)p ; }
-                assert(sfen_buffers_pool.empty());
-            }
-#endif
-        }
-
-        void write(size_t thread_id, const PackedSfenValue& psv)
-        {
-            // We have a buffer for each thread and add it there.
-            // If the buffer overflows, write it to a file.
-
-            // This buffer is prepared for each thread.
-            auto& buf = sfen_buffers[thread_id];
-
-            // Secure since there is no buf at the first time
-            // and immediately after writing the thread buffer.
-            if (!buf)
-            {
-                buf = std::make_unique<PSVector>();
-                buf->reserve(SFEN_WRITE_SIZE);
-            }
-
-            // Buffer is exclusive to this thread.
-            // There is no need for a critical section.
-            buf->push_back(psv);
-
-            if (buf->size() >= SFEN_WRITE_SIZE)
-            {
-                // If you load it in sfen_buffers_pool, the worker will do the rest.
-
-                // Critical section since sfen_buffers_pool is shared among threads.
-                std::unique_lock<std::mutex> lk(mutex);
-                sfen_buffers_pool.emplace_back(std::move(buf));
-            }
-        }
-
-        void flush()
-        {
-            for (size_t i = 0; i < sfen_buffers.size(); ++i)
-            {
-                flush(i);
-            }
-        }
-
-        // Move what remains in the buffer for your thread to a buffer for writing to a file.
-        void flush(size_t thread_id)
-        {
-            std::unique_lock<std::mutex> lk(mutex);
-
-            auto& buf = sfen_buffers[thread_id];
-
-            // There is a case that buf==nullptr, so that check is necessary.
-            if (buf && buf->size() != 0)
-            {
-                sfen_buffers_pool.emplace_back(std::move(buf));
-            }
-        }
-
-        // Dedicated thread to write to file
-        void file_write_worker()
-        {
-            while (!finished || sfen_buffers_pool.size())
-            {
-                vector<std::unique_ptr<PSVector>> buffers;
-                {
-                    std::unique_lock<std::mutex> lk(mutex);
-
-                    // Atomically swap take the filled buffers and
-                    // create a new buffer pool for threads to fill.
-                    buffers = std::move(sfen_buffers_pool);
-                    sfen_buffers_pool = std::vector<std::unique_ptr<PSVector>>();
-                }
-
-                if (!buffers.size())
-                {
-                    // Poor man's condition variable.
-                    sleep(100);
-                }
-                else
-                {
-                    for (auto& buf : buffers)
-                    {
-                        output_file_stream->write(*buf);
-
-                        sfen_write_count += buf->size();
-
-                        // Add the processed number here, and if it exceeds save_every,
-                        // change the file name and reset this counter.
-                        sfen_write_count_current_file += buf->size();
-                        if (sfen_write_count_current_file >= save_every)
-                        {
-                            sfen_write_count_current_file = 0;
-
-                            // Sequential number attached to the file
-                            int n = (int)(sfen_write_count / save_every);
-
-                            // Rename the file and open it again.
-                            // Add ios::app in consideration of overwriting.
-                            // (Depending on the operation, it may not be necessary.)
-                            string new_filename = filename + "_" + std::to_string(n);
-                            output_file_stream = create_new_sfen_output(new_filename, sfen_format);
-
-                            auto out = sync_region_cout.new_region();
-                            out << "INFO (sfen_writer): Creating new data file at " << new_filename << endl;
-                        }
-                    }
-                }
-            }
-        }
-
-    private:
-
-        std::unique_ptr<BasicSfenOutputStream> output_file_stream;
-
-        // A new net is saved after every save_every sfens are processed.
-        uint64_t save_every = std::numeric_limits<uint64_t>::max();
-
-        // File name passed in the constructor
-        std::string filename;
-
-        // Thread to write to the file
-        std::thread file_worker_thread;
-
-        // Flag that all threads have finished
-        atomic<bool> finished;
-
-        SfenOutputType sfen_format;
-
-        // buffer before writing to file
-        // sfen_buffers is the buffer for each thread
-        // sfen_buffers_pool is a buffer for writing.
-        // After loading the phase in the former buffer by SFEN_WRITE_SIZE,
-        // transfer it to the latter.
-        std::vector<std::unique_ptr<PSVector>> sfen_buffers;
-        std::vector<std::unique_ptr<PSVector>> sfen_buffers_pool;
-
-        // Mutex required to access sfen_buffers_pool
-        std::mutex mutex;
-
-        // Number of sfens written in total, and the
-        // number of sfens written in the current file.
-        uint64_t sfen_write_count = 0;
-        uint64_t sfen_write_count_current_file = 0;
-    };
-
     // Class to generate sfen with multiple threads
     struct Gensfen
     {
diff --git a/src/learn/sfen_writer.h b/src/learn/sfen_writer.h
new file mode 100644
index 00000000..1bbd916c
--- /dev/null
+++ b/src/learn/sfen_writer.h
@@ -0,0 +1,206 @@
+#include "packed_sfen.h"
+#include "sfen_stream.h"
+
+#include "misc.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <shared_mutex>
+#include <thread>
+#include <atomic>
+
+using namespace std;
+
+namespace Learner {
+
+    // Helper class for exporting Sfen
+    struct SfenWriter
+    {
+        // Amount of sfens required to flush the buffer.
+        static constexpr size_t SFEN_WRITE_SIZE = 5000;
+
+        // File name to write and number of threads to create
+        SfenWriter(string filename_, int thread_num, uint64_t save_count, SfenOutputType sfen_output_type)
+        {
+            sfen_buffers_pool.reserve((size_t)thread_num * 10);
+            sfen_buffers.resize(thread_num);
+
+            auto out = sync_region_cout.new_region();
+            out << "INFO (sfen_writer): Creating new data file at " << filename_ << endl;
+
+            sfen_format = sfen_output_type;
+            output_file_stream = create_new_sfen_output(filename_, sfen_format);
+            filename = filename_;
+            save_every = save_count;
+
+            finished = false;
+
+            file_worker_thread = std::thread([&] { this->file_write_worker(); });
+        }
+
+        ~SfenWriter()
+        {
+            flush();
+
+            finished = true;
+            file_worker_thread.join();
+            output_file_stream.reset();
+
+#if !defined(NDEBUG)
+            {
+                // All buffers should be empty since file_worker_thread
+                // should have written everything before exiting.
+                for (const auto& p : sfen_buffers) { assert(p == nullptr); (void)p ; }
+                assert(sfen_buffers_pool.empty());
+            }
+#endif
+        }
+
+        void write(size_t thread_id, const PackedSfenValue& psv)
+        {
+            // We have a buffer for each thread and add it there.
+            // If the buffer overflows, write it to a file.
+
+            // This buffer is prepared for each thread.
+            auto& buf = sfen_buffers[thread_id];
+
+            // Secure since there is no buf at the first time
+            // and immediately after writing the thread buffer.
+            if (!buf)
+            {
+                buf = std::make_unique<PSVector>();
+                buf->reserve(SFEN_WRITE_SIZE);
+            }
+
+            // Buffer is exclusive to this thread.
+            // There is no need for a critical section.
+            buf->push_back(psv);
+
+            if (buf->size() >= SFEN_WRITE_SIZE)
+            {
+                // If you load it in sfen_buffers_pool, the worker will do the rest.
+
+                // Critical section since sfen_buffers_pool is shared among threads.
+                std::unique_lock<std::mutex> lk(mutex);
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        void flush()
+        {
+            for (size_t i = 0; i < sfen_buffers.size(); ++i)
+            {
+                flush(i);
+            }
+        }
+
+        // Move what remains in the buffer for your thread to a buffer for writing to a file.
+        void flush(size_t thread_id)
+        {
+            std::unique_lock<std::mutex> lk(mutex);
+
+            auto& buf = sfen_buffers[thread_id];
+
+            // There is a case that buf==nullptr, so that check is necessary.
+            if (buf && buf->size() != 0)
+            {
+                sfen_buffers_pool.emplace_back(std::move(buf));
+            }
+        }
+
+        // Dedicated thread to write to file
+        void file_write_worker()
+        {
+            while (!finished || sfen_buffers_pool.size())
+            {
+                vector<std::unique_ptr<PSVector>> buffers;
+                {
+                    std::unique_lock<std::mutex> lk(mutex);
+
+                    // Atomically swap take the filled buffers and
+                    // create a new buffer pool for threads to fill.
+                    buffers = std::move(sfen_buffers_pool);
+                    sfen_buffers_pool = std::vector<std::unique_ptr<PSVector>>();
+                }
+
+                if (!buffers.size())
+                {
+                    // Poor man's condition variable.
+                    sleep(100);
+                }
+                else
+                {
+                    for (auto& buf : buffers)
+                    {
+                        output_file_stream->write(*buf);
+
+                        sfen_write_count += buf->size();
+
+                        // Add the processed number here, and if it exceeds save_every,
+                        // change the file name and reset this counter.
+                        sfen_write_count_current_file += buf->size();
+                        if (sfen_write_count_current_file >= save_every)
+                        {
+                            sfen_write_count_current_file = 0;
+
+                            // Sequential number attached to the file
+                            int n = (int)(sfen_write_count / save_every);
+
+                            // Rename the file and open it again.
+                            // Add ios::app in consideration of overwriting.
+                            // (Depending on the operation, it may not be necessary.)
+                            string new_filename = filename + "_" + std::to_string(n);
+                            output_file_stream = create_new_sfen_output(new_filename, sfen_format);
+
+                            auto out = sync_region_cout.new_region();
+                            out << "INFO (sfen_writer): Creating new data file at " << new_filename << endl;
+                        }
+                    }
+                }
+            }
+        }
+
+    private:
+
+        std::unique_ptr<BasicSfenOutputStream> output_file_stream;
+
+        // A new net is saved after every save_every sfens are processed.
+        uint64_t save_every = std::numeric_limits<uint64_t>::max();
+
+        // File name passed in the constructor
+        std::string filename;
+
+        // Thread to write to the file
+        std::thread file_worker_thread;
+
+        // Flag that all threads have finished
+        atomic<bool> finished;
+
+        SfenOutputType sfen_format;
+
+        // buffer before writing to file
+        // sfen_buffers is the buffer for each thread
+        // sfen_buffers_pool is a buffer for writing.
+        // After loading the phase in the former buffer by SFEN_WRITE_SIZE,
+        // transfer it to the latter.
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers;
+        std::vector<std::unique_ptr<PSVector>> sfen_buffers_pool;
+
+        // Mutex required to access sfen_buffers_pool
+        std::mutex mutex;
+
+        // Number of sfens written in total, and the
+        // number of sfens written in the current file.
+        uint64_t sfen_write_count = 0;
+        uint64_t sfen_write_count_current_file = 0;
+    };
+}

From e01397c674a843e2f90623b50d92dca6712b3f63 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 25 Oct 2020 10:43:45 +0100
Subject: [PATCH 398/583] Remove multi_think

---
 src/Makefile              |   3 +-
 src/learn/convert.cpp     |   2 -
 src/learn/multi_think.cpp |  98 ------------------------
 src/learn/multi_think.h   | 152 --------------------------------------
 4 files changed, 1 insertion(+), 254 deletions(-)
 delete mode 100644 src/learn/multi_think.cpp
 delete mode 100644 src/learn/multi_think.h

diff --git a/src/Makefile b/src/Makefile
index 0b2f99ed..f2c4d269 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -59,8 +59,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	learn/sfen_packer.cpp \
 	learn/learn.cpp \
 	learn/gensfen.cpp \
-	learn/convert.cpp \
-	learn/multi_think.cpp
+	learn/convert.cpp
 
 OBJS = $(notdir $(SRCS:.cpp=.o))
 
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index a7528b02..dfd30509 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -1,7 +1,5 @@
 #include "convert.h"
 
-#include "multi_think.h"
-
 #include "uci.h"
 #include "misc.h"
 #include "thread.h"
diff --git a/src/learn/multi_think.cpp b/src/learn/multi_think.cpp
deleted file mode 100644
index bf1ab29b..00000000
--- a/src/learn/multi_think.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-﻿#include "multi_think.h"
-
-#include "tt.h"
-#include "uci.h"
-#include "types.h"
-#include "search.h"
-
-#include "nnue/evaluate_nnue.h"
-
-#include <thread>
-
-void MultiThink::go_think()
-{
-    // Call the derived class's init().
-    init();
-
-    // The loop upper limit is set with set_loop_max().
-    loop_count = 0;
-    done_count = 0;
-
-    // Create threads as many as Options["Threads"] and start thinking.
-    std::vector<std::thread> threads;
-    auto thread_num = (size_t)Options["Threads"];
-
-    // Secure end flag of worker thread
-        threads_finished=0;
-
-    // start worker thread
-    for (size_t i = 0; i < thread_num; ++i)
-    {
-        threads.push_back(std::thread([i, this]
-        {
-            // exhaust all processor threads.
-            WinProcGroup::bindThisThread(i);
-
-            // execute the overridden process
-            this->thread_worker(i);
-
-            // Set the end flag because the thread has ended
-            this->threads_finished++;
-        }));
-    }
-
-    // wait for all threads to finish
-    // for (auto& th :threads)
-    // th.join();
-    // If you write like, the thread will rush here while it is still working,
-    // During that time, callback_func() cannot be called and you cannot save.
-    // Therefore, you need to check the end flag yourself.
-
-    // function to determine if all threads have finished
-    auto threads_done = [&]()
-    {
-        return threads_finished == thread_num;
-    };
-
-    // Call back if the callback function is set.
-    auto do_a_callback = [&]()
-    {
-        if (callback_func)
-            callback_func();
-    };
-
-
-    for (uint64_t i = 0 ; ; )
-    {
-        // If all threads have finished, exit the loop.
-        if (threads_done())
-            break;
-
-        sleep(1000);
-
-        // callback_func() is called every callback_seconds.
-        if (++i == callback_seconds)
-        {
-            do_a_callback();
-            // Since I am returning from ↑, I reset the counter, so
-            // no matter how long it takes to save() etc. in do_a_callback()
-            // The next call will take a certain amount of time.
-            i = 0;
-        }
-    }
-
-    // Last save.
-    std::cout << std::endl << "finalize..";
-
-    // do_a_callback();
-    // → It should be saved by the caller, so I feel that it is not necessary here.
-
-    // It is possible that the exit code of the thread is running but the exit code of the thread is running, so
-    // We need to wait for the end with join().
-    for (auto& th : threads)
-        th.join();
-
-    // The file writing thread etc. are still running only when all threads are finished
-    // Since the work itself may not have completed, output only that all threads have finished.
-    std::cout << "all threads are joined." << std::endl;
-}
diff --git a/src/learn/multi_think.h b/src/learn/multi_think.h
deleted file mode 100644
index 7e541909..00000000
--- a/src/learn/multi_think.h
+++ /dev/null
@@ -1,152 +0,0 @@
-﻿#ifndef _MULTI_THINK_
-#define _MULTI_THINK_
-
-#include "learn.h"
-
-#include "misc.h"
-#include "thread_win32_osx.h"
-
-#include <atomic>
-#include <limits>
-#include <functional>
-#include <mutex>
-#include <string>
-#include <cstdint>
-
-
-// Learning from a game record, when making yourself think and generating a fixed track, etc.
-// Helper class used when multiple threads want to call Search::think() individually.
-// Derive and use this class.
-struct MultiThink
-{
-    static constexpr std::uint64_t LOOP_COUNT_FINISHED = std::numeric_limits<std::uint64_t>::max();
-
-    MultiThink() : prng{}, loop_count(0) { }
-
-    MultiThink(std::uint64_t seed) : prng(seed), loop_count(0) { }
-
-    MultiThink(const std::string& seed) : prng(seed), loop_count(0) { }
-
-    // Call this function from the master thread, each thread will think,
-    // Return control when the thought ending condition is satisfied.
-    // Do something else.
-    // ・It is safe for each thread to call Learner::search(),qsearch()
-    // Separates the substitution table for each thread. (It will be restored after the end.)
-    // ・Book is not thread safe when in on the fly mode, so temporarily change this mode.
-    // Turn it off.
-    // [Requirements]
-    // 1) Override thread_worker()
-    // 2) Set the loop count with set_loop_max()
-    // 3) set a function to be called back periodically (if necessary)
-    // callback_func and callback_interval
-    void go_think();
-
-    // If there is something you want to initialize on the derived class side, override this,
-    // Called when initialization is completed with go_think().
-    // It is better to read the fixed trace at that timing.
-    virtual void init() {}
-
-    // A thread worker that is called by creating a thread when you go_think()
-    // Override and use this.
-    virtual void thread_worker(size_t thread_id) = 0;
-
-    // Called back every callback_seconds [seconds] when go_think().
-    std::function<void()> callback_func;
-    uint64_t callback_seconds = 600;
-
-    // Set the number of times worker processes (calls Search::think()).
-    void set_loop_max(uint64_t loop_max_) { loop_max = loop_max_; }
-
-    // Get the value set by set_loop_max().
-    uint64_t get_loop_max() const { return loop_max; }
-
-    // [ASYNC] Take the value of the loop counter and add the loop counter after taking it out.
-    // If the loop counter has reached loop_max, return UINT64_MAX.
-    // If you want to generate a phase, you must call this function at the time of generating the phase,
-    // Please note that the number of generated phases and the value of the counter will not match.
-    uint64_t get_next_loop_count() {
-        std::unique_lock<std::mutex> lk(loop_mutex);
-        if (loop_count >= loop_max)
-            return LOOP_COUNT_FINISHED;
-        return loop_count++;
-    }
-
-    // [ASYNC] For returning the processed number. Each time it is called, it returns a counter that is incremented.
-    uint64_t get_done_count() {
-        std::unique_lock<std::mutex> lk(loop_mutex);
-        return ++done_count;
-    }
-
-    // Mutex when worker thread accesses I/O
-    std::mutex io_mutex;
-
-protected:
-    // Random number generator body
-    AsyncPRNG prng;
-
-private:
-    // number of times worker processes (calls Search::think())
-    std::atomic<uint64_t> loop_max;
-    // number of times the worker has processed (calls Search::think())
-    std::atomic<uint64_t> loop_count;
-    // To return the number of times it has been processed.
-    std::atomic<uint64_t> done_count;
-
-    // Mutex when changing the variables in ↑
-    std::mutex loop_mutex;
-
-    // Thread end flag.
-    std::atomic<uint64_t> threads_finished;
-};
-
-// Mechanism to process task during idle time.
-// master passes the task with push_task_async() whenever you like.
-// When slave executes on_idle() in its spare time, it retrieves one task and continues execution until there is no queue.
-// Convenient to use when you want to write MultiThink thread worker in master-slave method.
-struct TaskDispatcher
-{
-    typedef std::function<void(size_t /* thread_id */)> Task;
-
-    // slave calls this function during idle.
-    void on_idle(size_t thread_id)
-    {
-        Task task;
-        while ((task = get_task_async()) != nullptr)
-            task(thread_id);
-
-        sleep(1);
-    }
-
-    // Stack [ASYNC] task.
-    void push_task_async(Task task)
-    {
-        std::unique_lock<std::mutex> lk(task_mutex);
-        tasks.push_back(task);
-    }
-
-    // Allocate size array elements for task in advance.
-    void task_reserve(size_t size)
-    {
-        tasks.reserve(size);
-    }
-
-protected:
-    // set of tasks
-    std::vector<Task> tasks;
-
-    // Take out one [ASYNC] task. Called from on_idle().
-    Task get_task_async()
-    {
-        std::unique_lock<std::mutex> lk(task_mutex);
-        if (tasks.size() == 0)
-            return nullptr;
-        Task task = *tasks.rbegin();
-        tasks.pop_back();
-        return task;
-    }
-
-    // a mutex for accessing tasks
-    std::mutex task_mutex;
-};
-
-#endif

From ba390a7f9a0a0243531a2489ab4f4303a26deca4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 26 Oct 2020 13:52:35 +0100
Subject: [PATCH 399/583] Print the used factorizer when intializing training.

---
 src/nnue/evaluate_nnue_learner.cpp               |  5 +++++
 src/nnue/trainer/features/factorizer.h           |  8 ++++++++
 .../trainer/features/factorizer_feature_set.h    | 16 ++++++++++++++++
 src/nnue/trainer/features/factorizer_half_kp.h   |  8 ++++++++
 4 files changed, 37 insertions(+)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 7a72ea19..6775707d 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -68,6 +68,11 @@ namespace Eval::NNUE {
 
         out << std::endl;
 
+        out << "Factorizers:\n"
+            << Features::Factorizer<RawFeatures>::get_factorizers_string() << std::endl;
+
+        out << std::endl;
+
         assert(feature_transformer);
         assert(network);
 
diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
index 49a2fe26..15ce8022 100644
--- a/src/nnue/trainer/features/factorizer.h
+++ b/src/nnue/trainer/features/factorizer.h
@@ -13,6 +13,14 @@ namespace Eval::NNUE::Features {
     template <typename FeatureType>
     class Factorizer {
     public:
+        static constexpr std::string get_name() {
+            return std::string("No factorizer");
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
         // Get the dimensionality of the learning feature
         static constexpr IndexType get_dimensions() {
             return FeatureType::kDimensions;
diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
index 032a449b..f5ee3c5c 100644
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/nnue/trainer/features/factorizer_feature_set.h
@@ -21,6 +21,14 @@ namespace Eval::NNUE::Features {
         static constexpr IndexType kBaseDimensions =
             FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
 
+        static constexpr std::string get_factorizers_string() {
+            std::string str = "  - ";
+            str += Head::get_name();
+            str += '\n';
+            str += Tail::get_factorizers_string();
+            return str;
+        }
+
         // Get the dimensionality of the learning feature
         static constexpr IndexType get_dimensions() {
             return Head::get_dimensions() + Tail::get_dimensions();
@@ -73,6 +81,14 @@ namespace Eval::NNUE::Features {
         // number of dimensions of original input features
         static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
 
+        static constexpr std::string get_name() {
+            return FeatureType::kName;
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
         // Get the dimensionality of the learning feature
         static constexpr IndexType get_dimensions() {
             return Factorizer<FeatureType>::get_dimensions();
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
index 152722ac..601ddfa5 100644
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/nnue/trainer/features/factorizer_half_kp.h
@@ -45,6 +45,14 @@ namespace Eval::NNUE::Features {
         static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
 
     public:
+        static constexpr std::string get_name() {
+            return std::string("Factorizer<") + FeatureType::kName + ">";
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
         // Get the dimensionality of the learning feature
         static constexpr IndexType get_dimensions() {
             return get_active_dimensions(kProperties);

From f7de49eb66b07a0ab65c32184d22af4abeced378 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 23:16:11 +0200
Subject: [PATCH 400/583] Create a collective parameter struct for learner.

---
 src/learn/learn.cpp | 374 +++++++++++++++++++-------------------------
 1 file changed, 162 insertions(+), 212 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 3faab0ea..e9eb1141 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -77,10 +77,6 @@ T operator -= (std::atomic<T>& x, const T rhs) { return x += -rhs; }
 
 namespace Learner
 {
-    static bool use_draw_games_in_training = true;
-    static bool use_draw_games_in_validation = true;
-    static bool skip_duplicated_positions_in_training = true;
-
     static double winning_probability_coefficient = 1.0 / PawnValueEg / 4.0 * std::log(10.0);
 
     // Score scale factors. ex) If we set src_score_min_value = 0.0,
@@ -373,37 +369,94 @@ namespace Learner
     // Class to generate sfen with multiple threads
     struct LearnerThink
     {
+        struct Params
+        {
+            // Mini batch size size. Be sure to set it on the side that uses this class.
+            uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
+
+            // Option to exclude early stage from learning
+            int reduction_gameply = 1;
+
+            // If the absolute value of the evaluation value of the deep search
+            // of the teacher phase exceeds this value, discard the teacher phase.
+            int eval_limit = 32000;
+
+            // Flag whether to dig a folder each time the evaluation function is saved.
+            // If true, do not dig the folder.
+            bool save_only_once = false;
+
+            bool shuffle = true;
+
+            bool verbose = false;
+
+            double newbob_decay = 0.5;
+            int newbob_num_trials = 4;
+            uint64_t auto_lr_drop = 0;
+
+            std::string best_nn_directory;
+
+            uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
+            uint64_t loss_output_interval = 1'000'000;
+
+            size_t sfen_read_size = SfenReader::DEFAULT_SFEN_READ_SIZE;
+            size_t thread_buffer_size = SfenReader::DEFAULT_THREAD_BUFFER_SIZE;
+
+            bool use_draw_games_in_training = true;
+            bool use_draw_games_in_validation = true;
+            bool skip_duplicated_positions_in_training = true;
+
+            string validation_set_file_name;
+            string seed;
+
+            std::vector<std::string> filenames;
+
+            uint64_t num_threads;
+
+            void enforce_constraints()
+            {
+                num_threads = Options["Threads"];
+
+                if (loss_output_interval == 0)
+                {
+                    loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
+                }
+
+                // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
+                reduction_gameply = max(reduction_gameply, 1);
+
+                if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
+                    // Save the current net to [EvalSaveDir]\original.
+                    Eval::NNUE::save_eval("original");
+
+                    // Set the folder above to best_nn_directory so that the trainer can
+                    // resotre the network parameters from the original net file.
+                    best_nn_directory =
+                        Path::combine(Options["EvalSaveDir"], "original");
+                }
+            }
+        };
+
         // Number of phases used for calculation such as mse
         // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
         // Since search() is performed with depth = 1 in calculation of
         // move match rate, simple comparison is not possible...
         static constexpr uint64_t sfen_for_mse_size = 2000;
 
-        LearnerThink(
-            const std::vector<std::string>& filenames,
-            bool shuffle,
-            uint64_t thread_num,
-            const std::string& seed,
-            size_t read_size,
-            size_t buffer_size
-        ) :
-            prng(seed),
+        LearnerThink(const Params& prm) :
+            params(prm),
+            prng(prm.seed),
             sr(
-                filenames,
-                shuffle,
+                prm.filenames,
+                prm.shuffle,
                 SfenReaderMode::Cyclic,
-                thread_num,
+                prm.num_threads,
                 std::to_string(prng.next_random_seed()),
-                read_size,
-                buffer_size),
+                prm.sfen_read_size,
+                prm.thread_buffer_size),
             learn_loss_sum{}
         {
-            save_only_once = false;
             save_count = 0;
             loss_output_count = 0;
-            newbob_decay = 1.0;
-            newbob_num_trials = 2;
-            auto_lr_drop = 0;
             last_lr_drop = 0;
             best_loss = std::numeric_limits<double>::infinity();
             latest_loss_sum = 0.0;
@@ -413,34 +466,6 @@ namespace Learner
 
         void learn(uint64_t epochs);
 
-
-        std::string validation_set_file_name;
-
-        // Mini batch size size. Be sure to set it on the side that uses this class.
-        uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
-
-        // Option to exclude early stage from learning
-        int reduction_gameply;
-
-        // If the absolute value of the evaluation value of the deep search
-        // of the teacher phase exceeds this value, discard the teacher phase.
-        int eval_limit;
-
-        // Flag whether to dig a folder each time the evaluation function is saved.
-        // If true, do not dig the folder.
-        bool save_only_once;
-
-        bool verbose;
-
-        double newbob_decay;
-        int newbob_num_trials;
-        uint64_t auto_lr_drop;
-
-        std::string best_nn_directory;
-
-        uint64_t eval_save_interval;
-        uint64_t loss_output_interval;
-
     private:
         void learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
 
@@ -462,6 +487,8 @@ namespace Learner
         // save merit function parameters to a file
         bool save(bool is_final = false);
 
+        Params params;
+
         PRNG prng;
 
         // sfen reader
@@ -493,11 +520,14 @@ namespace Learner
         Eval::NNUE::verify_any_net_loaded();
 
         const PSVector sfen_for_mse =
-            validation_set_file_name.empty()
+            params.validation_set_file_name.empty()
             ? sr.read_for_mse(sfen_for_mse_size)
-            : sr.read_validation_set(validation_set_file_name, eval_limit, use_draw_games_in_validation);
+            : sr.read_validation_set(
+                params.validation_set_file_name,
+                params.eval_limit,
+                params.use_draw_games_in_validation);
 
-        if (validation_set_file_name.empty()
+        if (params.validation_set_file_name.empty()
             && sfen_for_mse.size() != sfen_for_mse_size)
         {
             auto out = sync_region_cout.new_region();
@@ -508,7 +538,7 @@ namespace Learner
             return;
         }
 
-        if (newbob_decay != 1.0) {
+        if (params.newbob_decay != 1.0) {
 
             calc_loss(sfen_for_mse, 0);
 
@@ -527,10 +557,10 @@ namespace Learner
             std::atomic<uint64_t> counter{0};
 
             Threads.execute_with_workers([this, &counter](auto& th){
-                learn_worker(th, counter, mini_batch_size);
+                learn_worker(th, counter, params.mini_batch_size);
             });
 
-            total_done += mini_batch_size;
+            total_done += params.mini_batch_size;
 
             Threads.wait_for_workers_finished();
 
@@ -574,14 +604,14 @@ namespace Learner
                 break;
             }
 
-            if (eval_limit < abs(ps.score))
+            if (params.eval_limit < abs(ps.score))
                 goto RETRY_READ;
 
-            if (!use_draw_games_in_training && ps.game_result == 0)
+            if (!params.use_draw_games_in_training && ps.game_result == 0)
                 goto RETRY_READ;
 
             // Skip over the opening phase
-            if (ps.gamePly < prng.rand(reduction_gameply))
+            if (ps.gamePly < prng.rand(params.reduction_gameply))
                 goto RETRY_READ;
 
             StateInfo si;
@@ -647,10 +677,10 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(epoch, verbose);
+        Eval::NNUE::update_parameters(epoch, params.verbose);
         atomic_thread_fence(memory_order_seq_cst);
 
-        if (++save_count * mini_batch_size >= eval_save_interval)
+        if (++save_count * params.mini_batch_size >= params.eval_save_interval)
         {
             save_count = 0;
 
@@ -662,7 +692,7 @@ namespace Learner
             }
         }
 
-        if (++loss_output_count * mini_batch_size >= loss_output_interval)
+        if (++loss_output_count * params.mini_batch_size >= params.loss_output_interval)
         {
             loss_output_count = 0;
 
@@ -829,7 +859,7 @@ namespace Learner
         // Each time you save, change the extension part of the file name like "0","1","2",..
         // (Because I want to compare the winning rate for each evaluation function parameter later)
 
-        if (save_only_once)
+        if (params.save_only_once)
         {
             // When EVAL_SAVE_ONLY_ONCE is defined,
             // Do not dig a subfolder because I want to save it only once.
@@ -846,50 +876,49 @@ namespace Learner
             const std::string dir_name = std::to_string(dir_number++);
             Eval::NNUE::save_eval(dir_name);
 
-            if (newbob_decay != 1.0 && latest_loss_count > 0) {
-                static int trials = newbob_num_trials;
+            if (params.newbob_decay != 1.0 && latest_loss_count > 0) {
+                static int trials = params.newbob_num_trials;
                 const double latest_loss = latest_loss_sum / latest_loss_count;
                 latest_loss_sum = 0.0;
                 latest_loss_count = 0;
                 cout << "INFO (learning_rate):" << endl;
                 cout << "  - loss = " << latest_loss;
                 auto tot = total_done;
-                if (auto_lr_drop)
+                if (params.auto_lr_drop)
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
-                    best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
-                    trials = newbob_num_trials;
+                    trials = params.newbob_num_trials;
 
-                    if (tot >= last_lr_drop + auto_lr_drop)
+                    if (tot >= last_lr_drop + params.auto_lr_drop)
                     {
                         last_lr_drop = tot;
-                        global_learning_rate *= newbob_decay;
+                        global_learning_rate *= params.newbob_decay;
                     }
                 }
                 else if (latest_loss < best_loss)
                 {
                     cout << " < best (" << best_loss << "), accepted" << endl;
                     best_loss = latest_loss;
-                    best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
-                    trials = newbob_num_trials;
+                    trials = params.newbob_num_trials;
                 }
                 else
                 {
                     cout << " >= best (" << best_loss << "), rejected" << endl;
-                    best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
 
                     if (--trials > 0 && !is_final)
                     {
                         cout
                             << "  - reducing learning rate from " << global_learning_rate
-                            << " to " << (global_learning_rate * newbob_decay)
+                            << " to " << (global_learning_rate * params.newbob_decay)
                             << " (" << trials << " more trials)" << endl;
 
-                        global_learning_rate *= newbob_decay;
+                        global_learning_rate *= params.newbob_decay;
                     }
                 }
 
+                params.best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
+
                 if (trials == 0)
                 {
                     cout << "  - converged" << endl;
@@ -924,12 +953,7 @@ namespace Learner
     // Learning from the generated game record
     void learn(Position&, istringstream& is)
     {
-        const auto thread_num = (int)Options["Threads"];
-
-        vector<string> filenames;
-
-        // mini_batch_size 1M aspect by default. This can be increased.
-        auto mini_batch_size = LEARN_MINI_BATCH_SIZE;
+        LearnerThink::Params params;
 
         // Number of epochs
         uint64_t epochs = std::numeric_limits<uint64_t>::max();
@@ -938,21 +962,6 @@ namespace Learner
         string base_dir;
         string target_dir;
 
-        // If the absolute value of the evaluation value
-        // in the deep search of the teacher phase exceeds this value,
-        // that phase is discarded.
-        int eval_limit = 32000;
-
-        // Flag to save the evaluation function file only once near the end.
-        bool save_only_once = false;
-
-        // Shuffle about what you are pre-reading on the teacher aspect.
-        // (Shuffle of about 10 million phases)
-        // Turn on if you want to pass a pre-shuffled file.
-        bool no_shuffle = false;
-
-        bool verbose = false;
-
         global_learning_rate = 1.0;
 
         // elmo lambda
@@ -960,26 +969,9 @@ namespace Learner
         ELMO_LAMBDA2 = 1.0;
         ELMO_LAMBDA_LIMIT = 32000;
 
-        // if (gamePly <rand(reduction_gameply)) continue;
-        // An option to exclude the early stage from the learning target moderately like
-        // If set to 1, rand(1)==0, so nothing is excluded.
-        int reduction_gameply = 1;
-
         uint64_t nn_batch_size = 1000;
-        double newbob_decay = 0.5;
-        int newbob_num_trials = 4;
-        uint64_t auto_lr_drop = 0;
         string nn_options;
 
-        uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
-        uint64_t loss_output_interval = 1'000'000;
-
-        size_t sfen_read_size = SfenReader::DEFAULT_SFEN_READ_SIZE;
-        size_t thread_buffer_size = SfenReader::DEFAULT_THREAD_BUFFER_SIZE;
-
-        string validation_set_file_name;
-        string seed;
-
         auto out = sync_region_cout.new_region();
 
         // Assume the filenames are staggered.
@@ -994,8 +986,8 @@ namespace Learner
             // specify the number of phases of mini-batch
             if (option == "bat")
             {
-                is >> mini_batch_size;
-                mini_batch_size *= 10000; // Unit is ten thousand
+                is >> params.mini_batch_size;
+                params.mini_batch_size *= 10000; // Unit is ten thousand
             }
 
             // Specify the folder in which the game record is stored and make it the rooting target.
@@ -1004,72 +996,73 @@ namespace Learner
             {
                 std::string filename;
                 is >> filename;
-                filenames.push_back(filename);
+                params.filenames.push_back(filename);
             }
 
             // Specify the number of loops
-            else if (option == "epochs")      is >> epochs;
+            else if (option == "epochs") is >> epochs;
 
             // Game file storage folder (get game file with relative path from here)
-            else if (option == "basedir")   is >> base_dir;
+            else if (option == "basedir") is >> base_dir;
 
             // Mini batch size
-            else if (option == "batchsize") is >> mini_batch_size;
+            else if (option == "batchsize") is >> params.mini_batch_size;
 
             // learning rate
-            else if (option == "lr")        is >> global_learning_rate;
+            else if (option == "lr") is >> global_learning_rate;
 
             // Accept also the old option name.
             else if (option == "use_draw_in_training"
                   || option == "use_draw_games_in_training")
-                is >> use_draw_games_in_training;
+                is >> params.use_draw_games_in_training;
 
             // Accept also the old option name.
             else if (option == "use_draw_in_validation"
                   || option == "use_draw_games_in_validation")
-                is >> use_draw_games_in_validation;
+                is >> params.use_draw_games_in_validation;
 
             // Accept also the old option name.
             else if (option == "use_hash_in_training"
                   || option == "skip_duplicated_positions_in_training")
-                is >> skip_duplicated_positions_in_training;
+                is >> params.skip_duplicated_positions_in_training;
 
-            else if (option == "winning_probability_coefficient") is >> winning_probability_coefficient;
+            else if (option == "winning_probability_coefficient")
+                is >> winning_probability_coefficient;
 
             // Using WDL with win rate model instead of sigmoid
             else if (option == "use_wdl") is >> use_wdl;
 
 
             // LAMBDA
-            else if (option == "lambda")       is >> ELMO_LAMBDA;
-            else if (option == "lambda2")      is >> ELMO_LAMBDA2;
+            else if (option == "lambda") is >> ELMO_LAMBDA;
+            else if (option == "lambda2") is >> ELMO_LAMBDA2;
             else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
 
-            else if (option == "reduction_gameply") is >> reduction_gameply;
+            else if (option == "reduction_gameply") is >> params.reduction_gameply;
 
-            else if (option == "eval_limit") is >> eval_limit;
-            else if (option == "save_only_once") save_only_once = true;
-            else if (option == "no_shuffle") no_shuffle = true;
+            else if (option == "eval_limit") is >> params.eval_limit;
+            else if (option == "save_only_once") params.save_only_once = true;
+            else if (option == "no_shuffle") params.shuffle = false;
 
             else if (option == "nn_batch_size") is >> nn_batch_size;
-            else if (option == "newbob_decay") is >> newbob_decay;
-            else if (option == "newbob_num_trials") is >> newbob_num_trials;
+            else if (option == "newbob_decay") is >> params.newbob_decay;
+            else if (option == "newbob_num_trials") is >> params.newbob_num_trials;
             else if (option == "nn_options") is >> nn_options;
-            else if (option == "auto_lr_drop") is >> auto_lr_drop;
+            else if (option == "auto_lr_drop") is >> params.auto_lr_drop;
 
-            else if (option == "eval_save_interval") is >> eval_save_interval;
-            else if (option == "loss_output_interval") is >> loss_output_interval;
-            else if (option == "validation_set_file_name") is >> validation_set_file_name;
+            else if (option == "eval_save_interval") is >> params.eval_save_interval;
+            else if (option == "loss_output_interval") is >> params.loss_output_interval;
+            else if (option == "validation_set_file_name") is >> params.validation_set_file_name;
 
             else if (option == "src_score_min_value") is >> src_score_min_value;
             else if (option == "src_score_max_value") is >> src_score_max_value;
             else if (option == "dest_score_min_value") is >> dest_score_min_value;
             else if (option == "dest_score_max_value") is >> dest_score_max_value;
 
-            else if (option == "sfen_read_size") is >> sfen_read_size;
-            else if (option == "thread_buffer_size") is >> thread_buffer_size;
+            else if (option == "sfen_read_size") is >> params.sfen_read_size;
+            else if (option == "thread_buffer_size") is >> params.thread_buffer_size;
 
-            else if (option == "seed") is >> seed;
+            else if (option == "seed") is >> params.seed;
             else if (option == "set_recommended_uci_options")
             {
                 UCI::setoption("Use NNUE", "pure");
@@ -1082,21 +1075,13 @@ namespace Learner
                 UCI::setoption("PruneAtShallowDepth", "false");
                 UCI::setoption("EnableTranspositionTable", "false");
             }
-            else if (option == "verbose") verbose = true;
+            else if (option == "verbose") params.verbose = true;
             else
             {
                 out << "INFO: Unknown option: " << option << ". Ignoring.\n";
             }
         }
 
-        if (loss_output_interval == 0)
-        {
-            loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
-        }
-
-        // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
-        reduction_gameply = max(reduction_gameply, 1);
-
         out << "INFO: Executing learn command\n";
 
         // Issue a warning if OpenMP is disabled.
@@ -1104,40 +1089,42 @@ namespace Learner
         out << "WARNING: OpenMP disabled." << endl;
 #endif
 
+        params.enforce_constraints();
+
         // Right now we only have the individual files.
         // We need to apply base_dir here
         if (!target_dir.empty())
         {
-            append_files_from_dir(filenames, base_dir, target_dir);
+            append_files_from_dir(params.filenames, base_dir, target_dir);
         }
-        rebase_files(filenames, base_dir);
+        rebase_files(params.filenames, base_dir);
 
         out << "INFO: Input files:\n";
-        for (auto s : filenames)
+        for (auto s : params.filenames)
             out << "  - " << s << '\n';
 
         out << "INFO: Parameters:\n";
-        if (!validation_set_file_name.empty())
+        if (!params.validation_set_file_name.empty())
         {
-            out << "  - validation set           : " << validation_set_file_name << endl;
+            out << "  - validation set           : " << params.validation_set_file_name << endl;
         }
 
         out << "  - epochs                   : " << epochs << endl;
-        out << "  - epochs * minibatch size  : " << epochs * mini_batch_size << endl;
-        out << "  - eval_limit               : " << eval_limit << endl;
-        out << "  - save_only_once           : " << (save_only_once ? "true" : "false") << endl;
-        out << "  - shuffle on read          : " << (no_shuffle ? "false" : "true") << endl;
+        out << "  - epochs * minibatch size  : " << epochs * params.mini_batch_size << endl;
+        out << "  - eval_limit               : " << params.eval_limit << endl;
+        out << "  - save_only_once           : " << (params.save_only_once ? "true" : "false") << endl;
+        out << "  - shuffle on read          : " << (params.shuffle ? "true" : "false") << endl;
 
         out << "  - Loss Function            : " << LOSS_FUNCTION << endl;
-        out << "  - minibatch size           : " << mini_batch_size << endl;
+        out << "  - minibatch size           : " << params.mini_batch_size << endl;
 
         out << "  - nn_batch_size            : " << nn_batch_size << endl;
         out << "  - nn_options               : " << nn_options << endl;
 
         out << "  - learning rate            : " << global_learning_rate << endl;
-        out << "  - use draws in training    : " << use_draw_games_in_training << endl;
-        out << "  - use draws in validation  : " << use_draw_games_in_validation << endl;
-        out << "  - skip repeated positions  : " << skip_duplicated_positions_in_training << endl;
+        out << "  - use draws in training    : " << params.use_draw_games_in_training << endl;
+        out << "  - use draws in validation  : " << params.use_draw_games_in_validation << endl;
+        out << "  - skip repeated positions  : " << params.skip_duplicated_positions_in_training << endl;
 
         out << "  - winning prob coeff       : " << winning_probability_coefficient << endl;
         out << "  - use_wdl                  : " << use_wdl << endl;
@@ -1147,27 +1134,27 @@ namespace Learner
         out << "  - dest_score_min_value     : " << dest_score_min_value << endl;
         out << "  - dest_score_max_value     : " << dest_score_max_value << endl;
 
-        out << "  - reduction_gameply        : " << reduction_gameply << endl;
+        out << "  - reduction_gameply        : " << params.reduction_gameply << endl;
 
         out << "  - LAMBDA                   : " << ELMO_LAMBDA << endl;
         out << "  - LAMBDA2                  : " << ELMO_LAMBDA2 << endl;
         out << "  - LAMBDA_LIMIT             : " << ELMO_LAMBDA_LIMIT << endl;
-        out << "  - eval_save_interval       : " << eval_save_interval << " sfens" << endl;
-        out << "  - loss_output_interval     : " << loss_output_interval << " sfens" << endl;
+        out << "  - eval_save_interval       : " << params.eval_save_interval << " sfens" << endl;
+        out << "  - loss_output_interval     : " << params.loss_output_interval << " sfens" << endl;
 
-        out << "  - sfen_read_size           : " << sfen_read_size << endl;
-        out << "  - thread_buffer_size       : " << thread_buffer_size << endl;
+        out << "  - sfen_read_size           : " << params.sfen_read_size << endl;
+        out << "  - thread_buffer_size       : " << params.thread_buffer_size << endl;
 
-        out << "  - seed                     : " << seed << endl;
-        out << "  - verbose                  : " << (verbose ? "true" : "false") << endl;
+        out << "  - seed                     : " << params.seed << endl;
+        out << "  - verbose                  : " << (params.verbose ? "true" : "false") << endl;
 
-        if (auto_lr_drop) {
-            out << "  - learning rate scheduling : every " << auto_lr_drop << " sfens" << endl;
+        if (params.auto_lr_drop) {
+            out << "  - learning rate scheduling : every " << params.auto_lr_drop << " sfens" << endl;
         }
-        else if (newbob_decay != 1.0) {
+        else if (params.newbob_decay != 1.0) {
             out << "  - learning rate scheduling : newbob with decay" << endl;
-            out << "  - newbob_decay             : " << newbob_decay << endl;
-            out << "  - newbob_num_trials        : " << newbob_num_trials << endl;
+            out << "  - newbob_decay             : " << params.newbob_decay << endl;
+            out << "  - newbob_num_trials        : " << params.newbob_num_trials << endl;
         }
         else {
             out << "  - learning rate scheduling : fixed learning rate" << endl;
@@ -1175,54 +1162,17 @@ namespace Learner
 
         out << endl;
 
-        // -----------------------------------
-        // various initialization
-        // -----------------------------------
-
         out << "INFO: Started initialization." << endl;
 
         Threads.main()->ponder = false;
 
         set_learning_search_limits();
 
-        Eval::NNUE::initialize_training(seed, out);
+        Eval::NNUE::initialize_training(params.seed, out);
         Eval::NNUE::set_batch_size(nn_batch_size);
         Eval::NNUE::set_options(nn_options);
 
-        LearnerThink learn_think(
-            filenames,
-            !no_shuffle,
-            thread_num,
-            seed,
-            sfen_read_size,
-            thread_buffer_size);
-
-        if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
-            // Save the current net to [EvalSaveDir]\original.
-            Eval::NNUE::save_eval("original");
-
-            // Set the folder above to best_nn_directory so that the trainer can
-            // resotre the network parameters from the original net file.
-            learn_think.best_nn_directory =
-                Path::combine(Options["EvalSaveDir"], "original");
-        }
-
-        // Reflect other option settings.
-        learn_think.eval_limit = eval_limit;
-        learn_think.save_only_once = save_only_once;
-        learn_think.reduction_gameply = reduction_gameply;
-
-        learn_think.newbob_decay = newbob_decay;
-        learn_think.newbob_num_trials = newbob_num_trials;
-        learn_think.auto_lr_drop = auto_lr_drop;
-
-        learn_think.eval_save_interval = eval_save_interval;
-        learn_think.loss_output_interval = loss_output_interval;
-
-        learn_think.mini_batch_size = mini_batch_size;
-        learn_think.validation_set_file_name = validation_set_file_name;
-
-        learn_think.verbose = verbose;
+        LearnerThink learn_think(params);
 
         out << "Finished initialization." << endl;
 

From a8066cd4a959ac6c5b1fb18ffd8b16c75572f6ab Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 23:22:47 +0200
Subject: [PATCH 401/583] Rename elmo lambdas

---
 src/learn/learn.cpp | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index e9eb1141..c03e425c 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -87,6 +87,14 @@ namespace Learner
     static double dest_score_min_value = 0.0;
     static double dest_score_max_value = 1.0;
 
+    // A constant used in elmo (WCSC27). Adjustment required.
+    // Since elmo does not internally divide the expression, the value is different.
+    // You can set this value with the learn command.
+    // 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
+    static double elmo_lambda_low = 1.0;
+    static double elmo_lambda_high = 1.0;
+    static double elmo_lambda_limit = 32000;
+
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
@@ -239,14 +247,6 @@ namespace Learner
         return ((y2 - y1) / epsilon) / winning_probability_coefficient;
     }
 
-    // A constant used in elmo (WCSC27). Adjustment required.
-    // Since elmo does not internally divide the expression, the value is different.
-    // You can set this value with the learn command.
-    // 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
-    double ELMO_LAMBDA = 0.33;
-    double ELMO_LAMBDA2 = 0.33;
-    double ELMO_LAMBDA_LIMIT = 32000;
-
     // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
     double get_scaled_signal(double signal)
     {
@@ -274,12 +274,12 @@ namespace Learner
 
     double calculate_lambda(double teacher_signal)
     {
-        // If the evaluation value in deep search exceeds ELMO_LAMBDA_LIMIT
-        // then apply ELMO_LAMBDA2 instead of ELMO_LAMBDA.
+        // If the evaluation value in deep search exceeds elmo_lambda_limit
+        // then apply elmo_lambda_high instead of elmo_lambda_low.
         const double lambda =
-            (std::abs(teacher_signal) >= ELMO_LAMBDA_LIMIT)
-            ? ELMO_LAMBDA2
-            : ELMO_LAMBDA;
+            (std::abs(teacher_signal) >= elmo_lambda_limit)
+            ? elmo_lambda_high
+            : elmo_lambda_low;
 
         return lambda;
     }
@@ -964,11 +964,6 @@ namespace Learner
 
         global_learning_rate = 1.0;
 
-        // elmo lambda
-        ELMO_LAMBDA = 1.0;
-        ELMO_LAMBDA2 = 1.0;
-        ELMO_LAMBDA_LIMIT = 32000;
-
         uint64_t nn_batch_size = 1000;
         string nn_options;
 
@@ -1034,9 +1029,9 @@ namespace Learner
 
 
             // LAMBDA
-            else if (option == "lambda") is >> ELMO_LAMBDA;
-            else if (option == "lambda2") is >> ELMO_LAMBDA2;
-            else if (option == "lambda_limit") is >> ELMO_LAMBDA_LIMIT;
+            else if (option == "lambda") is >> elmo_lambda_low;
+            else if (option == "lambda2") is >> elmo_lambda_high;
+            else if (option == "lambda_limit") is >> elmo_lambda_limit;
 
             else if (option == "reduction_gameply") is >> params.reduction_gameply;
 
@@ -1136,9 +1131,9 @@ namespace Learner
 
         out << "  - reduction_gameply        : " << params.reduction_gameply << endl;
 
-        out << "  - LAMBDA                   : " << ELMO_LAMBDA << endl;
-        out << "  - LAMBDA2                  : " << ELMO_LAMBDA2 << endl;
-        out << "  - LAMBDA_LIMIT             : " << ELMO_LAMBDA_LIMIT << endl;
+        out << "  - elmo_lambda_low          : " << elmo_lambda_low << endl;
+        out << "  - elmo_lambda_high         : " << elmo_lambda_high << endl;
+        out << "  - elmo_lambda_limit        : " << elmo_lambda_limit << endl;
         out << "  - eval_save_interval       : " << params.eval_save_interval << " sfens" << endl;
         out << "  - loss_output_interval     : " << params.loss_output_interval << " sfens" << endl;
 

From c229929d266e1e8f4354742223e7b1121b0b8dc2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 22:58:09 +0200
Subject: [PATCH 402/583] Remove the position parameter from learn.

---
 src/learn/learn.cpp | 2 +-
 src/learn/learn.h   | 2 +-
 src/uci.cpp         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index c03e425c..90e6cb0f 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -951,7 +951,7 @@ namespace Learner
     }
 
     // Learning from the generated game record
-    void learn(Position&, istringstream& is)
+    void learn(istringstream& is)
     {
         LearnerThink::Params params;
 
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 5efeb516..008ca7af 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -67,7 +67,7 @@ namespace Learner
     double calc_grad(Value shallow, const PackedSfenValue& psv);
 
     // Learning from the generated game record
-    void learn(Position& pos, std::istringstream& is);
+    void learn(std::istringstream& is);
 }
 
 #endif // ifndef _LEARN_H_
diff --git a/src/uci.cpp b/src/uci.cpp
index dbef05bf..e6b45c02 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -339,7 +339,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
 
       else if (token == "gensfen") Learner::gensfen(is);
-      else if (token == "learn") Learner::learn(pos, is);
+      else if (token == "learn") Learner::learn(is);
       else if (token == "convert") Learner::convert(is);
       else if (token == "convert_bin") Learner::convert_bin(is);
       else if (token == "convert_plain") Learner::convert_plain(is);

From e4868cb59e83baf3a3950ae043bd5d04c75acc2f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 23:17:56 +0200
Subject: [PATCH 403/583] Move setting learn search limits to learner.

---
 src/learn/learn.cpp | 53 +++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 90e6cb0f..5bb41213 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -467,6 +467,8 @@ namespace Learner
         void learn(uint64_t epochs);
 
     private:
+        static void set_learning_search_limits();
+
         void learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
 
         void update_weights(const PSVector& psv, uint64_t epoch);
@@ -510,13 +512,37 @@ namespace Learner
         AtomicLoss learn_loss_sum;
     };
 
+    void LearnerThink::set_learning_search_limits()
+    {
+        Threads.main()->ponder = false;
+
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        limits.startTime = now();
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+    }
+
     void LearnerThink::learn(uint64_t epochs)
     {
-
 #if defined(_OPENMP)
         omp_set_num_threads((int)Options["Threads"]);
 #endif
 
+        set_learning_search_limits();
+
         Eval::NNUE::verify_any_net_loaded();
 
         const PSVector sfen_for_mse =
@@ -929,27 +955,6 @@ namespace Learner
         return false;
     }
 
-    static void set_learning_search_limits()
-    {
-        // About Search::Limits
-        // Be careful because this member variable is global and affects other threads.
-        auto& limits = Search::Limits;
-
-        limits.startTime = now();
-
-        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
-        limits.infinite = true;
-
-        // Since PV is an obstacle when displayed, erase it.
-        limits.silent = true;
-
-        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
-        limits.nodes = 0;
-
-        // depth is also processed by the one passed as an argument of Learner::search().
-        limits.depth = 0;
-    }
-
     // Learning from the generated game record
     void learn(istringstream& is)
     {
@@ -1159,10 +1164,6 @@ namespace Learner
 
         out << "INFO: Started initialization." << endl;
 
-        Threads.main()->ponder = false;
-
-        set_learning_search_limits();
-
         Eval::NNUE::initialize_training(params.seed, out);
         Eval::NNUE::set_batch_size(nn_batch_size);
         Eval::NNUE::set_options(nn_options);

From cde6ec2bf26d46dedf4547580f6e45e34d8b1ab4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 23:29:32 +0200
Subject: [PATCH 404/583] Make all grad related functions in learn static. Pass
 calc_grad as a parameter.

---
 src/learn/learn.cpp                | 40 +++++++++++++-----------------
 src/learn/learn.h                  |  4 +--
 src/nnue/evaluate_nnue_learner.cpp |  7 +++---
 src/nnue/evaluate_nnue_learner.h   |  2 +-
 4 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 5bb41213..b0f77e89 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -185,7 +185,7 @@ namespace Learner
     }
 
     // A function that converts the evaluation value to the winning rate [0,1]
-    double winning_percentage(double value)
+    static double winning_percentage(double value)
     {
         // 1/(1+10^(-Eval/4))
         // = 1/(1+e^(-Eval/4*ln(10))
@@ -194,7 +194,7 @@ namespace Learner
     }
 
     // A function that converts the evaluation value to the winning rate [0,1]
-    double winning_percentage_wdl(double value, int ply)
+    static double winning_percentage_wdl(double value, int ply)
     {
         constexpr double wdl_total = 1000.0;
         constexpr double draw_score = 0.5;
@@ -207,7 +207,7 @@ namespace Learner
     }
 
     // A function that converts the evaluation value to the winning rate [0,1]
-    double winning_percentage(double value, int ply)
+    static double winning_percentage(double value, int ply)
     {
         if (use_wdl)
         {
@@ -219,7 +219,7 @@ namespace Learner
         }
     }
 
-    double calc_cross_entropy_of_winning_percentage(
+    static double calc_cross_entropy_of_winning_percentage(
         double deep_win_rate,
         double shallow_eval,
         int ply)
@@ -229,7 +229,7 @@ namespace Learner
         return -p * std::log(q) - (1.0 - p) * std::log(1.0 - q);
     }
 
-    double calc_d_cross_entropy_of_winning_percentage(
+    static double calc_d_cross_entropy_of_winning_percentage(
         double deep_win_rate,
         double shallow_eval,
         int ply)
@@ -248,7 +248,7 @@ namespace Learner
     }
 
     // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-    double get_scaled_signal(double signal)
+    static double get_scaled_signal(double signal)
     {
         double scaled_signal = signal;
 
@@ -266,13 +266,13 @@ namespace Learner
     }
 
     // Teacher winning probability.
-    double calculate_p(double teacher_signal, int ply)
+    static double calculate_p(double teacher_signal, int ply)
     {
         const double scaled_teacher_signal = get_scaled_signal(teacher_signal);
         return winning_percentage(scaled_teacher_signal, ply);
     }
 
-    double calculate_lambda(double teacher_signal)
+    static double calculate_lambda(double teacher_signal)
     {
         // If the evaluation value in deep search exceeds elmo_lambda_limit
         // then apply elmo_lambda_high instead of elmo_lambda_low.
@@ -284,7 +284,7 @@ namespace Learner
         return lambda;
     }
 
-    double calculate_t(int game_result)
+    static double calculate_t(int game_result)
     {
         // Use 1 as the correction term if the expected win rate is 1,
         // 0 if you lose, and 0.5 if you draw.
@@ -294,20 +294,20 @@ namespace Learner
         return t;
     }
 
-    double calc_grad(Value teacher_signal, Value shallow, const PackedSfenValue& psv)
+    static double calc_grad(Value shallow, Value teacher_signal, int result, int ply)
     {
         // elmo (WCSC27) method
         // Correct with the actual game wins and losses.
-        const double q = winning_percentage(shallow, psv.gamePly);
-        const double p = calculate_p(teacher_signal, psv.gamePly);
-        const double t = calculate_t(psv.game_result);
+        const double q = winning_percentage(shallow, ply);
+        const double p = calculate_p(teacher_signal, ply);
+        const double t = calculate_t(result);
         const double lambda = calculate_lambda(teacher_signal);
 
         double grad;
         if (use_wdl)
         {
-            const double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, psv.gamePly);
-            const double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, psv.gamePly);
+            const double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, ply);
+            const double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, ply);
             grad = lambda * dce_p + (1.0 - lambda) * dce_t;
         }
         else
@@ -324,7 +324,7 @@ namespace Learner
     // The individual cross entropy of the win/loss term and win
     // rate term of the elmo expression is returned
     // to the arguments cross_entropy_eval and cross_entropy_win.
-    Loss calc_cross_entropy(
+    static Loss calc_cross_entropy(
         Value teacher_signal,
         Value shallow,
         const PackedSfenValue& psv)
@@ -360,12 +360,6 @@ namespace Learner
         return loss;
     }
 
-    // Other objective functions may be considered in the future...
-    double calc_grad(Value shallow, const PackedSfenValue& psv)
-    {
-        return calc_grad((Value)psv.score, shallow, psv);
-    }
-
     // Class to generate sfen with multiple threads
     struct LearnerThink
     {
@@ -703,7 +697,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(epoch, params.verbose);
+        Eval::NNUE::update_parameters(epoch, params.verbose, calc_grad);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 008ca7af..6ce476e5 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -64,10 +64,10 @@ namespace Learner
     // rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
     constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
 
-    double calc_grad(Value shallow, const PackedSfenValue& psv);
-
     // Learning from the generated game record
     void learn(std::istringstream& is);
+
+    using CalcGradFunc = double(Value, Value, int, int);
 }
 
 #endif // ifndef _LEARN_H_
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 6775707d..3e91a7de 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -18,8 +18,6 @@
 #include "misc.h"
 #include "thread_win32_osx.h"
 
-#include "learn/learn.h"
-
 // Learning rate scale
 double global_learning_rate;
 
@@ -183,7 +181,7 @@ namespace Eval::NNUE {
     }
 
     // update the evaluation function parameters
-    void update_parameters(uint64_t epoch, bool verbose) {
+    void update_parameters(uint64_t epoch, bool verbose, Learner::CalcGradFunc calc_grad) {
         assert(batch_size > 0);
 
         const auto learning_rate = static_cast<LearnFloatType>(
@@ -210,7 +208,8 @@ namespace Eval::NNUE {
                     batch[b].sign * network_output[b] * kPonanzaConstant));
                 const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
                 const auto& psv = batch[b].psv;
-                const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
+                const double gradient =
+                    batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
                 gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
 
 
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 91d2aa99..8a9786e5 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -31,7 +31,7 @@ namespace Eval::NNUE {
         double weight);
 
     // update the evaluation function parameters
-    void update_parameters(uint64_t epoch, bool verbose);
+    void update_parameters(uint64_t epoch, bool verbose, Learner::CalcGradFunc calc_grad);
 
     // Check if there are any problems with learning
     void check_health();

From f81fa3d7127a21d58853192fd59fad5a12589ec1 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 23:35:34 +0200
Subject: [PATCH 405/583] Replace global_learning_rate with learning_rate local
 to the learner and passed to update_parameters as a parameter.

---
 src/learn/learn.cpp                | 22 ++++++++++------------
 src/nnue/evaluate_nnue_learner.cpp | 13 +++++++------
 src/nnue/evaluate_nnue_learner.h   |  6 +++++-
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index b0f77e89..6cd54b13 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -56,8 +56,6 @@
 #include <omp.h>
 #endif
 
-extern double global_learning_rate;
-
 using namespace std;
 
 template <typename T>
@@ -399,6 +397,8 @@ namespace Learner
             bool use_draw_games_in_validation = true;
             bool skip_duplicated_positions_in_training = true;
 
+            double learning_rate = 1.0;
+
             string validation_set_file_name;
             string seed;
 
@@ -697,7 +697,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(epoch, params.verbose, calc_grad);
+        Eval::NNUE::update_parameters(epoch, params.verbose, params.learning_rate, calc_grad);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
@@ -737,7 +737,7 @@ namespace Learner
              << ", epoch " << epoch
              << endl;
 
-        out << "  - learning rate = " << global_learning_rate << endl;
+        out << "  - learning rate = " << params.learning_rate << endl;
 
         // For calculation of verification data loss
         AtomicLoss test_loss_sum{};
@@ -913,7 +913,7 @@ namespace Learner
                     if (tot >= last_lr_drop + params.auto_lr_drop)
                     {
                         last_lr_drop = tot;
-                        global_learning_rate *= params.newbob_decay;
+                        params.learning_rate *= params.newbob_decay;
                     }
                 }
                 else if (latest_loss < best_loss)
@@ -929,11 +929,11 @@ namespace Learner
                     if (--trials > 0 && !is_final)
                     {
                         cout
-                            << "  - reducing learning rate from " << global_learning_rate
-                            << " to " << (global_learning_rate * params.newbob_decay)
+                            << "  - reducing learning rate from " << params.learning_rate
+                            << " to " << (params.learning_rate * params.newbob_decay)
                             << " (" << trials << " more trials)" << endl;
 
-                        global_learning_rate *= params.newbob_decay;
+                        params.learning_rate *= params.newbob_decay;
                     }
                 }
 
@@ -961,8 +961,6 @@ namespace Learner
         string base_dir;
         string target_dir;
 
-        global_learning_rate = 1.0;
-
         uint64_t nn_batch_size = 1000;
         string nn_options;
 
@@ -1003,7 +1001,7 @@ namespace Learner
             else if (option == "batchsize") is >> params.mini_batch_size;
 
             // learning rate
-            else if (option == "lr") is >> global_learning_rate;
+            else if (option == "lr") is >> params.learning_rate;
 
             // Accept also the old option name.
             else if (option == "use_draw_in_training"
@@ -1115,7 +1113,7 @@ namespace Learner
         out << "  - nn_batch_size            : " << nn_batch_size << endl;
         out << "  - nn_options               : " << nn_options << endl;
 
-        out << "  - learning rate            : " << global_learning_rate << endl;
+        out << "  - learning rate            : " << params.learning_rate << endl;
         out << "  - use draws in training    : " << params.use_draw_games_in_training << endl;
         out << "  - use draws in validation  : " << params.use_draw_games_in_validation << endl;
         out << "  - skip repeated positions  : " << params.skip_duplicated_positions_in_training << endl;
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 3e91a7de..2a1fd6cb 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -18,9 +18,6 @@
 #include "misc.h"
 #include "thread_win32_osx.h"
 
-// Learning rate scale
-double global_learning_rate;
-
 // Code for learning NNUE evaluation function
 namespace Eval::NNUE {
 
@@ -181,11 +178,15 @@ namespace Eval::NNUE {
     }
 
     // update the evaluation function parameters
-    void update_parameters(uint64_t epoch, bool verbose, Learner::CalcGradFunc calc_grad) {
+    void update_parameters(
+        uint64_t epoch,
+        bool verbose,
+        double learning_rate,
+        Learner::CalcGradFunc calc_grad)
+    {
         assert(batch_size > 0);
 
-        const auto learning_rate = static_cast<LearnFloatType>(
-            global_learning_rate / batch_size);
+        learning_rate /= batch_size;
 
         std::lock_guard<std::mutex> lock(examples_mutex);
         std::shuffle(examples.begin(), examples.end(), rng);
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 8a9786e5..d350691b 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -31,7 +31,11 @@ namespace Eval::NNUE {
         double weight);
 
     // update the evaluation function parameters
-    void update_parameters(uint64_t epoch, bool verbose, Learner::CalcGradFunc calc_grad);
+    void update_parameters(
+        uint64_t epoch,
+        bool verbose,
+        double learning_rate,
+        Learner::CalcGradFunc calc_grad);
 
     // Check if there are any problems with learning
     void check_health();

From 680654b254dc2c6357199825c9399998d6bfd777 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 20:58:58 +0100
Subject: [PATCH 406/583] Add dots to output every epoch for progress
 visualization.

---
 src/nnue/evaluate_nnue_learner.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 2a1fd6cb..a97b45c7 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -244,6 +244,10 @@ namespace Eval::NNUE {
                 << " , batch_size = " << batch_size
                 << " , grad_norm = " << gradient_norm
                 << std::endl;
+        } else {
+            // Display some progress but don't synchronize as
+            // we can't really decide when to release the output lock here
+            std::cout << '.';
         }
 
         send_messages({{"quantize_parameters"}});

From 317fda251602ceb5af90b9134539f28210392184 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 24 Oct 2020 23:56:29 +0200
Subject: [PATCH 407/583] Cleanup eval saving and lr scheduling.

---
 src/learn/learn.cpp                | 136 ++++++++++++++++++-----------
 src/nnue/evaluate_nnue_learner.cpp |   2 +-
 2 files changed, 87 insertions(+), 51 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 6cd54b13..93262b42 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -456,6 +456,8 @@ namespace Learner
             latest_loss_sum = 0.0;
             latest_loss_count = 0;
             total_done = 0;
+            trials = params.newbob_num_trials;
+            dir_number = 0;
         }
 
         void learn(uint64_t epochs);
@@ -480,6 +482,8 @@ namespace Learner
 
         Value get_shallow_value(Position& pos);
 
+        bool check_progress();
+
         // save merit function parameters to a file
         bool save(bool is_final = false);
 
@@ -502,6 +506,9 @@ namespace Learner
         double latest_loss_sum;
         uint64_t latest_loss_count;
 
+        int trials;
+        int dir_number;
+
         // For calculation of learning data loss
         AtomicLoss learn_loss_sum;
     };
@@ -873,12 +880,84 @@ namespace Learner
         return shallow_value;
     }
 
+    bool LearnerThink::check_progress()
+    {
+        auto out = sync_region_cout.new_region();
+
+        const double latest_loss = latest_loss_sum / latest_loss_count;
+        bool converged = false;
+        latest_loss_sum = 0.0;
+        latest_loss_count = 0;
+
+        auto drop_lr = [&]() {
+            last_lr_drop = total_done;
+
+            out
+                << "  - reducing learning rate from " << params.learning_rate
+                << " to " << (params.learning_rate * params.newbob_decay)
+                << " (" << trials << " more trials)" << endl;
+
+            params.learning_rate *= params.newbob_decay;
+        };
+
+        auto accept = [&]() {
+            out << "  - loss = " << latest_loss << " < best (" << best_loss << "), accepted" << endl;
+
+            best_loss = latest_loss;
+            trials = params.newbob_num_trials;
+        };
+
+        auto reject = [&]() {
+            out << "  - loss = " << latest_loss << " >= best (" << best_loss << "), rejected" << endl;
+
+            --trials;
+            if (trials > 0)
+            {
+                drop_lr();
+                return false;
+            }
+            else
+            {
+                return true;
+            }
+        };
+
+        out << "INFO (learning_rate):" << endl;
+
+        if (params.auto_lr_drop)
+        {
+            accept();
+
+            if (total_done >= last_lr_drop + params.auto_lr_drop)
+            {
+                drop_lr();
+            }
+        }
+        else if (latest_loss < best_loss)
+        {
+            accept();
+        }
+        else
+        {
+            converged = reject();
+        }
+
+        if (converged)
+        {
+            out << "  - converged" << endl;
+        }
+
+        return converged;
+    }
+
     // Write evaluation function file.
     bool LearnerThink::save(bool is_final)
     {
         // Each time you save, change the extension part of the file name like "0","1","2",..
         // (Because I want to compare the winning rate for each evaluation function parameter later)
 
+        bool converged = false;
+
         if (params.save_only_once)
         {
             // When EVAL_SAVE_ONLY_ONCE is defined,
@@ -888,65 +967,22 @@ namespace Learner
         else if (is_final)
         {
             Eval::NNUE::save_eval("final");
-            return true;
+            converged = true;
         }
         else
         {
-            static int dir_number = 0;
+            // TODO: consider naming the output directory by epoch.
             const std::string dir_name = std::to_string(dir_number++);
             Eval::NNUE::save_eval(dir_name);
 
-            if (params.newbob_decay != 1.0 && latest_loss_count > 0) {
-                static int trials = params.newbob_num_trials;
-                const double latest_loss = latest_loss_sum / latest_loss_count;
-                latest_loss_sum = 0.0;
-                latest_loss_count = 0;
-                cout << "INFO (learning_rate):" << endl;
-                cout << "  - loss = " << latest_loss;
-                auto tot = total_done;
-                if (params.auto_lr_drop)
-                {
-                    cout << " < best (" << best_loss << "), accepted" << endl;
-                    best_loss = latest_loss;
-                    trials = params.newbob_num_trials;
-
-                    if (tot >= last_lr_drop + params.auto_lr_drop)
-                    {
-                        last_lr_drop = tot;
-                        params.learning_rate *= params.newbob_decay;
-                    }
-                }
-                else if (latest_loss < best_loss)
-                {
-                    cout << " < best (" << best_loss << "), accepted" << endl;
-                    best_loss = latest_loss;
-                    trials = params.newbob_num_trials;
-                }
-                else
-                {
-                    cout << " >= best (" << best_loss << "), rejected" << endl;
-
-                    if (--trials > 0 && !is_final)
-                    {
-                        cout
-                            << "  - reducing learning rate from " << params.learning_rate
-                            << " to " << (params.learning_rate * params.newbob_decay)
-                            << " (" << trials << " more trials)" << endl;
-
-                        params.learning_rate *= params.newbob_decay;
-                    }
-                }
-
+            if (params.newbob_decay != 1.0 && latest_loss_count > 0)
+            {
+                converged = check_progress();
                 params.best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
-
-                if (trials == 0)
-                {
-                    cout << "  - converged" << endl;
-                    return true;
-                }
             }
         }
-        return false;
+
+        return converged;
     }
 
     // Learning from the generated game record
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index a97b45c7..0cd61a41 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -280,6 +280,6 @@ namespace Eval::NNUE {
 #ifndef NDEBUG
         assert(result);
 #endif
-        out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
+        out << "INFO (save_eval): Finished saving evaluation file in " << eval_dir << std::endl;
     }
 }  // namespace Eval::NNUE
\ No newline at end of file

From ec9e49e875f06c450d1511964886cd2df17c72ca Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 28 Oct 2020 15:24:29 +0100
Subject: [PATCH 408/583] Add a HalfKA architecture (a product of K - king, and
 A - any piece) along with all required infrastructure. HalfKA doesn't
 discriminate kings compared to HalfKP. Keep old architecture as the default
 one.

---
 src/Makefile                                  |  3 +
 src/nnue/architectures/halfka_256x2-32-32.h   | 54 +++++++++++
 src/nnue/evaluate_nnue_learner.cpp            |  1 +
 src/nnue/features/a.cpp                       | 50 ++++++++++
 src/nnue/features/a.h                         | 54 +++++++++++
 src/nnue/features/half_ka.cpp                 | 89 ++++++++++++++++++
 src/nnue/features/half_ka.h                   | 75 +++++++++++++++
 src/nnue/features/half_relative_ka.cpp        | 86 +++++++++++++++++
 src/nnue/features/half_relative_ka.h          | 68 ++++++++++++++
 .../trainer/features/factorizer_half_ka.h     | 93 +++++++++++++++++++
 10 files changed, 573 insertions(+)
 create mode 100644 src/nnue/architectures/halfka_256x2-32-32.h
 create mode 100644 src/nnue/features/a.cpp
 create mode 100644 src/nnue/features/a.h
 create mode 100644 src/nnue/features/half_ka.cpp
 create mode 100644 src/nnue/features/half_ka.h
 create mode 100644 src/nnue/features/half_relative_ka.cpp
 create mode 100644 src/nnue/features/half_relative_ka.h
 create mode 100644 src/nnue/trainer/features/factorizer_half_ka.h

diff --git a/src/Makefile b/src/Makefile
index f2c4d269..45d27ef2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -50,9 +50,12 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/evaluate_nnue.cpp \
 	nnue/evaluate_nnue_learner.cpp \
 	nnue/features/half_kp.cpp \
+	nnue/features/half_ka.cpp \
 	nnue/features/half_relative_kp.cpp \
+	nnue/features/half_relative_ka.cpp \
 	nnue/features/k.cpp \
 	nnue/features/p.cpp \
+	nnue/features/a.cpp \
 	nnue/features/castling_right.cpp \
 	nnue/features/enpassant.cpp \
 	nnue/nnue_test_command.cpp \
diff --git a/src/nnue/architectures/halfka_256x2-32-32.h b/src/nnue/architectures/halfka_256x2-32-32.h
new file mode 100644
index 00000000..c108ef5d
--- /dev/null
+++ b/src/nnue/architectures/halfka_256x2-32-32.h
@@ -0,0 +1,54 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef NNUE_HALFKA_256X2_32_32_H_INCLUDED
+#define NNUE_HALFKA_256X2_32_32_H_INCLUDED
+
+#include "nnue/features/feature_set.h"
+#include "nnue/features/half_ka.h"
+
+#include "nnue/layers/input_slice.h"
+#include "nnue/layers/affine_transform.h"
+#include "nnue/layers/clipped_relu.h"
+
+namespace Eval::NNUE {
+
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+        Features::HalfKA<Features::Side::kFriend>>;
+
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+        // Define network structure
+        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+}  // namespace Eval::NNUE
+
+#endif // #ifndef NNUE_HALFA_256X2_32_32_H_INCLUDED
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 0cd61a41..4de939c5 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -7,6 +7,7 @@
 
 #include "trainer/features/factorizer_feature_set.h"
 #include "trainer/features/factorizer_half_kp.h"
+#include "trainer/features/factorizer_half_ka.h"
 #include "trainer/trainer_feature_transformer.h"
 #include "trainer/trainer_input_slice.h"
 #include "trainer/trainer_affine_transform.h"
diff --git a/src/nnue/features/a.cpp b/src/nnue/features/a.cpp
new file mode 100644
index 00000000..6ceb4efa
--- /dev/null
+++ b/src/nnue/features/a.cpp
@@ -0,0 +1,50 @@
+﻿#include "a.h"
+#include "index_list.h"
+
+// Definition of input feature A of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+    }
+
+    // Find the index of the feature quantity from the king position and PieceSquare
+    inline IndexType A::make_index(
+        Color perspective, Square s, Piece pc) {
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+    }
+
+    // Get a list of indices with a value of 1 among the features
+    void A::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s)));
+        }
+    }
+
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    void A::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+              removed->push_back(make_index(perspective, dp.from[i], pc));
+
+            if (dp.to[i] != SQ_NONE)
+              added->push_back(make_index(perspective, dp.to[i], pc));
+        }
+    }
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/a.h b/src/nnue/features/a.h
new file mode 100644
index 00000000..50a0d8be
--- /dev/null
+++ b/src/nnue/features/a.h
@@ -0,0 +1,54 @@
+﻿#ifndef _NNUE_FEATURES_A_H_
+#define _NNUE_FEATURES_A_H_
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+// Definition of input feature A of NNUE evaluation function
+// A is a union of P features and K features, so technically the
+// same effect can be achieved by including both P and K features
+// but it would result in slower index appending because
+// P would conditionally exclude K features and vice versa,
+// where A doesn't have any conditionals.
+namespace Eval::NNUE::Features {
+
+    // Feature P: PieceSquare of pieces other than balls
+    class A {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = "A";
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x7A4C414Cu;
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = PS_END2;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+    private:
+        // Index of a feature for a given piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_FEATURES_UNION_P_K_H_
diff --git a/src/nnue/features/half_ka.cpp b/src/nnue/features/half_ka.cpp
new file mode 100644
index 00000000..83e59067
--- /dev/null
+++ b/src/nnue/features/half_ka.cpp
@@ -0,0 +1,89 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//Definition of input features HalfKA of NNUE evaluation function
+
+#include "half_ka.h"
+#include "index_list.h"
+
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+    }
+
+    // Find the index of the feature quantity from the king position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfKA<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square ksq) {
+
+        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END2 * ksq);
+    }
+
+    // Get a list of indices for active features
+    template <Side AssociatedKing>
+    void HalfKA<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices for recently changed features
+    template <Side AssociatedKing>
+    void HalfKA<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfKA<Side::kFriend>;
+    template class HalfKA<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_ka.h b/src/nnue/features/half_ka.h
new file mode 100644
index 00000000..2839357e
--- /dev/null
+++ b/src/nnue/features/half_ka.h
@@ -0,0 +1,75 @@
+/*
+    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+    Stockfish is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    Stockfish is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef NNUE_FEATURES_HALF_KA_H_INCLUDED
+#define NNUE_FEATURES_HALF_KA_H_INCLUDED
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+//Definition of input features HalfKPK of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Feature HalfKPK: Combination of the position of own king
+    // and the position of pieces other than kings
+    template <Side AssociatedKing>
+    class HalfKA {
+
+    public:
+        // Feature name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfKA(Friend)" : "HalfKA(Enemy)";
+
+        // Hash value embedded in the evaluation file
+        static constexpr std::uint32_t kHashValue =
+            0x5F134CB9u ^ (AssociatedKing == Side::kFriend);
+
+        // Number of feature dimensions
+        static constexpr IndexType kDimensions =
+            static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END2);
+
+        // Maximum number of simultaneously active features
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Trigger for full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices for active features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices for recently changed features
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+    private:
+        // Index of a feature for a given king position and another piece on some square
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef NNUE_FEATURES_HALF_KA_H_INCLUDED
diff --git a/src/nnue/features/half_relative_ka.cpp b/src/nnue/features/half_relative_ka.cpp
new file mode 100644
index 00000000..ba3edbcf
--- /dev/null
+++ b/src/nnue/features/half_relative_ka.cpp
@@ -0,0 +1,86 @@
+﻿#include "half_relative_ka.h"
+#include "index_list.h"
+
+//Definition of input features HalfRelativeKA of NNUE evaluation function
+namespace Eval::NNUE::Features {
+
+    // Orient a square according to perspective (flip rank for black)
+    inline Square orient(Color perspective, Square s) {
+        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+    }
+
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKA<AssociatedKing>::make_index(
+        Color perspective,
+        Square s,
+        Piece pc,
+        Square sq_k) {
+
+        const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
+        return make_index(sq_k, p);
+    }
+
+    // Find the index of the feature quantity from the ball position and PieceSquare
+    template <Side AssociatedKing>
+    inline IndexType HalfRelativeKA<AssociatedKing>::make_index(
+        Square sq_k,
+        IndexType p) {
+
+        constexpr IndexType W = kBoardWidth;
+        constexpr IndexType H = kBoardHeight;
+        const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
+        const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
+        const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
+        const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
+        return H * W * piece_index + H * relative_file + relative_rank;
+    }
+
+    // Get a list of indices with a value of 1 among the features
+    template <Side AssociatedKing>
+    void HalfRelativeKA<AssociatedKing>::append_active_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* active) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        Bitboard bb = pos.pieces();
+        while (bb) {
+            Square s = pop_lsb(&bb);
+            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
+        }
+    }
+
+    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+    template <Side AssociatedKing>
+    void HalfRelativeKA<AssociatedKing>::append_changed_indices(
+        const Position& pos,
+        Color perspective,
+        IndexList* removed,
+        IndexList* added) {
+
+        Square ksq = orient(
+            perspective,
+            pos.square<KING>(
+                AssociatedKing == Side::kFriend ? perspective : ~perspective));
+
+        const auto& dp = pos.state()->dirtyPiece;
+        for (int i = 0; i < dp.dirty_num; ++i) {
+            Piece pc = dp.piece[i];
+
+            if (dp.from[i] != SQ_NONE)
+                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
+
+            if (dp.to[i] != SQ_NONE)
+                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
+        }
+    }
+
+    template class HalfRelativeKA<Side::kFriend>;
+    template class HalfRelativeKA<Side::kEnemy>;
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_relative_ka.h b/src/nnue/features/half_relative_ka.h
new file mode 100644
index 00000000..f42661e9
--- /dev/null
+++ b/src/nnue/features/half_relative_ka.h
@@ -0,0 +1,68 @@
+﻿#ifndef _NNUE_FEATURES_HALF_RELATIVE_KA_H_
+#define _NNUE_FEATURES_HALF_RELATIVE_KA_H_
+
+#include "features_common.h"
+
+#include "evaluate.h"
+
+// Definition of input features HalfRelativeKA of NNUE evaluation function
+// K - King
+// A - Any piece
+// KA - product of K and A
+namespace Eval::NNUE::Features {
+
+    // Feature HalfRelativeKA: Relative position of each piece other than the ball based on own ball or enemy ball
+    template <Side AssociatedKing>
+    class HalfRelativeKA {
+    public:
+        // feature quantity name
+        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+            "HalfRelativeKA(Friend)" : "HalfRelativeKA(Enemy)";
+
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue =
+            0xA123051Fu ^ (AssociatedKing == Side::kFriend);
+
+        static constexpr IndexType kNumPieceKinds = 6 * 2;
+
+        // width of the virtual board with the ball in the center
+        static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
+
+        // height of a virtual board with balls in the center
+        static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
+
+        // number of feature dimensions
+        static constexpr IndexType kDimensions =
+            kNumPieceKinds * kBoardHeight * kBoardWidth;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 32;
+
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger =
+            (AssociatedKing == Side::kFriend) ?
+            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+        // Get a list of indices with a value of 1 among the features
+        static void append_active_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* active);
+
+        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
+        static void append_changed_indices(
+            const Position& pos,
+            Color perspective,
+            IndexList* removed,
+            IndexList* added);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Square s, IndexType p);
+
+        // Find the index of the feature quantity from the ball position and PieceSquare
+        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
+    };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_FEATURES_HALF_RELATIVE_KA_H_
diff --git a/src/nnue/trainer/features/factorizer_half_ka.h b/src/nnue/trainer/features/factorizer_half_ka.h
new file mode 100644
index 00000000..90bd9d97
--- /dev/null
+++ b/src/nnue/trainer/features/factorizer_half_ka.h
@@ -0,0 +1,93 @@
+﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
+
+#include "factorizer.h"
+
+#include "nnue/features/half_ka.h"
+#include "nnue/features/a.h"
+#include "nnue/features/half_relative_ka.h"
+
+// Specialization of NNUE evaluation function feature conversion class template for HalfKA
+namespace Eval::NNUE::Features {
+
+    // Class template that converts input features into learning features
+    // Specialization for HalfKA
+    template <Side AssociatedKing>
+    class Factorizer<HalfKA<AssociatedKing>> {
+    private:
+        using FeatureType = HalfKA<AssociatedKing>;
+
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions =
+            FeatureType::kMaxActiveDimensions;
+
+        // Type of learning feature
+        enum TrainingFeatureType {
+            kFeaturesHalfKA,
+            kFeaturesA,
+            kFeaturesHalfRelativeKA,
+            kNumTrainingFeatureTypes,
+        };
+
+        // Learning feature information
+        static constexpr FeatureProperties kProperties[] = {
+            // kFeaturesHalfKPK
+            {true, FeatureType::kDimensions},
+            // kFeaturesPK
+            {true, Factorizer<A>::get_dimensions()},
+            // kFeaturesHalfRelativeKPK
+            {true, Factorizer<HalfRelativeKA<AssociatedKing>>::get_dimensions()},
+        };
+
+        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
+
+    public:
+        static constexpr std::string get_name() {
+            return std::string("Factorizer<") + FeatureType::kName + ">";
+        }
+
+        static constexpr std::string get_factorizers_string() {
+            return "  - " + get_name();
+        }
+
+        // Get the dimensionality of the learning feature
+        static constexpr IndexType get_dimensions() {
+            return get_active_dimensions(kProperties);
+        }
+
+        // Get index of learning feature and scale of learning rate
+        static void append_training_features(
+            IndexType base_index, std::vector<TrainingFeature>* training_features) {
+
+            // kFeaturesHalfKPK
+            IndexType index_offset = append_base_feature<FeatureType>(
+                kProperties[kFeaturesHalfKA], base_index, training_features);
+
+            const auto sq_k = static_cast<Square>(base_index / PS_END2);
+            const auto a = static_cast<IndexType>(base_index % PS_END2);
+
+            // kFeaturesPK
+            index_offset += inherit_features_if_required<A>(
+                index_offset, kProperties[kFeaturesA], a, training_features);
+
+            // kFeaturesHalfRelativeKPK
+            if (a >= PS_W_PAWN) {
+                index_offset += inherit_features_if_required<HalfRelativeKA<AssociatedKing>>(
+                    index_offset, kProperties[kFeaturesHalfRelativeKA],
+                    HalfRelativeKA<AssociatedKing>::make_index(sq_k, a),
+                    training_features);
+            }
+            else {
+                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKA]);
+            }
+
+            assert(index_offset == get_dimensions());
+        }
+    };
+
+    template <Side AssociatedKing>
+    constexpr FeatureProperties Factorizer<HalfKA<AssociatedKing>>::kProperties[];
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_

From 8fac468259e9bcd667c9d44cad48fc736b1bb98d Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 19:37:06 +0100
Subject: [PATCH 409/583] Add a cache line aligned allocator.

---
 src/misc.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/misc.h b/src/misc.h
index 9f250b6e..be9b4c38 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -563,6 +563,23 @@ public:
   void deallocate(T* p, std::size_t ) { std_aligned_free(p); }
 };
 
+template <typename T>
+class CacheLineAlignedAllocator {
+public:
+    using value_type = T;
+
+    constexpr static uint64_t cache_line_size = 64;
+
+    CacheLineAlignedAllocator() {}
+    CacheLineAlignedAllocator(const CacheLineAlignedAllocator&) {}
+    CacheLineAlignedAllocator(CacheLineAlignedAllocator&&) {}
+
+    template <typename U> CacheLineAlignedAllocator(const CacheLineAlignedAllocator<U>&) {}
+
+    T* allocate(std::size_t n) { return (T*)std_aligned_alloc(cache_line_size, n * sizeof(T)); }
+    void deallocate(T* p, std::size_t) { std_aligned_free(p); }
+};
+
 // --------------------
 //  Dependency Wrapper
 // --------------------

From f1e96cab55a7825a00ce6fdc7cae49ee77adbdd7 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 19:37:19 +0100
Subject: [PATCH 410/583] Align trainer arrays to cache line.

---
 src/nnue/trainer/trainer_affine_transform.h    | 12 ++++++------
 src/nnue/trainer/trainer_clipped_relu.h        |  4 ++--
 src/nnue/trainer/trainer_feature_transformer.h | 10 +++++-----
 src/nnue/trainer/trainer_input_slice.h         |  6 +++---
 src/nnue/trainer/trainer_sum.h                 |  2 +-
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 3179aeb0..449a0a11 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -353,18 +353,18 @@ namespace Eval::NNUE {
         LayerType* const target_layer_;
 
         // parameter
-        LearnFloatType biases_[kOutputDimensions];
-        LearnFloatType weights_[kOutputDimensions * kInputDimensions];
+        alignas(kCacheLineSize) LearnFloatType biases_[kOutputDimensions];
+        alignas(kCacheLineSize) LearnFloatType weights_[kOutputDimensions * kInputDimensions];
 
         // Buffer used for updating parameters
-        LearnFloatType biases_diff_[kOutputDimensions];
-        LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+        alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions];
+        alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
 
         // Forward propagation buffer
-        std::vector<LearnFloatType> output_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
 
         // buffer for back propagation
-        std::vector<LearnFloatType> gradients_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
 
         // hyper parameter
         LearnFloatType momentum_;
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 57e9bac4..5f2ff065 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -149,10 +149,10 @@ namespace Eval::NNUE {
         LayerType* const target_layer_;
 
         // Forward propagation buffer
-        std::vector<LearnFloatType> output_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
 
         // buffer for back propagation
-        std::vector<LearnFloatType> gradients_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
 
         // Health check statistics
         LearnFloatType min_activations_[kOutputDimensions];
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 97b19c46..9f0648d2 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -421,11 +421,11 @@ namespace Eval::NNUE {
             LearnFloatType weights_[kHalfDimensions * kInputDimensions];
 
         // Buffer used for updating parameters
-        LearnFloatType biases_diff_[kHalfDimensions];
-        std::vector<LearnFloatType> gradients_;
+        alignas(kCacheLineSize) LearnFloatType biases_diff_[kHalfDimensions];
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
 
         // Forward propagation buffer
-        std::vector<LearnFloatType> output_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
 
         // Features that appeared in the training data
         std::bitset<kInputDimensions> observed_features;
@@ -437,8 +437,8 @@ namespace Eval::NNUE {
         // Health check statistics
         LearnFloatType min_pre_activation_;
         LearnFloatType max_pre_activation_;
-        LearnFloatType min_activations_[kHalfDimensions];
-        LearnFloatType max_activations_[kHalfDimensions];
+        alignas(kCacheLineSize) LearnFloatType min_activations_[kHalfDimensions];
+        alignas(kCacheLineSize) LearnFloatType max_activations_[kHalfDimensions];
     };
 
 }  // namespace Eval::NNUE
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 43968776..9b8e5e13 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -163,7 +163,7 @@ namespace Eval::NNUE {
         const LearnFloatType* output_;
 
         // buffer for back propagation
-        std::vector<LearnFloatType> gradients_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
     };
 
     // Learning: Input layer
@@ -256,10 +256,10 @@ namespace Eval::NNUE {
         const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
 
         // Forward propagation buffer
-        std::vector<LearnFloatType> output_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
 
         // buffer for back propagation
-        std::vector<LearnFloatType> gradients_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
     };
 
 }  // namespace Eval::NNUE
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index c2e40b1c..b35420d6 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -184,7 +184,7 @@ namespace Eval::NNUE {
         LayerType* const target_layer_;
 
         // Forward propagation buffer
-        std::vector<LearnFloatType> output_;
+        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
     };
 
 }  // namespace Eval::NNUE

From ee0917a3459ee90a27cab4b519e571ca4fc22ac1 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 26 Oct 2020 15:06:15 +0100
Subject: [PATCH 411/583] Pass ThreadPool to update_parameters, propagate, and
 backpropagate.

---
 src/learn/learn.cpp                           |  2 +-
 src/nnue/evaluate_nnue_learner.cpp            |  6 +++--
 src/nnue/evaluate_nnue_learner.h              |  3 +++
 src/nnue/trainer/trainer_affine_transform.h   | 11 ++++++----
 src/nnue/trainer/trainer_clipped_relu.h       | 11 ++++++----
 .../trainer/trainer_feature_transformer.h     | 11 ++++++++--
 src/nnue/trainer/trainer_input_slice.h        | 22 +++++++++++--------
 src/nnue/trainer/trainer_sum.h                | 15 ++++++++-----
 8 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 93262b42..66461cc5 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -704,7 +704,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(epoch, params.verbose, params.learning_rate, calc_grad);
+        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, calc_grad);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 4de939c5..6294865d 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -18,6 +18,7 @@
 #include "uci.h"
 #include "misc.h"
 #include "thread_win32_osx.h"
+#include "thread.h"
 
 // Code for learning NNUE evaluation function
 namespace Eval::NNUE {
@@ -180,6 +181,7 @@ namespace Eval::NNUE {
 
     // update the evaluation function parameters
     void update_parameters(
+        ThreadPool& thread_pool,
         uint64_t epoch,
         bool verbose,
         double learning_rate,
@@ -202,7 +204,7 @@ namespace Eval::NNUE {
             std::vector<Example> batch(examples.end() - batch_size, examples.end());
             examples.resize(examples.size() - batch_size);
 
-            const auto network_output = trainer->propagate(batch);
+            const auto network_output = trainer->propagate(thread_pool, batch);
 
             std::vector<LearnFloatType> gradients(batch.size());
             for (std::size_t b = 0; b < batch.size(); ++b) {
@@ -226,7 +228,7 @@ namespace Eval::NNUE {
                 }
             }
 
-            trainer->backpropagate(gradients.data(), learning_rate);
+            trainer->backpropagate(thread_pool, gradients.data(), learning_rate);
 
             collect_stats = false;
         }
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index d350691b..8633f713 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -5,6 +5,8 @@
 
 #include "misc.h"
 
+struct ThreadPool;
+
 // Interface used for learning NNUE evaluation function
 namespace Eval::NNUE {
 
@@ -32,6 +34,7 @@ namespace Eval::NNUE {
 
     // update the evaluation function parameters
     void update_parameters(
+        ThreadPool& thread_pool,
         uint64_t epoch,
         bool verbose,
         double learning_rate,
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 449a0a11..5d2f29c9 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -7,6 +7,8 @@
 
 #include "nnue/layers/affine_transform.h"
 
+#include "thread.h"
+
 #include <random>
 
 // Specialization of NNUE evaluation function learning class template for AffineTransform
@@ -88,14 +90,14 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        const LearnFloatType* propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
                 output_.resize(kOutputDimensions * batch.size());
                 gradients_.resize(kInputDimensions * batch.size());
             }
 
             batch_size_ = static_cast<IndexType>(batch.size());
-            batch_input_ = previous_layer_trainer_->propagate(batch);
+            batch_input_ = previous_layer_trainer_->propagate(thread_pool, batch);
 #if defined(USE_BLAS)
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
@@ -127,7 +129,8 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             const LearnFloatType local_learning_rate =
@@ -211,7 +214,7 @@ namespace Eval::NNUE {
             }
             num_weights_diffs_ += kOutputDimensions * kInputDimensions;
 
-            previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
+            previous_layer_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
         }
 
     private:
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 5f2ff065..8e29e4a1 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -7,6 +7,8 @@
 
 #include "nnue/layers/clipped_relu.h"
 
+#include "thread.h"
+
 // Specialization of NNUE evaluation function learning class template for ClippedReLU
 namespace Eval::NNUE {
 
@@ -41,13 +43,13 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        const LearnFloatType* propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
               output_.resize(kOutputDimensions * batch.size());
               gradients_.resize(kInputDimensions * batch.size());
             }
 
-            const auto input = previous_layer_trainer_->propagate(batch);
+            const auto input = previous_layer_trainer_->propagate(thread_pool, batch);
             batch_size_ = static_cast<IndexType>(batch.size());
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
@@ -63,7 +65,8 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             for (IndexType b = 0; b < batch_size_; ++b) {
@@ -77,7 +80,7 @@ namespace Eval::NNUE {
             }
             num_total_ += batch_size_ * kOutputDimensions;
 
-            previous_layer_trainer_->backpropagate(gradients_.data(), learning_rate);
+            previous_layer_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
         }
 
     private:
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 9f0648d2..a778f956 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -9,6 +9,8 @@
 
 #include "nnue/nnue_feature_transformer.h"
 
+#include "thread.h"
+
 #include <array>
 #include <bitset>
 #include <numeric>
@@ -90,12 +92,14 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        const LearnFloatType* propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
                 output_.resize(kOutputDimensions * batch.size());
                 gradients_.resize(kOutputDimensions * batch.size());
             }
 
+            (void)thread_pool;
+
             batch_ = &batch;
             // affine transform
 #pragma omp parallel for
@@ -143,9 +147,12 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
+            (void)thread_pool;
+
             const LearnFloatType local_learning_rate =
                 learning_rate * learning_rate_scale_;
 
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 9b8e5e13..4bb38104 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -7,6 +7,8 @@
 
 #include "nnue/layers/input_slice.h"
 
+#include "thread.h"
+
 // Specialization of NNUE evaluation function learning class template for InputSlice
 namespace Eval::NNUE {
 
@@ -60,7 +62,7 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        const LearnFloatType* propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
             if (gradients_.size() < kInputDimensions * batch.size()) {
                 gradients_.resize(kInputDimensions * batch.size());
             }
@@ -69,7 +71,7 @@ namespace Eval::NNUE {
 
             if (num_calls_ == 0) {
                 current_operation_ = Operation::kPropagate;
-                output_ = feature_transformer_trainer_->propagate(batch);
+                output_ = feature_transformer_trainer_->propagate(thread_pool, batch);
             }
 
             assert(current_operation_ == Operation::kPropagate);
@@ -83,11 +85,12 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             if (num_referrers_ == 1) {
-                feature_transformer_trainer_->backpropagate(gradients, learning_rate);
+                feature_transformer_trainer_->backpropagate(thread_pool, gradients, learning_rate);
                 return;
             }
 
@@ -112,7 +115,7 @@ namespace Eval::NNUE {
 
             if (++num_calls_ == num_referrers_) {
                 feature_transformer_trainer_->backpropagate(
-                    gradients_.data(), learning_rate);
+                    thread_pool, gradients_.data(), learning_rate);
                 num_calls_ = 0;
                 current_operation_ = Operation::kNone;
             }
@@ -193,7 +196,7 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        const LearnFloatType* propagate(const std::vector<Example>& batch) {
+        const LearnFloatType* propagate(ThreadPool& thread_pool,const std::vector<Example>& batch) {
             if (output_.size() < kOutputDimensions * batch.size()) {
               output_.resize(kOutputDimensions * batch.size());
               gradients_.resize(kInputDimensions * batch.size());
@@ -201,7 +204,7 @@ namespace Eval::NNUE {
 
             batch_size_ = static_cast<IndexType>(batch.size());
 
-            const auto input = shared_input_trainer_->propagate(batch);
+            const auto input = shared_input_trainer_->propagate(thread_pool, batch);
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType input_offset = kInputDimensions * b;
                 const IndexType output_offset = kOutputDimensions * b;
@@ -219,7 +222,8 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
             for (IndexType b = 0; b < batch_size_; ++b) {
@@ -233,7 +237,7 @@ namespace Eval::NNUE {
                     }
                 }
             }
-            shared_input_trainer_->backpropagate(gradients_.data(), learning_rate);
+            shared_input_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
         }
 
     private:
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index b35420d6..6defb95f 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -7,6 +7,8 @@
 
 #include "nnue/layers/sum.h"
 
+#include "thread.h"
+
 // Specialization of NNUE evaluation function learning class template for Sum
 namespace Eval::NNUE {
 
@@ -45,10 +47,10 @@ namespace Eval::NNUE {
         }
 
         // forward propagation
-        /*const*/ LearnFloatType* propagate(const std::vector<Example>& batch) {
+        /*const*/ LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
             batch_size_ = static_cast<IndexType>(batch.size());
-            auto output = Tail::propagate(batch);
-            const auto head_output = previous_layer_trainer_->propagate(batch);
+            auto output = Tail::propagate(thread_pool, batch);
+            const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch);
 
 #if defined(USE_BLAS)
             cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
@@ -66,11 +68,12 @@ namespace Eval::NNUE {
         }
 
         // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
+        void backpropagate(ThreadPool& thread_pool,
+                           const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
-            Tail::backpropagate(gradients, learning_rate);
-            previous_layer_trainer_->backpropagate(gradients, learning_rate);
+            Tail::backpropagate(thread_pool, gradients, learning_rate);
+            previous_layer_trainer_->backpropagate(thread_pool, gradients, learning_rate);
         }
 
     private:

From c56a4a36eb92e8fd32b8923a52896352465f93b0 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 28 Oct 2020 14:41:51 +0100
Subject: [PATCH 412/583] Add our own blas-like routines that use stockfish's
 thread pool for parallelization.

---
 src/Makefile                 |    1 +
 src/extra/stockfish_blas.cpp | 1033 ++++++++++++++++++++++++++++++++++
 src/extra/stockfish_blas.h   |  130 +++++
 src/thread.h                 |   29 +
 src/uci.cpp                  |    9 +
 5 files changed, 1202 insertions(+)
 create mode 100644 src/extra/stockfish_blas.cpp
 create mode 100644 src/extra/stockfish_blas.h

diff --git a/src/Makefile b/src/Makefile
index 45d27ef2..cba4e351 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -47,6 +47,7 @@ PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 output_file_name $(PGO_TRAINING_
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
 	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
 	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
+	extra/stockfish_blas.cpp \
 	nnue/evaluate_nnue.cpp \
 	nnue/evaluate_nnue_learner.cpp \
 	nnue/features/half_kp.cpp \
diff --git a/src/extra/stockfish_blas.cpp b/src/extra/stockfish_blas.cpp
new file mode 100644
index 00000000..0ba40b49
--- /dev/null
+++ b/src/extra/stockfish_blas.cpp
@@ -0,0 +1,1033 @@
+#include "stockfish_blas.h"
+
+#include "thread.h"
+
+#include <cstring>
+#include <random>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <atomic>
+#include <chrono>
+
+#if defined(USE_SSE2)
+#include <xmmintrin.h>
+#endif
+
+#if defined (USE_SSE3)
+#include <pmmintrin.h>
+#endif
+
+#if defined(USE_BLAS)
+#include <cblas.h>
+#endif
+
+namespace Blas {
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    )
+    {
+        std::memcpy(Y, X, sizeof(float) * N);
+    }
+
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    )
+    {
+        if (incX == 1 && incY == 1)
+        {
+            scopy(N, X, Y);
+        }
+        else
+        {
+            for(int i = 0; i < N; ++i)
+            {
+                *Y = *X;
+                X += incX;
+                Y += incY;
+            }
+        }
+    }
+
+    void scopy(
+        ThreadPool&,
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    )
+    {
+        scopy(N, X, Y);
+    }
+
+    void scopy(
+        ThreadPool&,
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    )
+    {
+        scopy(N, X, incX, Y, incY);
+    }
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    )
+    {
+#if defined (USE_SSE2)
+
+        const __m128 alpha4 = _mm_set1_ps(alpha);
+
+        int i = 0;
+        for(; i < N - 31; i += 32)
+        {
+            __m128 x0 = _mm_loadu_ps(X + i +  0);
+            __m128 x1 = _mm_loadu_ps(X + i +  4);
+            __m128 x2 = _mm_loadu_ps(X + i +  8);
+            __m128 x3 = _mm_loadu_ps(X + i + 12);
+            __m128 x4 = _mm_loadu_ps(X + i + 16);
+            __m128 x5 = _mm_loadu_ps(X + i + 20);
+            __m128 x6 = _mm_loadu_ps(X + i + 24);
+            __m128 x7 = _mm_loadu_ps(X + i + 28);
+
+            x0 = _mm_mul_ps(x0, alpha4);
+            x1 = _mm_mul_ps(x1, alpha4);
+            x2 = _mm_mul_ps(x2, alpha4);
+            x3 = _mm_mul_ps(x3, alpha4);
+            x4 = _mm_mul_ps(x4, alpha4);
+            x5 = _mm_mul_ps(x5, alpha4);
+            x6 = _mm_mul_ps(x6, alpha4);
+            x7 = _mm_mul_ps(x7, alpha4);
+
+            _mm_storeu_ps(X + i +  0, x0);
+            _mm_storeu_ps(X + i +  4, x1);
+            _mm_storeu_ps(X + i +  8, x2);
+            _mm_storeu_ps(X + i + 12, x3);
+            _mm_storeu_ps(X + i + 16, x4);
+            _mm_storeu_ps(X + i + 20, x5);
+            _mm_storeu_ps(X + i + 24, x6);
+            _mm_storeu_ps(X + i + 28, x7);
+        }
+
+        for(; i < N; ++i)
+        {
+            X[i] *= alpha;
+        }
+
+#else
+
+        for(int i = 0; i < N; ++i)
+        {
+            X[i] *= alpha;
+        }
+
+#endif
+    }
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X, const int incX
+    )
+    {
+        if (incX == 1)
+        {
+            sscal(N, alpha, X);
+        }
+        else
+        {
+            for(int i = 0; i < N; ++i)
+            {
+                *X *= alpha;
+                X += incX;
+            }
+        }
+    }
+
+    void sscal(
+        ThreadPool&,
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    )
+    {
+        sscal(N, alpha, X);
+    }
+
+    void sscal(
+        ThreadPool&,
+        const int N,
+        const float alpha,
+        float *X, const int incX
+    )
+    {
+        sscal(N, alpha, X, incX);
+    }
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    )
+    {
+
+#if defined (USE_SSE2)
+
+        const __m128 alpha4 = _mm_set1_ps(alpha);
+
+        int i = 0;
+        for(; i < N - 15; i += 16)
+        {
+            __m128 x0 = _mm_loadu_ps(X + i +  0);
+            __m128 x1 = _mm_loadu_ps(X + i +  4);
+            __m128 x2 = _mm_loadu_ps(X + i +  8);
+            __m128 x3 = _mm_loadu_ps(X + i + 12);
+
+            __m128 y0 = _mm_loadu_ps(Y + i +  0);
+            __m128 y1 = _mm_loadu_ps(Y + i +  4);
+            __m128 y2 = _mm_loadu_ps(Y + i +  8);
+            __m128 y3 = _mm_loadu_ps(Y + i + 12);
+
+            x0 = _mm_mul_ps(x0, alpha4);
+            x1 = _mm_mul_ps(x1, alpha4);
+            x2 = _mm_mul_ps(x2, alpha4);
+            x3 = _mm_mul_ps(x3, alpha4);
+
+            x0 = _mm_add_ps(x0, y0);
+            x1 = _mm_add_ps(x1, y1);
+            x2 = _mm_add_ps(x2, y2);
+            x3 = _mm_add_ps(x3, y3);
+
+            _mm_storeu_ps(Y + i +  0, x0);
+            _mm_storeu_ps(Y + i +  4, x1);
+            _mm_storeu_ps(Y + i +  8, x2);
+            _mm_storeu_ps(Y + i + 12, x3);
+        }
+
+        for(; i < N; ++i)
+        {
+            Y[i] += X[i] * alpha;
+        }
+
+#else
+
+        for(int i = 0; i < N; ++i)
+        {
+            Y[i] += X[i] * alpha;
+        }
+
+#endif
+
+    }
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    )
+    {
+        if (incX == 1 && incY == 1)
+        {
+            saxpy(N, alpha, X, Y);
+        }
+        else
+        {
+            for(int i = 0; i < N; ++i)
+            {
+                *Y += *X * alpha;
+                Y += incY;
+                X += incX;
+            }
+        }
+    }
+
+    void saxpy(
+        ThreadPool&,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    )
+    {
+        saxpy(N, alpha, X, Y);
+    }
+
+    void saxpy(
+        ThreadPool&,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    )
+    {
+        saxpy(N, alpha, X, incX, Y, incY);
+    }
+
+#if defined (USE_SSE3)
+    inline __m128 m128_hadd_ps(__m128 a, __m128 b, __m128 c, __m128 d)
+    {
+        const __m128 t0 = _mm_hadd_ps(a, b);
+        const __m128 t1 = _mm_hadd_ps(c, d);
+        return _mm_hadd_ps(t0, t1);
+    }
+#endif
+
+#if defined (USE_SSE2)
+
+    inline void transpose4x4_sse2(
+        const float* SF_BLAS_RESTRICT A, const int lda,
+        float* SF_BLAS_RESTRICT B, const int ldb
+    )
+    {
+        __m128 row1 = _mm_loadu_ps(&A[0 * lda]);
+        __m128 row2 = _mm_loadu_ps(&A[1 * lda]);
+        __m128 row3 = _mm_loadu_ps(&A[2 * lda]);
+        __m128 row4 = _mm_loadu_ps(&A[3 * lda]);
+
+        _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+
+        _mm_storeu_ps(&B[0 * ldb], row1);
+        _mm_storeu_ps(&B[1 * ldb], row2);
+        _mm_storeu_ps(&B[2 * ldb], row3);
+        _mm_storeu_ps(&B[3 * ldb], row4);
+    }
+
+    void transpose_sse2(
+        const int N, const int M,
+        const float* SF_BLAS_RESTRICT A, const int lda,
+        float* SF_BLAS_RESTRICT B, const int ldb
+    )
+    {
+        static constexpr int block_size = 16;
+
+        for (int n = 0; n < N; n += block_size)
+        {
+            for (int m = 0; m < M; m += block_size)
+            {
+                const int max_n2 = n + block_size < N ? n + block_size : N;
+                const int max_m2 = m + block_size < M ? m + block_size : M;
+
+                int n2 = n;
+                for (; n2 < max_n2 - 3; n2 += 4)
+                {
+                    int m2 = m;
+                    for (; m2 < max_m2 - 3; m2 += 4)
+                    {
+                        transpose4x4_sse2(
+                            &A[n2 * lda + m2], lda,
+                            &B[m2 * ldb + n2], ldb
+                        );
+                    }
+
+                    for (; m2 < max_m2; ++m2)
+                    {
+                        B[m2 * ldb + n2 + 0] = A[(n2 + 0) * lda + m2];
+                        B[m2 * ldb + n2 + 1] = A[(n2 + 1) * lda + m2];
+                        B[m2 * ldb + n2 + 2] = A[(n2 + 2) * lda + m2];
+                        B[m2 * ldb + n2 + 3] = A[(n2 + 3) * lda + m2];
+                    }
+                }
+
+                for (; n2 < max_n2; ++n2)
+                {
+                    for (int m2 = m; m2 < max_m2; ++m2)
+                    {
+                        B[m2 * ldb + n2] = A[n2 * lda + m2];
+                    }
+                }
+            }
+        }
+    }
+#endif
+
+    void transpose(
+        const int N, const int M,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        float* SF_BLAS_RESTRICT B, const int ldb
+    )
+    {
+#if defined (USE_SSE2)
+
+        transpose_sse2(
+            N, M,
+            A, lda,
+            B, ldb
+        );
+
+#else
+
+        for(int r = 0; r < N; ++r)
+        {
+            for (int c = 0; c < M; ++c)
+            {
+                B[c*ldb + r] = A[r*lda + c];
+            }
+        }
+
+#endif
+    }
+
+    void sgemm_row_major_transpose_right(
+        ThreadPool& thread_pool,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+
+#if defined(USE_SSE3)
+
+        const __m128 alpha4 = _mm_set1_ps(alpha);
+        const __m128 beta4 = _mm_set1_ps(beta);
+
+        std::atomic<int> m_atomic = 0;
+        thread_pool.execute_with_workers(
+            [
+                M, N, K,
+                alpha, alpha4,
+                A, lda,
+                B, ldb,
+                beta, beta4,
+                C, ldc,
+                &m_atomic
+            ](Thread&) {
+                for (;;)
+                {
+                    const int m = m_atomic.fetch_add(2);
+                    if (m >= M - 1)
+                        break;
+
+                    int n = 0;
+                    for (; n < N - 3; n += 4)
+                    {
+                        //        mn
+                        __m128 sum00 = _mm_setzero_ps();
+                        __m128 sum01 = _mm_setzero_ps();
+                        __m128 sum02 = _mm_setzero_ps();
+                        __m128 sum03 = _mm_setzero_ps();
+                        __m128 sum10 = _mm_setzero_ps();
+                        __m128 sum11 = _mm_setzero_ps();
+                        __m128 sum12 = _mm_setzero_ps();
+                        __m128 sum13 = _mm_setzero_ps();
+
+                        // Horizontal sum of elements in sum[m][n] corresponds to
+                        // the final element in the C.
+
+                        int k = 0;
+                        for (; k < K - 3; k += 4)
+                        {
+                            const __m128 a0 = _mm_loadu_ps(&A[(m+0)*lda+k+0]);
+                            const __m128 a1 = _mm_loadu_ps(&A[(m+1)*lda+k+0]);
+
+                            const __m128 b0 = _mm_loadu_ps(&B[(n+0)*ldb+k+0]);
+                            const __m128 b1 = _mm_loadu_ps(&B[(n+1)*ldb+k+0]);
+                            const __m128 b2 = _mm_loadu_ps(&B[(n+2)*ldb+k+0]);
+                            const __m128 b3 = _mm_loadu_ps(&B[(n+3)*ldb+k+0]);
+
+                            sum00 = _mm_add_ps(sum00, _mm_mul_ps(a0, b0));
+                            sum01 = _mm_add_ps(sum01, _mm_mul_ps(a0, b1));
+                            sum02 = _mm_add_ps(sum02, _mm_mul_ps(a0, b2));
+                            sum03 = _mm_add_ps(sum03, _mm_mul_ps(a0, b3));
+                            sum10 = _mm_add_ps(sum10, _mm_mul_ps(a1, b0));
+                            sum11 = _mm_add_ps(sum11, _mm_mul_ps(a1, b1));
+                            sum12 = _mm_add_ps(sum12, _mm_mul_ps(a1, b2));
+                            sum13 = _mm_add_ps(sum13, _mm_mul_ps(a1, b3));
+                        }
+
+                        for(; k < K; k += 1)
+                        {
+                            const float a0 = A[(m+0)*lda+k+0];
+                            const float a1 = A[(m+1)*lda+k+0];
+
+                            const float b0 = B[(n+0)*ldb+k+0];
+                            const float b1 = B[(n+1)*ldb+k+0];
+                            const float b2 = B[(n+2)*ldb+k+0];
+                            const float b3 = B[(n+3)*ldb+k+0];
+
+                            // Since all will be summed vertically anyway we can
+                            // just add to the first element.
+                            // Other elements are left unmodified.
+                            sum00 = _mm_add_ss(sum00, _mm_set_ss(a0 * b0));
+                            sum01 = _mm_add_ss(sum01, _mm_set_ss(a0 * b1));
+                            sum02 = _mm_add_ss(sum02, _mm_set_ss(a0 * b2));
+                            sum03 = _mm_add_ss(sum03, _mm_set_ss(a0 * b3));
+                            sum10 = _mm_add_ss(sum10, _mm_set_ss(a1 * b0));
+                            sum11 = _mm_add_ss(sum11, _mm_set_ss(a1 * b1));
+                            sum12 = _mm_add_ss(sum12, _mm_set_ss(a1 * b2));
+                            sum13 = _mm_add_ss(sum13, _mm_set_ss(a1 * b3));
+                        }
+
+                        __m128 s0 = m128_hadd_ps(sum00, sum01, sum02, sum03);
+                        __m128 s1 = m128_hadd_ps(sum10, sum11, sum12, sum13);
+                        s0 = _mm_mul_ps(s0, alpha4);
+                        s1 = _mm_mul_ps(s1, alpha4);
+
+                        __m128 c0 = _mm_loadu_ps(&C[(m+0)*ldc+(n+0)]);
+                        __m128 c1 = _mm_loadu_ps(&C[(m+1)*ldc+(n+0)]);
+                        c0 = _mm_mul_ps(c0, beta4);
+                        c1 = _mm_mul_ps(c1, beta4);
+
+                        c0 = _mm_add_ps(c0, s0);
+                        c1 = _mm_add_ps(c1, s1);
+
+                        _mm_storeu_ps(&C[(m+0)*ldc+(n+0)], c0);
+                        _mm_storeu_ps(&C[(m+1)*ldc+(n+0)], c1);
+                    }
+
+                    for(; n < N; n += 1)
+                    {
+                        float sum0 = 0.0f;
+                        float sum1 = 0.0f;
+
+                        for (int k = 0; k < K; ++k)
+                        {
+                            const float a0 = A[(m+0)*lda+k+0];
+                            const float a1 = A[(m+1)*lda+k+0];
+
+                            const float b0 = B[(n+0)*ldb+k+0];
+
+                            sum0 += a0 * b0;
+                            sum1 += a1 * b0;
+                        }
+
+                        C[(m+0)*ldc+(n+0)] = C[(m+0)*ldc+(n+0)] * beta + sum0 * alpha;
+                        C[(m+1)*ldc+(n+0)] = C[(m+1)*ldc+(n+0)] * beta + sum1 * alpha;
+                    }
+                }
+            }
+        );
+
+        int m = M - (M % 2);
+        for (; m < M; m += 1)
+        {
+            for (int n = 0; n < N; n += 1)
+            {
+                float sum = 0.0f;
+
+                for (int k = 0; k < K; k += 1)
+                {
+                    sum += A[m*lda + k] * B[n*ldb + k];
+                }
+
+                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
+            }
+        }
+
+        thread_pool.wait_for_workers_finished();
+
+#else
+
+        thread_pool.for_each_index_with_workers(
+            0, M,
+            [&](Thread&, int m) {
+                for (int n = 0; n < N; n += 1)
+                {
+                    float sum = 0.0f;
+
+                    for (int k = 0; k < K; k += 1)
+                    {
+                        sum += A[m*lda + k] * B[n*ldb + k];
+                    }
+
+                    C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
+                }
+            }
+        );
+        thread_pool.wait_for_workers_finished();
+
+#endif
+    }
+
+    // The pointer to the storage returned by this function
+    // is valid until the next call to this function from
+    // the same thread with the same idx.
+    // This is an unsafe function and should be used with caution
+    // and only within this translation unit.
+    // The number of buffers available is just enough to make
+    // all functions here work.
+    float* get_thread_local_temporary_storage(
+        int requested_size, int idx
+    )
+    {
+        static constexpr int MAX_NUM_BUFFERS = 2;
+
+        static thread_local int s_data_size[MAX_NUM_BUFFERS] = {0};
+        static thread_local std::unique_ptr<float[]> s_data[MAX_NUM_BUFFERS];
+
+        if (requested_size > s_data_size[idx])
+        {
+            s_data[idx] = std::make_unique<float[]>(requested_size);
+            s_data_size[idx] = requested_size;
+        }
+
+        return s_data[idx].get();
+    }
+
+    void sgemm_row_major_transpose_none(
+        ThreadPool& thread_pool,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        constexpr static int temporary_buffer_index = 1;
+
+        auto B_tr = get_thread_local_temporary_storage(K * N, temporary_buffer_index);
+
+        transpose(
+            K, N,
+            B, ldb,
+            B_tr, K
+        );
+
+        sgemm_row_major_transpose_right(
+            thread_pool,
+            M, N, K,
+            alpha,
+            A, lda,
+            B_tr, K,
+            beta,
+            C, ldc
+        );
+    }
+
+    void sgemm_row_major(
+        ThreadPool& thread_pool,
+        MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        constexpr static int temporary_buffer_index = 0;
+
+        if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::Trans)
+        {
+            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
+
+            transpose(
+                K, M,
+                A, lda,
+                A_tr, K
+            );
+
+            sgemm_row_major_transpose_right(
+                thread_pool,
+                M, N, K,
+                alpha,
+                A_tr, K,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else if (TransA == MatrixTranspose::NoTrans && TransB == MatrixTranspose::Trans)
+        {
+            sgemm_row_major_transpose_right(
+                thread_pool,
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::NoTrans)
+        {
+            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
+
+            transpose(
+                K, M,
+                A, lda,
+                A_tr, K
+            );
+
+            sgemm_row_major_transpose_none(
+                thread_pool,
+                M, N, K,
+                alpha,
+                A_tr, K,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else // no transpositions
+        {
+            sgemm_row_major_transpose_none(
+                thread_pool,
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+    }
+
+    void sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        if (layout == MatrixLayout::RowMajor)
+        {
+            sgemm_row_major(
+                thread_pool,
+                TransA, TransB,
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else
+        {
+            sgemm_row_major(
+                thread_pool,
+                TransB, TransA,
+                N, M, K,
+                alpha,
+                B, ldb,
+                A, lda,
+                beta,
+                C, ldc
+            );
+        }
+    }
+
+    std::vector<float> generate_random_matrix(int rows, int cols)
+    {
+        std::vector<float> m(rows * cols);
+
+        std::mt19937_64 rng;
+        std::uniform_real_distribution<float> d(-1.0, 1.0);
+
+        for(auto& v : m)
+        {
+            v = d(rng);
+        }
+
+        return m;
+    }
+
+    std::vector<float> generate_zero_matrix(int rows, int cols)
+    {
+        return std::vector<float>(rows * cols, 0.0f);
+    }
+
+    float matrix_relative_error(
+        const std::vector<float>& ref,
+        const std::vector<float>& our
+    )
+    {
+        double sum = 0.0;
+        double diff_sum = 0.0;
+
+        for(size_t i = 0; i < ref.size(); ++i)
+        {
+            sum += std::abs(ref[i]);
+            diff_sum += std::abs(ref[i] - our[i]);
+        }
+
+        return diff_sum / sum;
+    }
+
+    float norm(
+        const std::vector<float>& v
+    )
+    {
+        double sum = 0.0;
+
+        for(auto& e : v)
+        {
+            sum += e * e;
+        }
+
+        return std::sqrt(sum);
+    }
+
+#if defined (USE_BLAS)
+
+    CBLAS_LAYOUT matrix_layout_to_blas_layout(MatrixLayout layout)
+    {
+        if (layout == MatrixLayout::RowMajor)
+            return CblasRowMajor;
+        else if (layout == MatrixLayout::ColMajor)
+            return CblasColMajor;
+
+        return static_cast<CBLAS_LAYOUT>(-1);
+    }
+
+    const char* matrix_layout_to_string(MatrixLayout layout)
+    {
+        if (layout == MatrixLayout::RowMajor)
+            return "RowMajor";
+        else if (layout == MatrixLayout::ColMajor)
+            return "ColMajor";
+
+        return "INVALID";
+    }
+
+    CBLAS_TRANSPOSE matrix_transpose_to_blas_transpose(MatrixTranspose tr)
+    {
+        if (tr == MatrixTranspose::NoTrans)
+            return CblasNoTrans;
+        else if (tr == MatrixTranspose::Trans)
+            return CblasTrans;
+
+        return static_cast<CBLAS_TRANSPOSE>(-1);
+    }
+
+    const char* matrix_transpose_to_string(MatrixTranspose tr)
+    {
+        if (tr == MatrixTranspose::NoTrans)
+            return "NoTrans";
+        else if (tr == MatrixTranspose::Trans)
+            return "Trans";
+
+        return "INVALID";
+    }
+
+    void test_sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose trA, MatrixTranspose trB,
+        int M, int N, int K
+    )
+    {
+        auto A = generate_random_matrix(M * 2, K * 2);
+        auto B = generate_random_matrix(K * 2, N * 2);
+        auto C_ref = generate_random_matrix(M * 2, N * 2);
+        auto C_our = C_ref;
+
+        std::cout
+            << matrix_layout_to_string(layout) << ' '
+            << matrix_transpose_to_string(trA) << ' '
+            << matrix_transpose_to_string(trB) << '\n';
+
+        std::cout << "A norm: " << norm(A) << '\n';
+        std::cout << "B norm: " << norm(B) << '\n';
+        std::cout << "C norm: " << norm(C_ref) << '\n';
+
+        const int lda = (trA == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? K * 2 : M * 2;
+        const int ldb = (trB == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? N * 2 : K * 2;
+        const int ldc = (layout == MatrixLayout::RowMajor) ? N * 2 : M * 2;
+
+        cblas_sgemm(
+            matrix_layout_to_blas_layout(layout),
+            matrix_transpose_to_blas_transpose(trA),
+            matrix_transpose_to_blas_transpose(trB),
+            M, N, K,
+            1.0,
+            A.data(), lda,
+            B.data(), ldb,
+            1.0,
+            C_ref.data(), ldc
+        );
+
+        sgemm(
+            thread_pool,
+            layout, trA, trB,
+            M, N, K,
+            1.0,
+            A.data(), lda,
+            B.data(), ldb,
+            1.0,
+            C_our.data(), ldc
+        );
+
+        std::cout << "C_ref norm: " << norm(C_ref) << '\n';
+        std::cout << "C_our norm: " << norm(C_our) << '\n';
+        std::cout << "Relative error: " << matrix_relative_error(C_ref, C_our) << '\n';
+
+        std::cout << '\n';
+    }
+
+    void test_sgemm(
+        ThreadPool& thread_pool
+    )
+    {
+        constexpr int M = 57;
+        constexpr int N = 127;
+        constexpr int K = 31;
+
+        std::cout << "SGEMM test:\n";
+
+        for(auto layout : { MatrixLayout::RowMajor, MatrixLayout::ColMajor })
+        {
+            for(auto trA : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
+            {
+                for(auto trB : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
+                {
+                    test_sgemm(
+                        thread_pool,
+                        layout, trA, trB,
+                        M, N, K
+                    );
+                }
+            }
+        }
+    }
+
+    void bench_sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose trA, MatrixTranspose trB,
+        int M, int N, int K
+    )
+    {
+        constexpr int num_iters = 1000;
+
+        auto A = generate_random_matrix(M * 2, K * 2);
+        auto B = generate_random_matrix(K * 2, N * 2);
+        auto C_ref = generate_random_matrix(M * 2, N * 2);
+        auto C_our = C_ref;
+
+        std::cout
+            << matrix_layout_to_string(layout) << ' '
+            << matrix_transpose_to_string(trA) << ' '
+            << matrix_transpose_to_string(trB) << '\n';
+
+        std::cout << "A norm: " << norm(A) << '\n';
+        std::cout << "B norm: " << norm(B) << '\n';
+        std::cout << "C norm: " << norm(C_ref) << '\n';
+
+        const int lda = (trA == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? K * 2 : M * 2;
+        const int ldb = (trB == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? N * 2 : K * 2;
+        const int ldc = (layout == MatrixLayout::RowMajor) ? N * 2 : M * 2;
+
+        auto t0_ref = std::chrono::high_resolution_clock::now();
+        for(int i = 0; i < num_iters; ++i)
+        {
+            cblas_sgemm(
+                matrix_layout_to_blas_layout(layout),
+                matrix_transpose_to_blas_transpose(trA),
+                matrix_transpose_to_blas_transpose(trB),
+                M, N, K,
+                1.0,
+                A.data(), lda,
+                B.data(), ldb,
+                -0.5,
+                C_ref.data(), ldc
+            );
+        }
+        auto t1_ref = std::chrono::high_resolution_clock::now();
+        auto diff_ref = t1_ref - t0_ref;
+
+        auto t0_our = std::chrono::high_resolution_clock::now();
+        for(int i = 0; i < num_iters; ++i)
+        {
+            sgemm(
+                thread_pool,
+                layout, trA, trB,
+                M, N, K,
+                1.0,
+                A.data(), lda,
+                B.data(), ldb,
+                -0.5,
+                C_our.data(), ldc
+            );
+        }
+        auto t1_our = std::chrono::high_resolution_clock::now();
+        auto diff_our = t1_our - t0_our;
+
+        std::cout << "C_ref norm: " << norm(C_ref) << '\n';
+        std::cout << "C_our norm: " << norm(C_our) << '\n';
+        std::cout << "Relative error: " << matrix_relative_error(C_ref, C_our) << '\n';
+        std::cout << "Ref time: " << std::chrono::duration_cast<std::chrono::nanoseconds>(diff_ref).count() << " [ns]\n";
+        std::cout << "Our time: " << std::chrono::duration_cast<std::chrono::nanoseconds>(diff_our).count() << " [ns]\n";
+
+        std::cout << '\n';
+    }
+
+    void bench_sgemm(
+        ThreadPool& thread_pool
+    )
+    {
+        constexpr int M = 107;
+        constexpr int N = 213;
+        constexpr int K = 57;
+
+        std::cout << "SGEMM benchmark:\n";
+
+        for(auto layout : { MatrixLayout::RowMajor, MatrixLayout::ColMajor })
+        {
+            for(auto trA : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
+            {
+                for(auto trB : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
+                {
+                    bench_sgemm(
+                        thread_pool,
+                        layout, trA, trB,
+                        M, N, K
+                    );
+                }
+            }
+        }
+    }
+
+#endif
+
+    void print_arch()
+    {
+#if defined (USE_SSE3)
+        std::cout << "Using the sse3 implementation.\n";
+#elif defined (USE_SSE2)
+        std::cout << "Using the sse2 implementation.\n";
+#else
+        std::cout << "Using the base implementation.\n";
+#endif
+    }
+
+    void test(
+        ThreadPool& thread_pool
+    )
+    {
+#if defined (USE_BLAS)
+        print_arch();
+        test_sgemm(thread_pool);
+#else
+        std::cout << "Blas tests are only runnable when USE_BLAS is defined.\n";
+        (void)thread_pool;
+#endif
+    }
+
+    void bench(
+        ThreadPool& thread_pool
+    )
+    {
+#if defined (USE_BLAS)
+        print_arch();
+        bench_sgemm(thread_pool);
+#else
+        std::cout << "Blas benchmarks are only runnable when USE_BLAS is defined.\n";
+        (void)thread_pool;
+#endif
+    }
+}
\ No newline at end of file
diff --git a/src/extra/stockfish_blas.h b/src/extra/stockfish_blas.h
new file mode 100644
index 00000000..65da7e99
--- /dev/null
+++ b/src/extra/stockfish_blas.h
@@ -0,0 +1,130 @@
+#ifndef _STOCKFISH_BLAS_H_
+#define _STOCKFISH_BLAS_H_
+
+struct ThreadPool;
+
+#if defined (_MSC_VER)
+#define SF_BLAS_RESTRICT __restrict
+#elif defined (__INTEL_COMPILER)
+#define SF_BLAS_RESTRICT restrict
+#elif defined (__clang__)
+#define SF_BLAS_RESTRICT __restrict__
+#elif defined (__GNUC__)
+#define SF_BLAS_RESTRICT __restrict__
+#endif
+
+namespace Blas {
+
+    enum struct MatrixLayout {
+        RowMajor = 101,
+        ColMajor = 102
+    };
+
+    enum struct MatrixTranspose {
+        NoTrans = 111,
+        Trans = 112
+    };
+
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void scopy(
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void scopy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void scopy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    );
+
+    void sscal(
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X, const int incX
+    );
+
+    void sscal(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X
+    );
+
+    void sscal(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        float * SF_BLAS_RESTRICT X, const int incX
+    );
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void saxpy(
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void saxpy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X,
+        float * SF_BLAS_RESTRICT Y
+    );
+
+    void saxpy(
+        ThreadPool& thread_pool,
+        const int N,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT X, const int incX,
+        float * SF_BLAS_RESTRICT Y, const int incY
+    );
+
+    void sgemm(
+        ThreadPool& thread_pool,
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    );
+
+    void test(
+        ThreadPool& thread_pool
+    );
+
+    void bench(
+        ThreadPool& thread_pool
+    );
+}
+
+#endif
diff --git a/src/thread.h b/src/thread.h
index c0a01770..3bc00729 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -39,6 +39,15 @@
 /// pointer to an entry its life time is unlimited and we don't have
 /// to care about someone changing the entry under our feet.
 
+namespace Detail {
+
+  template <typename T>
+  struct TypeIdentity {
+    using Type = T;
+  };
+
+}
+
 class Thread {
 
   std::mutex mutex;
@@ -120,6 +129,26 @@ struct ThreadPool : public std::vector<Thread*> {
   // to the state of the `worker` function object.
   void execute_with_workers(const std::function<void(Thread&)>& worker);
 
+  template <typename IndexT, typename FuncT>
+  void for_each_index_with_workers(
+    IndexT begin,
+    typename Detail::TypeIdentity<IndexT>::Type end,
+    FuncT func)
+  {
+    std::atomic<IndexT> i_atomic = begin;
+
+    execute_with_workers(
+      [&i_atomic, end, func](Thread& th) mutable {
+        for(;;) {
+          const auto i = i_atomic.fetch_add(1);
+          if (i >= end)
+            break;
+
+          func(th, i);
+        }
+      });
+  }
+
   void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
   void clear();
   void set(size_t);
diff --git a/src/uci.cpp b/src/uci.cpp
index e6b45c02..ae21a3ae 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -22,6 +22,7 @@
 #include <sstream>
 #include <string>
 
+#include "extra/stockfish_blas.h"
 #include "nnue/evaluate_nnue.h"
 #include "evaluate.h"
 #include "movegen.h"
@@ -354,6 +355,14 @@ void UCI::loop(int argc, char* argv[]) {
           std::cout << th.thread_idx() << '\n';
         });
       }
+      else if (token == "blastest")
+      {
+        Blas::test(Threads);
+      }
+      else if (token == "blasbench")
+      {
+        Blas::bench(Threads);
+      }
 
       // test command
       else if (token == "test") test_cmd(pos, is);

From a56d8124d897ae0704efe66483b9f36b3a0c9203 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 28 Oct 2020 14:52:27 +0100
Subject: [PATCH 413/583] Replace non-blas parts of trainers with our own
 blas-like routines.

---
 src/nnue/trainer/trainer_affine_transform.h   | 166 ++++++++++--------
 .../trainer/trainer_feature_transformer.h     | 120 +++++++++----
 src/nnue/trainer/trainer_input_slice.h        |  20 ++-
 src/nnue/trainer/trainer_sum.h                |  22 ++-
 4 files changed, 207 insertions(+), 121 deletions(-)

diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 5d2f29c9..610805ca 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -3,6 +3,8 @@
 
 #include "trainer.h"
 
+#include "extra/stockfish_blas.h"
+
 #include "learn/learn.h"
 
 #include "nnue/layers/affine_transform.h"
@@ -98,32 +100,46 @@ namespace Eval::NNUE {
 
             batch_size_ = static_cast<IndexType>(batch.size());
             batch_input_ = previous_layer_trainer_->propagate(thread_pool, batch);
+
 #if defined(USE_BLAS)
+
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
-                cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
+                cblas_scopy(
+                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
+                );
             }
 
-            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                        kOutputDimensions, batch_size_, kInputDimensions, 1.0,
-                        weights_, kInputDimensions,
-                        batch_input_, kInputDimensions,
-                        1.0, &output_[0], kOutputDimensions);
+            cblas_sgemm(
+                CblasColMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, batch_size_, kInputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                batch_input_, kInputDimensions,
+                1.0,
+                &output_[0], kOutputDimensions
+            );
 #else
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType input_batch_offset = kInputDimensions * b;
-                const IndexType output_batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    double sum = biases_[i];
-                    for (IndexType j = 0; j < kInputDimensions; ++j) {
-                        const IndexType index = kInputDimensions * i + j;
-                        sum += weights_[index] * batch_input_[input_batch_offset + j];
-                    }
 
-                    output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
-                }
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                Blas::scopy(
+                    thread_pool,
+                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
+                );
             }
 
+            Blas::sgemm(
+                thread_pool,
+                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
+                kOutputDimensions, batch_size_, kInputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                batch_input_, kInputDimensions,
+                1.0,
+                &output_[0], kOutputDimensions
+            );
+
 #endif
             return output_.data();
         }
@@ -137,67 +153,77 @@ namespace Eval::NNUE {
                 learning_rate * learning_rate_scale_;
 
 #if defined(USE_BLAS)
-            // backpropagate
-            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
-                        kInputDimensions, batch_size_, kOutputDimensions, 1.0,
-                        weights_, kInputDimensions,
-                        gradients, kOutputDimensions,
-                        0.0, &gradients_[0], kInputDimensions);
+
+            cblas_sgemm(
+                CblasColMajor, CblasNoTrans, CblasNoTrans,
+                kInputDimensions, batch_size_, kOutputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                gradients, kOutputDimensions,
+                0.0,
+                &gradients_[0], kInputDimensions
+            );
 
             // update
-            cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
+            cblas_sscal(
+                kOutputDimensions, momentum_, biases_diff_, 1
+            );
+
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
-                cblas_saxpy(kOutputDimensions, 1.0,
+                cblas_saxpy(
+                    kOutputDimensions, 1.0,
+                    &gradients[batch_offset], 1, biases_diff_, 1
+                );
+            }
+
+            cblas_sgemm(
+                CblasRowMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, kInputDimensions, batch_size_,
+                1.0,
+                gradients, kOutputDimensions,
+                batch_input_, kInputDimensions,
+                momentum_,
+                weights_diff_, kInputDimensions
+            );
+
+#else
+
+            // backpropagate
+            Blas::sgemm(
+                thread_pool,
+                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
+                kInputDimensions, batch_size_, kOutputDimensions,
+                1.0,
+                weights_, kInputDimensions,
+                gradients, kOutputDimensions,
+                0.0,
+                &gradients_[0], kInputDimensions
+            );
+
+
+            Blas::sscal(
+                thread_pool,
+                kOutputDimensions, momentum_, biases_diff_, 1
+            );
+
+            for (IndexType b = 0; b < batch_size_; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                Blas::saxpy(thread_pool, kOutputDimensions, 1.0,
                           &gradients[batch_offset], 1, biases_diff_, 1);
             }
 
-            cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
-                        kOutputDimensions, kInputDimensions, batch_size_, 1.0,
-                        gradients, kOutputDimensions,
-                        batch_input_, kInputDimensions,
-                        momentum_, weights_diff_, kInputDimensions);
+            Blas::sgemm(
+                thread_pool,
+                Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
+                kOutputDimensions, kInputDimensions, batch_size_,
+                1.0,
+                gradients, kOutputDimensions,
+                batch_input_, kInputDimensions,
+                momentum_,
+                weights_diff_, kInputDimensions
+            );
 
-#else
-            // backpropagate
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType input_batch_offset = kInputDimensions * b;
-                const IndexType output_batch_offset = kOutputDimensions * b;
-                for (IndexType j = 0; j < kInputDimensions; ++j) {
-                    double sum = 0.0;
-                    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                        const IndexType index = kInputDimensions * i + j;
-                        sum += weights_[index] * gradients[output_batch_offset + i];
-                    }
-                    gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
-                }
-            }
-
-            // update
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                biases_diff_[i] *= momentum_;
-            }
-
-            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-                weights_diff_[i] *= momentum_;
-            }
-
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType input_batch_offset = kInputDimensions * b;
-                const IndexType output_batch_offset = kOutputDimensions * b;
-
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    biases_diff_[i] += gradients[output_batch_offset + i];
-                }
-
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    for (IndexType j = 0; j < kInputDimensions; ++j) {
-                        const IndexType index = kInputDimensions * i + j;
-                        weights_diff_[index] += gradients[output_batch_offset + i] *
-                            batch_input_[input_batch_offset + j];
-                    }
-                }
-            }
 #endif
 
             for (IndexType i = 0; i < kOutputDimensions; ++i) {
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index a778f956..8be584e8 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -3,6 +3,8 @@
 
 #include "trainer.h"
 
+#include "extra/stockfish_blas.h"
+
 #include "features/factorizer_feature_set.h"
 
 #include "learn/learn.h"
@@ -107,24 +109,36 @@ namespace Eval::NNUE {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType c = 0; c < 2; ++c) {
                     const IndexType output_offset = batch_offset + kHalfDimensions * c;
+
 #if defined(USE_BLAS)
-                    cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
+
+                    cblas_scopy(
+                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
+                    );
+
                     for (const auto& feature : batch[b].training_features[c]) {
                         const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        cblas_saxpy(kHalfDimensions, (float)feature.get_count(),
-                                    &weights_[weights_offset], 1, &output_[output_offset], 1);
+                        cblas_saxpy(
+                            kHalfDimensions, (float)feature.get_count(),
+                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        );
                     }
+
 #else
-                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                        output_[output_offset + i] = biases_[i];
-                    }
+
+                    Blas::scopy(
+                        thread_pool,
+                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
+                    );
                     for (const auto& feature : batch[b].training_features[c]) {
                         const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                            output_[output_offset + i] +=
-                                feature.get_count() * weights_[weights_offset + i];
-                        }
+                        Blas::saxpy(
+                            thread_pool,
+                            kHalfDimensions, (float)feature.get_count(),
+                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        );
                     }
+
 #endif
                 }
             }
@@ -171,19 +185,27 @@ namespace Eval::NNUE {
             // Correct the learning rate and adjust the scale without using momentum
             const LearnFloatType effective_learning_rate =
                 static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
+
 #if defined(USE_BLAS)
-            cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
+
+            cblas_sscal(
+                kHalfDimensions, momentum_, biases_diff_, 1
+            );
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType c = 0; c < 2; ++c) {
                     const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    cblas_saxpy(kHalfDimensions, 1.0,
-                                &gradients_[output_offset], 1, biases_diff_, 1);
+                    cblas_saxpy(
+                        kHalfDimensions, 1.0,
+                        &gradients_[output_offset], 1, biases_diff_, 1
+                    );
                 }
             }
 
-            cblas_saxpy(kHalfDimensions, -local_learning_rate,
-                        biases_diff_, 1, biases_, 1);
+            cblas_saxpy(
+                kHalfDimensions, -local_learning_rate,
+                biases_diff_, 1, biases_, 1
+            );
 
 #pragma omp parallel
             {
@@ -205,45 +227,67 @@ namespace Eval::NNUE {
                             const auto scale = static_cast<LearnFloatType>(
                                 effective_learning_rate / feature.get_count());
 
-                            cblas_saxpy(kHalfDimensions, -scale,
-                                        &gradients_[output_offset], 1,
-                                        &weights_[weights_offset], 1);
+                            cblas_saxpy(
+                                kHalfDimensions, -scale,
+                                &gradients_[output_offset], 1,
+                                &weights_[weights_offset], 1
+                            );
                         }
                     }
                 }
             }
 
 #else
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                biases_diff_[i] *= momentum_;
-            }
+
+            Blas::sscal(
+                thread_pool,
+                kHalfDimensions, momentum_, biases_diff_, 1
+            );
 
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType c = 0; c < 2; ++c) {
                     const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                        biases_diff_[i] += gradients_[output_offset + i];
-                    }
+                    Blas::saxpy(
+                        thread_pool,
+                        kHalfDimensions, 1.0,
+                        &gradients_[output_offset], 1, biases_diff_, 1
+                    );
                 }
             }
 
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                biases_[i] -= local_learning_rate * biases_diff_[i];
-            }
+            Blas::saxpy(
+                thread_pool,
+                kHalfDimensions, -local_learning_rate,
+                biases_diff_, 1, biases_, 1
+            );
 
-            for (IndexType b = 0; b < batch_->size(); ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    for (const auto& feature : (*batch_)[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        const auto scale = static_cast<LearnFloatType>(
-                            effective_learning_rate / feature.get_count());
+#pragma omp parallel
+            {
+#if defined(_OPENMP)
+                const IndexType num_threads = omp_get_num_threads();
+                const IndexType thread_index = omp_get_thread_num();
+#endif
+                for (IndexType b = 0; b < batch_->size(); ++b) {
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType c = 0; c < 2; ++c) {
+                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                        for (const auto& feature : (*batch_)[b].training_features[c]) {
+#if defined(_OPENMP)
+                            if (feature.get_index() % num_threads != thread_index)
+                                continue;
+#endif
+                            const IndexType weights_offset =
+                                kHalfDimensions * feature.get_index();
+                            const auto scale = static_cast<LearnFloatType>(
+                                effective_learning_rate / feature.get_count());
 
-                        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                            weights_[weights_offset + i] -=
-                                scale * gradients_[output_offset + i];
+                            Blas::saxpy(
+                                thread_pool,
+                                kHalfDimensions, -scale,
+                                &gradients_[output_offset], 1,
+                                &weights_[weights_offset], 1
+                            );
                         }
                     }
                 }
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 4bb38104..03e9fec0 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -3,6 +3,8 @@
 
 #include "trainer.h"
 
+#include "extra/stockfish_blas.h"
+
 #include "learn/learn.h"
 
 #include "nnue/layers/input_slice.h"
@@ -208,13 +210,21 @@ namespace Eval::NNUE {
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType input_offset = kInputDimensions * b;
                 const IndexType output_offset = kOutputDimensions * b;
+
 #if defined(USE_BLAS)
-                cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
-                            &output_[output_offset], 1);
+
+                cblas_scopy(
+                    kOutputDimensions, &input[input_offset + Offset], 1,
+                    &output_[output_offset], 1
+                );
 #else
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    output_[output_offset + i] = input[input_offset + Offset + i];
-                }
+
+                Blas::scopy(
+                    thread_pool,
+                    kOutputDimensions, &input[input_offset + Offset], 1,
+                    &output_[output_offset], 1
+                );
+
 #endif
             }
 
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
index 6defb95f..88ff302c 100644
--- a/src/nnue/trainer/trainer_sum.h
+++ b/src/nnue/trainer/trainer_sum.h
@@ -3,6 +3,8 @@
 
 #include "trainer.h"
 
+#include "extra/stockfish_blas.h"
+
 #include "learn/learn.h"
 
 #include "nnue/layers/sum.h"
@@ -53,15 +55,19 @@ namespace Eval::NNUE {
             const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch);
 
 #if defined(USE_BLAS)
-            cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
-                        head_output, 1, output, 1);
+
+            cblas_saxpy(
+                kOutputDimensions * batch_size_, 1.0,
+                head_output, 1, output, 1
+            );
+
 #else
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    output[batch_offset + i] += head_output[batch_offset + i];
-                }
-            }
+
+            Blas::saxpy(
+                thread_pool,
+                kOutputDimensions * batch_size_, 1.0,
+                head_output, 1, output, 1
+            );
 
 #endif
             return output;

From 8c81bbd3db5b6f4d9927a220acc5ca0e063cdf7b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 30 Oct 2020 10:36:39 +0100
Subject: [PATCH 414/583] Fix the counter in for_each_index_with_workers going
 out of scope before workers finish.

---
 src/thread.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/thread.h b/src/thread.h
index 3bc00729..1f0ec6a2 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -135,10 +135,15 @@ struct ThreadPool : public std::vector<Thread*> {
     typename Detail::TypeIdentity<IndexT>::Type end,
     FuncT func)
   {
-    std::atomic<IndexT> i_atomic = begin;
+    // This value must outlive the function call.
+    // It's fairly safe if we make it static
+    // because for_each_index_with_workers
+    // is not reentrant nor thread safe.
+    static std::atomic<IndexT> i_atomic;
+    i_atomic.store(begin);
 
     execute_with_workers(
-      [&i_atomic, end, func](Thread& th) mutable {
+      [end, func](Thread& th) mutable {
         for(;;) {
           const auto i = i_atomic.fetch_add(1);
           if (i >= end)

From 7bedf6c5aba05ea7e42623cd5a31eb3e1be8bf66 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 30 Oct 2020 10:37:03 +0100
Subject: [PATCH 415/583] Specify the whole evalsave message because otherwise
 the first evalsave/0 triggers it.

---
 tests/instrumented_learn.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 07f5f98b..9109e78b 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -129,7 +129,7 @@ cat << EOF > learn01.exp
  send "isready\n"
  send "learn targetdir training_data epochs 1 sfen_read_size 100 thread_buffer_size 10 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
 
- expect "INFO (save_eval): Finished saving evaluation file in"
+ expect "INFO (save_eval): Finished saving evaluation file in evalsave/final"
 
  send "quit\n"
  expect eof

From 2c10b1babcf4c6917f83f473a1123296032274a8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 17:36:50 +0100
Subject: [PATCH 416/583] Optimize feature transformer clipped relu.

---
 .../trainer/trainer_feature_transformer.h     | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)

diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 8be584e8..c883b594 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -143,6 +143,119 @@ namespace Eval::NNUE {
                 }
             }
 
+#if defined (USE_SSE2)
+
+            {
+                static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
+
+                auto m128_hmin_ps = [](__m128 x3210) {
+                    __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
+                    __m128 min_x_x_13_20 = _mm_min_ps(x3210, x0032);
+                    // a = [ # , # , min(x[1], x[3]) , min(x[2], x[0]) ]
+                    __m128 min_x_x_20_13 = _mm_shuffle_ps(min_x_x_13_20, min_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
+                    return _mm_cvtss_f32(_mm_min_ps(min_x_x_13_20, min_x_x_20_13));
+                };
+
+                auto m128_hmax_ps = [](__m128 x3210) {
+                    __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
+                    __m128 max_x_x_13_20 = _mm_max_ps(x3210, x0032);
+                    // a = [ # , # , max(x[1], x[3]) , max(x[2], x[0]) ]
+                    __m128 max_x_x_20_13 = _mm_shuffle_ps(max_x_x_13_20, max_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
+                    return _mm_cvtss_f32(_mm_max_ps(max_x_x_13_20, max_x_x_20_13));
+                };
+
+                const int total_size = batch.size() * kOutputDimensions;
+
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
+
+                __m128 min_pre_activation0 = _mm_set1_ps(min_pre_activation_);
+                __m128 min_pre_activation1 = _mm_set1_ps(min_pre_activation_);
+                __m128 max_pre_activation0 = _mm_set1_ps(max_pre_activation_);
+                __m128 max_pre_activation1 = _mm_set1_ps(max_pre_activation_);
+
+                for (int i = 0; i < total_size; i += 16)
+                {
+                    __m128 out0 = _mm_loadu_ps(&output_[i +  0]);
+                    __m128 out1 = _mm_loadu_ps(&output_[i +  4]);
+                    __m128 out2 = _mm_loadu_ps(&output_[i +  8]);
+                    __m128 out3 = _mm_loadu_ps(&output_[i + 12]);
+
+                    __m128 min01 = _mm_min_ps(out0, out1);
+                    __m128 min23 = _mm_min_ps(out2, out3);
+
+                    __m128 max01 = _mm_max_ps(out0, out1);
+                    __m128 max23 = _mm_max_ps(out2, out3);
+
+                    min_pre_activation0 = _mm_min_ps(min_pre_activation0, min01);
+                    min_pre_activation1 = _mm_min_ps(min_pre_activation1, min23);
+                    max_pre_activation0 = _mm_max_ps(max_pre_activation0, max01);
+                    max_pre_activation1 = _mm_max_ps(max_pre_activation1, max23);
+
+                    out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
+                    out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
+                    out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
+                    out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
+
+                    _mm_storeu_ps(&output_[i +  0], out0);
+                    _mm_storeu_ps(&output_[i +  4], out1);
+                    _mm_storeu_ps(&output_[i +  8], out2);
+                    _mm_storeu_ps(&output_[i + 12], out3);
+                }
+
+                min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
+                max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
+
+                for (IndexType b = 0; b < batch.size(); ++b) 
+                {
+                    const IndexType batch_offset = kOutputDimensions * b;
+
+                    for (IndexType half = 0; half < 2; ++half)
+                    {
+                        const IndexType half_offset = batch_offset + half * kHalfDimensions;
+                        for (IndexType i = 0; i < kHalfDimensions; i += 16)
+                        {
+                            const __m128 out0 = _mm_loadu_ps(&output_[i +  0 + half_offset]);
+                            const __m128 out1 = _mm_loadu_ps(&output_[i +  4 + half_offset]);
+                            const __m128 out2 = _mm_loadu_ps(&output_[i +  8 + half_offset]);
+                            const __m128 out3 = _mm_loadu_ps(&output_[i + 12 + half_offset]);
+
+                            __m128 minact0 = _mm_loadu_ps(&min_activations_[i +  0]);
+                            __m128 minact1 = _mm_loadu_ps(&min_activations_[i +  4]);
+                            __m128 minact2 = _mm_loadu_ps(&min_activations_[i +  8]);
+                            __m128 minact3 = _mm_loadu_ps(&min_activations_[i + 12]);
+
+                            __m128 maxact0 = _mm_loadu_ps(&max_activations_[i +  0]);
+                            __m128 maxact1 = _mm_loadu_ps(&max_activations_[i +  4]);
+                            __m128 maxact2 = _mm_loadu_ps(&max_activations_[i +  8]);
+                            __m128 maxact3 = _mm_loadu_ps(&max_activations_[i + 12]);
+
+                            minact0 = _mm_min_ps(out0, minact0);
+                            minact1 = _mm_min_ps(out1, minact1);
+                            minact2 = _mm_min_ps(out2, minact2);
+                            minact3 = _mm_min_ps(out3, minact3);
+
+                            maxact0 = _mm_max_ps(out0, maxact0);
+                            maxact1 = _mm_max_ps(out1, maxact1);
+                            maxact2 = _mm_max_ps(out2, maxact2);
+                            maxact3 = _mm_max_ps(out3, maxact3);
+
+                            _mm_storeu_ps(&min_activations_[i +  0], minact0);
+                            _mm_storeu_ps(&min_activations_[i +  4], minact1);
+                            _mm_storeu_ps(&min_activations_[i +  8], minact2);
+                            _mm_storeu_ps(&min_activations_[i + 12], minact3);
+
+                            _mm_storeu_ps(&max_activations_[i +  0], maxact0);
+                            _mm_storeu_ps(&max_activations_[i +  4], maxact1);
+                            _mm_storeu_ps(&max_activations_[i +  8], maxact2);
+                            _mm_storeu_ps(&max_activations_[i + 12], maxact3);
+                        }
+                    }
+                }
+            }
+
+#else
+
             // clipped ReLU
             for (IndexType b = 0; b < batch.size(); ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
@@ -157,6 +270,8 @@ namespace Eval::NNUE {
                 }
             }
 
+#endif
+
             return output_.data();
         }
 

From c96743c5bd17173be2e08f00fac89e9f50746238 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 28 Oct 2020 14:59:18 +0100
Subject: [PATCH 417/583] Optimize feature transformer backpropagation stats.

---
 .../trainer/trainer_feature_transformer.h     | 52 +++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index c883b594..77edfbde 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -285,6 +285,55 @@ namespace Eval::NNUE {
             const LearnFloatType local_learning_rate =
                 learning_rate * learning_rate_scale_;
 
+#if defined (USE_SSE2)
+            
+            {
+                static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
+
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
+
+                const IndexType total_size = batch_->size() * kOutputDimensions;
+
+                for (IndexType i = 0; i < total_size; i += 16)
+                {
+                    __m128 out0 = _mm_loadu_ps(&output_[i + 0]);
+                    __m128 out1 = _mm_loadu_ps(&output_[i + 4]);
+                    __m128 out2 = _mm_loadu_ps(&output_[i + 8]);
+                    __m128 out3 = _mm_loadu_ps(&output_[i + 12]);
+
+                    __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
+                    __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
+                    __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
+                    __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
+
+                    __m128 grad0 = _mm_loadu_ps(&gradients[i + 0]);
+                    __m128 grad1 = _mm_loadu_ps(&gradients[i + 4]);
+                    __m128 grad2 = _mm_loadu_ps(&gradients[i + 8]);
+                    __m128 grad3 = _mm_loadu_ps(&gradients[i + 12]);
+
+                    grad0 = _mm_andnot_ps(clipped0, grad0);
+                    grad1 = _mm_andnot_ps(clipped1, grad1);
+                    grad2 = _mm_andnot_ps(clipped2, grad2);
+                    grad3 = _mm_andnot_ps(clipped3, grad3);
+
+                    _mm_storeu_ps(&gradients_[i + 0], grad0);
+                    _mm_storeu_ps(&gradients_[i + 4], grad1);
+                    _mm_storeu_ps(&gradients_[i + 8], grad2);
+                    _mm_storeu_ps(&gradients_[i + 12], grad3);
+
+                    const int clipped_mask =
+                        (_mm_movemask_ps(clipped0) << 0)
+                        | (_mm_movemask_ps(clipped1) << 4)
+                        | (_mm_movemask_ps(clipped2) << 8)
+                        | (_mm_movemask_ps(clipped3) << 12);
+
+                    num_clipped_ += popcount(clipped_mask);
+                }
+            }
+
+#else
+
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
@@ -294,6 +343,9 @@ namespace Eval::NNUE {
                     num_clipped_ += clipped;
                 }
             }
+
+#endif
+
             num_total_ += batch_->size() * kOutputDimensions;
 
             // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,

From 941897ff2c24c66afa0e42e59c004839968d05d6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 28 Oct 2020 15:03:09 +0100
Subject: [PATCH 418/583] Optimize trainer clipped relu backpropagate.

---
 src/nnue/trainer/trainer_clipped_relu.h | 52 +++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 8e29e4a1..dd6fc701 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -69,6 +69,55 @@ namespace Eval::NNUE {
                            const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
+#if defined (USE_SSE2)
+
+            {
+                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
+
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
+
+                const IndexType total_size = batch_size_ * kOutputDimensions;
+
+                for (IndexType i = 0; i < total_size; i += 16)
+                {
+                    __m128 out0 = _mm_loadu_ps(&output_[i + 0]);
+                    __m128 out1 = _mm_loadu_ps(&output_[i + 4]);
+                    __m128 out2 = _mm_loadu_ps(&output_[i + 8]);
+                    __m128 out3 = _mm_loadu_ps(&output_[i + 12]);
+
+                    __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
+                    __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
+                    __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
+                    __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
+
+                    __m128 grad0 = _mm_loadu_ps(&gradients[i + 0]);
+                    __m128 grad1 = _mm_loadu_ps(&gradients[i + 4]);
+                    __m128 grad2 = _mm_loadu_ps(&gradients[i + 8]);
+                    __m128 grad3 = _mm_loadu_ps(&gradients[i + 12]);
+
+                    grad0 = _mm_andnot_ps(clipped0, grad0);
+                    grad1 = _mm_andnot_ps(clipped1, grad1);
+                    grad2 = _mm_andnot_ps(clipped2, grad2);
+                    grad3 = _mm_andnot_ps(clipped3, grad3);
+
+                    _mm_storeu_ps(&gradients_[i + 0], grad0);
+                    _mm_storeu_ps(&gradients_[i + 4], grad1);
+                    _mm_storeu_ps(&gradients_[i + 8], grad2);
+                    _mm_storeu_ps(&gradients_[i + 12], grad3);
+
+                    const int clipped_mask =
+                        (_mm_movemask_ps(clipped0) << 0)
+                        | (_mm_movemask_ps(clipped1) << 4)
+                        | (_mm_movemask_ps(clipped2) << 8)
+                        | (_mm_movemask_ps(clipped3) << 12);
+
+                    num_clipped_ += popcount(clipped_mask);
+                }
+            }
+
+#else
+
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
@@ -78,6 +127,9 @@ namespace Eval::NNUE {
                     num_clipped_ += clipped;
                 }
             }
+
+#endif
+
             num_total_ += batch_size_ * kOutputDimensions;
 
             previous_layer_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);

From b5714c4084719cd089c2d70266404e4e36f0a129 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 18:41:17 +0100
Subject: [PATCH 419/583] Parallelize input slice trainer backprop.

---
 src/nnue/trainer/trainer_input_slice.h | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 03e9fec0..a93a3ea0 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -236,17 +236,29 @@ namespace Eval::NNUE {
                            const LearnFloatType* gradients,
                            LearnFloatType learning_rate) {
 
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType input_offset = kInputDimensions * b;
-                const IndexType output_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kInputDimensions; ++i) {
-                    if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) {
+            thread_pool.for_each_index_with_workers(
+                0, batch_size_,
+                [&](Thread&, int b) {
+                    const IndexType input_offset = kInputDimensions * b;
+                    const IndexType output_offset = kOutputDimensions * b;
+
+                    IndexType i = 0;
+                    for (; i < Offset; ++i) {
                         gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    } else {
+                    }
+
+                    for (; i < Offset + kOutputDimensions; ++i) {
                         gradients_[input_offset + i] = gradients[output_offset + i - Offset];
                     }
+
+                    for (; i < kInputDimensions; ++i)
+                    {
+                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                    }
                 }
-            }
+            );
+            thread_pool.wait_for_workers_finished();
+
             shared_input_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
         }
 

From db1b33d4acfe02d4eb05eac5f810729da1d1ebf4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 18:57:47 +0100
Subject: [PATCH 420/583] Optimize trainer clipped relu propagate

---
 src/nnue/trainer/trainer_clipped_relu.h | 68 +++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index dd6fc701..124671ed 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -50,7 +50,73 @@ namespace Eval::NNUE {
             }
 
             const auto input = previous_layer_trainer_->propagate(thread_pool, batch);
+
             batch_size_ = static_cast<IndexType>(batch.size());
+
+#if defined (USE_SSE2)
+
+            {
+                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
+
+                const __m128 kZero4 = _mm_set1_ps(+kZero);
+                const __m128 kOne4 = _mm_set1_ps(+kOne);
+
+                for (IndexType b = 0; b < batch.size(); ++b)
+                {
+                    const IndexType batch_offset = kOutputDimensions * b;
+
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&input[i + 0 + batch_offset]);
+                        __m128 out1 = _mm_loadu_ps(&input[i + 4 + batch_offset]);
+                        __m128 out2 = _mm_loadu_ps(&input[i + 8 + batch_offset]);
+                        __m128 out3 = _mm_loadu_ps(&input[i + 12 + batch_offset]);
+
+                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
+                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
+                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
+                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
+
+                        _mm_storeu_ps(&output_[i + 0 + batch_offset], out0);
+                        _mm_storeu_ps(&output_[i + 4 + batch_offset], out1);
+                        _mm_storeu_ps(&output_[i + 8 + batch_offset], out2);
+                        _mm_storeu_ps(&output_[i + 12 + batch_offset], out3);
+
+                        __m128 minact0 = _mm_loadu_ps(&min_activations_[i + 0]);
+                        __m128 minact1 = _mm_loadu_ps(&min_activations_[i + 4]);
+                        __m128 minact2 = _mm_loadu_ps(&min_activations_[i + 8]);
+                        __m128 minact3 = _mm_loadu_ps(&min_activations_[i + 12]);
+
+                        __m128 maxact0 = _mm_loadu_ps(&max_activations_[i + 0]);
+                        __m128 maxact1 = _mm_loadu_ps(&max_activations_[i + 4]);
+                        __m128 maxact2 = _mm_loadu_ps(&max_activations_[i + 8]);
+                        __m128 maxact3 = _mm_loadu_ps(&max_activations_[i + 12]);
+
+                        minact0 = _mm_min_ps(out0, minact0);
+                        minact1 = _mm_min_ps(out1, minact1);
+                        minact2 = _mm_min_ps(out2, minact2);
+                        minact3 = _mm_min_ps(out3, minact3);
+
+                        maxact0 = _mm_max_ps(out0, maxact0);
+                        maxact1 = _mm_max_ps(out1, maxact1);
+                        maxact2 = _mm_max_ps(out2, maxact2);
+                        maxact3 = _mm_max_ps(out3, maxact3);
+
+                        _mm_storeu_ps(&min_activations_[i + 0], minact0);
+                        _mm_storeu_ps(&min_activations_[i + 4], minact1);
+                        _mm_storeu_ps(&min_activations_[i + 8], minact2);
+                        _mm_storeu_ps(&min_activations_[i + 12], minact3);
+
+                        _mm_storeu_ps(&max_activations_[i + 0], maxact0);
+                        _mm_storeu_ps(&max_activations_[i + 4], maxact1);
+                        _mm_storeu_ps(&max_activations_[i + 8], maxact2);
+                        _mm_storeu_ps(&max_activations_[i + 12], maxact3);
+                    }
+                }
+            }
+
+#else
+
             for (IndexType b = 0; b < batch_size_; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
@@ -61,6 +127,8 @@ namespace Eval::NNUE {
                 }
             }
 
+#endif
+
             return output_.data();
         }
 

From e8907bcfc456cd9c5966dbc238018b0bc961eece Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 11:57:45 +0100
Subject: [PATCH 421/583] Replace omp in trainer_feature_transformer

---
 .../trainer/trainer_feature_transformer.h     | 182 ++++++++----------
 1 file changed, 82 insertions(+), 100 deletions(-)

diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 77edfbde..3062e432 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -19,10 +19,6 @@
 #include <random>
 #include <set>
 
-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-
 // Specialization for feature transformer of learning class template of NNUE evaluation function
 namespace Eval::NNUE {
 
@@ -104,44 +100,45 @@ namespace Eval::NNUE {
 
             batch_ = &batch;
             // affine transform
-#pragma omp parallel for
-            for (IndexType b = 0; b < batch.size(); ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+            thread_pool.for_each_index_with_workers(
+                0, batch.size(),
+                [&](Thread&, int b) {
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType c = 0; c < 2; ++c) {
+                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
 
 #if defined(USE_BLAS)
 
-                    cblas_scopy(
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-
-                    for (const auto& feature : batch[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        cblas_saxpy(
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        cblas_scopy(
+                            kHalfDimensions, biases_, 1, &output_[output_offset], 1
                         );
-                    }
+
+                        for (const auto& feature : batch[b].training_features[c]) {
+                            const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                            cblas_saxpy(
+                                kHalfDimensions, (float)feature.get_count(),
+                                &weights_[weights_offset], 1, &output_[output_offset], 1
+                            );
+                        }
 
 #else
 
-                    Blas::scopy(
-                        thread_pool,
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-                    for (const auto& feature : batch[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        Blas::saxpy(
-                            thread_pool,
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        Blas::scopy(
+                            kHalfDimensions, biases_, 1, &output_[output_offset], 1
                         );
-                    }
+                        for (const auto& feature : batch[b].training_features[c]) {
+                            const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                            Blas::saxpy(
+                                kHalfDimensions, (float)feature.get_count(),
+                                &weights_[weights_offset], 1, &output_[output_offset], 1
+                            );
+                        }
 
 #endif
+                    }
                 }
-            }
+            );
+            thread_pool.wait_for_workers_finished();
 
 #if defined (USE_SSE2)
 
@@ -358,6 +355,7 @@ namespace Eval::NNUE {
             cblas_sscal(
                 kHalfDimensions, momentum_, biases_diff_, 1
             );
+
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType c = 0; c < 2; ++c) {
@@ -374,36 +372,6 @@ namespace Eval::NNUE {
                 biases_diff_, 1, biases_, 1
             );
 
-#pragma omp parallel
-            {
-#if defined(_OPENMP)
-                const IndexType num_threads = omp_get_num_threads();
-                const IndexType thread_index = omp_get_thread_num();
-#endif
-                for (IndexType b = 0; b < batch_->size(); ++b) {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType c = 0; c < 2; ++c) {
-                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                        for (const auto& feature : (*batch_)[b].training_features[c]) {
-#if defined(_OPENMP)
-                            if (feature.get_index() % num_threads != thread_index)
-                                continue;
-#endif
-                            const IndexType weights_offset =
-                                kHalfDimensions * feature.get_index();
-                            const auto scale = static_cast<LearnFloatType>(
-                                effective_learning_rate / feature.get_count());
-
-                            cblas_saxpy(
-                                kHalfDimensions, -scale,
-                                &gradients_[output_offset], 1,
-                                &weights_[weights_offset], 1
-                            );
-                        }
-                    }
-                }
-            }
-
 #else
 
             Blas::sscal(
@@ -429,38 +397,47 @@ namespace Eval::NNUE {
                 biases_diff_, 1, biases_, 1
             );
 
-#pragma omp parallel
-            {
-#if defined(_OPENMP)
-                const IndexType num_threads = omp_get_num_threads();
-                const IndexType thread_index = omp_get_thread_num();
 #endif
-                for (IndexType b = 0; b < batch_->size(); ++b) {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType c = 0; c < 2; ++c) {
-                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                        for (const auto& feature : (*batch_)[b].training_features[c]) {
-#if defined(_OPENMP)
-                            if (feature.get_index() % num_threads != thread_index)
-                                continue;
-#endif
-                            const IndexType weights_offset =
-                                kHalfDimensions * feature.get_index();
-                            const auto scale = static_cast<LearnFloatType>(
-                                effective_learning_rate / feature.get_count());
 
-                            Blas::saxpy(
-                                thread_pool,
-                                kHalfDimensions, -scale,
-                                &gradients_[output_offset], 1,
-                                &weights_[weights_offset], 1
-                            );
+            thread_pool.execute_with_workers(
+                [&, num_threads = thread_pool.size()](Thread& th) {
+                    const auto thread_index = th.thread_idx();
+
+                    for (IndexType b = 0; b < batch_->size(); ++b) {
+                        const IndexType batch_offset = kOutputDimensions * b;
+                        for (IndexType c = 0; c < 2; ++c) {
+                            const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                            for (const auto& feature : (*batch_)[b].training_features[c]) {
+                                if (feature.get_index() % num_threads != thread_index)
+                                    continue;
+                                const IndexType weights_offset =
+                                    kHalfDimensions * feature.get_index();
+                                const auto scale = static_cast<LearnFloatType>(
+                                    effective_learning_rate / feature.get_count());
+
+#if defined (USE_BLAS)
+
+                                cblas_saxpy(
+                                    kHalfDimensions, -scale,
+                                    &gradients_[output_offset], 1,
+                                    &weights_[weights_offset], 1
+                                );
+
+#else
+
+                                Blas::saxpy(
+                                    kHalfDimensions, -scale,
+                                    &gradients_[output_offset], 1,
+                                    &weights_[weights_offset], 1
+                                );
+
+#endif
+                            }
                         }
                     }
                 }
-            }
+            );
 
-#endif
             for (IndexType b = 0; b < batch_->size(); ++b) {
                 for (IndexType c = 0; c < 2; ++c) {
                     for (const auto& feature : (*batch_)[b].training_features[c]) {
@@ -468,6 +445,8 @@ namespace Eval::NNUE {
                     }
                 }
             }
+
+            thread_pool.wait_for_workers_finished();
         }
 
     private:
@@ -493,22 +472,25 @@ namespace Eval::NNUE {
 
             std::vector<TrainingFeature> training_features;
 
-#pragma omp parallel for private(training_features)
-            for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
-                training_features.clear();
-                Features::Factorizer<RawFeatures>::append_training_features(
-                    j, &training_features);
+            Threads.for_each_index_with_workers(
+                0, RawFeatures::kDimensions,
+                [this, training_features](Thread&, int j) mutable {
+                    training_features.clear();
+                    Features::Factorizer<RawFeatures>::append_training_features(
+                        j, &training_features);
 
-                for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                    double sum = 0.0;
-                    for (const auto& feature : training_features) {
-                        sum += weights_[kHalfDimensions * feature.get_index() + i];
+                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                        double sum = 0.0;
+                        for (const auto& feature : training_features) {
+                            sum += weights_[kHalfDimensions * feature.get_index() + i];
+                        }
+
+                        target_layer_->weights_[kHalfDimensions * j + i] =
+                            round<typename LayerType::WeightType>(sum * kWeightScale);
                     }
-
-                    target_layer_->weights_[kHalfDimensions * j + i] =
-                        round<typename LayerType::WeightType>(sum * kWeightScale);
                 }
-            }
+            );
+            Threads.wait_for_workers_finished();
         }
 
         void reset_stats() {

From c53be1b23f48fef9c5f27203eefc1443ea107e5a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 19:19:23 +0100
Subject: [PATCH 422/583] Add specialized bitset for use in the trainer for
 observed features tracking.

---
 src/misc.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/src/misc.h b/src/misc.h
index be9b4c38..e564311f 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -397,6 +397,69 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
 #endif
 }
 
+// This bitset can be accessed concurrently, provided
+// the concurrent accesses are performed on distinct
+// instances of underlying type. That means the cuncurrent
+// accesses need to be spaced by at least 
+// bits_per_bucket bits.
+// But at least best_concurrent_access_stride bits
+// is recommended to prevent false sharing.
+template <uint64_t N>
+struct LargeBitset
+{
+private:
+    constexpr static uint64_t cache_line_size = 64;
+
+public:
+    using UnderlyingType = uint64_t;
+
+    constexpr static uint64_t num_bits = N;
+    constexpr static uint64_t bits_per_bucket = 8 * sizeof(uint64_t);
+    constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket;
+    constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size;
+
+    void set(uint64_t idx)
+    {
+        const uint64_t bucket = idx / bits_per_bucket;
+        const uint64_t bit = uint64_t(1) << (idx % bits_per_bucket);
+        bits[bucket] |= bit;
+    }
+
+    bool test(uint64_t idx) const
+    {
+        const uint64_t bucket = idx / bits_per_bucket;
+        const uint64_t bit = uint64_t(1) << (idx % bits_per_bucket);
+        return bits[bucket] & bit;
+    }
+
+    uint64_t count() const
+    {
+        uint64_t c = 0;
+        uint64_t i = 0;
+
+        for (; i < num_buckets - 3; i += 4)
+        {
+            uint64_t c0 = popcount(bits[i+0]);
+            uint64_t c1 = popcount(bits[i+1]);
+            uint64_t c2 = popcount(bits[i+2]);
+            uint64_t c3 = popcount(bits[i+3]);
+            c0 += c1;
+            c2 += c3;
+            c += c0 + c2;
+        }
+
+        for (; i < num_buckets; ++i)
+        {
+            c += popcount(bits[i]);
+        }
+
+        return c;
+    }
+
+private:
+    alignas(cache_line_size) UnderlyingType bits[num_buckets];
+};
+
 /// Under Windows it is not possible for a process to run on more than one
 /// logical processor group. This usually means to be limited to use max 64
 /// cores. To overcome this, some special platform specific API should be

From 987b6c98d4ddf2875d6b8bbe1e8be07de7233aea Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Oct 2020 19:24:07 +0100
Subject: [PATCH 423/583] Move the observed feature collection to the threaded
 part now that it can be done safely.

---
 src/misc.h                                    |  7 +++-
 .../trainer/trainer_feature_transformer.h     | 35 ++++++++++++-------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/misc.h b/src/misc.h
index e564311f..020fa9b5 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -400,7 +400,7 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
 // This bitset can be accessed concurrently, provided
 // the concurrent accesses are performed on distinct
 // instances of underlying type. That means the cuncurrent
-// accesses need to be spaced by at least 
+// accesses need to be spaced by at least
 // bits_per_bucket bits.
 // But at least best_concurrent_access_stride bits
 // is recommended to prevent false sharing.
@@ -418,6 +418,11 @@ public:
     constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket;
     constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size;
 
+    LargeBitset()
+    {
+        std::fill(std::begin(bits), std::end(bits), 0);
+    }
+
     void set(uint64_t idx)
     {
         const uint64_t bucket = idx / bits_per_bucket;
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 3062e432..419cdf5e 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -203,7 +203,7 @@ namespace Eval::NNUE {
                 min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
                 max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
 
-                for (IndexType b = 0; b < batch.size(); ++b) 
+                for (IndexType b = 0; b < batch.size(); ++b)
                 {
                     const IndexType batch_offset = kOutputDimensions * b;
 
@@ -283,7 +283,7 @@ namespace Eval::NNUE {
                 learning_rate * learning_rate_scale_;
 
 #if defined (USE_SSE2)
-            
+
             {
                 static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
 
@@ -408,10 +408,26 @@ namespace Eval::NNUE {
                         for (IndexType c = 0; c < 2; ++c) {
                             const IndexType output_offset = batch_offset + kHalfDimensions * c;
                             for (const auto& feature : (*batch_)[b].training_features[c]) {
-                                if (feature.get_index() % num_threads != thread_index)
+                                const IndexType feature_index = feature.get_index();
+
+                                // We assign each bucket a continuous range of bits at least
+                                // of cache line size to prevent false sharing.
+                                // For HalfKP this is enough to saturate about 80 threads.
+                                const IndexType thread_bucket =
+                                    (feature_index / BitsetType::best_concurrent_access_stride)
+                                    % num_threads;
+
+                                if (thread_bucket != thread_index)
                                     continue;
+
+                                // This operation can be performed safely because
+                                // each thread accesses a different memory location
+                                // (even a different cache line)
+                                observed_features.set(feature_index);
+
                                 const IndexType weights_offset =
-                                    kHalfDimensions * feature.get_index();
+                                    kHalfDimensions * feature_index;
+
                                 const auto scale = static_cast<LearnFloatType>(
                                     effective_learning_rate / feature.get_count());
 
@@ -438,14 +454,6 @@ namespace Eval::NNUE {
                 }
             );
 
-            for (IndexType b = 0; b < batch_->size(); ++b) {
-                for (IndexType c = 0; c < 2; ++c) {
-                    for (const auto& feature : (*batch_)[b].training_features[c]) {
-                        observed_features.set(feature.get_index());
-                    }
-                }
-            }
-
             thread_pool.wait_for_workers_finished();
         }
 
@@ -628,7 +636,8 @@ namespace Eval::NNUE {
         std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
 
         // Features that appeared in the training data
-        std::bitset<kInputDimensions> observed_features;
+        using BitsetType = LargeBitset<kInputDimensions>;
+        BitsetType observed_features;
 
         // hyper parameter
         LearnFloatType momentum_;

From 5d88e7bce8aa2478db3c5c3ea9a0651b2339d34c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 4 Nov 2020 20:17:58 +0100
Subject: [PATCH 424/583] Add optional move validation to training data
 conversion. No longer rely on static initialization order for magics
 initialization.

---
 src/extra/nnue_data_binpack_format.h | 770 ++++++++++++++++++++++++---
 src/learn/convert.cpp                |  16 +-
 2 files changed, 709 insertions(+), 77 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index b9e45c3e..ceb5c415 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -3132,7 +3132,11 @@ namespace chess
                 };
             }
 
-            static const EnumArray2<PieceType, Square, Bitboard> pseudoAttacks = generatePseudoAttacks();
+            static const EnumArray2<PieceType, Square, Bitboard>& pseudoAttacks()
+            {
+                static const EnumArray2<PieceType, Square, Bitboard> s_pseudoAttacks = generatePseudoAttacks();
+                return s_pseudoAttacks;
+            }
 
             [[nodiscard]] static Bitboard generatePositiveRayAttacks(Direction dir, Square fromSq)
             {
@@ -3187,24 +3191,29 @@ namespace chess
                 return bbs;
             }
 
-            static const std::array<EnumArray<Square, Bitboard>, 8> positiveRayAttacks = generatePositiveRayAttacks();
+
+            static const std::array<EnumArray<Square, Bitboard>, 8>& positiveRayAttacks()
+            {
+                static const std::array<EnumArray<Square, Bitboard>, 8> s_positiveRayAttacks = generatePositiveRayAttacks();
+                return s_positiveRayAttacks;
+            }
 
             template <Direction DirV>
             [[nodiscard]] static Bitboard slidingAttacks(Square sq, Bitboard occupied)
             {
                 assert(sq.isOk());
 
-                Bitboard attacks = positiveRayAttacks[DirV][sq];
+                Bitboard attacks = positiveRayAttacks()[DirV][sq];
 
                 if constexpr (DirV == NorthWest || DirV == North || DirV == NorthEast || DirV == East)
                 {
                     Bitboard blocker = (attacks & occupied) | h8; // set highest bit (H8) so msb never fails
-                    return attacks ^ positiveRayAttacks[DirV][blocker.first()];
+                    return attacks ^ positiveRayAttacks()[DirV][blocker.first()];
                 }
                 else
                 {
                     Bitboard blocker = (attacks & occupied) | a1;
-                    return attacks ^ positiveRayAttacks[DirV][blocker.last()];
+                    return attacks ^ positiveRayAttacks()[DirV][blocker.last()];
                 }
             }
 
@@ -3290,10 +3299,10 @@ namespace chess
             {
                 for (PieceType pt : { PieceType::Bishop, PieceType::Rook })
                 {
-                    const Bitboard s1Attacks = pseudoAttacks[pt][s1];
+                    const Bitboard s1Attacks = pseudoAttacks()[pt][s1];
                     if (s1Attacks.isSet(s2))
                     {
-                        const Bitboard s2Attacks = pseudoAttacks[pt][s2];
+                        const Bitboard s2Attacks = pseudoAttacks()[pt][s2];
                         return (s1Attacks & s2Attacks) | s1 | s2;
                     }
                 }
@@ -3420,14 +3429,14 @@ namespace chess
 
             assert(sq.isOk());
 
-            return detail::pseudoAttacks[PieceTypeV][sq];
+            return detail::pseudoAttacks()[PieceTypeV][sq];
         }
 
         [[nodiscard]] inline Bitboard pseudoAttacks(PieceType pt, Square sq)
         {
             assert(sq.isOk());
 
-            return detail::pseudoAttacks[pt][sq];
+            return detail::pseudoAttacks()[pt][sq];
         }
 
         [[nodiscard]] inline Bitboard pawnAttacks(Bitboard pawns, Color color)
@@ -4373,6 +4382,22 @@ namespace chess
         std::uint64_t low;
     };
 
+    struct Position;
+
+    struct MoveLegalityChecker
+    {
+        MoveLegalityChecker(const Position& position);
+
+        [[nodiscard]] bool isPseudoLegalMoveLegal(const Move& move) const;
+
+    private:
+        const Position* m_position;
+        Bitboard m_checkers;
+        Bitboard m_ourBlockersForKing;
+        Bitboard m_potentialCheckRemovals;
+        Square m_ksq;
+    };
+
     struct Position : public Board
     {
         using BaseType = Board;
@@ -4412,6 +4437,11 @@ namespace chess
 
         [[nodiscard]] inline std::string fen() const;
 
+        [[nodiscard]] MoveLegalityChecker moveLegalityChecker() const
+        {
+            return { *this };
+        }
+
         constexpr void setEpSquareUnchecked(Square sq)
         {
             m_epSquare = sq;
@@ -4498,6 +4528,8 @@ namespace chess
 
         [[nodiscard]] inline bool isCheckAfterMove(Move move) const;
 
+        [[nodiscard]] inline bool isMoveLegal(Move move) const;
+
         [[nodiscard]] inline bool isPseudoLegalMoveLegal(Move move) const;
 
         [[nodiscard]] inline bool isMovePseudoLegal(Move move) const;
@@ -4665,6 +4697,592 @@ namespace chess
         std::uint8_t m_packedState[16];
     };
 
+    namespace movegen
+    {
+        // For a pseudo-legal move the following are true:
+        //  - the moving piece has the pos.sideToMove() color
+        //  - the destination square is either empty or has a piece of the opposite color
+        //  - if it is a pawn move it is valid (but may be illegal due to discovered checks)
+        //  - if it is not a pawn move then the destination square is contained in attacks()
+        //  - if it is a castling it is legal
+        //  - a move other than castling may create a discovered attack on the king
+        //  - a king may walk into a check
+
+        template <typename FuncT>
+        inline void forEachPseudoLegalPawnMove(const Position& pos, Square from, FuncT&& f)
+        {
+            const Color sideToMove = pos.sideToMove();
+            const Square epSquare = pos.epSquare();
+            const Bitboard ourPieces = pos.piecesBB(sideToMove);
+            const Bitboard theirPieces = pos.piecesBB(!sideToMove);
+            const Bitboard occupied = ourPieces | theirPieces;
+
+            Bitboard attackTargets = theirPieces;
+            if (epSquare != Square::none())
+            {
+                attackTargets |= epSquare;
+            }
+
+            const Bitboard attacks = bb::pawnAttacks(Bitboard::square(from), sideToMove) & attackTargets;
+
+            const Rank secondToLastRank = sideToMove == Color::White ? rank7 : rank2;
+            const auto forward = sideToMove == Color::White ? FlatSquareOffset(0, 1) : FlatSquareOffset(0, -1);
+
+            // promotions
+            if (from.rank() == secondToLastRank)
+            {
+                // capture promotions
+                for (Square toSq : attacks)
+                {
+                    for (PieceType pt : { PieceType::Knight, PieceType::Bishop, PieceType::Rook, PieceType::Queen })
+                    {
+                        Move move{ from, toSq, MoveType::Promotion, Piece(pt, sideToMove) };
+                        f(move);
+                    }
+                }
+
+                // push promotions
+                const Square toSq = from + forward;
+                if (!occupied.isSet(toSq))
+                {
+                    for (PieceType pt : { PieceType::Knight, PieceType::Bishop, PieceType::Rook, PieceType::Queen })
+                    {
+                        Move move{ from, toSq, MoveType::Promotion, Piece(pt, sideToMove) };
+                        f(move);
+                    }
+                }
+            }
+            else
+            {
+                // captures
+                for (Square toSq : attacks)
+                {
+                    Move move{ from, toSq, (toSq == epSquare) ? MoveType::EnPassant : MoveType::Normal };
+                    f(move);
+                }
+
+                const Square toSq = from + forward;
+
+                // single push
+                if (!occupied.isSet(toSq))
+                {
+                    const Rank startRank = sideToMove == Color::White ? rank2 : rank7;
+                    if (from.rank() == startRank)
+                    {
+                        // double push
+                        const Square toSq2 = toSq + forward;
+                        if (!occupied.isSet(toSq2))
+                        {
+                            Move move{ from, toSq2 };
+                            f(move);
+                        }
+                    }
+
+                    Move move{ from, toSq };
+                    f(move);
+                }
+            }
+        }
+
+        template <Color SideToMoveV, typename FuncT>
+        inline void forEachPseudoLegalPawnMove(const Position& pos, FuncT&& f)
+        {
+            const Square epSquare = pos.epSquare();
+            const Bitboard ourPieces = pos.piecesBB(SideToMoveV);
+            const Bitboard theirPieces = pos.piecesBB(!SideToMoveV);
+            const Bitboard occupied = ourPieces | theirPieces;
+            const Bitboard pawns = pos.piecesBB(Piece(PieceType::Pawn, SideToMoveV));
+
+            const Bitboard secondToLastRank = SideToMoveV == Color::White ? bb::rank7 : bb::rank2;
+            const Bitboard secondRank = SideToMoveV == Color::White ? bb::rank2 : bb::rank7;
+
+            const auto singlePawnMoveDestinationOffset = SideToMoveV == Color::White ? FlatSquareOffset(0, 1) : FlatSquareOffset(0, -1);
+            const auto doublePawnMoveDestinationOffset = SideToMoveV == Color::White ? FlatSquareOffset(0, 2) : FlatSquareOffset(0, -2);
+
+            {
+                const int backward = SideToMoveV == Color::White ? -1 : 1;
+                const int backward2 = backward * 2;
+
+                const Bitboard doublePawnMoveStarts =
+                    pawns
+                    & secondRank
+                    & ~(occupied.shiftedVertically(backward) | occupied.shiftedVertically(backward2));
+
+                const Bitboard singlePawnMoveStarts =
+                    pawns
+                    & ~secondToLastRank
+                    & ~occupied.shiftedVertically(backward);
+
+                for (Square from : doublePawnMoveStarts)
+                {
+                    const Square to = from + doublePawnMoveDestinationOffset;
+                    f(Move::normal(from, to));
+                }
+
+                for (Square from : singlePawnMoveStarts)
+                {
+                    const Square to = from + singlePawnMoveDestinationOffset;
+                    f(Move::normal(from, to));
+                }
+            }
+
+            {
+                const Bitboard lastRank = SideToMoveV == Color::White ? bb::rank8 : bb::rank1;
+                const FlatSquareOffset westCaptureOffset = SideToMoveV == Color::White ? FlatSquareOffset(-1, 1) : FlatSquareOffset(-1, -1);
+                const FlatSquareOffset eastCaptureOffset = SideToMoveV == Color::White ? FlatSquareOffset(1, 1) : FlatSquareOffset(1, -1);
+
+                const Bitboard pawnsWithWestCapture = bb::eastPawnAttacks(theirPieces & ~lastRank, !SideToMoveV) & pawns;
+                const Bitboard pawnsWithEastCapture = bb::westPawnAttacks(theirPieces & ~lastRank, !SideToMoveV) & pawns;
+
+                for (Square from : pawnsWithWestCapture)
+                {
+                    f(Move::normal(from, from + westCaptureOffset));
+                }
+
+                for (Square from : pawnsWithEastCapture)
+                {
+                    f(Move::normal(from, from + eastCaptureOffset));
+                }
+            }
+
+            if (epSquare != Square::none())
+            {
+                const Bitboard pawnsThatCanCapture = bb::pawnAttacks(Bitboard::square(epSquare), !SideToMoveV) & pawns;
+                for (Square from : pawnsThatCanCapture)
+                {
+                    f(Move::enPassant(from, epSquare));
+                }
+            }
+
+            for (Square from : pawns & secondToLastRank)
+            {
+                const Bitboard attacks = bb::pawnAttacks(Bitboard::square(from), SideToMoveV) & theirPieces;
+
+                // capture promotions
+                for (Square to : attacks)
+                {
+                    for (PieceType pt : { PieceType::Knight, PieceType::Bishop, PieceType::Rook, PieceType::Queen })
+                    {
+                        Move move{ from, to, MoveType::Promotion, Piece(pt, SideToMoveV) };
+                        f(move);
+                    }
+                }
+
+                // push promotions
+                const Square to = from + singlePawnMoveDestinationOffset;
+                if (!occupied.isSet(to))
+                {
+                    for (PieceType pt : { PieceType::Knight, PieceType::Bishop, PieceType::Rook, PieceType::Queen })
+                    {
+                        Move move{ from, to, MoveType::Promotion, Piece(pt, SideToMoveV) };
+                        f(move);
+                    }
+                }
+            }
+        }
+
+        template <typename FuncT>
+        inline void forEachPseudoLegalPawnMove(const Position& pos, FuncT&& f)
+        {
+            if (pos.sideToMove() == Color::White)
+            {
+                forEachPseudoLegalPawnMove<Color::White>(pos, std::forward<FuncT>(f));
+            }
+            else
+            {
+                forEachPseudoLegalPawnMove<Color::Black>(pos, std::forward<FuncT>(f));
+            }
+        }
+
+        template <PieceType PieceTypeV, typename FuncT>
+        inline void forEachPseudoLegalPieceMove(const Position& pos, Square from, FuncT&& f)
+        {
+            static_assert(PieceTypeV != PieceType::None);
+
+            if constexpr (PieceTypeV == PieceType::Pawn)
+            {
+                forEachPseudoLegalPawnMove(pos, from, f);
+            }
+            else
+            {
+                const Color sideToMove = pos.sideToMove();
+                const Bitboard ourPieces = pos.piecesBB(sideToMove);
+                const Bitboard theirPieces = pos.piecesBB(!sideToMove);
+                const Bitboard occupied = ourPieces | theirPieces;
+                const Bitboard attacks = bb::attacks<PieceTypeV>(from, occupied) & ~ourPieces;
+
+                for (Square toSq : attacks)
+                {
+                    Move move{ from, toSq };
+                    f(move);
+                }
+            }
+        }
+
+        template <PieceType PieceTypeV, typename FuncT>
+        inline void forEachPseudoLegalPieceMove(const Position& pos, FuncT&& f)
+        {
+            static_assert(PieceTypeV != PieceType::None);
+
+            if constexpr (PieceTypeV == PieceType::Pawn)
+            {
+                forEachPseudoLegalPawnMove(pos, f);
+            }
+            else
+            {
+                const Color sideToMove = pos.sideToMove();
+                const Bitboard ourPieces = pos.piecesBB(sideToMove);
+                const Bitboard theirPieces = pos.piecesBB(!sideToMove);
+                const Bitboard occupied = ourPieces | theirPieces;
+                const Bitboard pieces = pos.piecesBB(Piece(PieceTypeV, sideToMove));
+                for (Square fromSq : pieces)
+                {
+                    const Bitboard attacks = bb::attacks<PieceTypeV>(fromSq, occupied) & ~ourPieces;
+                    for (Square toSq : attacks)
+                    {
+                        Move move{ fromSq, toSq };
+                        f(move);
+                    }
+                }
+            }
+        }
+
+        template <typename FuncT>
+        inline void forEachCastlingMove(const Position& pos, FuncT&& f)
+        {
+            CastlingRights rights = pos.castlingRights();
+            if (rights == CastlingRights::None)
+            {
+                return;
+            }
+
+            const Color sideToMove = pos.sideToMove();
+            const Bitboard ourPieces = pos.piecesBB(sideToMove);
+            const Bitboard theirPieces = pos.piecesBB(!sideToMove);
+            const Bitboard occupied = ourPieces | theirPieces;
+
+            // we first reduce the set of legal castlings by checking the paths for pieces
+            if (sideToMove == Color::White)
+            {
+                if ((CastlingTraits::castlingPath[Color::White][CastleType::Short] & occupied).any()) rights &= ~CastlingRights::WhiteKingSide;
+                if ((CastlingTraits::castlingPath[Color::White][CastleType::Long] & occupied).any()) rights &= ~CastlingRights::WhiteQueenSide;
+                rights &= ~CastlingRights::Black;
+            }
+            else
+            {
+                if ((CastlingTraits::castlingPath[Color::Black][CastleType::Short] & occupied).any()) rights &= ~CastlingRights::BlackKingSide;
+                if ((CastlingTraits::castlingPath[Color::Black][CastleType::Long] & occupied).any()) rights &= ~CastlingRights::BlackQueenSide;
+                rights &= ~CastlingRights::White;
+            }
+
+            if (rights == CastlingRights::None)
+            {
+                return;
+            }
+
+            // King must not be in check. Done here because it is quite expensive.
+            const Square ksq = pos.kingSquare(sideToMove);
+            if (pos.isSquareAttacked(ksq, !sideToMove))
+            {
+                return;
+            }
+
+            // Loop through all possible castlings.
+            for (CastleType castlingType : values<CastleType>())
+            {
+                const CastlingRights right = CastlingTraits::castlingRights[sideToMove][castlingType];
+
+                if (!contains(rights, right))
+                {
+                    continue;
+                }
+
+                // If we have this castling right
+                // we check whether the king passes an attacked square.
+                const Square passedSquare = CastlingTraits::squarePassedByKing[sideToMove][castlingType];
+                if (pos.isSquareAttacked(passedSquare, !sideToMove))
+                {
+                    continue;
+                }
+
+                // If it's a castling move then the change in square occupation
+                // cannot have an effect because otherwise there would be
+                // a slider attacker attacking the castling king.
+                if (pos.isSquareAttacked(CastlingTraits::kingDestination[sideToMove][castlingType], !sideToMove))
+                {
+                    continue;
+                }
+
+                // If not we can castle.
+                Move move = Move::castle(castlingType, sideToMove);
+                f(move);
+            }
+        }
+
+        // Calls a given function for all pseudo legal moves for the position.
+        // `pos` must be a legal chess position
+        template <typename FuncT>
+        inline void forEachPseudoLegalMove(const Position& pos, FuncT&& func)
+        {
+            forEachPseudoLegalPieceMove<PieceType::Pawn>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::Knight>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::Bishop>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::Rook>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::Queen>(pos, func);
+            forEachPseudoLegalPieceMove<PieceType::King>(pos, func);
+            forEachCastlingMove(pos, func);
+        }
+
+        // Calls a given function for all legal moves for the position.
+        // `pos` must be a legal chess position
+        template <typename FuncT>
+        inline void forEachLegalMove(const Position& pos, FuncT&& func)
+        {
+            auto funcIfLegal = [&func, checker = pos.moveLegalityChecker()](Move move) {
+                if (checker.isPseudoLegalMoveLegal(move))
+                {
+                    func(move);
+                }
+            };
+
+            forEachPseudoLegalPieceMove<PieceType::Pawn>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::Knight>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::Bishop>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::Rook>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::Queen>(pos, funcIfLegal);
+            forEachPseudoLegalPieceMove<PieceType::King>(pos, funcIfLegal);
+            forEachCastlingMove(pos, func);
+        }
+
+        // Generates all pseudo legal moves for the position.
+        // `pos` must be a legal chess position
+        [[nodiscard]] std::vector<Move> generatePseudoLegalMoves(const Position& pos);
+
+        // Generates all legal moves for the position.
+        // `pos` must be a legal chess position
+        [[nodiscard]] std::vector<Move> generateLegalMoves(const Position& pos);
+    }
+
+    [[nodiscard]] inline bool Position::isCheck() const
+    {
+        return BaseType::isSquareAttacked(kingSquare(m_sideToMove), !m_sideToMove);
+    }
+
+    [[nodiscard]] inline Bitboard Position::checkers() const
+    {
+        return BaseType::attackers(kingSquare(m_sideToMove), !m_sideToMove);
+    }
+
+    [[nodiscard]] inline bool Position::isCheckAfterMove(Move move) const
+    {
+        return BaseType::isSquareAttackedAfterMove(move, kingSquare(!m_sideToMove), m_sideToMove);
+    }
+
+    [[nodiscard]] inline bool Position::isMoveLegal(Move move) const
+    {
+        return
+            isMovePseudoLegal(move)
+            && isPseudoLegalMoveLegal(move);
+    }
+
+    [[nodiscard]] inline bool Position::isPseudoLegalMoveLegal(Move move) const
+    {
+        return
+            (move.type == MoveType::Castle)
+            || !isOwnKingAttackedAfterMove(move);
+    }
+
+    [[nodiscard]] inline bool Position::isMovePseudoLegal(Move move) const
+    {
+        if (!move.from.isOk() || !move.to.isOk())
+        {
+            return false;
+        }
+
+        if (move.from == move.to)
+        {
+            return false;
+        }
+
+        if (move.type != MoveType::Promotion && move.promotedPiece != Piece::none())
+        {
+            return false;
+        }
+
+        const Piece movedPiece = pieceAt(move.from);
+        if (movedPiece == Piece::none())
+        {
+            return false;
+        }
+
+        if (movedPiece.color() != m_sideToMove)
+        {
+            return false;
+        }
+
+        const Bitboard occupied = piecesBB();
+        const Bitboard ourPieces = piecesBB(m_sideToMove);
+        const bool isNormal = move.type == MoveType::Normal;
+
+        switch (movedPiece.type())
+        {
+        case PieceType::Pawn:
+        {
+            bool isValid = false;
+            // TODO: use iterators so we don't loop over all moves
+            //       when we can avoid it.
+            movegen::forEachPseudoLegalPawnMove(*this, move.from, [&isValid, &move](const Move& genMove) {
+                if (move == genMove)
+                {
+                    isValid = true;
+                }
+                });
+            return isValid;
+        }
+
+        case PieceType::Bishop:
+            return isNormal && (bb::attacks<PieceType::Bishop>(move.from, occupied) & ~ourPieces).isSet(move.to);
+
+        case PieceType::Knight:
+            return isNormal && (bb::pseudoAttacks<PieceType::Knight>(move.from) & ~ourPieces).isSet(move.to);
+
+        case PieceType::Rook:
+            return isNormal && (bb::attacks<PieceType::Rook>(move.from, occupied) & ~ourPieces).isSet(move.to);
+
+        case PieceType::Queen:
+            return isNormal && (bb::attacks<PieceType::Queen>(move.from, occupied) & ~ourPieces).isSet(move.to);
+
+        case PieceType::King:
+        {
+            if (move.type == MoveType::Castle)
+            {
+                bool isValid = false;
+                movegen::forEachCastlingMove(*this, [&isValid, &move](const Move& genMove) {
+                    if (move == genMove)
+                    {
+                        isValid = true;
+                    }
+                    });
+                return isValid;
+            }
+            else
+            {
+                return isNormal && (bb::pseudoAttacks<PieceType::King>(move.from) & ~ourPieces).isSet(move.to);
+            }
+        }
+
+        default:
+            return false;
+        }
+    }
+
+    [[nodiscard]] inline Bitboard Position::blockersForKing(Color color) const
+    {
+        const Color attackerColor = !color;
+
+        const Bitboard occupied = piecesBB();
+
+        const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
+        const Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
+        const Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
+
+        const Square ksq = kingSquare(color);
+
+        const Bitboard opponentBishopLikePieces = (bishops | queens);
+        const Bitboard bishopPseudoAttacks = bb::pseudoAttacks<PieceType::Bishop>(ksq);
+
+        const Bitboard opponentRookLikePieces = (rooks | queens);
+        const Bitboard rookPseudoAttacks = bb::pseudoAttacks<PieceType::Rook>(ksq);
+
+        const Bitboard xrayers =
+            (bishopPseudoAttacks & opponentBishopLikePieces)
+            | (rookPseudoAttacks & opponentRookLikePieces);
+
+        Bitboard allBlockers = Bitboard::none();
+
+        for (Square xrayer : xrayers)
+        {
+            const Bitboard blockers = bb::between(xrayer, ksq) & occupied;
+            if (blockers.exactlyOne())
+            {
+                allBlockers |= blockers;
+            }
+        }
+
+        return allBlockers;
+    }
+
+    inline MoveLegalityChecker::MoveLegalityChecker(const Position& position) :
+        m_position(&position),
+        m_checkers(position.checkers()),
+        m_ourBlockersForKing(
+            position.blockersForKing(position.sideToMove())
+            & position.piecesBB(position.sideToMove())
+        ),
+        m_ksq(position.kingSquare(position.sideToMove()))
+    {
+        if (m_checkers.exactlyOne())
+        {
+            const Bitboard knightCheckers = m_checkers & bb::pseudoAttacks<PieceType::Knight>(m_ksq);
+            if (knightCheckers.any())
+            {
+                // We're checked by a knight, we have to remove it or move the king.
+                m_potentialCheckRemovals = knightCheckers;
+            }
+            else
+            {
+                // If we're not checked by a knight we can block it.
+                m_potentialCheckRemovals = bb::between(m_ksq, m_checkers.first()) | m_checkers;
+            }
+        }
+        else
+        {
+            // Double check, king has to move.
+            m_potentialCheckRemovals = Bitboard::none();
+        }
+    }
+
+    [[nodiscard]] inline bool MoveLegalityChecker::isPseudoLegalMoveLegal(const Move& move) const
+    {
+        if (m_checkers.any())
+        {
+            if (move.from == m_ksq || move.type == MoveType::EnPassant)
+            {
+                return m_position->isPseudoLegalMoveLegal(move);
+            }
+            else
+            {
+                // This means there's only one check and we either
+                // blocked it or removed the piece that attacked
+                // our king. So the only threat is if it's a discovered check.
+                return
+                    m_potentialCheckRemovals.isSet(move.to)
+                    && !m_ourBlockersForKing.isSet(move.from);
+            }
+        }
+        else
+        {
+            if (move.from == m_ksq)
+            {
+                return m_position->isPseudoLegalMoveLegal(move);
+            }
+            else if (move.type == MoveType::EnPassant)
+            {
+                return !m_position->createsDiscoveredAttackOnOwnKing(move);
+            }
+            else if (m_ourBlockersForKing.isSet(move.from))
+            {
+                // If it was a blocker it may have only moved in line with our king.
+                // Otherwise it's a discovered check.
+                return bb::line(m_ksq, move.from).isSet(move.to);
+            }
+            else
+            {
+                return true;
+            }
+        }
+    }
+
     static_assert(sizeof(CompressedPosition) == 24);
     static_assert(std::is_trivially_copyable_v<CompressedPosition>);
 
@@ -5483,57 +6101,6 @@ namespace chess
         return { move, captured, oldEpSquare, oldCastlingRights };
     }
 
-    [[nodiscard]] inline bool Position::isCheck() const
-    {
-        return BaseType::isSquareAttacked(kingSquare(m_sideToMove), !m_sideToMove);
-    }
-
-    [[nodiscard]] inline Bitboard Position::checkers() const
-    {
-        return BaseType::attackers(kingSquare(m_sideToMove), !m_sideToMove);
-    }
-
-    [[nodiscard]] bool Position::isCheckAfterMove(Move move) const
-    {
-        return BaseType::isSquareAttackedAfterMove(move, kingSquare(!m_sideToMove), m_sideToMove);
-    }
-
-    [[nodiscard]] inline Bitboard Position::blockersForKing(Color color) const
-    {
-        const Color attackerColor = !color;
-
-        const Bitboard occupied = piecesBB();
-
-        const Bitboard bishops = piecesBB(Piece(PieceType::Bishop, attackerColor));
-        const Bitboard rooks = piecesBB(Piece(PieceType::Rook, attackerColor));
-        const Bitboard queens = piecesBB(Piece(PieceType::Queen, attackerColor));
-
-        const Square ksq = kingSquare(color);
-
-        const Bitboard opponentBishopLikePieces = (bishops | queens);
-        const Bitboard bishopPseudoAttacks = bb::pseudoAttacks<PieceType::Bishop>(ksq);
-
-        const Bitboard opponentRookLikePieces = (rooks | queens);
-        const Bitboard rookPseudoAttacks = bb::pseudoAttacks<PieceType::Rook>(ksq);
-
-        const Bitboard xrayers =
-            (bishopPseudoAttacks & opponentBishopLikePieces)
-            | (rookPseudoAttacks & opponentRookLikePieces);
-
-        Bitboard allBlockers = Bitboard::none();
-
-        for (Square xrayer : xrayers)
-        {
-            const Bitboard blockers = bb::between(xrayer, ksq) & occupied;
-            if (blockers.exactlyOne())
-            {
-                allBlockers |= blockers;
-            }
-        }
-
-        return allBlockers;
-    }
-
     [[nodiscard]] inline Position Position::afterMove(Move move) const
     {
         Position cpy(*this);
@@ -5756,6 +6323,25 @@ namespace binpack
                 return chess::Move{from, to, type};
             }
 
+            [[nodiscard]] std::string toString() const
+            {
+                const chess::Square to = static_cast<chess::Square>((m_raw & (0b111111 << 0) >> 0));
+                const chess::Square from = static_cast<chess::Square>((m_raw & (0b111111 << 6)) >> 6);
+
+                const unsigned promotionIndex = (m_raw & (0b11 << 12)) >> 12;
+                const chess::PieceType promotionType = static_cast<chess::PieceType>(static_cast<int>(chess::PieceType::Knight) + promotionIndex);
+
+                std::string r;
+                chess::parser_bits::appendSquareToString(from, r);
+                chess::parser_bits::appendSquareToString(to, r);
+                if (promotionType != chess::PieceType::None)
+                {
+                    r += chess::EnumTraits<chess::PieceType>::toChar(promotionType, chess::Color::Black);
+                }
+
+                return r;
+            }
+
         private:
             std::uint16_t m_raw;
         };
@@ -6233,6 +6819,11 @@ namespace binpack
         std::int16_t score;
         std::uint16_t ply;
         std::int16_t result;
+
+        [[nodiscard]] bool isValid() const
+        {
+            return pos.isMoveLegal(move);
+        }
     };
 
     [[nodiscard]] inline TrainingDataEntry packedSfenValueToTrainingDataEntry(const nodchip::PackedSfenValue& psv)
@@ -6921,7 +7512,7 @@ namespace binpack
         buffer.insert(buffer.end(), data, data+sizeof(psv));
     }
 
-    inline void convertPlainToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    inline void convertPlainToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
     {
         constexpr std::size_t reportEveryNPositions = 100'000;
 
@@ -6949,6 +7540,11 @@ namespace binpack
             if (key == "e"sv)
             {
                 e.move = chess::uci::uciToMove(e.pos, move);
+                if (validate && !e.isValid())
+                {
+                    std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                    return;
+                }
 
                 writer.addTrainingDataEntry(e);
 
@@ -6975,7 +7571,7 @@ namespace binpack
         std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
-    inline void convertBinpackToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    inline void convertBinpackToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
     {
         constexpr std::size_t bufferSize = MiB;
 
@@ -6990,7 +7586,14 @@ namespace binpack
 
         while(reader.hasNext())
         {
-            emitPlainEntry(buffer, reader.next());
+            auto e = reader.next();
+            if (validate && !e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                return;
+            }
+
+            emitPlainEntry(buffer, e);
 
             ++numProcessedPositions;
 
@@ -7016,7 +7619,7 @@ namespace binpack
     }
 
 
-    inline void convertBinToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    inline void convertBinToBinpack(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
     {
         constexpr std::size_t reportEveryNPositions = 100'000;
 
@@ -7037,7 +7640,15 @@ namespace binpack
                 break;
             }
 
-            writer.addTrainingDataEntry(packedSfenValueToTrainingDataEntry(psv));
+            auto e = packedSfenValueToTrainingDataEntry(psv);
+            if (validate && !e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                std::cerr << static_cast<int>(e.move.type) << '\n';
+                return;
+            }
+
+            writer.addTrainingDataEntry(e);
 
             ++numProcessedPositions;
             const auto cur = inputFile.tellg();
@@ -7050,7 +7661,7 @@ namespace binpack
         std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
-    inline void convertBinpackToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    inline void convertBinpackToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
     {
         constexpr std::size_t bufferSize = MiB;
 
@@ -7065,7 +7676,14 @@ namespace binpack
 
         while(reader.hasNext())
         {
-            emitBinEntry(buffer, reader.next());
+            auto e = reader.next();
+            if (validate && !e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                return;
+            }
+
+            emitBinEntry(buffer, e);
 
             ++numProcessedPositions;
 
@@ -7090,7 +7708,7 @@ namespace binpack
         std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
-    inline void convertBinToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    inline void convertBinToPlain(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
     {
         constexpr std::size_t bufferSize = MiB;
 
@@ -7113,7 +7731,14 @@ namespace binpack
                 break;
             }
 
-            emitPlainEntry(buffer, packedSfenValueToTrainingDataEntry(psv));
+            auto e = packedSfenValueToTrainingDataEntry(psv);
+            if (validate && !e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                return;
+            }
+
+            emitPlainEntry(buffer, e);
 
             ++numProcessedPositions;
 
@@ -7138,7 +7763,7 @@ namespace binpack
         std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
 
-    inline void convertPlainToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om)
+    inline void convertPlainToBin(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate)
     {
         constexpr std::size_t bufferSize = MiB;
 
@@ -7169,6 +7794,11 @@ namespace binpack
             if (key == "e"sv)
             {
                 e.move = chess::uci::uciToMove(e.pos, move);
+                if (validate && !e.isValid())
+                {
+                    std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                    return;
+                }
 
                 emitBinEntry(buffer, e);
 
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index dfd30509..5fe7ea1d 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -525,7 +525,7 @@ namespace Learner
             && ends_with(output_path, expected_output_extension);
     }
 
-    using ConvertFunctionType = void(std::string inputPath, std::string outputPath, std::ios_base::openmode om);
+    using ConvertFunctionType = void(std::string inputPath, std::string outputPath, std::ios_base::openmode om, bool validate);
 
     static ConvertFunctionType* get_convert_function(const std::string& input_path, const std::string& output_path)
     {
@@ -547,7 +547,7 @@ namespace Learner
         return nullptr;
     }
 
-    static void convert(const std::string& input_path, const std::string& output_path, std::ios_base::openmode om)
+    static void convert(const std::string& input_path, const std::string& output_path, std::ios_base::openmode om, bool validate)
     {
         if(!file_exists(input_path))
         {
@@ -558,7 +558,7 @@ namespace Learner
         auto func = get_convert_function(input_path, output_path);
         if (func != nullptr)
         {
-            func(input_path, output_path, om);
+            func(input_path, output_path, om, validate);
         }
         else
         {
@@ -568,20 +568,22 @@ namespace Learner
 
     static void convert(const std::vector<std::string>& args)
     {
-        if (args.size() < 2 || args.size() > 3)
+        if (args.size() < 2 || args.size() > 4)
         {
             std::cerr << "Invalid arguments.\n";
-            std::cerr << "Usage: convert from_path to_path [append]\n";
+            std::cerr << "Usage: convert from_path to_path [append] [validate]\n";
             return;
         }
 
-        const bool append = (args.size() == 3) && (args[2] == "append");
+        const bool append = std::find(args.begin() + 2, args.end(), "append") != args.end();
+        const bool validate = std::find(args.begin() + 2, args.end(), "validate") != args.end();
+
         const std::ios_base::openmode openmode =
             append
             ? std::ios_base::app
             : std::ios_base::trunc;
 
-        convert(args[0], args[1], openmode);
+        convert(args[0], args[1], openmode, validate);
     }
 
     void convert(istringstream& is)

From 8069963c56df0c0bb9fc785fe9d688f19f11c706 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 4 Nov 2020 20:23:36 +0100
Subject: [PATCH 425/583] Update convert docs.

---
 docs/convert.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/convert.md b/docs/convert.md
index 2e07ec52..132f66e0 100644
--- a/docs/convert.md
+++ b/docs/convert.md
@@ -6,10 +6,13 @@ As all commands in stockfish `convert` can be invoked either from command line (
 
 The syntax of this command is as follows:
 ```
-convert from_path to_path [append]
+convert from_path to_path [append] [validate]
 ```
 
 `from_path` is the path to the file to convert from. The type of the data is deduced based on its extension (one of `.plain`, `.bin`, `.binpack`).
 `to_path` is the path to an output file. The type of the data is deduced from its extension. If the file does not exist it is created.
 
-The last argument is optional. If not specified then the output file will be truncated prior to any writes. If the last argument is `append` then the converted training data will be appended to the end of the output file.
\ No newline at end of file
+`append` and `validate` can come in any order and are optional.
+If `append` not specified then the output file will be truncated prior to any writes. If `append` is specified then the converted training data will be appended to the end of the output file.
+
+If `validate` is specified then the conversion will stop on the first illegal move found and a diagnostic will be shown.
\ No newline at end of file

From 2a8576b80445afa60faef6f16d024d50a49ffd05 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 9 Nov 2020 19:08:28 +0100
Subject: [PATCH 426/583] Fix compilation issues.

---
 src/extra/nnue_data_binpack_format.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index ceb5c415..440ae885 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -5629,13 +5629,13 @@ namespace chess
                     king ^= CastlingTraits::kingDestination[attackerColor][castleType];
                     rooks ^= move.to;
                     rooks ^= CastlingTraits::rookDestination[attackerColor][castleType];
-
-                    break;
                 }
                 else
                 {
                     king ^= occupiedChange;
                 }
+
+                break;
             }
             case PieceType::None:
                 assert(false);

From a71623f74c7056242ce2d152613fe90fa0aa9ff8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 13 Nov 2020 10:56:52 +0100
Subject: [PATCH 427/583] Add explicit read head seek to the start of the
 binpack file. Otherwise on MACOS the read head is placed at the end when app
 is specified.

---
 src/extra/nnue_data_binpack_format.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 440ae885..31c6f7bb 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -6717,6 +6717,9 @@ namespace binpack
             m_path(std::move(path)),
             m_file(m_path, std::ios_base::binary | std::ios_base::in | std::ios_base::out | om)
         {
+            // Necessary for MAC because app mode makes it put the reading
+            // head at the end.
+            m_file.seekg(0);
         }
 
         void append(const char* data, std::uint32_t size)

From 69bc3ef9be0592627908877a1c2d3b2eb2131776 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 12 Nov 2020 22:24:59 +0100
Subject: [PATCH 428/583] Output loss more often.

---
 src/learn/learn.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 66461cc5..317f6da0 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -793,6 +793,8 @@ namespace Learner
 
             out << "  - norm = " << sum_norm << endl;
             out << "  - move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
+            out << "  - loss (current) = " << (test_loss_sum.cross_entropy - test_loss_sum.entropy) / psv.size() << endl;
+            out << "  - loss (average) = " << latest_loss_sum / latest_loss_count << endl;
         }
         else
         {

From 4e1653d53a44affe6f5a56ae4f9cb737df861f2e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 13 Nov 2020 11:05:07 +0100
Subject: [PATCH 429/583] Fix reliance on transitive includes for factorizers
 in trainer feature transformer. Add a file that includes all factorizers.

---
 src/nnue/evaluate_nnue_learner.cpp             |  5 ++---
 src/nnue/trainer/features/all_factorizers.h    | 10 ++++++++++
 src/nnue/trainer/trainer_feature_transformer.h |  2 +-
 3 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100644 src/nnue/trainer/features/all_factorizers.h

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 6294865d..43282494 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -5,9 +5,8 @@
 #include "evaluate_nnue.h"
 #include "evaluate_nnue_learner.h"
 
-#include "trainer/features/factorizer_feature_set.h"
-#include "trainer/features/factorizer_half_kp.h"
-#include "trainer/features/factorizer_half_ka.h"
+#include "trainer/features/all_factorizers.h"
+
 #include "trainer/trainer_feature_transformer.h"
 #include "trainer/trainer_input_slice.h"
 #include "trainer/trainer_affine_transform.h"
diff --git a/src/nnue/trainer/features/all_factorizers.h b/src/nnue/trainer/features/all_factorizers.h
new file mode 100644
index 00000000..75d62ec8
--- /dev/null
+++ b/src/nnue/trainer/features/all_factorizers.h
@@ -0,0 +1,10 @@
+#ifndef _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
+#define _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
+
+#include "factorizer.h"
+#include "factorizer_feature_set.h"
+
+#include "factorizer_half_kp.h"
+#include "factorizer_half_ka.h"
+
+#endif
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 419cdf5e..80f914f2 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -5,7 +5,7 @@
 
 #include "extra/stockfish_blas.h"
 
-#include "features/factorizer_feature_set.h"
+#include "features/all_factorizers.h"
 
 #include "learn/learn.h"
 

From 691da3bdad9890cf7b2ae4f279a264dba7104c0a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 13 Nov 2020 11:14:19 +0100
Subject: [PATCH 430/583] Add more information for factorizers at the start of
 training.

---
 src/nnue/trainer/features/factorizer.h             |  2 +-
 src/nnue/trainer/features/factorizer_feature_set.h |  2 +-
 src/nnue/trainer/features/factorizer_half_ka.h     | 14 +++++++-------
 src/nnue/trainer/features/factorizer_half_kp.h     |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
index 15ce8022..b64b0c74 100644
--- a/src/nnue/trainer/features/factorizer.h
+++ b/src/nnue/trainer/features/factorizer.h
@@ -14,7 +14,7 @@ namespace Eval::NNUE::Features {
     class Factorizer {
     public:
         static constexpr std::string get_name() {
-            return std::string("No factorizer");
+            return "Factorizer<" + FeatureType::get_name() + "> -> " + std::string("No factorizer");
         }
 
         static constexpr std::string get_factorizers_string() {
diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
index f5ee3c5c..60f42166 100644
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ b/src/nnue/trainer/features/factorizer_feature_set.h
@@ -82,7 +82,7 @@ namespace Eval::NNUE::Features {
         static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
 
         static constexpr std::string get_name() {
-            return FeatureType::kName;
+            return Factorizer<FeatureType>::get_name();
         }
 
         static constexpr std::string get_factorizers_string() {
diff --git a/src/nnue/trainer/features/factorizer_half_ka.h b/src/nnue/trainer/features/factorizer_half_ka.h
index 90bd9d97..36d36a2d 100644
--- a/src/nnue/trainer/features/factorizer_half_ka.h
+++ b/src/nnue/trainer/features/factorizer_half_ka.h
@@ -31,11 +31,11 @@ namespace Eval::NNUE::Features {
 
         // Learning feature information
         static constexpr FeatureProperties kProperties[] = {
-            // kFeaturesHalfKPK
+            // kFeaturesHalfA
             {true, FeatureType::kDimensions},
-            // kFeaturesPK
+            // kFeaturesA
             {true, Factorizer<A>::get_dimensions()},
-            // kFeaturesHalfRelativeKPK
+            // kFeaturesHalfRelativeKA
             {true, Factorizer<HalfRelativeKA<AssociatedKing>>::get_dimensions()},
         };
 
@@ -43,7 +43,7 @@ namespace Eval::NNUE::Features {
 
     public:
         static constexpr std::string get_name() {
-            return std::string("Factorizer<") + FeatureType::kName + ">";
+            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "A, HalfRelativeKA";
         }
 
         static constexpr std::string get_factorizers_string() {
@@ -59,18 +59,18 @@ namespace Eval::NNUE::Features {
         static void append_training_features(
             IndexType base_index, std::vector<TrainingFeature>* training_features) {
 
-            // kFeaturesHalfKPK
+            // kFeaturesHalfA
             IndexType index_offset = append_base_feature<FeatureType>(
                 kProperties[kFeaturesHalfKA], base_index, training_features);
 
             const auto sq_k = static_cast<Square>(base_index / PS_END2);
             const auto a = static_cast<IndexType>(base_index % PS_END2);
 
-            // kFeaturesPK
+            // kFeaturesA
             index_offset += inherit_features_if_required<A>(
                 index_offset, kProperties[kFeaturesA], a, training_features);
 
-            // kFeaturesHalfRelativeKPK
+            // kFeaturesHalfRelativeKA
             if (a >= PS_W_PAWN) {
                 index_offset += inherit_features_if_required<HalfRelativeKA<AssociatedKing>>(
                     index_offset, kProperties[kFeaturesHalfRelativeKA],
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
index 601ddfa5..c554f0fc 100644
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ b/src/nnue/trainer/features/factorizer_half_kp.h
@@ -46,7 +46,7 @@ namespace Eval::NNUE::Features {
 
     public:
         static constexpr std::string get_name() {
-            return std::string("Factorizer<") + FeatureType::kName + ">";
+            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "HalfK, P, HalfRelativeKP";
         }
 
         static constexpr std::string get_factorizers_string() {

From 9b930023fb42589d2169f9ef77670206c539d76e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <ts.tomeksopel@gmail.com>
Date: Sat, 14 Nov 2020 15:32:06 +0100
Subject: [PATCH 431/583] Fix default value for batchsize in learn docs.

---
 docs/learn.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/learn.md b/docs/learn.md
index 7051a173..037e149c 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -16,7 +16,7 @@ Currently the following options are available:
 
 `set_recommended_uci_options` - this is a modifier not a parameter, no value follows it. If specified then some UCI options are set to recommended values.
 
-`bat` - the size of a batch in multiples of 10000. This determines how many entries are read and shuffled at once during training. Default: 1000 (meaning batch size of 1000000).
+`bat` - the size of a batch in multiples of 10000. This determines how many entries are read and shuffled at once during training. Default: 100 (meaning batch size of 1000000).
 
 `targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
 
@@ -24,7 +24,7 @@ Currently the following options are available:
 
 `basedir` - the base directory for the paths. Default: "" (current directory)
 
-`batchsize` - same as `bat` but doesn't scale by 10000
+`batchsize` - same as `bat` but doesn't scale by 10000. Default: 1000000
 
 `lr` - initial learning rate. Default: 1.
 
@@ -105,4 +105,4 @@ Currently the following options are available:
 `buffer_size`
 `shuffleq`
 `shufflem`
-`output_file_name`
\ No newline at end of file
+`output_file_name`

From 00797a3d86976e4e91e7bc76509b4c305ce23e3f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 9 Nov 2020 19:21:27 +0100
Subject: [PATCH 432/583] add option `ensure_quiet` for gensfen that makes the
 generated position quiet

---
 src/learn/gensfen.cpp | 93 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 86 insertions(+), 7 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 4accb882..e1aec654 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -94,6 +94,8 @@ namespace Learner
             bool detect_draw_by_consecutive_low_score = true;
             bool detect_draw_by_insufficient_mating_material = true;
 
+            bool ensure_quiet = false;
+
             uint64_t num_threads;
 
             void enforce_constraints()
@@ -316,19 +318,86 @@ namespace Learner
                 // Discard stuff before write_minply is reached
                 // because it can harm training due to overfitting.
                 // Initial positions would be too common.
-                if (ply >= params.write_minply && !was_seen_before(pos))
+                if (ply >= params.write_minply)
                 {
                     packed_sfens.emplace_back(PackedSfenValue());
 
                     auto& psv = packed_sfens.back();
 
-                    // Here we only write the position data.
-                    // Result is added after the whole game is done.
-                    pos.sfen_pack(psv.sfen);
+                    if (params.ensure_quiet)
+                    {
+                        auto [qsearch_value, qsearch_pv] = Search::qsearch(pos);
+                        if (qsearch_pv.empty())
+                        {
+                            // Here we only write the position data.
+                            // Result is added after the whole game is done.
+                            pos.sfen_pack(psv.sfen);
 
-                    psv.score = search_value;
-                    psv.gamePly = ply;
-                    psv.move = search_pv[0];
+                            // Already a quiet position
+                            psv.score = search_value;
+                            psv.move = search_pv[0];
+                            psv.gamePly = ply;
+                        }
+                        else
+                        {
+                            // Navigate to a quiet
+                            int old_ply = ply;
+                            for (auto m : qsearch_pv)
+                            {
+                                pos.do_move(m, states[ply++]);
+                            }
+
+                            if (was_seen_before(pos))
+                            {
+                                // Just skip the move.
+                                packed_sfens.pop_back();
+                            }
+                            else
+                            {
+                                // Reevaluate
+                                auto [quiet_search_value, quiet_search_pv] = Search::search(pos, depth, 1, params.nodes);
+                                if (quiet_search_pv.empty())
+                                {
+                                    // Just skip the move.
+                                    packed_sfens.pop_back();
+                                }
+                                else
+                                {
+                                    // Here we only write the position data.
+                                    // Result is added after the whole game is done.
+                                    pos.sfen_pack(psv.sfen);
+
+                                    psv.score = quiet_search_value;
+                                    psv.move = quiet_search_pv[0];
+                                    psv.gamePly = ply;
+                                }
+                            }
+
+                            // Get back to the game
+                            for (auto it = qsearch_pv.rbegin(); it != qsearch_pv.rend(); ++it)
+                            {
+                                pos.undo_move(*it);
+                            }
+                            ply = old_ply;
+                        }
+                    }
+                    else
+                    {
+                        if (was_seen_before(pos))
+                        {
+                            packed_sfens.pop_back();
+                        }
+                        else
+                        {
+                            // Here we only write the position data.
+                            // Result is added after the whole game is done.
+                            pos.sfen_pack(psv.sfen);
+
+                            psv.score = search_value;
+                            psv.move = search_pv[0];
+                            psv.gamePly = ply;
+                        }
+                    }
                 }
 
                 // Update the next move according to best search result or random move.
@@ -777,6 +846,10 @@ namespace Learner
                 UCI::setoption("PruneAtShallowDepth", "false");
                 UCI::setoption("EnableTranspositionTable", "true");
             }
+            else if (token == "ensure_quiet")
+            {
+                params.ensure_quiet = true;
+            }
             else
                 cout << "ERROR: Ignoring unknown option " << token << endl;
         }
@@ -791,6 +864,12 @@ namespace Learner
                 cout << "WARNING: Unknown sfen format `" << sfen_format << "`. Using bin\n";
         }
 
+        if (params.ensure_quiet)
+        {
+            // Otherwise we can't ensure quiet positions...
+            UCI::setoption("EnableTranspositionTable", "false");
+        }
+
         if (random_file_name)
         {
             // Give a random number to output_file_name at this point.

From 00bc80c3c4d162ebfdc4176c9b01e9124c0a473a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 9 Nov 2020 19:27:53 +0100
Subject: [PATCH 433/583] Add `assume_quiet` option to the learner.

---
 src/learn/learn.cpp | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 317f6da0..7f18ff28 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -397,6 +397,8 @@ namespace Learner
             bool use_draw_games_in_validation = true;
             bool skip_duplicated_positions_in_training = true;
 
+            bool assume_quiet = false;
+
             double learning_rate = 1.0;
 
             string validation_set_file_name;
@@ -676,19 +678,22 @@ namespace Learner
                 goto RETRY_READ;
             }
 
-            int ply = 0;
-            pos.do_move((Move)ps.move, state[ply++]);
-
-            // We want to position being trained on not to be terminal
-            if (MoveList<LEGAL>(pos).size() == 0)
-                goto RETRY_READ;
-
-            // Evaluation value of shallow search (qsearch)
-            const auto [_, pv] = Search::qsearch(pos);
-
-            for (auto m : pv)
+            if (!params.assume_quiet)
             {
-                pos.do_move(m, state[ply++]);
+                int ply = 0;
+                pos.do_move((Move)ps.move, state[ply++]);
+
+                // We want to position being trained on not to be terminal
+                if (MoveList<LEGAL>(pos).size() == 0)
+                    goto RETRY_READ;
+
+                // Evaluation value of shallow search (qsearch)
+                const auto [_, pv] = Search::qsearch(pos);
+
+                for (auto m : pv)
+                {
+                    pos.do_move(m, state[ply++]);
+                }
             }
 
             // Since we have reached the end phase of PV, add the slope here.
@@ -1106,6 +1111,7 @@ namespace Learner
                 UCI::setoption("EnableTranspositionTable", "false");
             }
             else if (option == "verbose") params.verbose = true;
+            else if (option == "assume_quiet") params.assume_quiet = true;
             else
             {
                 out << "INFO: Unknown option: " << option << ". Ignoring.\n";

From 50358e26c77a7c51ea1c17a948c36b09cb18239d Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 13 Nov 2020 22:28:28 +0100
Subject: [PATCH 434/583] Fix searching terminal nodes in gensfen.

---
 src/search.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 436e11fd..1aa86bf3 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1976,7 +1976,7 @@ namespace Search
 
   // Initialization for learning.
   // Called from Learner::search(),Learner::qsearch().
-  static void init_for_search(Position& pos, Stack* ss)
+  static bool init_for_search(Position& pos, Stack* ss)
   {
 
     // RootNode requires ss->ply == 0.
@@ -2026,7 +2026,10 @@ namespace Search
       for (auto m: MoveList<LEGAL>(pos))
         rootMoves.push_back(Search::RootMove(m));
 
-      assert(!rootMoves.empty());
+      // Check if we're at a terminal node. Otherwise we end up returning
+      // malformed PV later on.
+      if (rootMoves.empty())
+        return false;
 
       th->UseRule50 = bool(Options["Syzygy50MoveRule"]);
       th->ProbeDepth = int(Options["SyzygyProbeDepth"]);
@@ -2042,6 +2045,8 @@ namespace Search
 
       Tablebases::rank_root_moves(pos, rootMoves);
     }
+
+    return true;
   }
 
   // Stationary search.
@@ -2061,7 +2066,9 @@ namespace Search
     Stack stack[MAX_PLY+10], *ss = stack+7;
     Move  pv[MAX_PLY+1];
 
-    init_for_search(pos, ss);
+    if (!init_for_search(pos, ss))
+      return {};
+
     ss->pv = pv; // For the time being, it must be a dummy and somewhere with a buffer.
 
     if (pos.is_draw(0)) {
@@ -2116,7 +2123,8 @@ namespace Search
     Stack stack[MAX_PLY + 10], * ss = stack + 7;
     Move pv[MAX_PLY + 1];
 
-    init_for_search(pos, ss);
+    if (!init_for_search(pos, ss))
+      return {};
 
 	ss->pv = pv; // For the time being, it must be a dummy and somewhere with a buffer.
 

From 3dbc45bdfc232b549c28269896ccc3760f937378 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 16 Nov 2020 00:41:05 +0100
Subject: [PATCH 435/583] Add gradient clipping.

---
 src/learn/learn.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 7f18ff28..3942b606 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -93,6 +93,8 @@ namespace Learner
     static double elmo_lambda_high = 1.0;
     static double elmo_lambda_limit = 32000;
 
+    static double max_grad = 1.0;
+
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
@@ -315,7 +317,7 @@ namespace Learner
             grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
         }
 
-        return grad;
+        return std::clamp(grad, -max_grad, max_grad);
     }
 
     // Calculate cross entropy during learning
@@ -1072,6 +1074,7 @@ namespace Learner
             else if (option == "lambda") is >> elmo_lambda_low;
             else if (option == "lambda2") is >> elmo_lambda_high;
             else if (option == "lambda_limit") is >> elmo_lambda_limit;
+            else if (option == "max_grad") is >> max_grad;
 
             else if (option == "reduction_gameply") is >> params.reduction_gameply;
 
@@ -1175,6 +1178,7 @@ namespace Learner
         out << "  - elmo_lambda_low          : " << elmo_lambda_low << endl;
         out << "  - elmo_lambda_high         : " << elmo_lambda_high << endl;
         out << "  - elmo_lambda_limit        : " << elmo_lambda_limit << endl;
+        out << "  - max_grad                 : " << max_grad << endl;
         out << "  - eval_save_interval       : " << params.eval_save_interval << " sfens" << endl;
         out << "  - loss_output_interval     : " << params.loss_output_interval << " sfens" << endl;
 

From d793663188729272cd0b8ecb8a841f7b50ea8345 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 16 Nov 2020 00:43:43 +0100
Subject: [PATCH 436/583] Add docs for max_grad option for learn

---
 docs/learn.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/learn.md b/docs/learn.md
index 037e149c..6de81521 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -50,6 +50,8 @@ Currently the following options are available:
 
 `lambda_limit` - the maximum absolute score value for which `lambda` is used as opposed to `lambda2`. For positions with absolute evaluation higher than `lambda_limit` `lambda2` will be used. Default: 32000 (so always `lambda`).
 
+`max_grad` - the maximum allowed loss gradient for backpropagation. Effectively a form of gradient clipping. Useful for the first iterations with a randomly generated net as with higher lr backpropagation often overshoots and kills the net. The default value is fairly conservative, values as low as 0.25 could be used with lr of 1.0 without problems. Default: 1.0.
+
 `reduction_gameply` - the minimum ply after which positions won't be skipped. Positions at plies below this value are skipped with a probability that lessens linearly with the ply (reaching 0 at `reduction_gameply`). Default: 1.
 
 `eval_limit` - positions with absolute evaluation higher than this will be skipped. Default: 32000 (nothing is skipped).

From d4350a16f32eaedf0e5bd207a4e1293a7a4e1f2c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 16 Nov 2020 11:32:42 +0100
Subject: [PATCH 437/583] Add representation of an opening book.

---
 src/Makefile               |  1 +
 src/learn/opening_book.cpp | 43 +++++++++++++++++++++++++++++
 src/learn/opening_book.h   | 56 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+)
 create mode 100644 src/learn/opening_book.cpp
 create mode 100644 src/learn/opening_book.h

diff --git a/src/Makefile b/src/Makefile
index cba4e351..51a9654a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -63,6 +63,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	learn/sfen_packer.cpp \
 	learn/learn.cpp \
 	learn/gensfen.cpp \
+	learn/opening_book.cpp \
 	learn/convert.cpp
 
 OBJS = $(notdir $(SRCS:.cpp=.o))
diff --git a/src/learn/opening_book.cpp b/src/learn/opening_book.cpp
new file mode 100644
index 00000000..fb569bda
--- /dev/null
+++ b/src/learn/opening_book.cpp
@@ -0,0 +1,43 @@
+#include "opening_book.h"
+
+#include <fstream>
+
+namespace Learner {
+
+    EpdOpeningBook::EpdOpeningBook(const std::string& file, PRNG& prng) :
+        OpeningBook(file)
+    {
+        std::ifstream in(file);
+        if (!in)
+        {
+            return;
+        }
+
+        std::string line;
+        while (std::getline(in, line))
+        {
+            if (line.empty())
+                continue;
+
+            fens.emplace_back(line);
+        }
+
+        Algo::shuffle(fens, prng);
+    }
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    std::unique_ptr<OpeningBook> open_opening_book(const std::string& filename, PRNG& prng)
+    {
+        if (ends_with(filename, ".epd"))
+            return std::make_unique<EpdOpeningBook>(filename, prng);
+
+        return nullptr;
+    }
+
+}
diff --git a/src/learn/opening_book.h b/src/learn/opening_book.h
new file mode 100644
index 00000000..16207f13
--- /dev/null
+++ b/src/learn/opening_book.h
@@ -0,0 +1,56 @@
+#ifndef LEARN_OPENING_BOOK_H
+#define LEARN_OPENING_BOOK_H
+
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+
+#include <vector>
+#include <random>
+#include <optional>
+#include <string>
+#include <cstdint>
+#include <memory>
+
+namespace Learner {
+
+    struct OpeningBook {
+
+        const std::string& next_fen()
+        {
+            assert(fens.size() > 0);
+
+            auto& fen = fens[current_index++];
+            if (current_index >= fens.size())
+                current_index = 0;
+
+            return fen;
+        }
+
+        std::size_t size() const { return fens.size(); }
+
+        const std::string& get_filename() const { return filename; }
+
+    protected:
+        OpeningBook(const std::string& file) :
+            filename(file),
+            current_index(0)
+        {
+        }
+
+
+        std::string filename;
+        std::vector<std::string> fens;
+        std::size_t current_index;
+    };
+
+    struct EpdOpeningBook : OpeningBook {
+
+        EpdOpeningBook(const std::string& file, PRNG& prng);
+    };
+
+    std::unique_ptr<OpeningBook> open_opening_book(const std::string& filename, PRNG& prng);
+
+}
+
+#endif

From e1dbad47cef574dc39d9d768308c9bf96de95c5b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 16 Nov 2020 14:13:34 +0100
Subject: [PATCH 438/583] Add support for opening book to gensfen.

---
 src/learn/gensfen.cpp | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index e1aec654..b265da71 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -2,6 +2,7 @@
 
 #include "sfen_writer.h"
 #include "packed_sfen.h"
+#include "opening_book.h"
 
 #include "misc.h"
 #include "position.h"
@@ -98,6 +99,8 @@ namespace Learner
 
             uint64_t num_threads;
 
+            std::string book;
+
             void enforce_constraints()
             {
                 search_depth_max = std::max(search_depth_min, search_depth_max);
@@ -130,6 +133,15 @@ namespace Learner
         {
             hash.resize(GENSFEN_HASH_SIZE);
 
+            if (!prm.book.empty())
+            {
+                opening_book = open_opening_book(prm.book, prng);
+                if (opening_book == nullptr)
+                {
+                    std::cout << "WARNING: Failed to open opening book " << prm.book << ". Falling back to startpos.\n";
+                }
+            }
+
             // Output seed to veryfy by the user if it's not identical by chance.
             std::cout << prng << std::endl;
         }
@@ -151,6 +163,8 @@ namespace Learner
 
         vector<Key> hash; // 64MB*sizeof(HASH_KEY) = 512MB
 
+        std::unique_ptr<OpeningBook> opening_book;
+
         static void set_gensfen_search_limits();
 
         void generate_worker(
@@ -248,7 +262,15 @@ namespace Learner
             // When parallelizing, Threads (since this is a vector<Thread*>,
             // Do the same for up to Threads[0]...Threads[thread_num-1].
             auto& pos = th.rootPos;
-            pos.set(StartFEN, false, &si, &th);
+            if (opening_book != nullptr)
+            {
+                auto& fen = opening_book->next_fen();
+                pos.set(fen, false, &si, &th);
+            }
+            else
+            {
+                pos.set(StartFEN, false, &si, &th);
+            }
 
             int resign_counter = 0;
             bool should_resign = prng.rand(10) > 1;
@@ -822,6 +844,8 @@ namespace Learner
                 is >> params.write_maxply;
             else if (token == "save_every")
                 is >> params.save_every;
+            else if (token == "book")
+                is >> params.book;
             else if (token == "random_file_name")
                 is >> random_file_name;
             // Accept also the old option name.
@@ -911,6 +935,7 @@ namespace Learner
             << "  - random_multi_pv_depth  = " << params.random_multi_pv_depth << endl
             << "  - write_minply           = " << params.write_minply << endl
             << "  - write_maxply           = " << params.write_maxply << endl
+            << "  - book                   = " << params.book << endl
             << "  - output_file_name       = " << params.output_file_name << endl
             << "  - save_every             = " << params.save_every << endl
             << "  - random_file_name       = " << random_file_name << endl

From 5f18c88b3d5683321d4a84d948e94053770f6291 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 16 Nov 2020 14:14:39 +0100
Subject: [PATCH 439/583] Docs for book in gensfen.

---
 docs/gensfen.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/gensfen.md b/docs/gensfen.md
index ce0f365c..16fead59 100644
--- a/docs/gensfen.md
+++ b/docs/gensfen.md
@@ -44,6 +44,8 @@ Currently the following options are available:
 
 `write_maxply` - maximum ply for which the training data entry will be emitted. Default: 400.
 
+`book` - a path to an opening book to use for the starting positions. Currently only .epd format is supported. If not specified then the starting position is always the standard chess starting position.
+
 `save_every` - the number of training data entries per file. If not specified then there will be always one file. If specified there may be more than one file generated (each having at most `save_every` training data entries) and each file will have a unique number attached.
 
 `random_file_name` - if specified then the output filename will be chosen randomly. Overrides `output_file_name`.

From d9dcdc2b73314ec7507801e1e23562cd9d49f4b2 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:06:14 +0100
Subject: [PATCH 440/583] Delete k-p_256x2-32-32.h

---
 src/nnue/architectures/k-p_256x2-32-32.h | 35 ------------------------
 1 file changed, 35 deletions(-)
 delete mode 100644 src/nnue/architectures/k-p_256x2-32-32.h

diff --git a/src/nnue/architectures/k-p_256x2-32-32.h b/src/nnue/architectures/k-p_256x2-32-32.h
deleted file mode 100644
index 92c9efcd..00000000
--- a/src/nnue/architectures/k-p_256x2-32-32.h
+++ /dev/null
@@ -1,35 +0,0 @@
-﻿// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef K_P_256X2_32_32_H
-#define K_P_256X2_32_32_H
-
-#include "nnue/features/feature_set.h"
-#include "nnue/features/k.h"
-#include "nnue/features/p.h"
-
-#include "nnue/layers/input_slice.h"
-#include "nnue/layers/affine_transform.h"
-#include "nnue/layers/clipped_relu.h"
-
-namespace Eval::NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<Features::K, Features::P>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-        // define network structure
-        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-}  // namespace Eval::NNUE
-#endif // K_P_256X2_32_32_H

From 72fee2f7a41d9c265a8d214c814dee1afb18e67c Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:06:32 +0100
Subject: [PATCH 441/583] Delete k-p-cr_256x2-32-32.h

---
 src/nnue/architectures/k-p-cr_256x2-32-32.h | 37 ---------------------
 1 file changed, 37 deletions(-)
 delete mode 100644 src/nnue/architectures/k-p-cr_256x2-32-32.h

diff --git a/src/nnue/architectures/k-p-cr_256x2-32-32.h b/src/nnue/architectures/k-p-cr_256x2-32-32.h
deleted file mode 100644
index 1db34b22..00000000
--- a/src/nnue/architectures/k-p-cr_256x2-32-32.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef K_P_CR_256X2_32_32_H
-#define K_P_CR_256X2_32_32_H
-
-#include "nnue/features/feature_set.h"
-#include "nnue/features/k.h"
-#include "nnue/features/p.h"
-#include "nnue/features/castling_right.h"
-
-#include "nnue/layers/input_slice.h"
-#include "nnue/layers/affine_transform.h"
-#include "nnue/layers/clipped_relu.h"
-
-namespace Eval::NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-        Features::CastlingRight>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-        // define network structure
-        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-}  // namespace Eval::NNUE
-#endif // K_P_CR_256X2_32_32_H

From b27c51b5cf1d8440270db19a2d3c105c7950c91f Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:06:45 +0100
Subject: [PATCH 442/583] Delete k-p-cr-ep_256x2-32-32.h

---
 .../architectures/k-p-cr-ep_256x2-32-32.h     | 38 -------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 src/nnue/architectures/k-p-cr-ep_256x2-32-32.h

diff --git a/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h b/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
deleted file mode 100644
index 14eeba54..00000000
--- a/src/nnue/architectures/k-p-cr-ep_256x2-32-32.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef K_P_CR_EP_256X2_32_32_H
-#define K_P_CR_EP_256X2_32_32_H
-
-#include "nnue/features/feature_set.h"
-#include "nnue/features/k.h"
-#include "nnue/features/p.h"
-#include "nnue/features/castling_right.h"
-#include "nnue/features/enpassant.h"
-
-#include "nnue/layers/input_slice.h"
-#include "nnue/layers/affine_transform.h"
-#include "nnue/layers/clipped_relu.h"
-
-namespace Eval::NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
-        Features::CastlingRight, Features::EnPassant>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-        // define network structure
-        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-}  // namespace Eval::NNUE
-#endif // K_P_CR_EP_256X2_32_32_H

From c04c5b6658b790c0cb75076517415200a32e3bba Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:11:02 +0100
Subject: [PATCH 443/583] Update nnue_common.h

---
 src/nnue/nnue_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index bd4294a3..9bce9fe9 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -71,7 +71,7 @@
 namespace Eval::NNUE {
 
     // Version of the evaluation file
-    constexpr std::uint32_t kVersion = 0x7AF32F17u;
+    constexpr std::uint32_t kVersion = 0x7AF32F16u;
 
     // Constant used in evaluation value calculation
     constexpr int FV_SCALE = 16;

From 5b3e9b0eb31b7b0ebf7031bedf7f8a11a9763483 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:15:25 +0100
Subject: [PATCH 444/583] Update p.cpp

---
 src/nnue/features/p.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/p.cpp b/src/nnue/features/p.cpp
index 1621e8b2..a17e304f 100644
--- a/src/nnue/features/p.cpp
+++ b/src/nnue/features/p.cpp
@@ -4,9 +4,11 @@
 //Definition of input feature P of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Find the index of the feature quantity from the king position and PieceSquare

From 36c801699f2b8f8243a587e3b37ba0f24ba86776 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:15:54 +0100
Subject: [PATCH 445/583] Update k.cpp

---
 src/nnue/features/k.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/k.cpp b/src/nnue/features/k.cpp
index f01a6ce0..7b62a75a 100644
--- a/src/nnue/features/k.cpp
+++ b/src/nnue/features/k.cpp
@@ -4,9 +4,11 @@
 //Definition of input feature quantity K of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Index of a feature for a given king position.

From 021f47b00eaf407854f5074ed804abdd1be620c7 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:16:37 +0100
Subject: [PATCH 446/583] Update half_relative_kp.cpp

---
 src/nnue/features/half_relative_kp.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
index 240e20c0..2ebccd59 100644
--- a/src/nnue/features/half_relative_kp.cpp
+++ b/src/nnue/features/half_relative_kp.cpp
@@ -4,9 +4,11 @@
 //Definition of input features HalfRelativeKP of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Find the index of the feature quantity from the ball position and PieceSquare

From be4cd561467362de6d4648d8413044e13314678b Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:17:27 +0100
Subject: [PATCH 447/583] Update half_kp.cpp

---
 src/nnue/features/half_kp.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index 18e82004..743a6378 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -23,9 +23,11 @@
 
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Find the index of the feature quantity from the king position and PieceSquare

From f832aa6b6becb9b5b88e120b78db8936ef028962 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:19:36 +0100
Subject: [PATCH 448/583] Update evaluate.h

---
 src/evaluate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.h b/src/evaluate.h
index fc626698..f5d3efa7 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -32,7 +32,7 @@ namespace Eval {
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-98a7585c85e9.nnue"
+  #define EvalFileDefaultName   "nn-c3ca321c51c9.nnue"
 
 } // namespace Eval
 

From 777c3a08ab5ab248958b45955053d1b735c7eb42 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:30:17 +0100
Subject: [PATCH 449/583] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 5fa8179e..d894e649 100644
--- a/README.md
+++ b/README.md
@@ -113,6 +113,8 @@ There is a builting converted that support all 3 formats described above. Any of
 
 ## Resources
 
+- [Training NNUE for SF](https://docs.google.com/document/d/1os5GH8GGJbV0nKAfXD-qySBclFzKKtXKHbAnA-un8tA/edit) google document with important information and coding priorities
+- [Gensfen data (vondele)](https://drive.google.com/drive/folders/1mftuzYdl9o6tBaceR3d_VBQIrgKJsFpl) over 2b fens available
 - [Stockfish NNUE Wiki](https://www.qhapaq.org/shogi/shogiwiki/stockfish-nnue/)
 - [Training instructions](https://twitter.com/mktakizawa/status/1273042640280252416) from the creator of the Elmo shogi engine
 - [Original Talkchess thread](http://talkchess.com/forum3/viewtopic.php?t=74059) discussing Stockfish NNUE

From ea70e378cdf15ea44ba943cc0a1257eb9a31d55a Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:47:44 +0100
Subject: [PATCH 450/583] Update a.cpp

---
 src/nnue/features/a.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/a.cpp b/src/nnue/features/a.cpp
index 6ceb4efa..1bfb583f 100644
--- a/src/nnue/features/a.cpp
+++ b/src/nnue/features/a.cpp
@@ -4,9 +4,13 @@
 // Definition of input feature A of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Find the index of the feature quantity from the king position and PieceSquare

From b0429237a86ea303d087e433783360f5858fb0f6 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:48:18 +0100
Subject: [PATCH 451/583] Update half_ka.cpp

---
 src/nnue/features/half_ka.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/half_ka.cpp b/src/nnue/features/half_ka.cpp
index 83e59067..08124b96 100644
--- a/src/nnue/features/half_ka.cpp
+++ b/src/nnue/features/half_ka.cpp
@@ -23,9 +23,13 @@
 
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Find the index of the feature quantity from the king position and PieceSquare

From 3975fc9c0dc1f896ae20339bb14c83572971f9a6 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 15:49:02 +0100
Subject: [PATCH 452/583] Update half_relative_ka.cpp

---
 src/nnue/features/half_relative_ka.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/nnue/features/half_relative_ka.cpp b/src/nnue/features/half_relative_ka.cpp
index ba3edbcf..d2ad31e6 100644
--- a/src/nnue/features/half_relative_ka.cpp
+++ b/src/nnue/features/half_relative_ka.cpp
@@ -4,9 +4,13 @@
 //Definition of input features HalfRelativeKA of NNUE evaluation function
 namespace Eval::NNUE::Features {
 
-    // Orient a square according to perspective (flip rank for black)
+    // Orient a square according to perspective (rotate the board 180° for black)
+    // Important note for "halfka": this arch was designed with "flip" in mind 
+    // although it still is untested which approach is better.
+    // this has to stay until we find a better arch that works with "flip".
+    // allows us to use current master net for gensfen (primarily needed for higher quality data)
     inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * SQ_A8));
+        return Square(int(s) ^ (bool(perspective) * 63));
     }
 
     // Find the index of the feature quantity from the ball position and PieceSquare

From 38d19eca143d1e0f6b2f42b8eafd27078d11f164 Mon Sep 17 00:00:00 2001
From: JWmer <26392242+NightlyKing@users.noreply.github.com>
Date: Fri, 20 Nov 2020 23:58:04 +0100
Subject: [PATCH 453/583] Update instrumented.sh

---
 tests/instrumented.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index 07ecbb9c..dffc257a 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -24,7 +24,7 @@ case $1 in
     echo "valgrind-thread testing started"
     prefix=''
     exeprefix='valgrind --fair-sched=try --error-exitcode=42'
-    postfix='1>/dev/null'
+    postfix=''
     threads="2"
     bench_depth=5
     go_depth=10

From d43cd104b6549c2372d85c95410cfb8d16cfeb33 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <ts.tomeksopel@gmail.com>
Date: Sat, 21 Nov 2020 21:14:15 +0100
Subject: [PATCH 454/583] Fix uninitialized variable when searching from a
 terminal position.

---
 src/thread.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/thread.cpp b/src/thread.cpp
index e867048d..f035186b 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -232,6 +232,9 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
       th->rootMoves = rootMoves;
       th->rootPos.set(pos.fen(), pos.is_chess960(), &th->rootState, th);
       th->rootState = setupStates->back();
+      // This is also set by rank_root_moves but we need to set it
+      // also when there is no legal moves.
+      th->rootInTB = false;
       th->UseRule50 = bool(Options["Syzygy50MoveRule"]);
       th->ProbeDepth = int(Options["SyzygyProbeDepth"]);
       th->Cardinality = int(Options["SyzygyProbeLimit"]);

From 3cee6881ee1639bd22d89ef43387c83c95f5067e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 17:18:06 +0100
Subject: [PATCH 455/583] Move the terminal position check to after qsearch,
 otherwise qsearch may end up in a terminal position.

---
 src/learn/learn.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 3942b606..cab5a9b5 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -685,10 +685,6 @@ namespace Learner
                 int ply = 0;
                 pos.do_move((Move)ps.move, state[ply++]);
 
-                // We want to position being trained on not to be terminal
-                if (MoveList<LEGAL>(pos).size() == 0)
-                    goto RETRY_READ;
-
                 // Evaluation value of shallow search (qsearch)
                 const auto [_, pv] = Search::qsearch(pos);
 
@@ -698,6 +694,10 @@ namespace Learner
                 }
             }
 
+            // We want to position being trained on not to be terminal
+            if (MoveList<LEGAL>(pos).size() == 0)
+                goto RETRY_READ;
+
             // Since we have reached the end phase of PV, add the slope here.
             pos_add_grad();
         }

From ee13cfce67222faafca4e93f8af39fad3429d4bd Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 17:02:33 +0100
Subject: [PATCH 456/583] Fix result assigned for a psvector when the positions
 are not continuous.

---
 src/learn/gensfen.cpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index b265da71..5f8bbba1 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -191,7 +191,8 @@ namespace Learner
             PSVector& sfens,
             int8_t lastTurnIsWin,
             std::atomic<uint64_t>& counter,
-            uint64_t limit);
+            uint64_t limit,
+            Color result_color);
 
         void report(uint64_t done, uint64_t new_done);
 
@@ -291,7 +292,7 @@ namespace Learner
             vector<int> move_hist_scores;
 
             auto flush_psv = [&](int8_t result) {
-                quit = commit_psv(th, packed_sfens, result, counter, limit);
+                quit = commit_psv(th, packed_sfens, result, counter, limit, pos.side_to_move());
             };
 
             for (int ply = 0; ; ++ply)
@@ -717,7 +718,8 @@ namespace Learner
         PSVector& sfens,
         int8_t result,
         std::atomic<uint64_t>& counter,
-        uint64_t limit)
+        uint64_t limit,
+        Color result_color)
     {
         if (!params.write_out_draw_game_in_training_data_generation && result == 0)
         {
@@ -725,13 +727,17 @@ namespace Learner
             return false;
         }
 
+        auto side_to_move_from_sfen = [](auto& sfen){
+            return (Color)(sfen.sfen.data[0] & 1);
+        };
+
         // From the final stage (one step before) to the first stage, give information on the outcome of the game for each stage.
         // The phases stored in sfens are assumed to be continuous (in order).
         for (auto it = sfens.rbegin(); it != sfens.rend(); ++it)
         {
-            // If is_win == 0 (draw), multiply by -1 and it will remain 0 (draw)
-            result = -result;
-            it->game_result = result;
+            // The side to move is packed as the lowest bit of the first byte
+            const Color side_to_move = side_to_move_from_sfen(*it);
+            it->game_result = side_to_move == result_color ? result : -result;
         }
 
         // Write sfens in move order to make potential compression easier

From 9030020a854f81b4441c3f0157e66ab72b5c02af Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 18:05:59 +0100
Subject: [PATCH 457/583] Add smart_fen_skipping option to learn.

---
 src/learn/learn.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index cab5a9b5..f7358f8e 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -400,6 +400,7 @@ namespace Learner
             bool skip_duplicated_positions_in_training = true;
 
             bool assume_quiet = false;
+            bool smart_fen_skipping = false;
 
             double learning_rate = 1.0;
 
@@ -680,7 +681,8 @@ namespace Learner
                 goto RETRY_READ;
             }
 
-            if (!params.assume_quiet)
+            // We don't need to qsearch when doing smart skipping
+            if (!params.assume_quiet && !params.smart_fen_skipping)
             {
                 int ply = 0;
                 pos.do_move((Move)ps.move, state[ply++]);
@@ -694,6 +696,13 @@ namespace Learner
                 }
             }
 
+            if (params.smart_fen_skipping
+                && (pos.capture_or_promotion((Move)ps.move)
+                    || pos.checkers()))
+            {
+                goto RETRY_READ;
+            }
+            
             // We want to position being trained on not to be terminal
             if (MoveList<LEGAL>(pos).size() == 0)
                 goto RETRY_READ;
@@ -1115,6 +1124,7 @@ namespace Learner
             }
             else if (option == "verbose") params.verbose = true;
             else if (option == "assume_quiet") params.assume_quiet = true;
+            else if (option == "smart_fen_skipping") params.smart_fen_skipping = true;
             else
             {
                 out << "INFO: Unknown option: " << option << ". Ignoring.\n";

From 45e3335ee843e11b838efbc507214e5bcd7313a4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 18:08:14 +0100
Subject: [PATCH 458/583] Add missing docs.

---
 docs/gensfen.md | 2 ++
 docs/learn.md   | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/docs/gensfen.md b/docs/gensfen.md
index 16fead59..48f7f5e7 100644
--- a/docs/gensfen.md
+++ b/docs/gensfen.md
@@ -62,4 +62,6 @@ Currently the following options are available:
 
 `sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `binpack`.
 
+`ensure_quiet` - this is a flag option. When specified the positions will be from the qsearch leaf.
+
 `seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
diff --git a/docs/learn.md b/docs/learn.md
index 6de81521..30a7c951 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -64,6 +64,10 @@ Currently the following options are available:
 
 `newbob_decay` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 0.5 (no LR drops)
 
+`assume_quiet` - this is a flag option. When specified learn will not perform qsearch to reach a quiet position.
+
+`smart_fen_skipping` - this is a flag option. When specified some position that are not good candidates for teaching are skipped. This includes positions where the best move is a capture or promotion, and position where a king is in check.
+
 `newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 4.
 
 `auto_lr_drop` - every time this many positions are processed the learning rate is multiplied by `newbob_decay`. In other words this value specifies for how many positions a single learning rate stage lasts. If 0 then doesn't have any effect. Default: 0.

From 89294e2e4f44fdf3b4c3e38609c6c9b4c2a3c982 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 26 Nov 2020 17:28:09 +0100
Subject: [PATCH 459/583] Add transform command. Add transform nudged_static
 subcommand.

---
 src/Makefile            |   3 +-
 src/learn/sfen_stream.h |  10 ++
 src/learn/transform.cpp | 242 ++++++++++++++++++++++++++++++++++++++++
 src/learn/transform.h   |  12 ++
 src/uci.cpp             |   2 +
 5 files changed, 268 insertions(+), 1 deletion(-)
 create mode 100644 src/learn/transform.cpp
 create mode 100644 src/learn/transform.h

diff --git a/src/Makefile b/src/Makefile
index a5f5f06f..7f00bfff 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -64,7 +64,8 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	learn/learn.cpp \
 	learn/gensfen.cpp \
 	learn/opening_book.cpp \
-	learn/convert.cpp
+	learn/convert.cpp \
+	learn/transform.cpp
 
 OBJS = $(notdir $(SRCS:.cpp=.o))
 
diff --git a/src/learn/sfen_stream.h b/src/learn/sfen_stream.h
index d25dd41d..da411346 100644
--- a/src/learn/sfen_stream.h
+++ b/src/learn/sfen_stream.h
@@ -207,6 +207,16 @@ namespace Learner {
         assert(false);
         return nullptr;
     }
+
+    inline std::unique_ptr<BasicSfenOutputStream> create_new_sfen_output(const std::string& filename)
+    {
+        if (has_extension(filename, BinSfenOutputStream::extension))
+            return std::make_unique<BinSfenOutputStream>(filename);
+        else if (has_extension(filename, BinpackSfenOutputStream::extension))
+            return std::make_unique<BinpackSfenOutputStream>(filename);
+
+        return nullptr;
+    }
 }
 
 #endif
\ No newline at end of file
diff --git a/src/learn/transform.cpp b/src/learn/transform.cpp
new file mode 100644
index 00000000..5687b48b
--- /dev/null
+++ b/src/learn/transform.cpp
@@ -0,0 +1,242 @@
+#include "transform.h"
+
+#include "sfen_stream.h"
+#include "packed_sfen.h"
+
+#include "thread.h"
+#include "position.h"
+#include "evaluate.h"
+
+#include "nnue/evaluate_nnue.h"
+
+#include <string>
+#include <map>
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+namespace Learner
+{
+    using CommandFunc = void(*)(std::istringstream&);
+
+    enum struct NudgedStaticMode
+    {
+        Absolute,
+        Relative,
+        Interpolate
+    };
+
+    struct NudgedStaticParams
+    {
+        std::string input_filename = "in.binpack";
+        std::string output_filename = "out.binpack";
+        NudgedStaticMode mode = NudgedStaticMode::Absolute;
+        int absolute_nudge = 5;
+        float relative_nudge = 0.1;
+        float interpolate_nudge = 0.1;
+
+        void enforce_constraints()
+        {
+            relative_nudge = std::max(relative_nudge, 0.0f);
+            absolute_nudge = std::max(absolute_nudge, 0);
+        }
+    };
+
+    [[nodiscard]] std::int16_t nudge(NudgedStaticParams& params, std::int16_t static_eval_i16, std::int16_t deep_eval_i16)
+    {
+        auto saturate_i32_to_i16 = [](int v) {
+            return static_cast<std::int16_t>(
+                std::clamp(
+                    v,
+                    (int)std::numeric_limits<std::int16_t>::min(),
+                    (int)std::numeric_limits<std::int16_t>::max()
+                )
+            );
+        };
+
+        auto saturate_f32_to_i16 = [saturate_i32_to_i16](float v) {
+            return saturate_i32_to_i16((int)v);
+        };
+
+        int static_eval = static_eval_i16;
+        int deep_eval = deep_eval_i16;
+
+        switch(params.mode)
+        {
+            case NudgedStaticMode::Absolute:
+                return saturate_i32_to_i16(
+                    static_eval + std::clamp(
+                        deep_eval - static_eval,
+                        -params.absolute_nudge,
+                        params.absolute_nudge
+                    )
+                );
+
+            case NudgedStaticMode::Relative:
+                return saturate_f32_to_i16(
+                    (float)static_eval * std::clamp(
+                        (float)deep_eval / (float)static_eval,
+                        (1.0f - params.relative_nudge),
+                        (1.0f + params.relative_nudge)
+                    )
+                );
+
+            case NudgedStaticMode::Interpolate:
+                return saturate_f32_to_i16(
+                    (float)static_eval * (1.0f - params.interpolate_nudge)
+                    + (float)deep_eval * params.interpolate_nudge
+                );
+
+            default:
+                assert(false);
+                return 0;
+        }
+    }
+
+    void do_nudged_static(NudgedStaticParams& params)
+    {
+        Thread* th = Threads.main();
+        Position& pos = th->rootPos;
+        StateInfo si;
+
+        auto in = Learner::open_sfen_input_file(params.input_filename);
+        auto out = Learner::create_new_sfen_output(params.output_filename);
+
+        if (in == nullptr)
+        {
+            std::cerr << "Invalid input file type.\n";
+            return;
+        }
+
+        if (out == nullptr)
+        {
+            std::cerr << "Invalid output file type.\n";
+            return;
+        }
+
+        PSVector buffer;
+        uint64_t batch_size = 1'000'000;
+
+        buffer.reserve(batch_size);
+
+        uint64_t num_processed = 0;
+        for (;;)
+        {
+            auto v = in->next();
+            if (!v.has_value())
+                break;
+
+            auto& ps = v.value();
+
+            pos.set_from_packed_sfen(ps.sfen, &si, th);
+            auto static_eval = Eval::evaluate(pos);
+            auto deep_eval = ps.score;
+            ps.score = nudge(params, static_eval, deep_eval);
+
+            buffer.emplace_back(ps);
+            if (buffer.size() >= batch_size)
+            {
+                num_processed += buffer.size();
+
+                out->write(buffer);
+                buffer.clear();
+
+                std::cout << "Processed " << num_processed << " positions.\n";
+            }
+        }
+
+        if (!buffer.empty())
+        {
+            num_processed += buffer.size();
+
+            out->write(buffer);
+            buffer.clear();
+
+            std::cout << "Processed " << num_processed << " positions.\n";
+        }
+
+        std::cout << "Finished.\n";
+    }
+
+    void nudged_static(std::istringstream& is)
+    {
+        NudgedStaticParams params{};
+
+        while(true)
+        {
+            std::string token;
+            is >> token;
+
+            if (token == "")
+                break;
+
+            if (token == "absolute")
+            {
+                params.mode = NudgedStaticMode::Absolute;
+                is >> params.absolute_nudge;
+            }
+            else if (token == "relative")
+            {
+                params.mode = NudgedStaticMode::Relative;
+                is >> params.relative_nudge;
+            }
+            else if (token == "interpolate")
+            {
+                params.mode = NudgedStaticMode::Interpolate;
+                is >> params.interpolate_nudge;
+            }
+            else if (token == "input_file")
+                is >> params.input_filename;
+            else if (token == "output_file")
+                is >> params.output_filename;
+        }
+
+        std::cout << "Performing transform nudged_static with parameters:\n";
+        std::cout << "input_file          : " << params.input_filename << '\n';
+        std::cout << "output_file         : " << params.output_filename << '\n';
+        std::cout << "\n";
+        if (params.mode == NudgedStaticMode::Absolute)
+        {
+            std::cout << "mode                : absolute\n";
+            std::cout << "absolute_nudge      : " << params.absolute_nudge << '\n';
+        }
+        else if (params.mode == NudgedStaticMode::Relative)
+        {
+            std::cout << "mode                : relative\n";
+            std::cout << "relative_nudge      : " << params.relative_nudge << '\n';
+        }
+        else if (params.mode == NudgedStaticMode::Interpolate)
+        {
+            std::cout << "mode                : interpolate\n";
+            std::cout << "interpolate_nudge   : " << params.interpolate_nudge << '\n';
+        }
+        std::cout << '\n';
+
+        params.enforce_constraints();
+        do_nudged_static(params);
+    }
+
+    void transform(std::istringstream& is)
+    {
+        const std::map<std::string, CommandFunc> subcommands = {
+            { "nudged_static", &nudged_static }
+        };
+
+        Eval::NNUE::init();
+
+        std::string subcommand;
+        is >> subcommand;
+
+        auto func = subcommands.find(subcommand);
+        if (func == subcommands.end())
+        {
+            std::cout << "Invalid subcommand " << subcommand << ". Exiting...\n";
+            return;
+        }
+
+        func->second(is);
+    }
+
+}
diff --git a/src/learn/transform.h b/src/learn/transform.h
new file mode 100644
index 00000000..8a6921a0
--- /dev/null
+++ b/src/learn/transform.h
@@ -0,0 +1,12 @@
+#ifndef _TRANSFORM_H_
+#define _TRANSFORM_H_
+
+#include <sstream>
+
+namespace Learner {
+
+    void transform(std::istringstream& is);
+
+}
+
+#endif
diff --git a/src/uci.cpp b/src/uci.cpp
index ae21a3ae..8e64da6b 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -38,6 +38,7 @@
 #include "learn/gensfen.h"
 #include "learn/learn.h"
 #include "learn/convert.h"
+#include "learn/transform.h"
 
 using namespace std;
 
@@ -345,6 +346,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "convert_bin") Learner::convert_bin(is);
       else if (token == "convert_plain") Learner::convert_plain(is);
       else if (token == "convert_bin_from_pgn_extract") Learner::convert_bin_from_pgn_extract(is);
+      else if (token == "transform") Learner::transform(is);
 
       // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);

From 92b14a5ba2310ea5285d3f5987c5bc247c715860 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 26 Nov 2020 18:06:00 +0100
Subject: [PATCH 460/583] Add docs for transform.

---
 docs/transform.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 docs/transform.md

diff --git a/docs/transform.md b/docs/transform.md
new file mode 100644
index 00000000..82e963fe
--- /dev/null
+++ b/docs/transform.md
@@ -0,0 +1,21 @@
+# Transform
+
+`transform` command exposes subcommands that perform some specific transformation over data. The call syntax is `transform <subcommand>`. Currently implemented subcommands are listed and described below.
+
+## `nudged_static`
+
+`transform nudged_static` takes named parameters in the form of `gensfen param_1_name param_1_value param_2_name param_2_value ...` and flag parameters which don't require values.
+
+This command goes through positions in the input files and replaces the scores with new ones - generated from static eval - but slightly adjusted based on the scores in the original input file.
+
+Currently the following options are available:
+
+`input_file` - path to the input file. Supports bin and binpack formats. Default: in.binpack.
+
+`output_file` - path to the output file. Supports bin and binpack formats. Default: out.binpack.
+
+`absolute` - states that the adjustment should be bounded by an absolute value. After this token follows the maximum absolute adjustment. Values are always adjusted towards scores in the input file. This is the default mode. Default maximum adjustement: 5.
+
+`relative` - states that the adjustment should be bounded by a value relative in magnitude to the static eval value. After this token follows the maximum relative change - a floating point value greater than 0. For example a value of 0.1 only allows changing the static eval by at most 10% towards the score from the input file.
+
+`interpolate` states that the output score should be a value interpolated between static eval and the score from the input file. After this token follows the interpolation constant `t`. `t` of 0 means that only static eval is used. `t` of 1 means that only score from the input file is used. `t` of 0.5 means that the static eval and input score are averaged. It accepts values outside of range `<0, 1>`, but the usefulness is questionable.

From 4ea8572b6d8fdbd092c94954c78a6b0a47289083 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 18:27:51 +0100
Subject: [PATCH 461/583] Add single threaded sgemm.

---
 src/extra/stockfish_blas.cpp | 290 +++++++++++++++++++++++++++++++++++
 src/extra/stockfish_blas.h   |  10 ++
 2 files changed, 300 insertions(+)

diff --git a/src/extra/stockfish_blas.cpp b/src/extra/stockfish_blas.cpp
index 0ba40b49..109a4b44 100644
--- a/src/extra/stockfish_blas.cpp
+++ b/src/extra/stockfish_blas.cpp
@@ -546,6 +546,156 @@ namespace Blas {
         );
         thread_pool.wait_for_workers_finished();
 
+#endif
+    }
+
+    void sgemm_row_major_transpose_right(
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+
+#if defined(USE_SSE3)
+
+        const __m128 alpha4 = _mm_set1_ps(alpha);
+        const __m128 beta4 = _mm_set1_ps(beta);
+
+        for (int m = 0; m < M - 1; m += 2)
+        {
+            int n = 0;
+            for (; n < N - 3; n += 4)
+            {
+                //        mn
+                __m128 sum00 = _mm_setzero_ps();
+                __m128 sum01 = _mm_setzero_ps();
+                __m128 sum02 = _mm_setzero_ps();
+                __m128 sum03 = _mm_setzero_ps();
+                __m128 sum10 = _mm_setzero_ps();
+                __m128 sum11 = _mm_setzero_ps();
+                __m128 sum12 = _mm_setzero_ps();
+                __m128 sum13 = _mm_setzero_ps();
+
+                // Horizontal sum of elements in sum[m][n] corresponds to
+                // the final element in the C.
+
+                int k = 0;
+                for (; k < K - 3; k += 4)
+                {
+                    const __m128 a0 = _mm_loadu_ps(&A[(m+0)*lda+k+0]);
+                    const __m128 a1 = _mm_loadu_ps(&A[(m+1)*lda+k+0]);
+
+                    const __m128 b0 = _mm_loadu_ps(&B[(n+0)*ldb+k+0]);
+                    const __m128 b1 = _mm_loadu_ps(&B[(n+1)*ldb+k+0]);
+                    const __m128 b2 = _mm_loadu_ps(&B[(n+2)*ldb+k+0]);
+                    const __m128 b3 = _mm_loadu_ps(&B[(n+3)*ldb+k+0]);
+
+                    sum00 = _mm_add_ps(sum00, _mm_mul_ps(a0, b0));
+                    sum01 = _mm_add_ps(sum01, _mm_mul_ps(a0, b1));
+                    sum02 = _mm_add_ps(sum02, _mm_mul_ps(a0, b2));
+                    sum03 = _mm_add_ps(sum03, _mm_mul_ps(a0, b3));
+                    sum10 = _mm_add_ps(sum10, _mm_mul_ps(a1, b0));
+                    sum11 = _mm_add_ps(sum11, _mm_mul_ps(a1, b1));
+                    sum12 = _mm_add_ps(sum12, _mm_mul_ps(a1, b2));
+                    sum13 = _mm_add_ps(sum13, _mm_mul_ps(a1, b3));
+                }
+
+                for(; k < K; k += 1)
+                {
+                    const float a0 = A[(m+0)*lda+k+0];
+                    const float a1 = A[(m+1)*lda+k+0];
+
+                    const float b0 = B[(n+0)*ldb+k+0];
+                    const float b1 = B[(n+1)*ldb+k+0];
+                    const float b2 = B[(n+2)*ldb+k+0];
+                    const float b3 = B[(n+3)*ldb+k+0];
+
+                    // Since all will be summed vertically anyway we can
+                    // just add to the first element.
+                    // Other elements are left unmodified.
+                    sum00 = _mm_add_ss(sum00, _mm_set_ss(a0 * b0));
+                    sum01 = _mm_add_ss(sum01, _mm_set_ss(a0 * b1));
+                    sum02 = _mm_add_ss(sum02, _mm_set_ss(a0 * b2));
+                    sum03 = _mm_add_ss(sum03, _mm_set_ss(a0 * b3));
+                    sum10 = _mm_add_ss(sum10, _mm_set_ss(a1 * b0));
+                    sum11 = _mm_add_ss(sum11, _mm_set_ss(a1 * b1));
+                    sum12 = _mm_add_ss(sum12, _mm_set_ss(a1 * b2));
+                    sum13 = _mm_add_ss(sum13, _mm_set_ss(a1 * b3));
+                }
+
+                __m128 s0 = m128_hadd_ps(sum00, sum01, sum02, sum03);
+                __m128 s1 = m128_hadd_ps(sum10, sum11, sum12, sum13);
+                s0 = _mm_mul_ps(s0, alpha4);
+                s1 = _mm_mul_ps(s1, alpha4);
+
+                __m128 c0 = _mm_loadu_ps(&C[(m+0)*ldc+(n+0)]);
+                __m128 c1 = _mm_loadu_ps(&C[(m+1)*ldc+(n+0)]);
+                c0 = _mm_mul_ps(c0, beta4);
+                c1 = _mm_mul_ps(c1, beta4);
+
+                c0 = _mm_add_ps(c0, s0);
+                c1 = _mm_add_ps(c1, s1);
+
+                _mm_storeu_ps(&C[(m+0)*ldc+(n+0)], c0);
+                _mm_storeu_ps(&C[(m+1)*ldc+(n+0)], c1);
+            }
+
+            for(; n < N; n += 1)
+            {
+                float sum0 = 0.0f;
+                float sum1 = 0.0f;
+
+                for (int k = 0; k < K; ++k)
+                {
+                    const float a0 = A[(m+0)*lda+k+0];
+                    const float a1 = A[(m+1)*lda+k+0];
+
+                    const float b0 = B[(n+0)*ldb+k+0];
+
+                    sum0 += a0 * b0;
+                    sum1 += a1 * b0;
+                }
+
+                C[(m+0)*ldc+(n+0)] = C[(m+0)*ldc+(n+0)] * beta + sum0 * alpha;
+                C[(m+1)*ldc+(n+0)] = C[(m+1)*ldc+(n+0)] * beta + sum1 * alpha;
+            }
+        }
+
+        for (; m < M; m += 1)
+        {
+            for (int n = 0; n < N; n += 1)
+            {
+                float sum = 0.0f;
+
+                for (int k = 0; k < K; k += 1)
+                {
+                    sum += A[m*lda + k] * B[n*ldb + k];
+                }
+
+                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
+            }
+        }
+
+#else
+
+        for (int m = 0; m < M; m += 1)
+        {
+            for (int n = 0; n < N; n += 1)
+            {
+                float sum = 0.0f;
+
+                for (int k = 0; k < K; k += 1)
+                {
+                    sum += A[m*lda + k] * B[n*ldb + k];
+                }
+
+                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
+            }
+        }
+
 #endif
     }
 
@@ -605,6 +755,35 @@ namespace Blas {
         );
     }
 
+    void sgemm_row_major_transpose_none(
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        constexpr static int temporary_buffer_index = 1;
+
+        auto B_tr = get_thread_local_temporary_storage(K * N, temporary_buffer_index);
+
+        transpose(
+            K, N,
+            B, ldb,
+            B_tr, K
+        );
+
+        sgemm_row_major_transpose_right(
+            M, N, K,
+            alpha,
+            A, lda,
+            B_tr, K,
+            beta,
+            C, ldc
+        );
+    }
+
     void sgemm_row_major(
         ThreadPool& thread_pool,
         MatrixTranspose TransA, MatrixTranspose TransB,
@@ -684,6 +863,80 @@ namespace Blas {
         }
     }
 
+    void sgemm_row_major(
+        MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        constexpr static int temporary_buffer_index = 0;
+
+        if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::Trans)
+        {
+            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
+
+            transpose(
+                K, M,
+                A, lda,
+                A_tr, K
+            );
+
+            sgemm_row_major_transpose_right(
+                M, N, K,
+                alpha,
+                A_tr, K,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else if (TransA == MatrixTranspose::NoTrans && TransB == MatrixTranspose::Trans)
+        {
+            sgemm_row_major_transpose_right(
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::NoTrans)
+        {
+            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
+
+            transpose(
+                K, M,
+                A, lda,
+                A_tr, K
+            );
+
+            sgemm_row_major_transpose_none(
+                M, N, K,
+                alpha,
+                A_tr, K,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else // no transpositions
+        {
+            sgemm_row_major_transpose_none(
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+    }
+
     void sgemm(
         ThreadPool& thread_pool,
         MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
@@ -723,6 +976,43 @@ namespace Blas {
         }
     }
 
+
+    void sgemm(
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    )
+    {
+        if (layout == MatrixLayout::RowMajor)
+        {
+            sgemm_row_major(
+                TransA, TransB,
+                M, N, K,
+                alpha,
+                A, lda,
+                B, ldb,
+                beta,
+                C, ldc
+            );
+        }
+        else
+        {
+            sgemm_row_major(
+                TransB, TransA,
+                N, M, K,
+                alpha,
+                B, ldb,
+                A, lda,
+                beta,
+                C, ldc
+            );
+        }
+    }
+
     std::vector<float> generate_random_matrix(int rows, int cols)
     {
         std::vector<float> m(rows * cols);
diff --git a/src/extra/stockfish_blas.h b/src/extra/stockfish_blas.h
index 65da7e99..f551bbf2 100644
--- a/src/extra/stockfish_blas.h
+++ b/src/extra/stockfish_blas.h
@@ -118,6 +118,16 @@ namespace Blas {
         float * SF_BLAS_RESTRICT C, const int ldc
     );
 
+    void sgemm(
+        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
+        const int M, const int N, const int K,
+        const float alpha,
+        const float * SF_BLAS_RESTRICT A, const int lda,
+        const float * SF_BLAS_RESTRICT B, const int ldb,
+        const float beta,
+        float * SF_BLAS_RESTRICT C, const int ldc
+    );
+
     void test(
         ThreadPool& thread_pool
     );

From 0d4b803b08af87a1a264d196c9d3762c1acb1aeb Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 19:23:30 +0100
Subject: [PATCH 462/583] Prepare trainer affine transform.

---
 src/nnue/trainer/trainer_affine_transform.h | 215 +++++++++++++-------
 1 file changed, 142 insertions(+), 73 deletions(-)

diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index 610805ca..f66f1a65 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -91,19 +91,52 @@ namespace Eval::NNUE {
             quantize_parameters();
         }
 
-        // forward propagation
-        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
-            if (output_.size() < kOutputDimensions * batch.size()) {
-                output_.resize(kOutputDimensions * batch.size());
-                gradients_.resize(kInputDimensions * batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        {
+            if (output_.size() < kOutputDimensions * combined_batch.size()) {
+                output_.resize(kOutputDimensions * combined_batch.size());
+                gradients_.resize(kInputDimensions * combined_batch.size());
             }
 
-            batch_size_ = static_cast<IndexType>(batch.size());
-            batch_input_ = previous_layer_trainer_->propagate(thread_pool, batch);
+            if (thread_states_.size() < thread_pool.size())
+            {
+                thread_states_.resize(thread_pool.size());
+            }
+
+            combined_batch_size_ = static_cast<IndexType>(combined_batch.size());
+            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
+
+            auto& main_thread_state = thread_states_[0];
 
 #if defined(USE_BLAS)
 
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            // update
+            cblas_sscal(
+                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
+            );
+
+#else
+
+            Blas::sscal(
+                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
+            );
+
+#endif
+
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+                thread_states_[i].reset_biases();
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
+
+            previous_layer_trainer_->propagate(th, offset, count);
+
+#if defined(USE_BLAS)
+
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 cblas_scopy(
                     kOutputDimensions, biases_, 1, &output_[batch_offset], 1
@@ -112,149 +145,151 @@ namespace Eval::NNUE {
 
             cblas_sgemm(
                 CblasColMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, batch_size_, kInputDimensions,
+                kOutputDimensions, count, kInputDimensions,
                 1.0,
                 weights_, kInputDimensions,
-                batch_input_, kInputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
                 1.0,
-                &output_[0], kOutputDimensions
+                &output_[offset * kOutputDimensions], kOutputDimensions
             );
 #else
 
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 Blas::scopy(
-                    thread_pool,
                     kOutputDimensions, biases_, 1, &output_[batch_offset], 1
                 );
             }
 
             Blas::sgemm(
-                thread_pool,
                 Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
-                kOutputDimensions, batch_size_, kInputDimensions,
+                kOutputDimensions, count, kInputDimensions,
                 1.0,
                 weights_, kInputDimensions,
-                batch_input_, kInputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
                 1.0,
-                &output_[0], kOutputDimensions
+                &output_[offset * kOutputDimensions], kOutputDimensions
             );
 
 #endif
-            return output_.data();
         }
 
         // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
+        void backpropagate(Thread& th,
                            const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
-
-            const LearnFloatType local_learning_rate =
-                learning_rate * learning_rate_scale_;
+                           uint64_t offset,
+                           uint64_t count) {
 
+            auto& thread_state = thread_states_[th.thread_idx()];
+            const auto momentum = th.thread_idx() == 0 ? momentum_ : 0.0f;
 #if defined(USE_BLAS)
 
             cblas_sgemm(
                 CblasColMajor, CblasNoTrans, CblasNoTrans,
-                kInputDimensions, batch_size_, kOutputDimensions,
+                kInputDimensions, count, kOutputDimensions,
                 1.0,
                 weights_, kInputDimensions,
-                gradients, kOutputDimensions,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
                 0.0,
-                &gradients_[0], kInputDimensions
+                &gradients_[offset * kInputDimensions], kInputDimensions
             );
 
-            // update
-            cblas_sscal(
-                kOutputDimensions, momentum_, biases_diff_, 1
-            );
-
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 cblas_saxpy(
                     kOutputDimensions, 1.0,
-                    &gradients[batch_offset], 1, biases_diff_, 1
+                    &gradients[batch_offset], 1, thread_state.biases_diff_, 1
                 );
             }
 
             cblas_sgemm(
                 CblasRowMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, kInputDimensions, batch_size_,
+                kOutputDimensions, kInputDimensions, count,
                 1.0,
-                gradients, kOutputDimensions,
-                batch_input_, kInputDimensions,
-                momentum_,
-                weights_diff_, kInputDimensions
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                momentum,
+                thread_state.weights_diff_, kInputDimensions
             );
 
 #else
 
             // backpropagate
             Blas::sgemm(
-                thread_pool,
                 Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
-                kInputDimensions, batch_size_, kOutputDimensions,
+                kInputDimensions, count, kOutputDimensions,
                 1.0,
                 weights_, kInputDimensions,
-                gradients, kOutputDimensions,
+                gradients + offset * kOutputDimensions, kOutputDimensions,
                 0.0,
-                &gradients_[0], kInputDimensions
+                &gradients_[offset * kInputDimensions], kInputDimensions
             );
 
-
-            Blas::sscal(
-                thread_pool,
-                kOutputDimensions, momentum_, biases_diff_, 1
-            );
-
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
-                Blas::saxpy(thread_pool, kOutputDimensions, 1.0,
-                          &gradients[batch_offset], 1, biases_diff_, 1);
+                Blas::saxpy(kOutputDimensions, 1.0,
+                          &gradients[batch_offset], 1, thread_state.biases_diff_, 1);
             }
 
             Blas::sgemm(
-                thread_pool,
                 Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
-                kOutputDimensions, kInputDimensions, batch_size_,
+                kOutputDimensions, kInputDimensions, count,
                 1.0,
-                gradients, kOutputDimensions,
-                batch_input_, kInputDimensions,
-                momentum_,
-                weights_diff_, kInputDimensions
+                gradients + offset * kOutputDimensions, kOutputDimensions,
+                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
+                momentum,
+                thread_state.weights_diff_, kInputDimensions
             );
 
 #endif
 
+            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
+
+        void reduce_thread_state()
+        {
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+            {
+                thread_states_[0] += thread_states_[i];
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
+        {
+            const LearnFloatType local_learning_rate =
+                learning_rate * learning_rate_scale_;
+
+            reduce_thread_state();
+
+            auto& main_thread_state = thread_states_[0];
+
             for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                const double d = local_learning_rate * biases_diff_[i];
+                const double d = local_learning_rate * main_thread_state.biases_diff_[i];
                 biases_[i] -= d;
                 abs_biases_diff_sum_ += std::abs(d);
             }
             num_biases_diffs_ += kOutputDimensions;
 
             for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-                const double d = local_learning_rate * weights_diff_[i];
+                const double d = local_learning_rate * main_thread_state.weights_diff_[i];
                 weights_[i] -= d;
                 abs_weights_diff_sum_ += std::abs(d);
             }
             num_weights_diffs_ += kOutputDimensions * kInputDimensions;
 
-            previous_layer_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
+            previous_layer_trainer_->step_end(thread_pool, learning_rate);
         }
 
     private:
         // constructor
         Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-            batch_size_(0),
-            batch_input_(nullptr),
+            combined_batch_size_(0),
+            combined_batch_input_(nullptr),
             previous_layer_trainer_(Trainer<PreviousLayer>::create(
                 &target_layer->previous_layer_, ft)),
             target_layer_(target_layer),
             biases_(),
             weights_(),
-            biases_diff_(),
-            weights_diff_(),
             momentum_(0.2),
             learning_rate_scale_(1.0) {
 
@@ -335,10 +370,12 @@ namespace Eval::NNUE {
                 }
             }
 
-            std::fill(std::begin(biases_diff_), std::end(biases_diff_),
-                      static_cast<LearnFloatType>(0.0));
-            std::fill(std::begin(weights_diff_), std::end(weights_diff_),
-                      static_cast<LearnFloatType>(0.0));
+            for (auto& state : thread_states_)
+            {
+                state.reset_weights();
+                state.reset_biases();
+            }
+
 
             reset_stats();
         }
@@ -365,7 +402,7 @@ namespace Eval::NNUE {
             std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
 
         // number of samples in mini-batch
-        IndexType batch_size_;
+        IndexType combined_batch_size_;
 
         double abs_biases_diff_sum_;
         double abs_weights_diff_sum_;
@@ -373,7 +410,7 @@ namespace Eval::NNUE {
         uint64_t num_weights_diffs_;
 
         // Input mini batch
-        const LearnFloatType* batch_input_;
+        const LearnFloatType* combined_batch_input_;
 
         // Trainer of the previous layer
         const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
@@ -382,12 +419,44 @@ namespace Eval::NNUE {
         LayerType* const target_layer_;
 
         // parameter
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            // Buffer used for updating parameters
+            alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions];
+            alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+
+            ThreadState() { reset_weights(); reset_biases(); }
+
+            ThreadState& operator+=(const ThreadState& other)
+            {
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    biases_diff_[i] += other.biases_diff_[i];
+                }
+
+                for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i)
+                {
+                    weights_diff_[i] += other.weights_diff_[i];
+                }
+
+                return *this;
+            }
+
+            void reset_weights()
+            {
+                std::fill(std::begin(weights_diff_), std::end(weights_diff_), 0.0f);
+            }
+
+            void reset_biases()
+            {
+                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
+            }
+        };
+
         alignas(kCacheLineSize) LearnFloatType biases_[kOutputDimensions];
         alignas(kCacheLineSize) LearnFloatType weights_[kOutputDimensions * kInputDimensions];
 
-        // Buffer used for updating parameters
-        alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions];
-        alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
 
         // Forward propagation buffer
         std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;

From cc11375f6df3186ec6090c671ddcbed45e8bc55a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 19:46:43 +0100
Subject: [PATCH 463/583] Skeleton for new evaluate learner

---
 src/nnue/evaluate_nnue_learner.cpp | 69 +++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 20 deletions(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 4104fef5..644ac9a4 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -54,6 +54,12 @@ namespace Eval::NNUE {
         const std::string& seed,
         SynchronizedRegionLogger::Region& out) {
 
+#if defined (OPENBLAS_VERSION)
+        openblas_set_num_threads(1);
+#elif defined (INTEL_MKL_VERSION)
+        mkl_set_num_threads(1);
+#endif
+
         out << "INFO (initialize_training): Initializing NN training for "
             << get_architecture_string() << std::endl;
 
@@ -199,39 +205,62 @@ namespace Eval::NNUE {
 
         bool collect_stats = verbose;
 
+        std::vector<double> abs_eval_diff_sum_local(thread_pool.size(), 0.0);
+        std::vector<double> abs_discrete_eval_sum_local(thread_pool.size(), 0.0);
+        std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
+
         while (examples.size() >= batch_size) {
             std::vector<Example> batch(examples.end() - batch_size, examples.end());
             examples.resize(examples.size() - batch_size);
 
-            const auto network_output = trainer->propagate(thread_pool, batch);
-
+            const auto network_output = trainer->step_start(thread_pool, batch);
             std::vector<LearnFloatType> gradients(batch.size());
-            for (std::size_t b = 0; b < batch.size(); ++b) {
-                const auto shallow = static_cast<Value>(round<std::int32_t>(
-                    batch[b].sign * network_output[b] * kPonanzaConstant));
-                const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
-                const auto& psv = batch[b].psv;
-                const double gradient =
-                    batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
-                gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+
+            thread_pool.for_each_index_chunk_with_workers(
+                std::size_t(0), batch.size(),
+                [&](Thread& th, std::size_t offset, std::size_t count) {
+                    const auto thread_id = th.thread_idx();
+
+                    trainer->propagate(th, offset, count);
+
+                    for (std::size_t b = offset; b < offset + count; ++b) {
+                        const auto shallow = static_cast<Value>(round<std::int32_t>(
+                            batch[b].sign * network_output[b] * kPonanzaConstant));
+                        const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
+                        const auto& psv = batch[b].psv;
+                        const double gradient =
+                            batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
 
 
-                // The discrete eval will only be valid before first backpropagation,
-                // that is only for the first batch.
-                // Similarily we want only gradients from one batch.
-                if (collect_stats)
-                {
-                    abs_eval_diff_sum += std::abs(discrete - shallow);
-                    abs_discrete_eval_sum += std::abs(discrete);
-                    gradient_norm += std::abs(gradient);
+                        // The discrete eval will only be valid before first backpropagation,
+                        // that is only for the first batch.
+                        // Similarily we want only gradients from one batch.
+                        if (collect_stats)
+                        {
+                            abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow);
+                            abs_discrete_eval_sum_local[thread_id] += std::abs(discrete);
+                            gradient_norm_local[thread_id] += std::abs(gradient);
+                        }
+                    }
+
+                    trainer->backpropagate(th, gradients.data(), offset, count);
                 }
-            }
+            );
+            thread_pool.wait_for_workers_finished();
 
-            trainer->backpropagate(thread_pool, gradients.data(), learning_rate);
+            trainer->step_end(thread_pool, learning_rate);
 
             collect_stats = false;
         }
 
+        if (verbose)
+        {
+            abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0);
+            abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0);
+            gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0);
+        }
+
         if (verbose) {
             const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
             const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;

From 774b02364121b23ae13be862324fbebc59f357af Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 19:53:33 +0100
Subject: [PATCH 464/583] Add chunked for each with workers.

---
 src/thread.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/thread.h b/src/thread.h
index 0d0d7fea..83ba2f33 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -155,6 +155,31 @@ struct ThreadPool : public std::vector<Thread*> {
       });
   }
 
+  template <typename IndexT, typename FuncT>
+  void for_each_index_chunk_with_workers(
+    IndexT begin,
+    typename Detail::TypeIdentity<IndexT>::Type end,
+    FuncT func)
+  {
+    // This value must outlive the function call.
+    // It's fairly safe if we make it static
+    // because for_each_index_with_workers
+    // is not reentrant nor thread safe.
+    const IndexT size = end - begin;
+    const IndexT chunk_size = (size + this->size()) / this->size();
+
+    execute_with_workers(
+      [chunk_size, end, func](Thread& th) mutable {
+        const IndexT thread_id = th.thread_idx();
+        const IndexT offset = chunk_size * thread_id;
+        if (offset >= end)
+          return;
+
+        const IndexT count = offset + chunk_size > end ? end - offset : chunk_size;
+        func(th, offset, count);
+      });
+  }
+
   void start_thinking(Position&, StateListPtr&, const Search::LimitsType&, bool = false);
   void clear();
   void set(size_t);

From 401fc0fbab085f75a1cde793dc3a0b6ded13bafb Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 20:16:49 +0100
Subject: [PATCH 465/583] Prepare clipped relu trainer.

---
 src/nnue/trainer/trainer_clipped_relu.h | 233 +++++++++++++++---------
 1 file changed, 150 insertions(+), 83 deletions(-)

diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index 124671ed..e4bcecaf 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -42,16 +42,31 @@ namespace Eval::NNUE {
             previous_layer_trainer_->initialize(rng);
         }
 
-        // forward propagation
-        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
-            if (output_.size() < kOutputDimensions * batch.size()) {
-              output_.resize(kOutputDimensions * batch.size());
-              gradients_.resize(kInputDimensions * batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        {
+            if (output_.size() < kOutputDimensions * combined_batch.size()) {
+              output_.resize(kOutputDimensions * combined_batch.size());
+              gradients_.resize(kInputDimensions * combined_batch.size());
             }
 
-            const auto input = previous_layer_trainer_->propagate(thread_pool, batch);
+            if (thread_states_.size() < thread_pool.size())
+            {
+                thread_states_.resize(thread_pool.size());
+            }
 
-            batch_size_ = static_cast<IndexType>(batch.size());
+            input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
+
+            batch_size_ = static_cast<IndexType>(combined_batch.size());
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
+
+            auto& thread_state = thread_states_[th.thread_idx()];
+
+            previous_layer_trainer_->propagate(th, offset, count);
 
 #if defined (USE_SSE2)
 
@@ -61,16 +76,16 @@ namespace Eval::NNUE {
                 const __m128 kZero4 = _mm_set1_ps(+kZero);
                 const __m128 kOne4 = _mm_set1_ps(+kOne);
 
-                for (IndexType b = 0; b < batch.size(); ++b)
+                for (IndexType b = offset; b < offset + count; ++b)
                 {
                     const IndexType batch_offset = kOutputDimensions * b;
 
                     for (IndexType i = 0; i < kOutputDimensions; i += 16)
                     {
-                        __m128 out0 = _mm_loadu_ps(&input[i + 0 + batch_offset]);
-                        __m128 out1 = _mm_loadu_ps(&input[i + 4 + batch_offset]);
-                        __m128 out2 = _mm_loadu_ps(&input[i + 8 + batch_offset]);
-                        __m128 out3 = _mm_loadu_ps(&input[i + 12 + batch_offset]);
+                        __m128 out0 = _mm_loadu_ps(&input_[i + 0 + batch_offset]);
+                        __m128 out1 = _mm_loadu_ps(&input_[i + 4 + batch_offset]);
+                        __m128 out2 = _mm_loadu_ps(&input_[i + 8 + batch_offset]);
+                        __m128 out3 = _mm_loadu_ps(&input_[i + 12 + batch_offset]);
 
                         out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
                         out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
@@ -82,15 +97,15 @@ namespace Eval::NNUE {
                         _mm_storeu_ps(&output_[i + 8 + batch_offset], out2);
                         _mm_storeu_ps(&output_[i + 12 + batch_offset], out3);
 
-                        __m128 minact0 = _mm_loadu_ps(&min_activations_[i + 0]);
-                        __m128 minact1 = _mm_loadu_ps(&min_activations_[i + 4]);
-                        __m128 minact2 = _mm_loadu_ps(&min_activations_[i + 8]);
-                        __m128 minact3 = _mm_loadu_ps(&min_activations_[i + 12]);
+                        __m128 minact0 = _mm_loadu_ps(&thread_state.min_activations_[i + 0]);
+                        __m128 minact1 = _mm_loadu_ps(&thread_state.min_activations_[i + 4]);
+                        __m128 minact2 = _mm_loadu_ps(&thread_state.min_activations_[i + 8]);
+                        __m128 minact3 = _mm_loadu_ps(&thread_state.min_activations_[i + 12]);
 
-                        __m128 maxact0 = _mm_loadu_ps(&max_activations_[i + 0]);
-                        __m128 maxact1 = _mm_loadu_ps(&max_activations_[i + 4]);
-                        __m128 maxact2 = _mm_loadu_ps(&max_activations_[i + 8]);
-                        __m128 maxact3 = _mm_loadu_ps(&max_activations_[i + 12]);
+                        __m128 maxact0 = _mm_loadu_ps(&thread_state.max_activations_[i + 0]);
+                        __m128 maxact1 = _mm_loadu_ps(&thread_state.max_activations_[i + 4]);
+                        __m128 maxact2 = _mm_loadu_ps(&thread_state.max_activations_[i + 8]);
+                        __m128 maxact3 = _mm_loadu_ps(&thread_state.max_activations_[i + 12]);
 
                         minact0 = _mm_min_ps(out0, minact0);
                         minact1 = _mm_min_ps(out1, minact1);
@@ -102,40 +117,41 @@ namespace Eval::NNUE {
                         maxact2 = _mm_max_ps(out2, maxact2);
                         maxact3 = _mm_max_ps(out3, maxact3);
 
-                        _mm_storeu_ps(&min_activations_[i + 0], minact0);
-                        _mm_storeu_ps(&min_activations_[i + 4], minact1);
-                        _mm_storeu_ps(&min_activations_[i + 8], minact2);
-                        _mm_storeu_ps(&min_activations_[i + 12], minact3);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 0], minact0);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 4], minact1);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 8], minact2);
+                        _mm_storeu_ps(&thread_state.min_activations_[i + 12], minact3);
 
-                        _mm_storeu_ps(&max_activations_[i + 0], maxact0);
-                        _mm_storeu_ps(&max_activations_[i + 4], maxact1);
-                        _mm_storeu_ps(&max_activations_[i + 8], maxact2);
-                        _mm_storeu_ps(&max_activations_[i + 12], maxact3);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 0], maxact0);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 4], maxact1);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 8], maxact2);
+                        _mm_storeu_ps(&thread_state.max_activations_[i + 12], maxact3);
                     }
                 }
             }
 
 #else
 
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
                     const IndexType index = batch_offset + i;
-                    output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
-                    min_activations_[i] = std::min(min_activations_[i], output_[index]);
-                    max_activations_[i] = std::max(max_activations_[i], output_[index]);
+                    output_[index] = std::max(+kZero, std::min(+kOne, input_[index]));
+                    thread_state.min_activations_[i] = std::min(thread_state.min_activations_[i], output_[index]);
+                    thread_state.max_activations_[i] = std::max(thread_state.max_activations_[i], output_[index]);
                 }
             }
 
 #endif
-
-            return output_.data();
         }
 
         // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
+        void backpropagate(Thread& th,
                            const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
+                           const uint64_t offset,
+                           const uint64_t count) {
+
+            auto& thread_state = thread_states_[th.thread_idx()];
 
 #if defined (USE_SSE2)
 
@@ -145,62 +161,78 @@ namespace Eval::NNUE {
                 const __m128 kZero4 = _mm_set1_ps(+kZero);
                 const __m128 kOne4 = _mm_set1_ps(+kOne);
 
-                const IndexType total_size = batch_size_ * kOutputDimensions;
-
-                for (IndexType i = 0; i < total_size; i += 16)
+                for (IndexType b = offset; b < offset + count; ++b)
                 {
-                    __m128 out0 = _mm_loadu_ps(&output_[i + 0]);
-                    __m128 out1 = _mm_loadu_ps(&output_[i + 4]);
-                    __m128 out2 = _mm_loadu_ps(&output_[i + 8]);
-                    __m128 out3 = _mm_loadu_ps(&output_[i + 12]);
+                    const IndexType batch_offset = kOutputDimensions * b;
 
-                    __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
-                    __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
-                    __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
-                    __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
+                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
+                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
+                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
 
-                    __m128 grad0 = _mm_loadu_ps(&gradients[i + 0]);
-                    __m128 grad1 = _mm_loadu_ps(&gradients[i + 4]);
-                    __m128 grad2 = _mm_loadu_ps(&gradients[i + 8]);
-                    __m128 grad3 = _mm_loadu_ps(&gradients[i + 12]);
+                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
+                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
+                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
+                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
 
-                    grad0 = _mm_andnot_ps(clipped0, grad0);
-                    grad1 = _mm_andnot_ps(clipped1, grad1);
-                    grad2 = _mm_andnot_ps(clipped2, grad2);
-                    grad3 = _mm_andnot_ps(clipped3, grad3);
+                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
+                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
+                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
+                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
 
-                    _mm_storeu_ps(&gradients_[i + 0], grad0);
-                    _mm_storeu_ps(&gradients_[i + 4], grad1);
-                    _mm_storeu_ps(&gradients_[i + 8], grad2);
-                    _mm_storeu_ps(&gradients_[i + 12], grad3);
+                        grad0 = _mm_andnot_ps(clipped0, grad0);
+                        grad1 = _mm_andnot_ps(clipped1, grad1);
+                        grad2 = _mm_andnot_ps(clipped2, grad2);
+                        grad3 = _mm_andnot_ps(clipped3, grad3);
 
-                    const int clipped_mask =
-                        (_mm_movemask_ps(clipped0) << 0)
-                        | (_mm_movemask_ps(clipped1) << 4)
-                        | (_mm_movemask_ps(clipped2) << 8)
-                        | (_mm_movemask_ps(clipped3) << 12);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
 
-                    num_clipped_ += popcount(clipped_mask);
+                        const int clipped_mask =
+                            (_mm_movemask_ps(clipped0) << 0)
+                            | (_mm_movemask_ps(clipped1) << 4)
+                            | (_mm_movemask_ps(clipped2) << 8)
+                            | (_mm_movemask_ps(clipped3) << 12);
+
+                        thread_state.num_clipped_ += popcount(clipped_mask);
+                    }
                 }
             }
 
 #else
 
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
                     const IndexType index = batch_offset + i;
                     const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
                     gradients_[index] = gradients[index] * !clipped;
-                    num_clipped_ += clipped;
+                    thread_state.num_clipped_ += clipped;
                 }
             }
 
 #endif
 
-            num_total_ += batch_size_ * kOutputDimensions;
+            thread_state.num_total_ += count * kOutputDimensions;
 
-            previous_layer_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
+            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
+
+        void reduce_thread_state()
+        {
+            for (IndexType i = 1; i < thread_states_.size(); ++i)
+            {
+                thread_states_[0] += thread_states_[i];
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
+        {
+            previous_layer_trainer_->step_end(thread_pool, learning_rate);
         }
 
     private:
@@ -215,22 +247,21 @@ namespace Eval::NNUE {
         }
 
         void reset_stats() {
-            std::fill(std::begin(min_activations_), std::end(min_activations_),
-                      std::numeric_limits<LearnFloatType>::max());
-            std::fill(std::begin(max_activations_), std::end(max_activations_),
-                      std::numeric_limits<LearnFloatType>::lowest());
-
-            num_clipped_ = 0;
-            num_total_ = 0;
+            for(auto& state : thread_states_)
+                state.reset();
         }
 
         // Check if there are any problems with learning
         void check_health() {
 
+            reduce_thread_state();
+
+            auto& main_thread_state = thread_states_[0];
+
             const auto largest_min_activation = *std::max_element(
-                std::begin(min_activations_), std::end(min_activations_));
+                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
             const auto smallest_max_activation = *std::min_element(
-                std::begin(max_activations_), std::end(max_activations_));
+                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
 
             auto out = sync_region_cout.new_region();
 
@@ -243,7 +274,7 @@ namespace Eval::NNUE {
                 << " , smallest max activation = " << smallest_max_activation
                 << std::endl;
 
-            out << "  - clipped " << static_cast<double>(num_clipped_) / num_total_ * 100.0 << "% of outputs"
+            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
                 << std::endl;
 
             out.unlock();
@@ -262,9 +293,10 @@ namespace Eval::NNUE {
         // number of samples in mini-batch
         IndexType batch_size_;
 
-        IndexType num_clipped_;
         IndexType num_total_;
 
+        const LearnFloatType* input_;
+
         // Trainer of the previous layer
         const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
 
@@ -277,9 +309,44 @@ namespace Eval::NNUE {
         // buffer for back propagation
         std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
 
-        // Health check statistics
-        LearnFloatType min_activations_[kOutputDimensions];
-        LearnFloatType max_activations_[kOutputDimensions];
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            // Health check statistics
+            LearnFloatType min_activations_[kOutputDimensions];
+            LearnFloatType max_activations_[kOutputDimensions];
+            IndexType num_clipped_;
+            IndexType num_total_;
+
+            ThreadState() { reset(); }
+
+            ThreadState& operator+=(const ThreadState& other)
+            {
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
+                }
+
+                for (IndexType i = 0; i < kOutputDimensions; ++i)
+                {
+                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
+                }
+
+                num_clipped_ += other.num_clipped_;
+                num_total_ += other.num_total_;
+
+                return *this;
+            }
+
+            void reset()
+            {
+                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
+                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
+                num_clipped_ = 0;
+                num_total_ = 0;
+            }
+        };
+
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
     };
 
 }  // namespace Eval::NNUE

From a3c78691a23fd743e2a815b65594609683b87d9c Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 20:44:26 +0100
Subject: [PATCH 466/583] Prepare input slice trainer.

---
 src/nnue/trainer/trainer_input_slice.h | 181 ++++++++++++++++---------
 1 file changed, 115 insertions(+), 66 deletions(-)

diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index a93a3ea0..54f03d42 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -34,15 +34,15 @@ namespace Eval::NNUE {
 
         // Set options such as hyperparameters
         void send_message(Message* message) {
-            if (num_calls_ == 0) {
+            if (num_calls_[0] == 0) {
                 current_operation_ = Operation::kSendMessage;
                 feature_transformer_trainer_->send_message(message);
             }
 
             assert(current_operation_ == Operation::kSendMessage);
 
-            if (++num_calls_ == num_referrers_) {
-                num_calls_ = 0;
+            if (++num_calls_[0] == num_referrers_) {
+                num_calls_[0] = 0;
                 current_operation_ = Operation::kNone;
             }
         }
@@ -50,55 +50,79 @@ namespace Eval::NNUE {
         // Initialize the parameters with random numbers
         template <typename RNG>
         void initialize(RNG& rng) {
-            if (num_calls_ == 0) {
+            if (num_calls_[0] == 0) {
                 current_operation_ = Operation::kInitialize;
                 feature_transformer_trainer_->initialize(rng);
             }
 
             assert(current_operation_ == Operation::kInitialize);
 
-            if (++num_calls_ == num_referrers_) {
-                num_calls_ = 0;
+            if (++num_calls_[0] == num_referrers_) {
+                num_calls_[0] = 0;
                 current_operation_ = Operation::kNone;
             }
         }
 
-        // forward propagation
-        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
-            if (gradients_.size() < kInputDimensions * batch.size()) {
-                gradients_.resize(kInputDimensions * batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
+            if (gradients_.size() < kInputDimensions * combined_batch.size()) {
+                gradients_.resize(kInputDimensions * combined_batch.size());
             }
 
-            batch_size_ = static_cast<IndexType>(batch.size());
-
-            if (num_calls_ == 0) {
-                current_operation_ = Operation::kPropagate;
-                output_ = feature_transformer_trainer_->propagate(thread_pool, batch);
+            if (num_calls_.size() < thread_pool.size())
+            {
+                num_calls_.resize(thread_pool.size(), 0);
             }
 
-            assert(current_operation_ == Operation::kPropagate);
+            batch_size_ = static_cast<IndexType>(combined_batch.size());
 
-            if (++num_calls_ == num_referrers_) {
-                num_calls_ = 0;
+            if (num_calls_[0] == 0) {
+                current_operation_ = Operation::kStepStart;
+                output_ = feature_transformer_trainer_->step_start(thread_pool, combined_batch);
+            }
+
+            assert(current_operation_ == Operation::kStepStart);
+
+            if (++num_calls_[0] == num_referrers_) {
+                num_calls_[0] = 0;
                 current_operation_ = Operation::kNone;
             }
 
             return output_;
         }
 
+        // forward propagation
+        void propagate(Thread& th, uint64_t offset, uint64_t count) {
+            const auto thread_id = th.thread_idx();
+
+            if (num_calls_[thread_id] == 0) {
+                current_operation_ = Operation::kPropagate;
+                feature_transformer_trainer_->propagate(th, offset, count);
+            }
+
+            assert(current_operation_ == Operation::kPropagate);
+
+            if (++num_calls_[thread_id] == num_referrers_) {
+                num_calls_[thread_id] = 0;
+                current_operation_ = Operation::kNone;
+            }
+        }
+
         // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
+        void backpropagate(Thread& th,
                            const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
+                           uint64_t offset,
+                           uint64_t count) {
+
+            const auto thread_id = th.thread_idx();
 
             if (num_referrers_ == 1) {
-                feature_transformer_trainer_->backpropagate(thread_pool, gradients, learning_rate);
+                feature_transformer_trainer_->backpropagate(th, gradients, offset, count);
                 return;
             }
 
-            if (num_calls_ == 0) {
+            if (num_calls_[thread_id] == 0) {
                 current_operation_ = Operation::kBackPropagate;
-                for (IndexType b = 0; b < batch_size_; ++b) {
+                for (IndexType b = offset; b < offset + count; ++b) {
                     const IndexType batch_offset = kInputDimensions * b;
                     for (IndexType i = 0; i < kInputDimensions; ++i) {
                         gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
@@ -108,17 +132,31 @@ namespace Eval::NNUE {
 
             assert(current_operation_ == Operation::kBackPropagate);
 
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kInputDimensions * b;
                 for (IndexType i = 0; i < kInputDimensions; ++i) {
                     gradients_[batch_offset + i] += gradients[batch_offset + i];
                 }
             }
 
-            if (++num_calls_ == num_referrers_) {
+            if (++num_calls_[thread_id] == num_referrers_) {
                 feature_transformer_trainer_->backpropagate(
-                    thread_pool, gradients_.data(), learning_rate);
-                num_calls_ = 0;
+                    th, gradients_.data(), offset, count);
+                num_calls_[thread_id] = 0;
+                current_operation_ = Operation::kNone;
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
+            if (num_calls_[0] == 0) {
+                current_operation_ = Operation::kStepEnd;
+                feature_transformer_trainer_->step_end(thread_pool, learning_rate);
+            }
+
+            assert(current_operation_ == Operation::kStepEnd);
+
+            if (++num_calls_[0] == num_referrers_) {
+                num_calls_[0] = 0;
                 current_operation_ = Operation::kNone;
             }
         }
@@ -128,7 +166,7 @@ namespace Eval::NNUE {
         SharedInputTrainer(FeatureTransformer* ft) :
             batch_size_(0),
             num_referrers_(0),
-            num_calls_(0),
+            num_calls_(1, 0),
             current_operation_(Operation::kNone),
             feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
                 ft)),
@@ -144,8 +182,10 @@ namespace Eval::NNUE {
             kNone,
             kSendMessage,
             kInitialize,
+            kStepStart,
             kPropagate,
             kBackPropagate,
+            kStepEnd,
         };
 
         // number of samples in mini-batch
@@ -155,7 +195,7 @@ namespace Eval::NNUE {
         std::uint32_t num_referrers_;
 
         // Number of times the current process has been called
-        std::uint32_t num_calls_;
+        std::vector<std::uint32_t> num_calls_;
 
         // current processing type
         Operation current_operation_;
@@ -197,74 +237,81 @@ namespace Eval::NNUE {
             shared_input_trainer_->initialize(rng);
         }
 
-        // forward propagation
-        const LearnFloatType* propagate(ThreadPool& thread_pool,const std::vector<Example>& batch) {
-            if (output_.size() < kOutputDimensions * batch.size()) {
-              output_.resize(kOutputDimensions * batch.size());
-              gradients_.resize(kInputDimensions * batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
+            if (output_.size() < kOutputDimensions * combined_batch.size()) {
+              output_.resize(kOutputDimensions * combined_batch.size());
+              gradients_.resize(kInputDimensions * combined_batch.size());
             }
 
-            batch_size_ = static_cast<IndexType>(batch.size());
+            batch_size_ = static_cast<IndexType>(combined_batch.size());
 
-            const auto input = shared_input_trainer_->propagate(thread_pool, batch);
-            for (IndexType b = 0; b < batch_size_; ++b) {
+            input_ = shared_input_trainer_->step_start(thread_pool, combined_batch);
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, uint64_t offset, uint64_t count) {
+
+            shared_input_trainer_->propagate(th, offset, count);
+
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType input_offset = kInputDimensions * b;
                 const IndexType output_offset = kOutputDimensions * b;
 
 #if defined(USE_BLAS)
 
                 cblas_scopy(
-                    kOutputDimensions, &input[input_offset + Offset], 1,
+                    kOutputDimensions, &input_[input_offset + Offset], 1,
                     &output_[output_offset], 1
                 );
 #else
 
                 Blas::scopy(
-                    thread_pool,
-                    kOutputDimensions, &input[input_offset + Offset], 1,
+                    kOutputDimensions, &input_[input_offset + Offset], 1,
                     &output_[output_offset], 1
                 );
 
 #endif
             }
-
-            return output_.data();
         }
 
         // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
+        void backpropagate(Thread& th,
                            const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
+                           uint64_t offset,
+                           uint64_t count) {
 
-            thread_pool.for_each_index_with_workers(
-                0, batch_size_,
-                [&](Thread&, int b) {
-                    const IndexType input_offset = kInputDimensions * b;
-                    const IndexType output_offset = kOutputDimensions * b;
+            for (IndexType b = offset; b < offset + count; ++b)
+            {
+                const IndexType input_offset = kInputDimensions * b;
+                const IndexType output_offset = kOutputDimensions * b;
 
-                    IndexType i = 0;
-                    for (; i < Offset; ++i) {
-                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
-
-                    for (; i < Offset + kOutputDimensions; ++i) {
-                        gradients_[input_offset + i] = gradients[output_offset + i - Offset];
-                    }
-
-                    for (; i < kInputDimensions; ++i)
-                    {
-                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
+                IndexType i = 0;
+                for (; i < Offset; ++i) {
+                    gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
                 }
-            );
-            thread_pool.wait_for_workers_finished();
 
-            shared_input_trainer_->backpropagate(thread_pool, gradients_.data(), learning_rate);
+                for (; i < Offset + kOutputDimensions; ++i) {
+                    gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+                }
+
+                for (; i < kInputDimensions; ++i)
+                {
+                    gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                }
+            }
+
+            shared_input_trainer_->backpropagate(th, gradients_.data(), offset, count);
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
+            shared_input_trainer_->step_end(thread_pool, learning_rate);
         }
 
     private:
         // constructor
-        Trainer(FeatureTransformer* ft):
+        Trainer(FeatureTransformer* ft) :
             batch_size_(0),
             shared_input_trainer_(SharedInputTrainer::create(ft)) {
         }
@@ -278,6 +325,8 @@ namespace Eval::NNUE {
         // number of samples in mini-batch
         IndexType batch_size_;
 
+        const LearnFloatType* input_;
+
         // Trainer of shared input layer
         const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
 

From 15c528ca7b6beefa64ba2c0192c7dc3efacc665e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 22 Nov 2020 21:38:11 +0100
Subject: [PATCH 467/583] Prepare feature transformer learner.

---
 .../trainer/trainer_feature_transformer.h     | 486 +++++++++++-------
 1 file changed, 298 insertions(+), 188 deletions(-)

diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 80f914f2..9686002f 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -89,56 +89,88 @@ namespace Eval::NNUE {
             quantize_parameters();
         }
 
-        // forward propagation
-        const LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
-            if (output_.size() < kOutputDimensions * batch.size()) {
-                output_.resize(kOutputDimensions * batch.size());
-                gradients_.resize(kOutputDimensions * batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        {
+            if (output_.size() < kOutputDimensions * combined_batch.size()) {
+                output_.resize(kOutputDimensions * combined_batch.size());
+                gradients_.resize(kOutputDimensions * combined_batch.size());
             }
 
-            (void)thread_pool;
+            if (thread_stat_states_.size() < thread_pool.size())
+            {
+                thread_stat_states_.resize(thread_pool.size());
+            }
 
-            batch_ = &batch;
-            // affine transform
-            thread_pool.for_each_index_with_workers(
-                0, batch.size(),
-                [&](Thread&, int b) {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType c = 0; c < 2; ++c) {
-                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+            if (thread_bias_states_.size() < thread_pool.size())
+            {
+                thread_bias_states_.resize(thread_pool.size());
+            }
+
+            batch_ = &combined_batch;
+
+            auto& main_thread_bias_state = thread_bias_states_[0];
 
 #if defined(USE_BLAS)
 
-                        cblas_scopy(
-                            kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                        );
-
-                        for (const auto& feature : batch[b].training_features[c]) {
-                            const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                            cblas_saxpy(
-                                kHalfDimensions, (float)feature.get_count(),
-                                &weights_[weights_offset], 1, &output_[output_offset], 1
-                            );
-                        }
+            cblas_sscal(
+                kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
+            );
 
 #else
 
-                        Blas::scopy(
-                            kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                        );
-                        for (const auto& feature : batch[b].training_features[c]) {
-                            const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                            Blas::saxpy(
-                                kHalfDimensions, (float)feature.get_count(),
-                                &weights_[weights_offset], 1, &output_[output_offset], 1
-                            );
-                        }
+            Blas::sscal(
+                kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
+            );
 
 #endif
+
+            for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
+                thread_bias_states_[i].reset();
+
+            return output_.data();
+        }
+
+        // forward propagation
+        void propagate(Thread& th, uint64_t offset, uint64_t count) {
+
+            auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
+
+            for (IndexType b = offset; b < offset + count; ++b)
+            {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+
+#if defined(USE_BLAS)
+
+                    cblas_scopy(
+                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
+                    );
+
+                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                        cblas_saxpy(
+                            kHalfDimensions, (float)feature.get_count(),
+                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        );
                     }
+
+#else
+
+                    Blas::scopy(
+                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
+                    );
+                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
+                        Blas::saxpy(
+                            kHalfDimensions, (float)feature.get_count(),
+                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                        );
+                    }
+
+#endif
                 }
-            );
-            thread_pool.wait_for_workers_finished();
+            }
 
 #if defined (USE_SSE2)
 
@@ -161,49 +193,51 @@ namespace Eval::NNUE {
                     return _mm_cvtss_f32(_mm_max_ps(max_x_x_13_20, max_x_x_20_13));
                 };
 
-                const int total_size = batch.size() * kOutputDimensions;
-
                 const __m128 kZero4 = _mm_set1_ps(+kZero);
                 const __m128 kOne4 = _mm_set1_ps(+kOne);
 
-                __m128 min_pre_activation0 = _mm_set1_ps(min_pre_activation_);
-                __m128 min_pre_activation1 = _mm_set1_ps(min_pre_activation_);
-                __m128 max_pre_activation0 = _mm_set1_ps(max_pre_activation_);
-                __m128 max_pre_activation1 = _mm_set1_ps(max_pre_activation_);
+                __m128 min_pre_activation0 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
+                __m128 min_pre_activation1 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
+                __m128 max_pre_activation0 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
+                __m128 max_pre_activation1 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
 
-                for (int i = 0; i < total_size; i += 16)
+                for (IndexType b = offset; b < offset + count; ++b)
                 {
-                    __m128 out0 = _mm_loadu_ps(&output_[i +  0]);
-                    __m128 out1 = _mm_loadu_ps(&output_[i +  4]);
-                    __m128 out2 = _mm_loadu_ps(&output_[i +  8]);
-                    __m128 out3 = _mm_loadu_ps(&output_[i + 12]);
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i +  0]);
+                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i +  4]);
+                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i +  8]);
+                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
 
-                    __m128 min01 = _mm_min_ps(out0, out1);
-                    __m128 min23 = _mm_min_ps(out2, out3);
+                        __m128 min01 = _mm_min_ps(out0, out1);
+                        __m128 min23 = _mm_min_ps(out2, out3);
 
-                    __m128 max01 = _mm_max_ps(out0, out1);
-                    __m128 max23 = _mm_max_ps(out2, out3);
+                        __m128 max01 = _mm_max_ps(out0, out1);
+                        __m128 max23 = _mm_max_ps(out2, out3);
 
-                    min_pre_activation0 = _mm_min_ps(min_pre_activation0, min01);
-                    min_pre_activation1 = _mm_min_ps(min_pre_activation1, min23);
-                    max_pre_activation0 = _mm_max_ps(max_pre_activation0, max01);
-                    max_pre_activation1 = _mm_max_ps(max_pre_activation1, max23);
+                        min_pre_activation0 = _mm_min_ps(min_pre_activation0, min01);
+                        min_pre_activation1 = _mm_min_ps(min_pre_activation1, min23);
+                        max_pre_activation0 = _mm_max_ps(max_pre_activation0, max01);
+                        max_pre_activation1 = _mm_max_ps(max_pre_activation1, max23);
 
-                    out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
-                    out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
-                    out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
-                    out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
+                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
+                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
+                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
+                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
 
-                    _mm_storeu_ps(&output_[i +  0], out0);
-                    _mm_storeu_ps(&output_[i +  4], out1);
-                    _mm_storeu_ps(&output_[i +  8], out2);
-                    _mm_storeu_ps(&output_[i + 12], out3);
+                        _mm_storeu_ps(&output_[batch_offset + i +  0], out0);
+                        _mm_storeu_ps(&output_[batch_offset + i +  4], out1);
+                        _mm_storeu_ps(&output_[batch_offset + i +  8], out2);
+                        _mm_storeu_ps(&output_[batch_offset + i + 12], out3);
+                    }
                 }
 
-                min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
-                max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
+                thread_stat_state.min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
+                thread_stat_state.max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
 
-                for (IndexType b = 0; b < batch.size(); ++b)
+                for (IndexType b = offset; b < offset + count; ++b)
                 {
                     const IndexType batch_offset = kOutputDimensions * b;
 
@@ -217,15 +251,15 @@ namespace Eval::NNUE {
                             const __m128 out2 = _mm_loadu_ps(&output_[i +  8 + half_offset]);
                             const __m128 out3 = _mm_loadu_ps(&output_[i + 12 + half_offset]);
 
-                            __m128 minact0 = _mm_loadu_ps(&min_activations_[i +  0]);
-                            __m128 minact1 = _mm_loadu_ps(&min_activations_[i +  4]);
-                            __m128 minact2 = _mm_loadu_ps(&min_activations_[i +  8]);
-                            __m128 minact3 = _mm_loadu_ps(&min_activations_[i + 12]);
+                            __m128 minact0 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  0]);
+                            __m128 minact1 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  4]);
+                            __m128 minact2 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  8]);
+                            __m128 minact3 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 12]);
 
-                            __m128 maxact0 = _mm_loadu_ps(&max_activations_[i +  0]);
-                            __m128 maxact1 = _mm_loadu_ps(&max_activations_[i +  4]);
-                            __m128 maxact2 = _mm_loadu_ps(&max_activations_[i +  8]);
-                            __m128 maxact3 = _mm_loadu_ps(&max_activations_[i + 12]);
+                            __m128 maxact0 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  0]);
+                            __m128 maxact1 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  4]);
+                            __m128 maxact2 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  8]);
+                            __m128 maxact3 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 12]);
 
                             minact0 = _mm_min_ps(out0, minact0);
                             minact1 = _mm_min_ps(out1, minact1);
@@ -237,15 +271,15 @@ namespace Eval::NNUE {
                             maxact2 = _mm_max_ps(out2, maxact2);
                             maxact3 = _mm_max_ps(out3, maxact3);
 
-                            _mm_storeu_ps(&min_activations_[i +  0], minact0);
-                            _mm_storeu_ps(&min_activations_[i +  4], minact1);
-                            _mm_storeu_ps(&min_activations_[i +  8], minact2);
-                            _mm_storeu_ps(&min_activations_[i + 12], minact3);
+                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  0], minact0);
+                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  4], minact1);
+                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  8], minact2);
+                            _mm_storeu_ps(&thread_stat_state.min_activations_[i + 12], minact3);
 
-                            _mm_storeu_ps(&max_activations_[i +  0], maxact0);
-                            _mm_storeu_ps(&max_activations_[i +  4], maxact1);
-                            _mm_storeu_ps(&max_activations_[i +  8], maxact2);
-                            _mm_storeu_ps(&max_activations_[i + 12], maxact3);
+                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  0], maxact0);
+                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  4], maxact1);
+                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  8], maxact2);
+                            _mm_storeu_ps(&thread_stat_state.max_activations_[i + 12], maxact3);
                         }
                     }
                 }
@@ -254,33 +288,30 @@ namespace Eval::NNUE {
 #else
 
             // clipped ReLU
-            for (IndexType b = 0; b < batch.size(); ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
                     const IndexType index = batch_offset + i;
-                    min_pre_activation_ = std::min(min_pre_activation_, output_[index]);
-                    max_pre_activation_ = std::max(max_pre_activation_, output_[index]);
+                    thread_stat_state.min_pre_activation_ = std::min(thread_stat_state.min_pre_activation_, output_[index]);
+                    thread_stat_state.max_pre_activation_ = std::max(thread_stat_state.max_pre_activation_, output_[index]);
                     output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
                     const IndexType t = i % kHalfDimensions;
-                    min_activations_[t] = std::min(min_activations_[t], output_[index]);
-                    max_activations_[t] = std::max(max_activations_[t], output_[index]);
+                    thread_stat_state.min_activations_[t] = std::min(thread_stat_state.min_activations_[t], output_[index]);
+                    thread_stat_state.max_activations_[t] = std::max(thread_stat_state.max_activations_[t], output_[index]);
                 }
             }
 
 #endif
-
-            return output_.data();
         }
 
         // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
+        void backpropagate(Thread& th,
                            const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
+                           uint64_t offset,
+                           uint64_t count) {
 
-            (void)thread_pool;
-
-            const LearnFloatType local_learning_rate =
-                learning_rate * learning_rate_scale_;
+            auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
+            auto& thread_bias_state = thread_bias_states_[th.thread_idx()];
 
 #if defined (USE_SSE2)
 
@@ -290,111 +321,134 @@ namespace Eval::NNUE {
                 const __m128 kZero4 = _mm_set1_ps(+kZero);
                 const __m128 kOne4 = _mm_set1_ps(+kOne);
 
-                const IndexType total_size = batch_->size() * kOutputDimensions;
-
-                for (IndexType i = 0; i < total_size; i += 16)
+                for (IndexType b = offset; b < offset + count; ++b)
                 {
-                    __m128 out0 = _mm_loadu_ps(&output_[i + 0]);
-                    __m128 out1 = _mm_loadu_ps(&output_[i + 4]);
-                    __m128 out2 = _mm_loadu_ps(&output_[i + 8]);
-                    __m128 out3 = _mm_loadu_ps(&output_[i + 12]);
+                    const IndexType batch_offset = kOutputDimensions * b;
+                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
+                    {
+                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
+                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
+                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
+                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
 
-                    __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
-                    __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
-                    __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
-                    __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
+                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
+                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
+                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
+                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
 
-                    __m128 grad0 = _mm_loadu_ps(&gradients[i + 0]);
-                    __m128 grad1 = _mm_loadu_ps(&gradients[i + 4]);
-                    __m128 grad2 = _mm_loadu_ps(&gradients[i + 8]);
-                    __m128 grad3 = _mm_loadu_ps(&gradients[i + 12]);
+                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
+                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
+                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
+                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
 
-                    grad0 = _mm_andnot_ps(clipped0, grad0);
-                    grad1 = _mm_andnot_ps(clipped1, grad1);
-                    grad2 = _mm_andnot_ps(clipped2, grad2);
-                    grad3 = _mm_andnot_ps(clipped3, grad3);
+                        grad0 = _mm_andnot_ps(clipped0, grad0);
+                        grad1 = _mm_andnot_ps(clipped1, grad1);
+                        grad2 = _mm_andnot_ps(clipped2, grad2);
+                        grad3 = _mm_andnot_ps(clipped3, grad3);
 
-                    _mm_storeu_ps(&gradients_[i + 0], grad0);
-                    _mm_storeu_ps(&gradients_[i + 4], grad1);
-                    _mm_storeu_ps(&gradients_[i + 8], grad2);
-                    _mm_storeu_ps(&gradients_[i + 12], grad3);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
+                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
 
-                    const int clipped_mask =
-                        (_mm_movemask_ps(clipped0) << 0)
-                        | (_mm_movemask_ps(clipped1) << 4)
-                        | (_mm_movemask_ps(clipped2) << 8)
-                        | (_mm_movemask_ps(clipped3) << 12);
+                        const int clipped_mask =
+                            (_mm_movemask_ps(clipped0) << 0)
+                            | (_mm_movemask_ps(clipped1) << 4)
+                            | (_mm_movemask_ps(clipped2) << 8)
+                            | (_mm_movemask_ps(clipped3) << 12);
 
-                    num_clipped_ += popcount(clipped_mask);
+                        thread_stat_state.num_clipped_ += popcount(clipped_mask);
+                    }
                 }
             }
 
 #else
 
-            for (IndexType b = 0; b < batch_->size(); ++b) {
+            for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kOutputDimensions * b;
                 for (IndexType i = 0; i < kOutputDimensions; ++i) {
                     const IndexType index = batch_offset + i;
                     const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
                     gradients_[index] = gradients[index] * !clipped;
-                    num_clipped_ += clipped;
+                    thread_stat_state.num_clipped_ += clipped;
                 }
             }
 
 #endif
 
-            num_total_ += batch_->size() * kOutputDimensions;
+            thread_stat_state.num_total_ += count * kOutputDimensions;
+
+#if defined(USE_BLAS)
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    cblas_saxpy(
+                        kHalfDimensions, 1.0,
+                        &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
+                    );
+                }
+            }
+
+#else
+
+            for (IndexType b = offset; b < offset + count; ++b) {
+                const IndexType batch_offset = kOutputDimensions * b;
+                for (IndexType c = 0; c < 2; ++c) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    Blas::saxpy(
+                        kHalfDimensions, 1.0,
+                        &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
+                    );
+                }
+            }
+
+#endif
+        }
+
+        void reduce_thread_stat_state()
+        {
+            for (IndexType i = 1; i < thread_stat_states_.size(); ++i)
+            {
+                thread_stat_states_[0] += thread_stat_states_[i];
+            }
+        }
+
+        void reduce_thread_bias_state()
+        {
+            for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
+            {
+                thread_bias_states_[0] += thread_bias_states_[i];
+            }
+        }
+
+        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
+
+            const LearnFloatType local_learning_rate =
+                learning_rate * learning_rate_scale_;
 
             // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
             // Correct the learning rate and adjust the scale without using momentum
             const LearnFloatType effective_learning_rate =
                 static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
 
+            reduce_thread_bias_state();
+
+            auto& main_thread_state = thread_bias_states_[0];
+
 #if defined(USE_BLAS)
 
-            cblas_sscal(
-                kHalfDimensions, momentum_, biases_diff_, 1
-            );
-
-            for (IndexType b = 0; b < batch_->size(); ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    cblas_saxpy(
-                        kHalfDimensions, 1.0,
-                        &gradients_[output_offset], 1, biases_diff_, 1
-                    );
-                }
-            }
-
             cblas_saxpy(
                 kHalfDimensions, -local_learning_rate,
-                biases_diff_, 1, biases_, 1
+                main_thread_state.biases_diff_, 1, biases_, 1
             );
 
 #else
 
-            Blas::sscal(
-                thread_pool,
-                kHalfDimensions, momentum_, biases_diff_, 1
-            );
-
-            for (IndexType b = 0; b < batch_->size(); ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    Blas::saxpy(
-                        thread_pool,
-                        kHalfDimensions, 1.0,
-                        &gradients_[output_offset], 1, biases_diff_, 1
-                    );
-                }
-            }
-
             Blas::saxpy(
-                thread_pool,
                 kHalfDimensions, -local_learning_rate,
-                biases_diff_, 1, biases_, 1
+                main_thread_state.biases_diff_, 1, biases_, 1
             );
 
 #endif
@@ -464,7 +518,6 @@ namespace Eval::NNUE {
             target_layer_(target_layer),
             biases_(),
             weights_(),
-            biases_diff_(),
             momentum_(0.2),
             learning_rate_scale_(1.0) {
 
@@ -502,16 +555,8 @@ namespace Eval::NNUE {
         }
 
         void reset_stats() {
-            min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
-            max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
-
-            std::fill(std::begin(min_activations_), std::end(min_activations_),
-                      std::numeric_limits<LearnFloatType>::max());
-            std::fill(std::begin(max_activations_), std::end(max_activations_),
-                      std::numeric_limits<LearnFloatType>::lowest());
-
-            num_clipped_ = 0;
-            num_total_ = 0;
+            for (auto& state : thread_stat_states_)
+                state.reset();
         }
 
         // read parameterized integer
@@ -528,9 +573,10 @@ namespace Eval::NNUE {
                     target_layer_->weights_[i] / kWeightScale);
             }
 
-            std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
-
             reset_stats();
+
+            for (auto& state : thread_bias_states_)
+                state.reset();
         }
 
         // Set the weight corresponding to the feature that does not appear in the learning data to 0
@@ -552,10 +598,14 @@ namespace Eval::NNUE {
                 std::numeric_limits<typename LayerType::WeightType>::max() /
                 kWeightScale;
 
+            reduce_thread_stat_state();
+
+            auto& main_thread_state = thread_stat_states_[0];
+
             const auto largest_min_activation = *std::max_element(
-                std::begin(min_activations_), std::end(min_activations_));
+                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
             const auto smallest_max_activation = *std::min_element(
-                std::begin(max_activations_), std::end(max_activations_));
+                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
 
             double abs_bias_sum = 0.0;
             double abs_weight_sum = 0.0;
@@ -578,8 +628,8 @@ namespace Eval::NNUE {
                 << std::endl;
 
             out << "  - (min, max) of pre-activations = "
-                << min_pre_activation_ << ", "
-                << max_pre_activation_ << " (limit = "
+                << main_thread_state.min_pre_activation_ << ", "
+                << main_thread_state.max_pre_activation_ << " (limit = "
                 << kPreActivationLimit << ")"
                 << std::endl;
 
@@ -590,7 +640,7 @@ namespace Eval::NNUE {
             out << "  - avg_abs_bias   = " << abs_bias_sum / std::size(biases_) << std::endl;
             out << "  - avg_abs_weight = " << abs_weight_sum / std::size(weights_) << std::endl;
 
-            out << "  - clipped " << static_cast<double>(num_clipped_) / num_total_ * 100.0 << "% of outputs"
+            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
                 << std::endl;
 
             out.unlock();
@@ -620,7 +670,6 @@ namespace Eval::NNUE {
         // layer to learn
         LayerType* const target_layer_;
 
-        IndexType num_clipped_;
         IndexType num_total_;
 
         // parameter
@@ -629,7 +678,6 @@ namespace Eval::NNUE {
             LearnFloatType weights_[kHalfDimensions * kInputDimensions];
 
         // Buffer used for updating parameters
-        alignas(kCacheLineSize) LearnFloatType biases_diff_[kHalfDimensions];
         std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
 
         // Forward propagation buffer
@@ -643,11 +691,73 @@ namespace Eval::NNUE {
         LearnFloatType momentum_;
         LearnFloatType learning_rate_scale_;
 
-        // Health check statistics
-        LearnFloatType min_pre_activation_;
-        LearnFloatType max_pre_activation_;
-        alignas(kCacheLineSize) LearnFloatType min_activations_[kHalfDimensions];
-        alignas(kCacheLineSize) LearnFloatType max_activations_[kHalfDimensions];
+        struct alignas(kCacheLineSize) ThreadStatState
+        {
+            alignas(kCacheLineSize) LearnFloatType min_activations_[kHalfDimensions];
+            alignas(kCacheLineSize) LearnFloatType max_activations_[kHalfDimensions];
+            LearnFloatType min_pre_activation_;
+            LearnFloatType max_pre_activation_;
+            IndexType num_clipped_;
+            IndexType num_total_;
+
+            ThreadStatState() { reset(); }
+
+            ThreadStatState& operator+=(const ThreadStatState& other)
+            {
+                for (IndexType i = 0; i < kHalfDimensions; ++i)
+                {
+                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
+                }
+
+                for (IndexType i = 0; i < kHalfDimensions; ++i)
+                {
+                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
+                }
+
+                min_pre_activation_ = std::min(min_pre_activation_, other.min_pre_activation_);
+                max_pre_activation_ = std::max(max_pre_activation_, other.max_pre_activation_);
+
+                num_clipped_ += other.num_clipped_;
+                num_total_ += other.num_total_;
+
+                return *this;
+            }
+
+            void reset()
+            {
+                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
+                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
+                min_pre_activation_ = std::numeric_limits<float>::max();
+                max_pre_activation_ = std::numeric_limits<float>::lowest();
+                num_clipped_ = 0;
+                num_total_ = 0;
+            }
+        };
+
+        struct alignas(kCacheLineSize) ThreadBiasState
+        {
+            alignas(kCacheLineSize) LearnFloatType biases_diff_[kHalfDimensions];
+
+            ThreadBiasState() { reset(); }
+
+            ThreadBiasState& operator+=(const ThreadBiasState& other)
+            {
+                for (IndexType i = 0; i < kHalfDimensions; ++i)
+                {
+                    biases_diff_[i] += other.biases_diff_[i];
+                }
+
+                return *this;
+            }
+
+            void reset()
+            {
+                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
+            }
+        };
+
+        std::vector<ThreadStatState, CacheLineAlignedAllocator<ThreadStatState>> thread_stat_states_;
+        std::vector<ThreadBiasState, CacheLineAlignedAllocator<ThreadBiasState>> thread_bias_states_;
     };
 
 }  // namespace Eval::NNUE

From 1c8495b54b7b5c52d33492f458b829f18fe61460 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 20:37:38 +0100
Subject: [PATCH 468/583] Remove handwritten saxpy because compilers optimize
 the second look anyway.

---
 src/extra/stockfish_blas.cpp                  | 45 +------------------
 .../trainer/trainer_feature_transformer.h     |  6 +--
 2 files changed, 5 insertions(+), 46 deletions(-)

diff --git a/src/extra/stockfish_blas.cpp b/src/extra/stockfish_blas.cpp
index 109a4b44..2bf28b8f 100644
--- a/src/extra/stockfish_blas.cpp
+++ b/src/extra/stockfish_blas.cpp
@@ -178,53 +178,11 @@ namespace Blas {
     )
     {
 
-#if defined (USE_SSE2)
-
-        const __m128 alpha4 = _mm_set1_ps(alpha);
-
-        int i = 0;
-        for(; i < N - 15; i += 16)
-        {
-            __m128 x0 = _mm_loadu_ps(X + i +  0);
-            __m128 x1 = _mm_loadu_ps(X + i +  4);
-            __m128 x2 = _mm_loadu_ps(X + i +  8);
-            __m128 x3 = _mm_loadu_ps(X + i + 12);
-
-            __m128 y0 = _mm_loadu_ps(Y + i +  0);
-            __m128 y1 = _mm_loadu_ps(Y + i +  4);
-            __m128 y2 = _mm_loadu_ps(Y + i +  8);
-            __m128 y3 = _mm_loadu_ps(Y + i + 12);
-
-            x0 = _mm_mul_ps(x0, alpha4);
-            x1 = _mm_mul_ps(x1, alpha4);
-            x2 = _mm_mul_ps(x2, alpha4);
-            x3 = _mm_mul_ps(x3, alpha4);
-
-            x0 = _mm_add_ps(x0, y0);
-            x1 = _mm_add_ps(x1, y1);
-            x2 = _mm_add_ps(x2, y2);
-            x3 = _mm_add_ps(x3, y3);
-
-            _mm_storeu_ps(Y + i +  0, x0);
-            _mm_storeu_ps(Y + i +  4, x1);
-            _mm_storeu_ps(Y + i +  8, x2);
-            _mm_storeu_ps(Y + i + 12, x3);
-        }
-
-        for(; i < N; ++i)
-        {
-            Y[i] += X[i] * alpha;
-        }
-
-#else
-
         for(int i = 0; i < N; ++i)
         {
             Y[i] += X[i] * alpha;
         }
 
-#endif
-
     }
 
     void saxpy(
@@ -564,7 +522,8 @@ namespace Blas {
         const __m128 alpha4 = _mm_set1_ps(alpha);
         const __m128 beta4 = _mm_set1_ps(beta);
 
-        for (int m = 0; m < M - 1; m += 2)
+        int m = 0;
+        for (; m < M - 1; m += 2)
         {
             int n = 0;
             for (; n < N - 3; n += 4)
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 9686002f..78729064 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -164,7 +164,7 @@ namespace Eval::NNUE {
                         const IndexType weights_offset = kHalfDimensions * feature.get_index();
                         Blas::saxpy(
                             kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], 1, &output_[output_offset], 1
+                            &weights_[weights_offset], &output_[output_offset]
                         );
                     }
 
@@ -497,8 +497,8 @@ namespace Eval::NNUE {
 
                                 Blas::saxpy(
                                     kHalfDimensions, -scale,
-                                    &gradients_[output_offset], 1,
-                                    &weights_[weights_offset], 1
+                                    &gradients_[output_offset],
+                                    &weights_[weights_offset]
                                 );
 
 #endif

From 49b2dcb1f3db8ac8c7f9cfcf1abfcb64194ff700 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 21:53:53 +0100
Subject: [PATCH 469/583] Preallocate memory for unique_features. Keep the
 training_features temporary buffer as a thread_local so we reuse the storage.

---
 src/nnue/evaluate_nnue_learner.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 644ac9a4..2f0a2122 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -157,8 +157,12 @@ namespace Eval::NNUE {
             active_indices[0].swap(active_indices[1]);
         }
 
+        static thread_local std::vector<TrainingFeature> s_training_features;
+        auto& training_features = s_training_features;
+
         for (const auto color : Colors) {
-            std::vector<TrainingFeature> training_features;
+            training_features.clear();
+
             for (const auto base_index : active_indices[color]) {
                 static_assert(Features::Factorizer<RawFeatures>::get_dimensions() <
                               (1 << TrainingFeature::kIndexBits), "");
@@ -169,6 +173,7 @@ namespace Eval::NNUE {
             std::sort(training_features.begin(), training_features.end());
 
             auto& unique_features = example.training_features[color];
+            unique_features.reserve(training_features.size());
             for (const auto& feature : training_features) {
                 if (!unique_features.empty() &&
                     feature.get_index() == unique_features.back().get_index()) {

From 8009973381f1064ea72e9533808166dd54a5445b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 22:13:11 +0100
Subject: [PATCH 470/583] Special case for alpha=1 in saxpy, slight performance
 increase.

---
 src/extra/stockfish_blas.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/extra/stockfish_blas.cpp b/src/extra/stockfish_blas.cpp
index 2bf28b8f..70b258bc 100644
--- a/src/extra/stockfish_blas.cpp
+++ b/src/extra/stockfish_blas.cpp
@@ -177,10 +177,19 @@ namespace Blas {
         float * SF_BLAS_RESTRICT Y
     )
     {
-
-        for(int i = 0; i < N; ++i)
+        if (alpha == 1.0f)
         {
-            Y[i] += X[i] * alpha;
+            for (int i = 0; i < N; ++i)
+            {
+                Y[i] += X[i];
+            }
+        }
+        else
+        {
+            for (int i = 0; i < N; ++i)
+            {
+                Y[i] += X[i] * alpha;
+            }
         }
 
     }

From e954b14196e129a6df6edf184006bbc4dff2177f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 22:43:42 +0100
Subject: [PATCH 471/583] Prefetch weights for feature transformer backprop to
 shared cache.

---
 src/nnue/trainer/trainer_feature_transformer.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 78729064..fa0859ed 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -138,6 +138,7 @@ namespace Eval::NNUE {
             for (IndexType b = offset; b < offset + count; ++b)
             {
                 const IndexType batch_offset = kOutputDimensions * b;
+
                 for (IndexType c = 0; c < 2; ++c) {
                     const IndexType output_offset = batch_offset + kHalfDimensions * c;
 
@@ -459,10 +460,16 @@ namespace Eval::NNUE {
 
                     for (IndexType b = 0; b < batch_->size(); ++b) {
                         const IndexType batch_offset = kOutputDimensions * b;
+
                         for (IndexType c = 0; c < 2; ++c) {
                             const IndexType output_offset = batch_offset + kHalfDimensions * c;
                             for (const auto& feature : (*batch_)[b].training_features[c]) {
                                 const IndexType feature_index = feature.get_index();
+                                const IndexType weights_offset =
+                                    kHalfDimensions * feature_index;
+#if defined (USE_SSE2)
+                                _mm_prefetch(reinterpret_cast<const char*>(&weights_[weights_offset]), _MM_HINT_T2);
+#endif
 
                                 // We assign each bucket a continuous range of bits at least
                                 // of cache line size to prevent false sharing.
@@ -479,9 +486,6 @@ namespace Eval::NNUE {
                                 // (even a different cache line)
                                 observed_features.set(feature_index);
 
-                                const IndexType weights_offset =
-                                    kHalfDimensions * feature_index;
-
                                 const auto scale = static_cast<LearnFloatType>(
                                     effective_learning_rate / feature.get_count());
 

From 0bee8fef64f955f662386fc28cdde9da7536fd8e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 22:59:34 +0100
Subject: [PATCH 472/583] Don't unnecessarily copy the batch part.

---
 src/nnue/evaluate_nnue_learner.cpp            | 25 +++++++++--------
 src/nnue/trainer/trainer_affine_transform.h   | 14 ++++++----
 src/nnue/trainer/trainer_clipped_relu.h       | 14 ++++++----
 .../trainer/trainer_feature_transformer.h     | 23 +++++++++------
 src/nnue/trainer/trainer_input_slice.h        | 28 +++++++++++--------
 5 files changed, 61 insertions(+), 43 deletions(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 2f0a2122..24ad2732 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -215,27 +215,28 @@ namespace Eval::NNUE {
         std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
 
         while (examples.size() >= batch_size) {
-            std::vector<Example> batch(examples.end() - batch_size, examples.end());
-            examples.resize(examples.size() - batch_size);
-
-            const auto network_output = trainer->step_start(thread_pool, batch);
-            std::vector<LearnFloatType> gradients(batch.size());
+            auto batch_begin = examples.end() - batch_size;
+            auto batch_end = examples.end();
+            auto size = batch_end - batch_begin;
+            const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
+            std::vector<LearnFloatType> gradients(size);
 
             thread_pool.for_each_index_chunk_with_workers(
-                std::size_t(0), batch.size(),
+                std::size_t(0), size,
                 [&](Thread& th, std::size_t offset, std::size_t count) {
                     const auto thread_id = th.thread_idx();
 
                     trainer->propagate(th, offset, count);
 
                     for (std::size_t b = offset; b < offset + count; ++b) {
+                        const auto& e = *(batch_begin + b);
                         const auto shallow = static_cast<Value>(round<std::int32_t>(
-                            batch[b].sign * network_output[b] * kPonanzaConstant));
-                        const auto discrete = batch[b].sign * batch[b].discrete_nn_eval;
-                        const auto& psv = batch[b].psv;
+                            e.sign * network_output[b] * kPonanzaConstant));
+                        const auto discrete = e.sign * e.discrete_nn_eval;
+                        const auto& psv = e.psv;
                         const double gradient =
-                            batch[b].sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
-                        gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+                            e.sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        gradients[b] = static_cast<LearnFloatType>(gradient * e.weight);
 
 
                         // The discrete eval will only be valid before first backpropagation,
@@ -256,6 +257,8 @@ namespace Eval::NNUE {
 
             trainer->step_end(thread_pool, learning_rate);
 
+            examples.resize(examples.size() - size);
+
             collect_stats = false;
         }
 
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index f66f1a65..b6d70aa4 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -91,11 +91,13 @@ namespace Eval::NNUE {
             quantize_parameters();
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
         {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-                output_.resize(kOutputDimensions * combined_batch.size());
-                gradients_.resize(kInputDimensions * combined_batch.size());
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+                output_.resize(kOutputDimensions * size);
+                gradients_.resize(kInputDimensions * size);
             }
 
             if (thread_states_.size() < thread_pool.size())
@@ -103,8 +105,8 @@ namespace Eval::NNUE {
                 thread_states_.resize(thread_pool.size());
             }
 
-            combined_batch_size_ = static_cast<IndexType>(combined_batch.size());
-            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
+            combined_batch_size_ = size;
+            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
 
             auto& main_thread_state = thread_states_[0];
 
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index e4bcecaf..eae35df6 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -42,11 +42,13 @@ namespace Eval::NNUE {
             previous_layer_trainer_->initialize(rng);
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
         {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-              output_.resize(kOutputDimensions * combined_batch.size());
-              gradients_.resize(kInputDimensions * combined_batch.size());
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+              output_.resize(kOutputDimensions * size);
+              gradients_.resize(kInputDimensions * size);
             }
 
             if (thread_states_.size() < thread_pool.size())
@@ -54,9 +56,9 @@ namespace Eval::NNUE {
                 thread_states_.resize(thread_pool.size());
             }
 
-            input_ = previous_layer_trainer_->step_start(thread_pool, combined_batch);
+            input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
 
-            batch_size_ = static_cast<IndexType>(combined_batch.size());
+            batch_size_ = size;
 
             return output_.data();
         }
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index fa0859ed..65766b05 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -89,11 +89,13 @@ namespace Eval::NNUE {
             quantize_parameters();
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch)
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
         {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-                output_.resize(kOutputDimensions * combined_batch.size());
-                gradients_.resize(kOutputDimensions * combined_batch.size());
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+                output_.resize(kOutputDimensions * size);
+                gradients_.resize(kOutputDimensions * size);
             }
 
             if (thread_stat_states_.size() < thread_pool.size())
@@ -106,7 +108,8 @@ namespace Eval::NNUE {
                 thread_bias_states_.resize(thread_pool.size());
             }
 
-            batch_ = &combined_batch;
+            batch_ = &*batch_begin;
+            batch_size_ = size;
 
             auto& main_thread_bias_state = thread_bias_states_[0];
 
@@ -161,7 +164,7 @@ namespace Eval::NNUE {
                     Blas::scopy(
                         kHalfDimensions, biases_, 1, &output_[output_offset], 1
                     );
-                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                    for (const auto& feature : batch_[b].training_features[c]) {
                         const IndexType weights_offset = kHalfDimensions * feature.get_index();
                         Blas::saxpy(
                             kHalfDimensions, (float)feature.get_count(),
@@ -458,12 +461,12 @@ namespace Eval::NNUE {
                 [&, num_threads = thread_pool.size()](Thread& th) {
                     const auto thread_index = th.thread_idx();
 
-                    for (IndexType b = 0; b < batch_->size(); ++b) {
+                    for (IndexType b = 0; b < batch_size_; ++b) {
                         const IndexType batch_offset = kOutputDimensions * b;
 
                         for (IndexType c = 0; c < 2; ++c) {
                             const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                            for (const auto& feature : (*batch_)[b].training_features[c]) {
+                            for (const auto& feature : batch_[b].training_features[c]) {
                                 const IndexType feature_index = feature.get_index();
                                 const IndexType weights_offset =
                                     kHalfDimensions * feature_index;
@@ -519,6 +522,7 @@ namespace Eval::NNUE {
         // constructor
         Trainer(LayerType* target_layer) :
             batch_(nullptr),
+            batch_size_(0),
             target_layer_(target_layer),
             biases_(),
             weights_(),
@@ -669,7 +673,8 @@ namespace Eval::NNUE {
         static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
 
         // mini batch
-        const std::vector<Example>* batch_;
+        const Example* batch_;
+        IndexType batch_size_;
 
         // layer to learn
         LayerType* const target_layer_;
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 54f03d42..ad681d57 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -63,9 +63,12 @@ namespace Eval::NNUE {
             }
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
-            if (gradients_.size() < kInputDimensions * combined_batch.size()) {
-                gradients_.resize(kInputDimensions * combined_batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+            
+            if (gradients_.size() < kInputDimensions * size) {
+                gradients_.resize(kInputDimensions * size);
             }
 
             if (num_calls_.size() < thread_pool.size())
@@ -73,11 +76,11 @@ namespace Eval::NNUE {
                 num_calls_.resize(thread_pool.size(), 0);
             }
 
-            batch_size_ = static_cast<IndexType>(combined_batch.size());
+            batch_size_ = size;
 
             if (num_calls_[0] == 0) {
                 current_operation_ = Operation::kStepStart;
-                output_ = feature_transformer_trainer_->step_start(thread_pool, combined_batch);
+                output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
             }
 
             assert(current_operation_ == Operation::kStepStart);
@@ -237,15 +240,18 @@ namespace Eval::NNUE {
             shared_input_trainer_->initialize(rng);
         }
 
-        const LearnFloatType* step_start(ThreadPool& thread_pool, const std::vector<Example>& combined_batch) {
-            if (output_.size() < kOutputDimensions * combined_batch.size()) {
-              output_.resize(kOutputDimensions * combined_batch.size());
-              gradients_.resize(kInputDimensions * combined_batch.size());
+        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
+        {
+            const auto size = batch_end - batch_begin;
+
+            if (output_.size() < kOutputDimensions * size) {
+              output_.resize(kOutputDimensions * size);
+              gradients_.resize(kInputDimensions * size);
             }
 
-            batch_size_ = static_cast<IndexType>(combined_batch.size());
+            batch_size_ = size;
 
-            input_ = shared_input_trainer_->step_start(thread_pool, combined_batch);
+            input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end);
 
             return output_.data();
         }

From 34510dd08a611762b9d826f9cd72cda72ad0ee13 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 23:12:02 +0100
Subject: [PATCH 473/583] Remove used examples asyncronously.

---
 src/nnue/evaluate_nnue_learner.cpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 24ad2732..4a1a163d 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -214,9 +214,10 @@ namespace Eval::NNUE {
         std::vector<double> abs_discrete_eval_sum_local(thread_pool.size(), 0.0);
         std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
 
-        while (examples.size() >= batch_size) {
-            auto batch_begin = examples.end() - batch_size;
-            auto batch_end = examples.end();
+        auto prev_batch_begin = examples.end();
+        while (prev_batch_begin - examples.begin() >= batch_size) {
+            auto batch_begin = prev_batch_begin - batch_size;
+            auto batch_end = prev_batch_begin;
             auto size = batch_end - batch_begin;
             const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
             std::vector<LearnFloatType> gradients(size);
@@ -253,14 +254,20 @@ namespace Eval::NNUE {
                     trainer->backpropagate(th, gradients.data(), offset, count);
                 }
             );
+
+            // We can asyncronously erase the examples that we used in the previous
+            // step. This can be done safely because we're no longer using these
+            // examples and erase won't invalidate iterators.
+            examples.erase(prev_batch_begin, examples.end());
+            prev_batch_begin = batch_begin;
+
             thread_pool.wait_for_workers_finished();
 
             trainer->step_end(thread_pool, learning_rate);
 
-            examples.resize(examples.size() - size);
-
             collect_stats = false;
         }
+        examples.erase(prev_batch_begin, examples.end());
 
         if (verbose)
         {

From 622e0b14c280e89dd338064bce5d4e9c56eb0875 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 25 Nov 2020 23:13:06 +0100
Subject: [PATCH 474/583] Remove superfluous example shuffling. Shuffling now
 only happens on reading.

---
 src/nnue/evaluate_nnue_learner.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 4a1a163d..78446af2 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -202,7 +202,6 @@ namespace Eval::NNUE {
         learning_rate /= batch_size;
 
         std::lock_guard<std::mutex> lock(examples_mutex);
-        std::shuffle(examples.begin(), examples.end(), rng);
 
         double abs_eval_diff_sum = 0.0;
         double abs_discrete_eval_sum = 0.0;

From a97b65eaef85fc524e5455d2d78d36ca9675b08f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 11:04:10 +0100
Subject: [PATCH 475/583] Fix compilation error with USE_BLAS

---
 src/nnue/trainer/trainer_feature_transformer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 65766b05..877a74bc 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -151,7 +151,7 @@ namespace Eval::NNUE {
                         kHalfDimensions, biases_, 1, &output_[output_offset], 1
                     );
 
-                    for (const auto& feature : (*batch_)[b].training_features[c]) {
+                    for (const auto& feature : batch_[b].training_features[c]) {
                         const IndexType weights_offset = kHalfDimensions * feature.get_index();
                         cblas_saxpy(
                             kHalfDimensions, (float)feature.get_count(),

From 2aa7f5290e89db930dfbd038a6848da3ce43352d Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 17:31:57 +0100
Subject: [PATCH 476/583] Fix comparison of integers with different signedness.

---
 src/nnue/evaluate_nnue_learner.cpp             | 2 +-
 src/nnue/trainer/trainer_affine_transform.h    | 2 +-
 src/nnue/trainer/trainer_clipped_relu.h        | 2 +-
 src/nnue/trainer/trainer_feature_transformer.h | 2 +-
 src/nnue/trainer/trainer_input_slice.h         | 6 +++---
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 78446af2..6e0572dd 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -214,7 +214,7 @@ namespace Eval::NNUE {
         std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
 
         auto prev_batch_begin = examples.end();
-        while (prev_batch_begin - examples.begin() >= batch_size) {
+        while ((long)(prev_batch_begin - examples.begin()) >= (long)batch_size) {
             auto batch_begin = prev_batch_begin - batch_size;
             auto batch_end = prev_batch_begin;
             auto size = batch_end - batch_begin;
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
index b6d70aa4..53e8f904 100644
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ b/src/nnue/trainer/trainer_affine_transform.h
@@ -95,7 +95,7 @@ namespace Eval::NNUE {
         {
             const auto size = batch_end - batch_begin;
 
-            if (output_.size() < kOutputDimensions * size) {
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
                 output_.resize(kOutputDimensions * size);
                 gradients_.resize(kInputDimensions * size);
             }
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index eae35df6..ff883afc 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -46,7 +46,7 @@ namespace Eval::NNUE {
         {
             const auto size = batch_end - batch_begin;
 
-            if (output_.size() < kOutputDimensions * size) {
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
               output_.resize(kOutputDimensions * size);
               gradients_.resize(kInputDimensions * size);
             }
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 877a74bc..9afda728 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -93,7 +93,7 @@ namespace Eval::NNUE {
         {
             const auto size = batch_end - batch_begin;
 
-            if (output_.size() < kOutputDimensions * size) {
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
                 output_.resize(kOutputDimensions * size);
                 gradients_.resize(kOutputDimensions * size);
             }
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index ad681d57..a94cae93 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -66,8 +66,8 @@ namespace Eval::NNUE {
         const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
         {
             const auto size = batch_end - batch_begin;
-            
-            if (gradients_.size() < kInputDimensions * size) {
+
+            if ((long)gradients_.size() < (long)kInputDimensions * size) {
                 gradients_.resize(kInputDimensions * size);
             }
 
@@ -244,7 +244,7 @@ namespace Eval::NNUE {
         {
             const auto size = batch_end - batch_begin;
 
-            if (output_.size() < kOutputDimensions * size) {
+            if ((long)output_.size() < (long)kOutputDimensions * size) {
               output_.resize(kOutputDimensions * size);
               gradients_.resize(kInputDimensions * size);
             }

From 1322a9a5fd5bc0d085c584237d0f4b70b7b4d56e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 21:26:06 +0100
Subject: [PATCH 477/583] Prevent false sharing of num_calls counter in the
 shared input trainer. Fix current_operation not being local to the executing
 thread.

---
 src/nnue/trainer/trainer_input_slice.h | 117 +++++++++++++++----------
 1 file changed, 73 insertions(+), 44 deletions(-)

diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index a94cae93..62a761a7 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -15,6 +15,19 @@
 namespace Eval::NNUE {
 
     // Learning: Input layer
+    // This is tricky. It exists because when there's more than one trainer
+    // on top of a single feature transformer we want to only call propagate/backpropagate
+    // on the feature transformer once. This is straightforward in the old
+    // multithreading case, because propagate/backpropagate is called just once from the
+    // main thread. But with the current implementation of coarser multithreading
+    // we end up calling each method from each thread. Therefore we have to keep
+    // the num_calls and current_operation per thread basis, each thread must work
+    // on its designated batch slice, and the only synchronization points are
+    // step_start and step_end - for which we use state of the first thread.
+    // Each thread requires their own bookkeeping because it's possible that
+    // one thread is still in propagate of some batch slice while the other thread
+    // is doing backpropagate of some other slice. We also ensure the thread state
+    // isn't suspectible to false sharing by using a full cache line for the state.
     class SharedInputTrainer {
     public:
         // factory function
@@ -34,32 +47,36 @@ namespace Eval::NNUE {
 
         // Set options such as hyperparameters
         void send_message(Message* message) {
-            if (num_calls_[0] == 0) {
-                current_operation_ = Operation::kSendMessage;
+            auto& thread_state = thread_states_[0];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kSendMessage;
                 feature_transformer_trainer_->send_message(message);
             }
 
-            assert(current_operation_ == Operation::kSendMessage);
+            assert(thread_state.current_operation == Operation::kSendMessage);
 
-            if (++num_calls_[0] == num_referrers_) {
-                num_calls_[0] = 0;
-                current_operation_ = Operation::kNone;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
             }
         }
 
         // Initialize the parameters with random numbers
         template <typename RNG>
         void initialize(RNG& rng) {
-            if (num_calls_[0] == 0) {
-                current_operation_ = Operation::kInitialize;
+            auto& thread_state = thread_states_[0];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kInitialize;
                 feature_transformer_trainer_->initialize(rng);
             }
 
-            assert(current_operation_ == Operation::kInitialize);
+            assert(thread_state.current_operation == Operation::kInitialize);
 
-            if (++num_calls_[0] == num_referrers_) {
-                num_calls_[0] = 0;
-                current_operation_ = Operation::kNone;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
             }
         }
 
@@ -71,23 +88,25 @@ namespace Eval::NNUE {
                 gradients_.resize(kInputDimensions * size);
             }
 
-            if (num_calls_.size() < thread_pool.size())
+            if (thread_states_.size() < thread_pool.size())
             {
-                num_calls_.resize(thread_pool.size(), 0);
+                thread_states_.resize(thread_pool.size());
             }
 
             batch_size_ = size;
 
-            if (num_calls_[0] == 0) {
-                current_operation_ = Operation::kStepStart;
+            auto& thread_state = thread_states_[0];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kStepStart;
                 output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
             }
 
-            assert(current_operation_ == Operation::kStepStart);
+            assert(thread_state.current_operation == Operation::kStepStart);
 
-            if (++num_calls_[0] == num_referrers_) {
-                num_calls_[0] = 0;
-                current_operation_ = Operation::kNone;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
             }
 
             return output_;
@@ -97,16 +116,18 @@ namespace Eval::NNUE {
         void propagate(Thread& th, uint64_t offset, uint64_t count) {
             const auto thread_id = th.thread_idx();
 
-            if (num_calls_[thread_id] == 0) {
-                current_operation_ = Operation::kPropagate;
+            auto& thread_state = thread_states_[thread_id];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kPropagate;
                 feature_transformer_trainer_->propagate(th, offset, count);
             }
 
-            assert(current_operation_ == Operation::kPropagate);
+            assert(thread_state.current_operation == Operation::kPropagate);
 
-            if (++num_calls_[thread_id] == num_referrers_) {
-                num_calls_[thread_id] = 0;
-                current_operation_ = Operation::kNone;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
             }
         }
 
@@ -118,13 +139,15 @@ namespace Eval::NNUE {
 
             const auto thread_id = th.thread_idx();
 
+            auto& thread_state = thread_states_[thread_id];
+
             if (num_referrers_ == 1) {
                 feature_transformer_trainer_->backpropagate(th, gradients, offset, count);
                 return;
             }
 
-            if (num_calls_[thread_id] == 0) {
-                current_operation_ = Operation::kBackPropagate;
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kBackPropagate;
                 for (IndexType b = offset; b < offset + count; ++b) {
                     const IndexType batch_offset = kInputDimensions * b;
                     for (IndexType i = 0; i < kInputDimensions; ++i) {
@@ -133,7 +156,7 @@ namespace Eval::NNUE {
                 }
             }
 
-            assert(current_operation_ == Operation::kBackPropagate);
+            assert(thread_state.current_operation == Operation::kBackPropagate);
 
             for (IndexType b = offset; b < offset + count; ++b) {
                 const IndexType batch_offset = kInputDimensions * b;
@@ -142,25 +165,27 @@ namespace Eval::NNUE {
                 }
             }
 
-            if (++num_calls_[thread_id] == num_referrers_) {
+            if (++thread_state.num_calls == num_referrers_) {
                 feature_transformer_trainer_->backpropagate(
                     th, gradients_.data(), offset, count);
-                num_calls_[thread_id] = 0;
-                current_operation_ = Operation::kNone;
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
             }
         }
 
         void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
-            if (num_calls_[0] == 0) {
-                current_operation_ = Operation::kStepEnd;
+            auto& thread_state = thread_states_[0];
+
+            if (thread_state.num_calls == 0) {
+                thread_state.current_operation = Operation::kStepEnd;
                 feature_transformer_trainer_->step_end(thread_pool, learning_rate);
             }
 
-            assert(current_operation_ == Operation::kStepEnd);
+            assert(thread_state.current_operation == Operation::kStepEnd);
 
-            if (++num_calls_[0] == num_referrers_) {
-                num_calls_[0] = 0;
-                current_operation_ = Operation::kNone;
+            if (++thread_state.num_calls == num_referrers_) {
+                thread_state.num_calls = 0;
+                thread_state.current_operation = Operation::kNone;
             }
         }
 
@@ -169,8 +194,7 @@ namespace Eval::NNUE {
         SharedInputTrainer(FeatureTransformer* ft) :
             batch_size_(0),
             num_referrers_(0),
-            num_calls_(1, 0),
-            current_operation_(Operation::kNone),
+            thread_states_(1),
             feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
                 ft)),
             output_(nullptr) {
@@ -197,11 +221,16 @@ namespace Eval::NNUE {
         // number of layers sharing this layer as input
         std::uint32_t num_referrers_;
 
-        // Number of times the current process has been called
-        std::vector<std::uint32_t> num_calls_;
+        struct alignas(kCacheLineSize) ThreadState
+        {
+            std::uint32_t num_calls{0};
 
-        // current processing type
-        Operation current_operation_;
+            // current processing type
+            Operation current_operation = Operation::kNone;
+        };
+
+        // Number of times the current process has been called
+        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
 
         // Trainer of input feature converter
         const std::shared_ptr<Trainer<FeatureTransformer>>

From 6ce0245787c1111aa2d014b2d26fc8220da3adae Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 01:37:07 +0100
Subject: [PATCH 478/583] Basic autograd

---
 src/learn/autograd.h | 350 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 350 insertions(+)
 create mode 100644 src/learn/autograd.h

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
new file mode 100644
index 00000000..8a4df2ab
--- /dev/null
+++ b/src/learn/autograd.h
@@ -0,0 +1,350 @@
+#ifndef LEARNER_AUTOGRAD_H
+#define LEARNER_AUTOGRAD_H
+
+#include <cmath>
+#include <utility>
+#include <type_traits>
+#include <memory>
+#include <tuple>
+
+namespace Learner::Autograd::UnivariateStatic
+{
+
+    template <typename T>
+    struct Identity
+    {
+        using type = T;
+    };
+
+    template <typename T>
+    using Id = typename Identity<T>::type;
+
+    template <typename T, int I>
+    struct VariableParameter
+    {
+        using ValueType = T;
+
+        VariableParameter()
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::get<I>(args);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(1.0);
+        }
+    };
+
+    template <typename T, int I>
+    struct ConstantParameter
+    {
+        using ValueType = T;
+
+        ConstantParameter()
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::get<I>(args);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+    };
+
+    template <typename T>
+    struct Constant
+    {
+        using ValueType = T;
+
+        Constant(T x) :
+            m_x(std::move(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>&) const
+        {
+            return m_x;
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+
+    private:
+        T m_x;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    struct Sum
+    {
+        using ValueType = T;
+
+        Sum(LhsT lhs, RhsT rhs) :
+            m_lhs(std::move(lhs)),
+            m_rhs(std::move(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) + m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) + m_rhs.grad(args);
+        }
+
+    private:
+        LhsT m_lhs;
+        RhsT m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    auto operator+(LhsT lhs, RhsT rhs)
+    {
+        return Sum(std::move(lhs), std::move(rhs));
+    }
+
+    template <typename LhsT, typename T = typename LhsT::ValueType>
+    auto operator+(LhsT lhs, Id<T> rhs)
+    {
+        return Sum(std::move(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename RhsT::ValueType>
+    auto operator+(Id<T> lhs, RhsT rhs)
+    {
+        return Sum(Constant(lhs), std::move(rhs));
+    }
+
+    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    struct Difference
+    {
+        using ValueType = T;
+
+        Difference(LhsT lhs, RhsT rhs) :
+            m_lhs(std::move(lhs)),
+            m_rhs(std::move(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) - m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) - m_rhs.grad(args);
+        }
+
+    private:
+        LhsT m_lhs;
+        RhsT m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    auto operator-(LhsT lhs, RhsT rhs)
+    {
+        return Difference(std::move(lhs), std::move(rhs));
+    }
+
+    template <typename LhsT, typename T = typename LhsT::ValueType>
+    auto operator-(LhsT lhs, Id<T> rhs)
+    {
+        return Difference(std::move(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename RhsT::ValueType>
+    auto operator-(Id<T> lhs, RhsT rhs)
+    {
+        return Difference(Constant(lhs), std::move(rhs));
+    }
+
+    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    struct Product
+    {
+        using ValueType = T;
+
+        Product(LhsT lhs, RhsT rhs) :
+            m_lhs(std::move(lhs)),
+            m_rhs(std::move(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) * m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args);
+        }
+
+    private:
+        LhsT m_lhs;
+        RhsT m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    auto operator*(LhsT lhs, RhsT rhs)
+    {
+        return Product(std::move(lhs), std::move(rhs));
+    }
+
+    template <typename LhsT, typename T = typename LhsT::ValueType>
+    auto operator*(LhsT lhs, Id<T> rhs)
+    {
+        return Product(std::move(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename RhsT::ValueType>
+    auto operator*(Id<T> lhs, RhsT rhs)
+    {
+        return Product(Constant(lhs), std::move(rhs));
+    }
+
+    template <typename ArgT, typename T = typename ArgT::ValueType>
+    struct Sigmoid
+    {
+        using ValueType = T;
+
+        explicit Sigmoid(ArgT x) :
+            m_x(std::move(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return value_(m_x.value(args));
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_x.grad(args) * grad_(m_x.value(args));
+        }
+
+    private:
+        ArgT m_x;
+
+        T value_(T x) const
+        {
+            return 1.0 / (1.0 + std::exp(-x));
+        }
+
+        T grad_(T x) const
+        {
+            return value_(x) * (1.0 - value_(x));
+        }
+    };
+
+    template <typename ArgT>
+    auto sigmoid(ArgT x)
+    {
+        return Sigmoid(std::move(x));
+    }
+
+    template <typename ArgT, typename T = typename ArgT::ValueType>
+    struct Pow
+    {
+        using ValueType = T;
+
+        explicit Pow(ArgT x, Id<T> exponent) :
+            m_x(std::move(x)),
+            m_exponent(std::move(exponent))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return std::pow(m_x.value(args), m_exponent);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args);
+        }
+
+    private:
+        ArgT m_x;
+        T m_exponent;
+    };
+
+    template <typename ArgT, typename T = typename ArgT::ValueType>
+    auto pow(ArgT x, Id<T> exp)
+    {
+        return Pow(std::move(x), std::move(exp));
+    }
+
+    template <typename ArgT, typename T = typename ArgT::ValueType>
+    struct Log
+    {
+        using ValueType = T;
+
+        explicit Log(ArgT x) :
+            m_x(std::move(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return value_(m_x.value(args));
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_x.grad(args) * grad_(m_x.value(args));
+        }
+
+    private:
+        ArgT m_x;
+
+        T value_(T x) const
+        {
+            return std::log(x);
+        }
+
+        T grad_(T x) const
+        {
+            return 1.0 / x;
+        }
+    };
+
+    template <typename ArgT>
+    auto log(ArgT x)
+    {
+        return Log(std::move(x));
+    }
+
+}
+
+#endif
\ No newline at end of file

From 541fb8177abfafdcbe23f0f98431a56e49dbae98 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 11:33:35 +0100
Subject: [PATCH 479/583] More utility in autograd.

---
 src/learn/autograd.h | 68 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 9 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 8a4df2ab..0b894cc4 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -7,6 +7,44 @@
 #include <memory>
 #include <tuple>
 
+namespace Learner
+{
+    template <typename T>
+    struct ValueWithGrad
+    {
+        T value;
+        T grad;
+
+        ValueWithGrad& operator+=(const ValueWithGrad<T>& rhs)
+        {
+            value += rhs.value;
+            grad += rhs.grad;
+            return *this;
+        }
+
+        ValueWithGrad& operator-=(const ValueWithGrad<T>& rhs)
+        {
+            value -= rhs.value;
+            grad -= rhs.grad;
+            return *this;
+        }
+
+        ValueWithGrad& operator*=(T rhs)
+        {
+            value *= rhs;
+            grad *= rhs;
+            return *this;
+        }
+
+        ValueWithGrad& operator/=(T rhs)
+        {
+            value /= rhs;
+            grad /= rhs;
+            return *this;
+        }
+    };
+}
+
 namespace Learner::Autograd::UnivariateStatic
 {
 
@@ -19,8 +57,20 @@ namespace Learner::Autograd::UnivariateStatic
     template <typename T>
     using Id = typename Identity<T>::type;
 
+    template <typename T>
+    struct Evaluable
+    {
+        template <typename... ArgsTs>
+        auto eval(const std::tuple<ArgsTs...>& args) const
+        {
+            using ValueType = typename T::ValueType;
+            const T* this_ = static_cast<const T*>(this);
+            return ValueWithGrad<ValueType>{ this_->value(args), this_->grad(args) };
+        }
+    };
+
     template <typename T, int I>
-    struct VariableParameter
+    struct VariableParameter : Evaluable<VariableParameter<T, I>>
     {
         using ValueType = T;
 
@@ -42,7 +92,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename T, int I>
-    struct ConstantParameter
+    struct ConstantParameter : Evaluable<ConstantParameter<T, I>>
     {
         using ValueType = T;
 
@@ -64,7 +114,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename T>
-    struct Constant
+    struct Constant : Evaluable<Constant<T>>
     {
         using ValueType = T;
 
@@ -90,7 +140,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    struct Sum
+    struct Sum : Evaluable<Sum<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
@@ -136,7 +186,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    struct Difference
+    struct Difference : Evaluable<Difference<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
@@ -182,7 +232,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    struct Product
+    struct Product : Evaluable<Product<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
@@ -228,7 +278,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Sigmoid
+    struct Sigmoid : Evaluable<Sigmoid<ArgT, T>>
     {
         using ValueType = T;
 
@@ -270,7 +320,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Pow
+    struct Pow : Evaluable<Pow<ArgT, T>>
     {
         using ValueType = T;
 
@@ -304,7 +354,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Log
+    struct Log : Evaluable<Log<ArgT, T>>
     {
         using ValueType = T;
 

From 5a58eb803a2c1b11808c96a1d8eb9c58a01d4791 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 11:55:00 +0100
Subject: [PATCH 480/583] Loss func with autograd

---
 src/learn/learn.cpp | 19 +++++++++++++++++--
 src/learn/learn.h   |  2 ++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index f7358f8e..411cee08 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -19,6 +19,7 @@
 
 #include "learn.h"
 
+#include "autograd.h"
 #include "sfen_reader.h"
 
 #include "misc.h"
@@ -320,6 +321,20 @@ namespace Learner
         return std::clamp(grad, -max_grad, max_grad);
     }
 
+    static ValueWithGrad<double> get_loss(Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        auto q_ = sigmoid(VariableParameter<double, 0>{} * winning_probability_coefficient);
+        auto p_ = sigmoid(ConstantParameter<double, 1>{} * winning_probability_coefficient);
+        auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
+        auto lambda_ = ConstantParameter<double, 3>{};
+        auto loss_ = pow(lambda_ * (q_ - p_) + (1.0 - lambda_) * (q_ - t_), 2.0);
+
+        auto args = std::tuple((double)shallow, (double)teacher_signal, (double)result, calculate_lambda(teacher_signal));
+        return loss_.eval(args);
+    }
+
     // Calculate cross entropy during learning
     // The individual cross entropy of the win/loss term and win
     // rate term of the elmo expression is returned
@@ -702,7 +717,7 @@ namespace Learner
             {
                 goto RETRY_READ;
             }
-            
+
             // We want to position being trained on not to be terminal
             if (MoveList<LEGAL>(pos).size() == 0)
                 goto RETRY_READ;
@@ -720,7 +735,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, calc_grad);
+        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, calc_grad, get_loss);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 6ce476e5..f74fd4e3 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -33,6 +33,7 @@ using LearnFloatType = float;
 // Definition of struct used in Learner
 // ----------------------
 
+#include "autograd.h"
 #include "packed_sfen.h"
 
 #include "position.h"
@@ -68,6 +69,7 @@ namespace Learner
     void learn(std::istringstream& is);
 
     using CalcGradFunc = double(Value, Value, int, int);
+    using CalcLossFunc = ValueWithGrad<double>(Value, Value, int, int);
 }
 
 #endif // ifndef _LEARN_H_

From b71d1e86205505997106348afa7e359b9f6593c1 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 11:55:15 +0100
Subject: [PATCH 481/583] Pass the new loss function to update_parameters

---
 src/nnue/evaluate_nnue_learner.cpp | 5 ++++-
 src/nnue/evaluate_nnue_learner.h   | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 6e0572dd..822c56b4 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -195,8 +195,11 @@ namespace Eval::NNUE {
         uint64_t epoch,
         bool verbose,
         double learning_rate,
-        Learner::CalcGradFunc calc_grad)
+        Learner::CalcGradFunc calc_grad,
+        Learner::CalcLossFunc calc_loss)
     {
+        using namespace Learner::Autograd::UnivariateStatic;
+
         assert(batch_size > 0);
 
         learning_rate /= batch_size;
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 8633f713..0fe8afce 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -38,7 +38,8 @@ namespace Eval::NNUE {
         uint64_t epoch,
         bool verbose,
         double learning_rate,
-        Learner::CalcGradFunc calc_grad);
+        Learner::CalcGradFunc calc_grad,
+        Learner::CalcLossFunc calc_loss);
 
     // Check if there are any problems with learning
     void check_health();

From 539bd2d1c8fdbe74cff0efc30a994f1fed7a08fe Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 12:18:02 +0100
Subject: [PATCH 482/583] Replace the old loss/grad calculation completely.

---
 src/learn/autograd.h               |   5 +
 src/learn/learn.cpp                | 295 +++++++----------------------
 src/learn/learn.h                  |   1 -
 src/nnue/evaluate_nnue_learner.cpp |   7 +-
 src/nnue/evaluate_nnue_learner.h   |   1 -
 5 files changed, 79 insertions(+), 230 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 0b894cc4..f83d4d72 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -42,6 +42,11 @@ namespace Learner
             grad /= rhs;
             return *this;
         }
+
+        ValueWithGrad abs() const
+        {
+            return { std::abs(value), std::abs(grad) };
+        }
     };
 }
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 411cee08..e558b56a 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -52,6 +52,7 @@
 #include <sstream>
 #include <unordered_set>
 #include <iostream>
+#include <mutex>
 
 #if defined (_OPENMP)
 #include <omp.h>
@@ -99,65 +100,64 @@ namespace Learner
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
-    namespace Detail {
-        template <bool AtomicV>
-        struct Loss
+    struct Loss
+    {
+        double value() const
         {
-            using T =
-                std::conditional_t<
-                    AtomicV,
-                    atomic<double>,
-                    double
-                >;
+            return m_loss.value;
+        }
 
-            T cross_entropy_eval{0.0};
-            T cross_entropy_win{0.0};
-            T cross_entropy{0.0};
-            T entropy_eval{0.0};
-            T entropy_win{0.0};
-            T entropy{0.0};
-            T count{0.0};
+        double grad() const
+        {
+            return m_loss.grad;
+        }
 
-            template <bool OtherAtomicV>
-            Loss& operator += (const Loss<OtherAtomicV>& rhs)
-            {
-                cross_entropy_eval += rhs.cross_entropy_eval;
-                cross_entropy_win += rhs.cross_entropy_win;
-                cross_entropy += rhs.cross_entropy;
-                entropy_eval += rhs.entropy_eval;
-                entropy_win += rhs.entropy_win;
-                entropy += rhs.entropy;
-                count += rhs.count;
+        uint64_t count() const
+        {
+            return m_count;
+        }
 
-                return *this;
-            }
+        Loss& operator += (const ValueWithGrad<double>& rhs)
+        {
+            std::unique_lock lock(m_mutex);
 
-            void reset()
-            {
-                cross_entropy_eval = 0.0;
-                cross_entropy_win = 0.0;
-                cross_entropy = 0.0;
-                entropy_eval = 0.0;
-                entropy_win = 0.0;
-                entropy = 0.0;
-                count = 0.0;
-            }
+            m_loss += rhs.abs();
+            m_count += 1;
 
-            template <typename StreamT>
-            void print(const std::string& prefix, StreamT& s) const
-            {
-                s << "  - " << prefix << "_cross_entropy_eval = " << cross_entropy_eval / count << endl;
-                s << "  - " << prefix << "_cross_entropy_win  = " << cross_entropy_win / count << endl;
-                s << "  - " << prefix << "_entropy_eval       = " << entropy_eval / count << endl;
-                s << "  - " << prefix << "_entropy_win        = " << entropy_win / count << endl;
-                s << "  - " << prefix << "_cross_entropy      = " << cross_entropy / count << endl;
-                s << "  - " << prefix << "_entropy            = " << entropy / count << endl;
-            }
-        };
-    }
+            return *this;
+        }
 
-    using Loss = Detail::Loss<false>;
-    using AtomicLoss = Detail::Loss<true>;
+        Loss& operator += (const Loss& rhs)
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss += rhs.m_loss.abs();
+            m_count += rhs.m_count;
+
+            return *this;
+        }
+
+        void reset()
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss = ValueWithGrad<double>{ 0.0, 0.0 };
+            m_count = 0;
+        }
+
+        template <typename StreamT>
+        void print(const std::string& prefix, StreamT& s) const
+        {
+            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << endl;
+            s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << endl;
+        }
+
+    private:
+        ValueWithGrad<double> m_loss{ 0.0, 0.0 };
+        uint64_t m_count{0};
+        std::mutex m_mutex;
+
+    };
 
     static void append_files_from_dir(
         std::vector<std::string>& filenames,
@@ -185,94 +185,6 @@ namespace Learner
         }
     }
 
-    // A function that converts the evaluation value to the winning rate [0,1]
-    static double winning_percentage(double value)
-    {
-        // 1/(1+10^(-Eval/4))
-        // = 1/(1+e^(-Eval/4*ln(10))
-        // = sigmoid(Eval/4*ln(10))
-        return Math::sigmoid(value * winning_probability_coefficient);
-    }
-
-    // A function that converts the evaluation value to the winning rate [0,1]
-    static double winning_percentage_wdl(double value, int ply)
-    {
-        constexpr double wdl_total = 1000.0;
-        constexpr double draw_score = 0.5;
-
-        const double wdl_w = UCI::win_rate_model_double(value, ply);
-        const double wdl_l = UCI::win_rate_model_double(-value, ply);
-        const double wdl_d = wdl_total - wdl_w - wdl_l;
-
-        return (wdl_w + wdl_d * draw_score) / wdl_total;
-    }
-
-    // A function that converts the evaluation value to the winning rate [0,1]
-    static double winning_percentage(double value, int ply)
-    {
-        if (use_wdl)
-        {
-            return winning_percentage_wdl(value, ply);
-        }
-        else
-        {
-            return winning_percentage(value);
-        }
-    }
-
-    static double calc_cross_entropy_of_winning_percentage(
-        double deep_win_rate,
-        double shallow_eval,
-        int ply)
-    {
-        const double p = deep_win_rate;
-        const double q = winning_percentage(shallow_eval, ply);
-        return -p * std::log(q) - (1.0 - p) * std::log(1.0 - q);
-    }
-
-    static double calc_d_cross_entropy_of_winning_percentage(
-        double deep_win_rate,
-        double shallow_eval,
-        int ply)
-    {
-        constexpr double epsilon = 0.000001;
-
-        const double y1 = calc_cross_entropy_of_winning_percentage(
-            deep_win_rate, shallow_eval, ply);
-
-        const double y2 = calc_cross_entropy_of_winning_percentage(
-            deep_win_rate, shallow_eval + epsilon, ply);
-
-        // Divide by the winning_probability_coefficient to
-        // match scale with the sigmoidal win rate
-        return ((y2 - y1) / epsilon) / winning_probability_coefficient;
-    }
-
-    // Training Formula · Issue #71 · nodchip/Stockfish https://github.com/nodchip/Stockfish/issues/71
-    static double get_scaled_signal(double signal)
-    {
-        double scaled_signal = signal;
-
-        // Normalize to [0.0, 1.0].
-        scaled_signal =
-            (scaled_signal - src_score_min_value)
-            / (src_score_max_value - src_score_min_value);
-
-        // Scale to [dest_score_min_value, dest_score_max_value].
-        scaled_signal =
-            scaled_signal * (dest_score_max_value - dest_score_min_value)
-            + dest_score_min_value;
-
-        return scaled_signal;
-    }
-
-    // Teacher winning probability.
-    static double calculate_p(double teacher_signal, int ply)
-    {
-        const double scaled_teacher_signal = get_scaled_signal(teacher_signal);
-        return winning_percentage(scaled_teacher_signal, ply);
-    }
-
     static double calculate_lambda(double teacher_signal)
     {
         // If the evaluation value in deep search exceeds elmo_lambda_limit
@@ -285,94 +197,31 @@ namespace Learner
         return lambda;
     }
 
-    static double calculate_t(int game_result)
-    {
-        // Use 1 as the correction term if the expected win rate is 1,
-        // 0 if you lose, and 0.5 if you draw.
-        // game_result = 1,0,-1 so add 1 and divide by 2.
-        const double t = double(game_result + 1) * 0.5;
-
-        return t;
-    }
-
-    static double calc_grad(Value shallow, Value teacher_signal, int result, int ply)
-    {
-        // elmo (WCSC27) method
-        // Correct with the actual game wins and losses.
-        const double q = winning_percentage(shallow, ply);
-        const double p = calculate_p(teacher_signal, ply);
-        const double t = calculate_t(result);
-        const double lambda = calculate_lambda(teacher_signal);
-
-        double grad;
-        if (use_wdl)
-        {
-            const double dce_p = calc_d_cross_entropy_of_winning_percentage(p, shallow, ply);
-            const double dce_t = calc_d_cross_entropy_of_winning_percentage(t, shallow, ply);
-            grad = lambda * dce_p + (1.0 - lambda) * dce_t;
-        }
-        else
-        {
-            // Use the actual win rate as a correction term.
-            // This is the idea of ​​elmo (WCSC27), modern O-parts.
-            grad = lambda * (q - p) + (1.0 - lambda) * (q - t);
-        }
-
-        return std::clamp(grad, -max_grad, max_grad);
-    }
-
     static ValueWithGrad<double> get_loss(Value shallow, Value teacher_signal, int result, int ply)
     {
         using namespace Learner::Autograd::UnivariateStatic;
-
         auto q_ = sigmoid(VariableParameter<double, 0>{} * winning_probability_coefficient);
         auto p_ = sigmoid(ConstantParameter<double, 1>{} * winning_probability_coefficient);
         auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         auto lambda_ = ConstantParameter<double, 3>{};
         auto loss_ = pow(lambda_ * (q_ - p_) + (1.0 - lambda_) * (q_ - t_), 2.0);
 
+        /*
+        auto q_ = VariableParameter<double, 0>{};
+        auto p_ = ConstantParameter<double, 1>{};
+        auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
+        */
+
         auto args = std::tuple((double)shallow, (double)teacher_signal, (double)result, calculate_lambda(teacher_signal));
         return loss_.eval(args);
     }
 
-    // Calculate cross entropy during learning
-    // The individual cross entropy of the win/loss term and win
-    // rate term of the elmo expression is returned
-    // to the arguments cross_entropy_eval and cross_entropy_win.
-    static Loss calc_cross_entropy(
+    static auto get_loss(
         Value teacher_signal,
         Value shallow,
         const PackedSfenValue& psv)
     {
-        // Teacher winning probability.
-        const double q = winning_percentage(shallow, psv.gamePly);
-        const double p = calculate_p(teacher_signal, psv.gamePly);
-        const double t = calculate_t(psv.game_result);
-        const double lambda = calculate_lambda(teacher_signal);
-
-        constexpr double epsilon = 0.000001;
-
-        const double m = (1.0 - lambda) * t + lambda * p;
-
-        Loss loss{};
-
-        loss.cross_entropy_eval =
-            (-p * std::log(q + epsilon) - (1.0 - p) * std::log(1.0 - q + epsilon));
-        loss.cross_entropy_win =
-            (-t * std::log(q + epsilon) - (1.0 - t) * std::log(1.0 - q + epsilon));
-        loss.entropy_eval =
-            (-p * std::log(p + epsilon) - (1.0 - p) * std::log(1.0 - p + epsilon));
-        loss.entropy_win =
-            (-t * std::log(t + epsilon) - (1.0 - t) * std::log(1.0 - t + epsilon));
-
-        loss.cross_entropy =
-            (-m * std::log(q + epsilon) - (1.0 - m) * std::log(1.0 - q + epsilon));
-        loss.entropy =
-            (-m * std::log(m + epsilon) - (1.0 - m) * std::log(1.0 - m + epsilon));
-
-        loss.count = 1;
-
-        return loss;
+        return get_loss(shallow, teacher_signal, psv.game_result, psv.gamePly);
     }
 
     // Class to generate sfen with multiple threads
@@ -495,7 +344,7 @@ namespace Learner
             Thread& th,
             std::atomic<uint64_t>& counter,
             const PSVector& psv,
-            AtomicLoss& test_loss_sum,
+            Loss& test_loss_sum,
             atomic<double>& sum_norm,
             atomic<int>& move_accord_count
         );
@@ -530,7 +379,7 @@ namespace Learner
         int dir_number;
 
         // For calculation of learning data loss
-        AtomicLoss learn_loss_sum;
+        Loss learn_loss_sum;
     };
 
     void LearnerThink::set_learning_search_limits()
@@ -681,7 +530,7 @@ namespace Learner
 
                 const Value shallow_value = Eval::evaluate(pos);
 
-                const auto loss = calc_cross_entropy(
+                const auto loss = get_loss(
                     deep_value,
                     (rootColor == pos.side_to_move()) ? shallow_value : -shallow_value,
                     ps);
@@ -735,7 +584,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, calc_grad, get_loss);
+        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, get_loss);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
@@ -778,7 +627,7 @@ namespace Learner
         out << "  - learning rate = " << params.learning_rate << endl;
 
         // For calculation of verification data loss
-        AtomicLoss test_loss_sum{};
+        Loss test_loss_sum{};
 
         // norm for learning
         atomic<double> sum_norm{0.0};
@@ -810,26 +659,24 @@ namespace Learner
         });
         Threads.wait_for_workers_finished();
 
-        latest_loss_sum += test_loss_sum.cross_entropy - test_loss_sum.entropy;
+        latest_loss_sum += test_loss_sum.value();
         latest_loss_count += psv.size();
 
-        if (psv.size() && test_loss_sum.count > 0.0)
+        if (psv.size() && test_loss_sum.count() > 0)
         {
             test_loss_sum.print("test", out);
 
-            if (learn_loss_sum.count > 0.0)
+            if (learn_loss_sum.count() > 0)
             {
                 learn_loss_sum.print("learn", out);
             }
 
             out << "  - norm = " << sum_norm << endl;
             out << "  - move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
-            out << "  - loss (current) = " << (test_loss_sum.cross_entropy - test_loss_sum.entropy) / psv.size() << endl;
-            out << "  - loss (average) = " << latest_loss_sum / latest_loss_count << endl;
         }
         else
         {
-            out << "ERROR: psv.size() = " << psv.size() << " ,  done = " << test_loss_sum.count << endl;
+            out << "ERROR: psv.size() = " << psv.size() << " ,  done = " << test_loss_sum.count() << endl;
         }
 
         learn_loss_sum.reset();
@@ -839,7 +686,7 @@ namespace Learner
         Thread& th,
         std::atomic<uint64_t>& counter,
         const PSVector& psv,
-        AtomicLoss& test_loss_sum,
+        Loss& test_loss_sum,
         atomic<double>& sum_norm,
         atomic<int>& move_accord_count
     )
@@ -869,7 +716,7 @@ namespace Learner
             // Evaluation value of deep search
             const auto deep_value = (Value)ps.score;
 
-            const auto loss = calc_cross_entropy(
+            const auto loss = get_loss(
                 deep_value,
                 shallow_value,
                 ps);
diff --git a/src/learn/learn.h b/src/learn/learn.h
index f74fd4e3..4e8d8a02 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -68,7 +68,6 @@ namespace Learner
     // Learning from the generated game record
     void learn(std::istringstream& is);
 
-    using CalcGradFunc = double(Value, Value, int, int);
     using CalcLossFunc = ValueWithGrad<double>(Value, Value, int, int);
 }
 
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 822c56b4..038a462c 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -195,7 +195,6 @@ namespace Eval::NNUE {
         uint64_t epoch,
         bool verbose,
         double learning_rate,
-        Learner::CalcGradFunc calc_grad,
         Learner::CalcLossFunc calc_loss)
     {
         using namespace Learner::Autograd::UnivariateStatic;
@@ -237,8 +236,8 @@ namespace Eval::NNUE {
                             e.sign * network_output[b] * kPonanzaConstant));
                         const auto discrete = e.sign * e.discrete_nn_eval;
                         const auto& psv = e.psv;
-                        const double gradient =
-                            e.sign * calc_grad(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        const auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        const double gradient = loss.grad * e.sign * kPonanzaConstant;
                         gradients[b] = static_cast<LearnFloatType>(gradient * e.weight);
 
 
@@ -330,4 +329,4 @@ namespace Eval::NNUE {
 #endif
         out << "INFO (save_eval): Finished saving evaluation file in " << eval_dir << std::endl;
     }
-}  // namespace Eval::NNUE
\ No newline at end of file
+}  // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 0fe8afce..7f7daa5b 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -38,7 +38,6 @@ namespace Eval::NNUE {
         uint64_t epoch,
         bool verbose,
         double learning_rate,
-        Learner::CalcGradFunc calc_grad,
         Learner::CalcLossFunc calc_loss);
 
     // Check if there are any problems with learning

From aa55692b97df056298ca016a5be4771902baafd9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 17:08:22 +0100
Subject: [PATCH 483/583] Cross entropy loss.

---
 src/learn/autograd.h | 36 ++++++++++++++++++++++++++++++++++--
 src/learn/learn.cpp  | 17 +++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index f83d4d72..a4ad8b7f 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -282,6 +282,38 @@ namespace Learner::Autograd::UnivariateStatic
         return Product(Constant(lhs), std::move(rhs));
     }
 
+    template <typename ArgT, typename T = typename ArgT::ValueType>
+    struct Negation : Evaluable<Negation<ArgT, T>>
+    {
+        using ValueType = T;
+
+        explicit Negation(ArgT x) :
+            m_x(std::move(x))
+        {
+        }
+
+        template <typename... ArgsTs>
+        T value(const std::tuple<ArgsTs...>& args) const
+        {
+            return -m_x.value(args);
+        }
+
+        template <typename... ArgsTs>
+        T grad(const std::tuple<ArgsTs...>& args) const
+        {
+            return -m_x.grad(args);
+        }
+
+    private:
+        ArgT m_x;
+    };
+
+    template <typename ArgT, typename T = typename ArgT::ValueType>
+    auto operator-(ArgT x)
+    {
+        return Negation(std::move(x));
+    }
+
     template <typename ArgT, typename T = typename ArgT::ValueType>
     struct Sigmoid : Evaluable<Sigmoid<ArgT, T>>
     {
@@ -318,7 +350,7 @@ namespace Learner::Autograd::UnivariateStatic
         }
     };
 
-    template <typename ArgT>
+    template <typename ArgT, typename T = typename ArgT::ValueType>
     auto sigmoid(ArgT x)
     {
         return Sigmoid(std::move(x));
@@ -394,7 +426,7 @@ namespace Learner::Autograd::UnivariateStatic
         }
     };
 
-    template <typename ArgT>
+    template <typename ArgT, typename T = typename ArgT::ValueType>
     auto log(ArgT x)
     {
         return Log(std::move(x));
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index e558b56a..83229c61 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -200,11 +200,14 @@ namespace Learner
     static ValueWithGrad<double> get_loss(Value shallow, Value teacher_signal, int result, int ply)
     {
         using namespace Learner::Autograd::UnivariateStatic;
+
+        /*
         auto q_ = sigmoid(VariableParameter<double, 0>{} * winning_probability_coefficient);
         auto p_ = sigmoid(ConstantParameter<double, 1>{} * winning_probability_coefficient);
         auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         auto lambda_ = ConstantParameter<double, 3>{};
         auto loss_ = pow(lambda_ * (q_ - p_) + (1.0 - lambda_) * (q_ - t_), 2.0);
+        */
 
         /*
         auto q_ = VariableParameter<double, 0>{};
@@ -212,6 +215,20 @@ namespace Learner
         auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
         */
 
+        const double epsilon = 1e-12;
+
+        auto q_ = sigmoid(VariableParameter<double, 0>{} * winning_probability_coefficient);
+        auto p_ = sigmoid(ConstantParameter<double, 1>{} * winning_probability_coefficient);
+        auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
+        auto lambda_ = ConstantParameter<double, 3>{};
+        auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
+        auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
+        auto teacher_loss_ = -(p_ * log(q_) + (1.0 - p_) * log(1.0 - q_));
+        auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
+        auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
+        auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
+        auto loss_ = result_ - entropy_;
+
         auto args = std::tuple((double)shallow, (double)teacher_signal, (double)result, calculate_lambda(teacher_signal));
         return loss_.eval(args);
     }

From d103867558d7c57a9eae5e2a061394d937881b13 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 17:27:45 +0100
Subject: [PATCH 484/583] Add memoization to the autograd expression evaluator.

---
 src/learn/autograd.h | 97 ++++++++++++++++++++++++++++----------------
 1 file changed, 63 insertions(+), 34 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index a4ad8b7f..2b0eee3a 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -6,6 +6,7 @@
 #include <type_traits>
 #include <memory>
 #include <tuple>
+#include <optional>
 
 namespace Learner
 {
@@ -62,20 +63,48 @@ namespace Learner::Autograd::UnivariateStatic
     template <typename T>
     using Id = typename Identity<T>::type;
 
-    template <typename T>
+    template <typename T, typename ChildT>
     struct Evaluable
     {
         template <typename... ArgsTs>
         auto eval(const std::tuple<ArgsTs...>& args) const
         {
-            using ValueType = typename T::ValueType;
-            const T* this_ = static_cast<const T*>(this);
-            return ValueWithGrad<ValueType>{ this_->value(args), this_->grad(args) };
+            return ValueWithGrad<T>{ value(args), grad(args) };
         }
+
+        template <typename... ArgsTs>
+        auto value(const std::tuple<ArgsTs...>& args) const
+        {
+            const ChildT* this_ = static_cast<const ChildT*>(this);
+
+            if (!value_cache.has_value())
+            {
+                value_cache = this_->calculate_value(args);
+            }
+
+            return *value_cache;
+        }
+
+        template <typename... ArgsTs>
+        auto grad(const std::tuple<ArgsTs...>& args) const
+        {
+            const ChildT* this_ = static_cast<const ChildT*>(this);
+
+            if (!grad_cache.has_value())
+            {
+                grad_cache = this_->calculate_grad(args);
+            }
+
+            return *grad_cache;
+        }
+
+    private:
+        mutable std::optional<T> value_cache;
+        mutable std::optional<T> grad_cache;
     };
 
     template <typename T, int I>
-    struct VariableParameter : Evaluable<VariableParameter<T, I>>
+    struct VariableParameter : Evaluable<T, VariableParameter<T, I>>
     {
         using ValueType = T;
 
@@ -84,20 +113,20 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return std::get<I>(args);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>&) const
+        T calculate_grad(const std::tuple<ArgsTs...>&) const
         {
             return T(1.0);
         }
     };
 
     template <typename T, int I>
-    struct ConstantParameter : Evaluable<ConstantParameter<T, I>>
+    struct ConstantParameter : Evaluable<T, ConstantParameter<T, I>>
     {
         using ValueType = T;
 
@@ -106,20 +135,20 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return std::get<I>(args);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>&) const
+        T calculate_grad(const std::tuple<ArgsTs...>&) const
         {
             return T(0.0);
         }
     };
 
     template <typename T>
-    struct Constant : Evaluable<Constant<T>>
+    struct Constant : Evaluable<T, Constant<T>>
     {
         using ValueType = T;
 
@@ -129,13 +158,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>&) const
+        T calculate_value(const std::tuple<ArgsTs...>&) const
         {
             return m_x;
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>&) const
+        T calculate_grad(const std::tuple<ArgsTs...>&) const
         {
             return T(0.0);
         }
@@ -145,7 +174,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    struct Sum : Evaluable<Sum<LhsT, RhsT, T>>
+    struct Sum : Evaluable<T, Sum<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
@@ -156,13 +185,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.value(args) + m_rhs.value(args);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.grad(args) + m_rhs.grad(args);
         }
@@ -191,7 +220,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    struct Difference : Evaluable<Difference<LhsT, RhsT, T>>
+    struct Difference : Evaluable<T, Difference<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
@@ -202,13 +231,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.value(args) - m_rhs.value(args);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.grad(args) - m_rhs.grad(args);
         }
@@ -237,7 +266,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    struct Product : Evaluable<Product<LhsT, RhsT, T>>
+    struct Product : Evaluable<T, Product<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
@@ -248,13 +277,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.value(args) * m_rhs.value(args);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args);
         }
@@ -283,7 +312,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Negation : Evaluable<Negation<ArgT, T>>
+    struct Negation : Evaluable<T, Negation<ArgT, T>>
     {
         using ValueType = T;
 
@@ -293,13 +322,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return -m_x.value(args);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return -m_x.grad(args);
         }
@@ -315,7 +344,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Sigmoid : Evaluable<Sigmoid<ArgT, T>>
+    struct Sigmoid : Evaluable<T, Sigmoid<ArgT, T>>
     {
         using ValueType = T;
 
@@ -325,13 +354,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return value_(m_x.value(args));
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_x.grad(args) * grad_(m_x.value(args));
         }
@@ -357,7 +386,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Pow : Evaluable<Pow<ArgT, T>>
+    struct Pow : Evaluable<T, Pow<ArgT, T>>
     {
         using ValueType = T;
 
@@ -368,13 +397,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return std::pow(m_x.value(args), m_exponent);
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args);
         }
@@ -391,7 +420,7 @@ namespace Learner::Autograd::UnivariateStatic
     }
 
     template <typename ArgT, typename T = typename ArgT::ValueType>
-    struct Log : Evaluable<Log<ArgT, T>>
+    struct Log : Evaluable<T, Log<ArgT, T>>
     {
         using ValueType = T;
 
@@ -401,13 +430,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T value(const std::tuple<ArgsTs...>& args) const
+        T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return value_(m_x.value(args));
         }
 
         template <typename... ArgsTs>
-        T grad(const std::tuple<ArgsTs...>& args) const
+        T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_x.grad(args) * grad_(m_x.value(args));
         }

From a5c20bee5b49a9643ce7cc23aeee08f9f374ac19 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 17:57:06 +0100
Subject: [PATCH 485/583] Apply gradient clipping.

---
 src/learn/autograd.h | 6 ++++++
 src/learn/learn.cpp  | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 2b0eee3a..afbcc41b 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -7,6 +7,7 @@
 #include <memory>
 #include <tuple>
 #include <optional>
+#include <algorithm>
 
 namespace Learner
 {
@@ -48,6 +49,11 @@ namespace Learner
         {
             return { std::abs(value), std::abs(grad) };
         }
+
+        ValueWithGrad clamp_grad(T max) const
+        {
+            return { value, std::clamp(grad, -max, max) };
+        }
     };
 }
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 83229c61..0b04d034 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -230,7 +230,7 @@ namespace Learner
         auto loss_ = result_ - entropy_;
 
         auto args = std::tuple((double)shallow, (double)teacher_signal, (double)result, calculate_lambda(teacher_signal));
-        return loss_.eval(args);
+        return loss_.eval(args).clamp_grad(max_grad);
     }
 
     static auto get_loss(

From aec6017195fedf7dac6a891a0cb89a06f457ade4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 19:06:31 +0100
Subject: [PATCH 486/583] When forming an autograd expression only copy parts
 that are rvalue references, store references to lvalues.

---
 src/learn/autograd.h | 153 ++++++++++++++++++++++---------------------
 1 file changed, 80 insertions(+), 73 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index afbcc41b..714f741a 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -69,6 +69,13 @@ namespace Learner::Autograd::UnivariateStatic
     template <typename T>
     using Id = typename Identity<T>::type;
 
+    template <typename T>
+    using StoreValueOrRef = std::conditional_t<
+            std::is_rvalue_reference_v<T>,
+            std::remove_reference_t<T>,
+            const std::remove_reference_t<T>&
+        >;
+
     template <typename T, typename ChildT>
     struct Evaluable
     {
@@ -179,14 +186,14 @@ namespace Learner::Autograd::UnivariateStatic
         T m_x;
     };
 
-    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
     struct Sum : Evaluable<T, Sum<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
-        Sum(LhsT lhs, RhsT rhs) :
-            m_lhs(std::move(lhs)),
-            m_rhs(std::move(rhs))
+        Sum(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
         {
         }
 
@@ -203,36 +210,36 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        LhsT m_lhs;
-        RhsT m_rhs;
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
     };
 
-    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    auto operator+(LhsT lhs, RhsT rhs)
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    auto operator+(LhsT&& lhs, RhsT&& rhs)
     {
-        return Sum(std::move(lhs), std::move(rhs));
+        return Sum<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
-    template <typename LhsT, typename T = typename LhsT::ValueType>
-    auto operator+(LhsT lhs, Id<T> rhs)
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    auto operator+(LhsT&& lhs, Id<T> rhs)
     {
-        return Sum(std::move(lhs), Constant(rhs));
+        return Sum<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
-    template <typename RhsT, typename T = typename RhsT::ValueType>
-    auto operator+(Id<T> lhs, RhsT rhs)
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    auto operator+(Id<T> lhs, RhsT&& rhs)
     {
-        return Sum(Constant(lhs), std::move(rhs));
+        return Sum<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
 
-    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
     struct Difference : Evaluable<T, Difference<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
-        Difference(LhsT lhs, RhsT rhs) :
-            m_lhs(std::move(lhs)),
-            m_rhs(std::move(rhs))
+        Difference(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
         {
         }
 
@@ -249,36 +256,36 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        LhsT m_lhs;
-        RhsT m_rhs;
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
     };
 
-    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    auto operator-(LhsT lhs, RhsT rhs)
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    auto operator-(LhsT&& lhs, RhsT&& rhs)
     {
-        return Difference(std::move(lhs), std::move(rhs));
+        return Difference<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
-    template <typename LhsT, typename T = typename LhsT::ValueType>
-    auto operator-(LhsT lhs, Id<T> rhs)
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    auto operator-(LhsT&& lhs, Id<T> rhs)
     {
-        return Difference(std::move(lhs), Constant(rhs));
+        return Difference<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
-    template <typename RhsT, typename T = typename RhsT::ValueType>
-    auto operator-(Id<T> lhs, RhsT rhs)
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    auto operator-(Id<T> lhs, RhsT&& rhs)
     {
-        return Difference(Constant(lhs), std::move(rhs));
+        return Difference<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
 
-    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
     struct Product : Evaluable<T, Product<LhsT, RhsT, T>>
     {
         using ValueType = T;
 
-        Product(LhsT lhs, RhsT rhs) :
-            m_lhs(std::move(lhs)),
-            m_rhs(std::move(rhs))
+        Product(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
         {
         }
 
@@ -295,35 +302,35 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        LhsT m_lhs;
-        RhsT m_rhs;
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
     };
 
-    template <typename LhsT, typename RhsT, typename T = typename LhsT::ValueType>
-    auto operator*(LhsT lhs, RhsT rhs)
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    auto operator*(LhsT&& lhs, RhsT&& rhs)
     {
-        return Product(std::move(lhs), std::move(rhs));
+        return Product<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
-    template <typename LhsT, typename T = typename LhsT::ValueType>
-    auto operator*(LhsT lhs, Id<T> rhs)
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    auto operator*(LhsT&& lhs, Id<T> rhs)
     {
-        return Product(std::move(lhs), Constant(rhs));
+        return Product<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
-    template <typename RhsT, typename T = typename RhsT::ValueType>
-    auto operator*(Id<T> lhs, RhsT rhs)
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    auto operator*(Id<T> lhs, RhsT&& rhs)
     {
-        return Product(Constant(lhs), std::move(rhs));
+        return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
     struct Negation : Evaluable<T, Negation<ArgT, T>>
     {
         using ValueType = T;
 
-        explicit Negation(ArgT x) :
-            m_x(std::move(x))
+        explicit Negation(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
         {
         }
 
@@ -340,22 +347,22 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        ArgT m_x;
+        StoreValueOrRef<ArgT> m_x;
     };
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
-    auto operator-(ArgT x)
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    auto operator-(ArgT&& x)
     {
-        return Negation(std::move(x));
+        return Negation<ArgT&&>(std::forward<ArgT>(x));
     }
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
     struct Sigmoid : Evaluable<T, Sigmoid<ArgT, T>>
     {
         using ValueType = T;
 
-        explicit Sigmoid(ArgT x) :
-            m_x(std::move(x))
+        explicit Sigmoid(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
         {
         }
 
@@ -372,7 +379,7 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        ArgT m_x;
+        StoreValueOrRef<ArgT> m_x;
 
         T value_(T x) const
         {
@@ -385,19 +392,19 @@ namespace Learner::Autograd::UnivariateStatic
         }
     };
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
-    auto sigmoid(ArgT x)
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    auto sigmoid(ArgT&& x)
     {
-        return Sigmoid(std::move(x));
+        return Sigmoid<ArgT&&>(std::forward<ArgT>(x));
     }
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
     struct Pow : Evaluable<T, Pow<ArgT, T>>
     {
         using ValueType = T;
 
-        explicit Pow(ArgT x, Id<T> exponent) :
-            m_x(std::move(x)),
+        explicit Pow(ArgT&& x, Id<T> exponent) :
+            m_x(std::forward<ArgT>(x)),
             m_exponent(std::move(exponent))
         {
         }
@@ -415,23 +422,23 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        ArgT m_x;
+        StoreValueOrRef<ArgT> m_x;
         T m_exponent;
     };
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
-    auto pow(ArgT x, Id<T> exp)
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    auto pow(ArgT&& x, Id<T> exp)
     {
-        return Pow(std::move(x), std::move(exp));
+        return Pow<ArgT&&>(std::forward<ArgT>(x), std::move(exp));
     }
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
     struct Log : Evaluable<T, Log<ArgT, T>>
     {
         using ValueType = T;
 
-        explicit Log(ArgT x) :
-            m_x(std::move(x))
+        explicit Log(ArgT&& x) :
+            m_x(std::forward<ArgT>(x))
         {
         }
 
@@ -448,7 +455,7 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
     private:
-        ArgT m_x;
+        StoreValueOrRef<ArgT> m_x;
 
         T value_(T x) const
         {
@@ -461,10 +468,10 @@ namespace Learner::Autograd::UnivariateStatic
         }
     };
 
-    template <typename ArgT, typename T = typename ArgT::ValueType>
-    auto log(ArgT x)
+    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
+    auto log(ArgT&& x)
     {
-        return Log(std::move(x));
+        return Log<ArgT&&>(std::forward<ArgT>(x));
     }
 
 }

From 26f19e1429312e5e0d6fcbc3db325f9923d76d54 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 22:50:49 +0100
Subject: [PATCH 487/583] Make automatic differentiation node types constexpr.

---
 src/learn/autograd.h | 48 +++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 714f741a..4edf0e4c 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -79,6 +79,8 @@ namespace Learner::Autograd::UnivariateStatic
     template <typename T, typename ChildT>
     struct Evaluable
     {
+        constexpr Evaluable() = default;
+
         template <typename... ArgsTs>
         auto eval(const std::tuple<ArgsTs...>& args) const
         {
@@ -121,7 +123,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        VariableParameter()
+        constexpr VariableParameter()
         {
         }
 
@@ -143,7 +145,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        ConstantParameter()
+        constexpr ConstantParameter()
         {
         }
 
@@ -165,7 +167,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        Constant(T x) :
+        constexpr Constant(T x) :
             m_x(std::move(x))
         {
         }
@@ -191,7 +193,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        Sum(LhsT&& lhs, RhsT&& rhs) :
+        constexpr Sum(LhsT&& lhs, RhsT&& rhs) :
             m_lhs(std::forward<LhsT>(lhs)),
             m_rhs(std::forward<RhsT>(rhs))
         {
@@ -215,19 +217,19 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    auto operator+(LhsT&& lhs, RhsT&& rhs)
+    constexpr auto operator+(LhsT&& lhs, RhsT&& rhs)
     {
         return Sum<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
     template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    auto operator+(LhsT&& lhs, Id<T> rhs)
+    constexpr auto operator+(LhsT&& lhs, Id<T> rhs)
     {
         return Sum<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
     template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    auto operator+(Id<T> lhs, RhsT&& rhs)
+    constexpr auto operator+(Id<T> lhs, RhsT&& rhs)
     {
         return Sum<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
@@ -237,7 +239,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        Difference(LhsT&& lhs, RhsT&& rhs) :
+        constexpr Difference(LhsT&& lhs, RhsT&& rhs) :
             m_lhs(std::forward<LhsT>(lhs)),
             m_rhs(std::forward<RhsT>(rhs))
         {
@@ -261,19 +263,19 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    auto operator-(LhsT&& lhs, RhsT&& rhs)
+    constexpr auto operator-(LhsT&& lhs, RhsT&& rhs)
     {
         return Difference<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
     template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    auto operator-(LhsT&& lhs, Id<T> rhs)
+    constexpr auto operator-(LhsT&& lhs, Id<T> rhs)
     {
         return Difference<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
     template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    auto operator-(Id<T> lhs, RhsT&& rhs)
+    constexpr auto operator-(Id<T> lhs, RhsT&& rhs)
     {
         return Difference<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
@@ -283,7 +285,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        Product(LhsT&& lhs, RhsT&& rhs) :
+        constexpr Product(LhsT&& lhs, RhsT&& rhs) :
             m_lhs(std::forward<LhsT>(lhs)),
             m_rhs(std::forward<RhsT>(rhs))
         {
@@ -307,19 +309,19 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    auto operator*(LhsT&& lhs, RhsT&& rhs)
+    constexpr auto operator*(LhsT&& lhs, RhsT&& rhs)
     {
         return Product<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
     template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    auto operator*(LhsT&& lhs, Id<T> rhs)
+    constexpr auto operator*(LhsT&& lhs, Id<T> rhs)
     {
         return Product<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
     template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    auto operator*(Id<T> lhs, RhsT&& rhs)
+    constexpr auto operator*(Id<T> lhs, RhsT&& rhs)
     {
         return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
@@ -329,7 +331,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        explicit Negation(ArgT&& x) :
+        constexpr explicit Negation(ArgT&& x) :
             m_x(std::forward<ArgT>(x))
         {
         }
@@ -351,7 +353,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    auto operator-(ArgT&& x)
+    constexpr auto operator-(ArgT&& x)
     {
         return Negation<ArgT&&>(std::forward<ArgT>(x));
     }
@@ -361,7 +363,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        explicit Sigmoid(ArgT&& x) :
+        constexpr explicit Sigmoid(ArgT&& x) :
             m_x(std::forward<ArgT>(x))
         {
         }
@@ -393,7 +395,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    auto sigmoid(ArgT&& x)
+    constexpr auto sigmoid(ArgT&& x)
     {
         return Sigmoid<ArgT&&>(std::forward<ArgT>(x));
     }
@@ -403,7 +405,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        explicit Pow(ArgT&& x, Id<T> exponent) :
+        constexpr explicit Pow(ArgT&& x, Id<T> exponent) :
             m_x(std::forward<ArgT>(x)),
             m_exponent(std::move(exponent))
         {
@@ -427,7 +429,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    auto pow(ArgT&& x, Id<T> exp)
+    constexpr auto pow(ArgT&& x, Id<T> exp)
     {
         return Pow<ArgT&&>(std::forward<ArgT>(x), std::move(exp));
     }
@@ -437,7 +439,7 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
-        explicit Log(ArgT&& x) :
+        constexpr explicit Log(ArgT&& x) :
             m_x(std::forward<ArgT>(x))
         {
         }
@@ -469,7 +471,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    auto log(ArgT&& x)
+    constexpr auto log(ArgT&& x)
     {
         return Log<ArgT&&>(std::forward<ArgT>(x));
     }

From cb812c742c25e2737808cf7ec349e4eeffb0d911 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 29 Nov 2020 22:52:21 +0100
Subject: [PATCH 488/583] Add [[nodiscard]] attributes to autograd functions.

---
 src/learn/autograd.h | 80 ++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 4edf0e4c..5c573c0f 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -45,12 +45,12 @@ namespace Learner
             return *this;
         }
 
-        ValueWithGrad abs() const
+        [[nodiscard]] ValueWithGrad abs() const
         {
             return { std::abs(value), std::abs(grad) };
         }
 
-        ValueWithGrad clamp_grad(T max) const
+        [[nodiscard]] ValueWithGrad clamp_grad(T max) const
         {
             return { value, std::clamp(grad, -max, max) };
         }
@@ -82,13 +82,13 @@ namespace Learner::Autograd::UnivariateStatic
         constexpr Evaluable() = default;
 
         template <typename... ArgsTs>
-        auto eval(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] auto eval(const std::tuple<ArgsTs...>& args) const
         {
             return ValueWithGrad<T>{ value(args), grad(args) };
         }
 
         template <typename... ArgsTs>
-        auto value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args) const
         {
             const ChildT* this_ = static_cast<const ChildT*>(this);
 
@@ -101,7 +101,7 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        auto grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args) const
         {
             const ChildT* this_ = static_cast<const ChildT*>(this);
 
@@ -128,13 +128,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return std::get<I>(args);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>&) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
         {
             return T(1.0);
         }
@@ -150,13 +150,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return std::get<I>(args);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>&) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
         {
             return T(0.0);
         }
@@ -173,13 +173,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>&) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
         {
             return m_x;
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>&) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
         {
             return T(0.0);
         }
@@ -200,13 +200,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.value(args) + m_rhs.value(args);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.grad(args) + m_rhs.grad(args);
         }
@@ -217,19 +217,19 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    constexpr auto operator+(LhsT&& lhs, RhsT&& rhs)
+    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, RhsT&& rhs)
     {
         return Sum<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
     template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    constexpr auto operator+(LhsT&& lhs, Id<T> rhs)
+    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, Id<T> rhs)
     {
         return Sum<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
     template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    constexpr auto operator+(Id<T> lhs, RhsT&& rhs)
+    [[nodiscard]] constexpr auto operator+(Id<T> lhs, RhsT&& rhs)
     {
         return Sum<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
@@ -246,13 +246,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.value(args) - m_rhs.value(args);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.grad(args) - m_rhs.grad(args);
         }
@@ -263,19 +263,19 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    constexpr auto operator-(LhsT&& lhs, RhsT&& rhs)
+    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, RhsT&& rhs)
     {
         return Difference<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
     template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    constexpr auto operator-(LhsT&& lhs, Id<T> rhs)
+    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, Id<T> rhs)
     {
         return Difference<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
     template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    constexpr auto operator-(Id<T> lhs, RhsT&& rhs)
+    [[nodiscard]] constexpr auto operator-(Id<T> lhs, RhsT&& rhs)
     {
         return Difference<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
@@ -292,13 +292,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.value(args) * m_rhs.value(args);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args);
         }
@@ -309,19 +309,19 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    constexpr auto operator*(LhsT&& lhs, RhsT&& rhs)
+    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, RhsT&& rhs)
     {
         return Product<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
     }
 
     template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    constexpr auto operator*(LhsT&& lhs, Id<T> rhs)
+    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, Id<T> rhs)
     {
         return Product<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
     }
 
     template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    constexpr auto operator*(Id<T> lhs, RhsT&& rhs)
+    [[nodiscard]] constexpr auto operator*(Id<T> lhs, RhsT&& rhs)
     {
         return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
@@ -337,13 +337,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return -m_x.value(args);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return -m_x.grad(args);
         }
@@ -353,7 +353,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    constexpr auto operator-(ArgT&& x)
+    [[nodiscard]] constexpr auto operator-(ArgT&& x)
     {
         return Negation<ArgT&&>(std::forward<ArgT>(x));
     }
@@ -369,13 +369,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return value_(m_x.value(args));
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_x.grad(args) * grad_(m_x.value(args));
         }
@@ -383,19 +383,19 @@ namespace Learner::Autograd::UnivariateStatic
     private:
         StoreValueOrRef<ArgT> m_x;
 
-        T value_(T x) const
+        [[nodiscard]] T value_(T x) const
         {
             return 1.0 / (1.0 + std::exp(-x));
         }
 
-        T grad_(T x) const
+        [[nodiscard]] T grad_(T x) const
         {
             return value_(x) * (1.0 - value_(x));
         }
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    constexpr auto sigmoid(ArgT&& x)
+    [[nodiscard]] constexpr auto sigmoid(ArgT&& x)
     {
         return Sigmoid<ArgT&&>(std::forward<ArgT>(x));
     }
@@ -412,13 +412,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return std::pow(m_x.value(args), m_exponent);
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args);
         }
@@ -429,7 +429,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    constexpr auto pow(ArgT&& x, Id<T> exp)
+    [[nodiscard]] constexpr auto pow(ArgT&& x, Id<T> exp)
     {
         return Pow<ArgT&&>(std::forward<ArgT>(x), std::move(exp));
     }
@@ -445,13 +445,13 @@ namespace Learner::Autograd::UnivariateStatic
         }
 
         template <typename... ArgsTs>
-        T calculate_value(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
         {
             return value_(m_x.value(args));
         }
 
         template <typename... ArgsTs>
-        T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
         {
             return m_x.grad(args) * grad_(m_x.value(args));
         }
@@ -471,7 +471,7 @@ namespace Learner::Autograd::UnivariateStatic
     };
 
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    constexpr auto log(ArgT&& x)
+    [[nodiscard]] constexpr auto log(ArgT&& x)
     {
         return Log<ArgT&&>(std::forward<ArgT>(x));
     }

From 8adf00ae6e43b5f27ad48deb39b87a7c05b2fe5e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 14:01:31 +0100
Subject: [PATCH 489/583] Identify a single evalation chain by ID in autograd
 to prevent cache reuse for subsequent evaluations of the same expression
 tree.

---
 src/learn/autograd.h | 87 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 82 insertions(+), 5 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 5c573c0f..7006121a 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -8,6 +8,7 @@
 #include <tuple>
 #include <optional>
 #include <algorithm>
+#include <cstdint>
 
 namespace Learner
 {
@@ -76,46 +77,122 @@ namespace Learner::Autograd::UnivariateStatic
             const std::remove_reference_t<T>&
         >;
 
+    namespace Detail
+    {
+        using CallIdType = std::uint32_t;
+
+        struct CallId
+        {
+            CallIdType call_id{};
+
+            constexpr CallId() :
+                call_id(0)
+            {
+            }
+
+            constexpr CallId(CallIdType id) :
+                call_id(id)
+            {
+            }
+
+            [[nodiscard]] bool operator==(CallId rhs) const noexcept
+            {
+                return call_id == rhs.call_id;
+            }
+
+            [[nodiscard]] bool operator!=(CallId rhs) const noexcept
+            {
+                return call_id != rhs.call_id;
+            }
+        };
+
+        [[nodiscard]] inline CallId next_call_id()
+        {
+            static thread_local CallIdType s_call_id = 0;
+            return CallId{ s_call_id++ };
+        }
+
+        template <typename T, typename Tuple>
+        struct TupleContains;
+
+        template <typename T, typename... Us>
+        struct TupleContains<T, std::tuple<Us...>> : std::disjunction<std::is_same<T, Us>...> {};
+
+        template <typename T, typename Tuple>
+        constexpr bool TupleContainsV = TupleContains<T, Tuple>::value;
+    }
+
     template <typename T, typename ChildT>
     struct Evaluable
     {
         constexpr Evaluable() = default;
 
+        // We append a unique call id so that we can invalidate the cache when
+        // the next computation starts. A single evaluation should see
+        // the same call_id at every node.
         template <typename... ArgsTs>
         [[nodiscard]] auto eval(const std::tuple<ArgsTs...>& args) const
         {
-            return ValueWithGrad<T>{ value(args), grad(args) };
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return ValueWithGrad<T>{ value(new_args), grad(new_args) };
         }
 
-        template <typename... ArgsTs>
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
         [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args) const
         {
             const ChildT* this_ = static_cast<const ChildT*>(this);
 
-            if (!value_cache.has_value())
+            const auto call_id = std::get<Detail::CallId>(args);
+            if (!value_cache.has_value() || value_cache_call_id != call_id)
             {
+                value_cache_call_id = call_id;
                 value_cache = this_->calculate_value(args);
             }
 
             return *value_cache;
         }
 
-        template <typename... ArgsTs>
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args, ...) const
+        {
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return value(new_args);
+        }
+
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
         [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args) const
         {
             const ChildT* this_ = static_cast<const ChildT*>(this);
 
-            if (!grad_cache.has_value())
+            const auto call_id = std::get<Detail::CallId>(args);
+            if (!grad_cache.has_value() || grad_cache_call_id != call_id)
             {
+                grad_cache_call_id = call_id;
                 grad_cache = this_->calculate_grad(args);
             }
 
             return *grad_cache;
         }
 
+        template <typename... ArgsTs,
+            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
+        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args, ...) const
+        {
+            const auto call_id = Detail::next_call_id();
+            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
+            return grad(new_args);
+        }
+
     private:
         mutable std::optional<T> value_cache;
         mutable std::optional<T> grad_cache;
+        mutable Detail::CallId value_cache_call_id{};
+        mutable Detail::CallId grad_cache_call_id{};
     };
 
     template <typename T, int I>

From 891abf55115fca95ee40103bb1157cf341ba57d9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 15:14:26 +0100
Subject: [PATCH 490/583] Make the autograd loss expression chain thread_local.

---
 src/learn/learn.cpp | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 0b04d034..af867d42 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -215,21 +215,28 @@ namespace Learner
         auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
         */
 
-        const double epsilon = 1e-12;
+        constexpr double epsilon = 1e-12;
 
-        auto q_ = sigmoid(VariableParameter<double, 0>{} * winning_probability_coefficient);
-        auto p_ = sigmoid(ConstantParameter<double, 1>{} * winning_probability_coefficient);
-        auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
-        auto lambda_ = ConstantParameter<double, 3>{};
-        auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
-        auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
-        auto teacher_loss_ = -(p_ * log(q_) + (1.0 - p_) * log(1.0 - q_));
-        auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
-        auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
-        auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
-        auto loss_ = result_ - entropy_;
+        static thread_local auto q_ = sigmoid(VariableParameter<double, 0>{} * ConstantParameter<double, 4>{});
+        static thread_local auto p_ = sigmoid(ConstantParameter<double, 1>{} * ConstantParameter<double, 4>{});
+        static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
+        static thread_local auto lambda_ = ConstantParameter<double, 3>{};
+        static thread_local auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
+        static thread_local auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
+        static thread_local auto teacher_loss_ = -(p_ * log(q_) + (1.0 - p_) * log(1.0 - q_));
+        static thread_local auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
+        static thread_local auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
+        static thread_local auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
+        static thread_local auto loss_ = result_ - entropy_;
+
+        auto args = std::tuple(
+            (double)shallow, 
+            (double)teacher_signal, 
+            (double)result, 
+            calculate_lambda(teacher_signal), 
+            winning_probability_coefficient
+        );
 
-        auto args = std::tuple((double)shallow, (double)teacher_signal, (double)result, calculate_lambda(teacher_signal));
         return loss_.eval(args).clamp_grad(max_grad);
     }
 

From e975889132bd2303915a2e2eb587b2633487c358 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 15:21:39 +0100
Subject: [PATCH 491/583] Move cross_entropy calculation to a separate
 function.

---
 src/learn/learn.cpp | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index af867d42..dd893d9d 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -197,6 +197,29 @@ namespace Learner
         return lambda;
     }
 
+    template <typename ShallowT, typename TeacherT, typename ResultT, typename LambdaT>
+    static auto& cross_entropy_(
+        ShallowT& q_,
+        TeacherT& p_,
+        ResultT& t_,
+        LambdaT& lambda_
+    )
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        constexpr double epsilon = 1e-12;
+
+        static thread_local auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
+        static thread_local auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
+        static thread_local auto teacher_loss_ = -(p_ * log(q_) + (1.0 - p_) * log(1.0 - q_));
+        static thread_local auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
+        static thread_local auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
+        static thread_local auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
+        static thread_local auto loss_ = result_ - entropy_;
+
+        return loss_;
+    }
+
     static ValueWithGrad<double> get_loss(Value shallow, Value teacher_signal, int result, int ply)
     {
         using namespace Learner::Autograd::UnivariateStatic;
@@ -215,19 +238,11 @@ namespace Learner
         auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
         */
 
-        constexpr double epsilon = 1e-12;
-
         static thread_local auto q_ = sigmoid(VariableParameter<double, 0>{} * ConstantParameter<double, 4>{});
         static thread_local auto p_ = sigmoid(ConstantParameter<double, 1>{} * ConstantParameter<double, 4>{});
         static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
-        static thread_local auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
-        static thread_local auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
-        static thread_local auto teacher_loss_ = -(p_ * log(q_) + (1.0 - p_) * log(1.0 - q_));
-        static thread_local auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
-        static thread_local auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
-        static thread_local auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
-        static thread_local auto loss_ = result_ - entropy_;
+        static thread_local auto loss_ = cross_entropy_(q_, p_, t_, lambda_);
 
         auto args = std::tuple(
             (double)shallow, 

From cbd973fdaaec0685717441a0c5418a95f5527acc Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 16:50:51 +0100
Subject: [PATCH 492/583] Detect constant expressions in autograd and return 0
 grad early.

---
 src/learn/autograd.h | 44 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 7006121a..45bee469 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -120,6 +120,9 @@ namespace Learner::Autograd::UnivariateStatic
 
         template <typename T, typename Tuple>
         constexpr bool TupleContainsV = TupleContains<T, Tuple>::value;
+
+        template <typename... Ts>
+        constexpr bool AreAllConstantV = (std::remove_reference_t<Ts>::is_constant && ...);
     }
 
     template <typename T, typename ChildT>
@@ -167,16 +170,23 @@ namespace Learner::Autograd::UnivariateStatic
             typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
         [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args) const
         {
-            const ChildT* this_ = static_cast<const ChildT*>(this);
-
-            const auto call_id = std::get<Detail::CallId>(args);
-            if (!grad_cache.has_value() || grad_cache_call_id != call_id)
+            if constexpr (ChildT::is_constant)
             {
-                grad_cache_call_id = call_id;
-                grad_cache = this_->calculate_grad(args);
+                return T(0.0);
             }
+            else
+            {
+                const ChildT* this_ = static_cast<const ChildT*>(this);
 
-            return *grad_cache;
+                const auto call_id = std::get<Detail::CallId>(args);
+                if (!grad_cache.has_value() || grad_cache_call_id != call_id)
+                {
+                    grad_cache_call_id = call_id;
+                    grad_cache = this_->calculate_grad(args);
+                }
+
+                return *grad_cache;
+            }
         }
 
         template <typename... ArgsTs,
@@ -199,6 +209,8 @@ namespace Learner::Autograd::UnivariateStatic
     struct VariableParameter : Evaluable<T, VariableParameter<T, I>>
     {
         using ValueType = T;
+        
+        static constexpr bool is_constant = false;
 
         constexpr VariableParameter()
         {
@@ -222,6 +234,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = true;
+
         constexpr ConstantParameter()
         {
         }
@@ -244,6 +258,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = true;
+
         constexpr Constant(T x) :
             m_x(std::move(x))
         {
@@ -270,6 +286,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
         constexpr Sum(LhsT&& lhs, RhsT&& rhs) :
             m_lhs(std::forward<LhsT>(lhs)),
             m_rhs(std::forward<RhsT>(rhs))
@@ -316,6 +334,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
         constexpr Difference(LhsT&& lhs, RhsT&& rhs) :
             m_lhs(std::forward<LhsT>(lhs)),
             m_rhs(std::forward<RhsT>(rhs))
@@ -362,6 +382,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
         constexpr Product(LhsT&& lhs, RhsT&& rhs) :
             m_lhs(std::forward<LhsT>(lhs)),
             m_rhs(std::forward<RhsT>(rhs))
@@ -408,6 +430,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
         constexpr explicit Negation(ArgT&& x) :
             m_x(std::forward<ArgT>(x))
         {
@@ -440,6 +464,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
         constexpr explicit Sigmoid(ArgT&& x) :
             m_x(std::forward<ArgT>(x))
         {
@@ -482,6 +508,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
         constexpr explicit Pow(ArgT&& x, Id<T> exponent) :
             m_x(std::forward<ArgT>(x)),
             m_exponent(std::move(exponent))
@@ -516,6 +544,8 @@ namespace Learner::Autograd::UnivariateStatic
     {
         using ValueType = T;
 
+        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
+
         constexpr explicit Log(ArgT&& x) :
             m_x(std::forward<ArgT>(x))
         {

From 01ae7b1e2c7f5a8e4c5bf3552b2f3378409efeb2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 20:26:02 +0100
Subject: [PATCH 493/583] Simplify passing constants that may vary between
 calls.

---
 src/learn/autograd.h | 32 +++++++++++++++++++++++++++++++-
 src/learn/learn.cpp  | 23 ++++++++++++++++-------
 2 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 45bee469..4383dfab 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -209,7 +209,7 @@ namespace Learner::Autograd::UnivariateStatic
     struct VariableParameter : Evaluable<T, VariableParameter<T, I>>
     {
         using ValueType = T;
-        
+
         static constexpr bool is_constant = false;
 
         constexpr VariableParameter()
@@ -281,6 +281,36 @@ namespace Learner::Autograd::UnivariateStatic
         T m_x;
     };
 
+    // The "constant" may change between executions, but is assumed to be
+    // constant during a single evaluation.
+    template <typename T>
+    struct ConstantRef : Evaluable<T, ConstantRef<T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = true;
+
+        constexpr ConstantRef(const T& x) :
+            m_x(x)
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
+        {
+            return m_x;
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
+        {
+            return T(0.0);
+        }
+
+    private:
+        const T& m_x;
+    };
+
     template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
     struct Sum : Evaluable<T, Sum<LhsT, RhsT, T>>
     {
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index dd893d9d..8e32836b 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -220,6 +220,16 @@ namespace Learner
         return loss_;
     }
 
+    template <typename ValueT>
+    static auto& expected_perf_(ValueT&& v_)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto perf_ = sigmoid(std::forward<ValueT>(v_) * ConstantRef<double>(winning_probability_coefficient));
+
+        return perf_;
+    }
+
     static ValueWithGrad<double> get_loss(Value shallow, Value teacher_signal, int result, int ply)
     {
         using namespace Learner::Autograd::UnivariateStatic;
@@ -238,18 +248,17 @@ namespace Learner
         auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
         */
 
-        static thread_local auto q_ = sigmoid(VariableParameter<double, 0>{} * ConstantParameter<double, 4>{});
-        static thread_local auto p_ = sigmoid(ConstantParameter<double, 1>{} * ConstantParameter<double, 4>{});
+        static thread_local auto q_ = expected_perf_(VariableParameter<double, 0>{});
+        static thread_local auto p_ = expected_perf_(ConstantParameter<double, 1>{});
         static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
         static thread_local auto loss_ = cross_entropy_(q_, p_, t_, lambda_);
 
         auto args = std::tuple(
-            (double)shallow, 
-            (double)teacher_signal, 
-            (double)result, 
-            calculate_lambda(teacher_signal), 
-            winning_probability_coefficient
+            (double)shallow,
+            (double)teacher_signal,
+            (double)result,
+            calculate_lambda(teacher_signal)
         );
 
         return loss_.eval(args).clamp_grad(max_grad);

From de675e3503dde2a93cbfecb75260285c91665a14 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 20:32:53 +0100
Subject: [PATCH 494/583] Reintroduce optional scaling of the teacher signal.

---
 src/learn/autograd.h | 49 ++++++++++++++++++++++++++++++++++++++++++++
 src/learn/learn.cpp  | 21 ++++++++++++++++++-
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/src/learn/autograd.h b/src/learn/autograd.h
index 4383dfab..7b2853df 100644
--- a/src/learn/autograd.h
+++ b/src/learn/autograd.h
@@ -455,6 +455,55 @@ namespace Learner::Autograd::UnivariateStatic
         return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
     }
 
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    struct Quotient : Evaluable<T, Quotient<LhsT, RhsT, T>>
+    {
+        using ValueType = T;
+
+        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
+
+        constexpr Quotient(LhsT&& lhs, RhsT&& rhs) :
+            m_lhs(std::forward<LhsT>(lhs)),
+            m_rhs(std::forward<RhsT>(rhs))
+        {
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
+        {
+            return m_lhs.value(args) / m_rhs.value(args);
+        }
+
+        template <typename... ArgsTs>
+        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
+        {
+            auto g = m_rhs.value(args);
+            return (m_lhs.grad(args) * g - m_lhs.value(args) * m_rhs.grad(args)) / (g * g);
+        }
+
+    private:
+        StoreValueOrRef<LhsT> m_lhs;
+        StoreValueOrRef<RhsT> m_rhs;
+    };
+
+    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, RhsT&& rhs)
+    {
+        return Quotient<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
+    }
+
+    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, Id<T> rhs)
+    {
+        return Quotient<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
+    }
+
+    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
+    [[nodiscard]] constexpr auto operator/(Id<T> lhs, RhsT&& rhs)
+    {
+        return Quotient<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
+    }
+
     template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
     struct Negation : Evaluable<T, Negation<ArgT, T>>
     {
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 8e32836b..07e5bd4a 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -220,6 +220,25 @@ namespace Learner
         return loss_;
     }
 
+    template <typename ValueT>
+    static auto& scale_score_(ValueT&& v_)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        // Normalize to [0.0, 1.0].
+        static thread_local auto normalized_ =
+            (std::forward<ValueT>(v_) - ConstantRef<double>(src_score_min_value))
+            / (ConstantRef<double>(src_score_max_value) - ConstantRef<double>(src_score_min_value));
+
+        // Scale to [dest_score_min_value, dest_score_max_value].
+        static thread_local auto scaled_ =
+            normalized_
+            * (ConstantRef<double>(dest_score_max_value) - ConstantRef<double>(dest_score_min_value))
+            + ConstantRef<double>(dest_score_min_value);
+
+        return scaled_;
+    }
+
     template <typename ValueT>
     static auto& expected_perf_(ValueT&& v_)
     {
@@ -249,7 +268,7 @@ namespace Learner
         */
 
         static thread_local auto q_ = expected_perf_(VariableParameter<double, 0>{});
-        static thread_local auto p_ = expected_perf_(ConstantParameter<double, 1>{});
+        static thread_local auto p_ = expected_perf_(scale_score_(ConstantParameter<double, 1>{}));
         static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
         static thread_local auto loss_ = cross_entropy_(q_, p_, t_, lambda_);

From 256c4b55ec3b7e6ffbbb19e2cc15fa1ecbfb48b0 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 20:39:46 +0100
Subject: [PATCH 495/583] Properly apply gradient norm clipping after it's
 scaled in the update_parameters.

---
 src/learn/learn.cpp                | 11 +++++------
 src/nnue/evaluate_nnue_learner.cpp |  6 ++++--
 src/nnue/evaluate_nnue_learner.h   |  1 +
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 07e5bd4a..109a43ea 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -95,8 +95,6 @@ namespace Learner
     static double elmo_lambda_high = 1.0;
     static double elmo_lambda_limit = 32000;
 
-    static double max_grad = 1.0;
-
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
@@ -280,7 +278,7 @@ namespace Learner
             calculate_lambda(teacher_signal)
         );
 
-        return loss_.eval(args).clamp_grad(max_grad);
+        return loss_.eval(args);
     }
 
     static auto get_loss(
@@ -334,6 +332,7 @@ namespace Learner
             bool smart_fen_skipping = false;
 
             double learning_rate = 1.0;
+            double max_grad = 1.0;
 
             string validation_set_file_name;
             string seed;
@@ -651,7 +650,7 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, get_loss);
+        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, params.max_grad, get_loss);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
@@ -985,6 +984,7 @@ namespace Learner
 
             // learning rate
             else if (option == "lr") is >> params.learning_rate;
+            else if (option == "max_grad") is >> params.max_grad;
 
             // Accept also the old option name.
             else if (option == "use_draw_in_training"
@@ -1012,7 +1012,6 @@ namespace Learner
             else if (option == "lambda") is >> elmo_lambda_low;
             else if (option == "lambda2") is >> elmo_lambda_high;
             else if (option == "lambda_limit") is >> elmo_lambda_limit;
-            else if (option == "max_grad") is >> max_grad;
 
             else if (option == "reduction_gameply") is >> params.reduction_gameply;
 
@@ -1100,6 +1099,7 @@ namespace Learner
         out << "  - nn_options               : " << nn_options << endl;
 
         out << "  - learning rate            : " << params.learning_rate << endl;
+        out << "  - max_grad                 : " << params.max_grad << endl;
         out << "  - use draws in training    : " << params.use_draw_games_in_training << endl;
         out << "  - use draws in validation  : " << params.use_draw_games_in_validation << endl;
         out << "  - skip repeated positions  : " << params.skip_duplicated_positions_in_training << endl;
@@ -1117,7 +1117,6 @@ namespace Learner
         out << "  - elmo_lambda_low          : " << elmo_lambda_low << endl;
         out << "  - elmo_lambda_high         : " << elmo_lambda_high << endl;
         out << "  - elmo_lambda_limit        : " << elmo_lambda_limit << endl;
-        out << "  - max_grad                 : " << max_grad << endl;
         out << "  - eval_save_interval       : " << params.eval_save_interval << " sfens" << endl;
         out << "  - loss_output_interval     : " << params.loss_output_interval << " sfens" << endl;
 
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 038a462c..8c28e4f4 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -195,6 +195,7 @@ namespace Eval::NNUE {
         uint64_t epoch,
         bool verbose,
         double learning_rate,
+        double max_grad,
         Learner::CalcLossFunc calc_loss)
     {
         using namespace Learner::Autograd::UnivariateStatic;
@@ -237,8 +238,9 @@ namespace Eval::NNUE {
                         const auto discrete = e.sign * e.discrete_nn_eval;
                         const auto& psv = e.psv;
                         const auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
-                        const double gradient = loss.grad * e.sign * kPonanzaConstant;
-                        gradients[b] = static_cast<LearnFloatType>(gradient * e.weight);
+                        const double gradient = std::clamp(
+                            loss.grad * e.sign * kPonanzaConstant * e.weight, -max_grad, max_grad);
+                        gradients[b] = static_cast<LearnFloatType>(gradient);
 
 
                         // The discrete eval will only be valid before first backpropagation,
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 7f7daa5b..5beca0a7 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -38,6 +38,7 @@ namespace Eval::NNUE {
         uint64_t epoch,
         bool verbose,
         double learning_rate,
+        double max_grad,
         Learner::CalcLossFunc calc_loss);
 
     // Check if there are any problems with learning

From cf6bc7ecaf006bbbb0325f117a41a26b98e0a50e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 20:42:54 +0100
Subject: [PATCH 496/583] Cleanup around get_loss

---
 src/learn/learn.cpp | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 109a43ea..e3bfe3a4 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -213,9 +213,9 @@ namespace Learner
         static thread_local auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
         static thread_local auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
         static thread_local auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
-        static thread_local auto loss_ = result_ - entropy_;
+        static thread_local auto cross_entropy_ = result_ - entropy_;
 
-        return loss_;
+        return cross_entropy_;
     }
 
     template <typename ValueT>
@@ -247,23 +247,27 @@ namespace Learner
         return perf_;
     }
 
-    static ValueWithGrad<double> get_loss(Value shallow, Value teacher_signal, int result, int ply)
+    static ValueWithGrad<double> get_loss_noob(Value shallow, Value teacher_signal, int result, int /* ply */)
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
-        /*
-        auto q_ = sigmoid(VariableParameter<double, 0>{} * winning_probability_coefficient);
-        auto p_ = sigmoid(ConstantParameter<double, 1>{} * winning_probability_coefficient);
-        auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
-        auto lambda_ = ConstantParameter<double, 3>{};
-        auto loss_ = pow(lambda_ * (q_ - p_) + (1.0 - lambda_) * (q_ - t_), 2.0);
-        */
+        static thread_local auto q_ = VariableParameter<double, 0>{};
+        static thread_local auto p_ = ConstantParameter<double, 1>{};
+        static thread_local auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
 
-        /*
-        auto q_ = VariableParameter<double, 0>{};
-        auto p_ = ConstantParameter<double, 1>{};
-        auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
-        */
+        auto args = std::tuple(
+            (double)shallow,
+            (double)teacher_signal,
+            (double)result,
+            calculate_lambda(teacher_signal)
+        );
+
+        return loss_.eval(args);
+    }
+
+    static ValueWithGrad<double> get_loss_cross_entropy(Value shallow, Value teacher_signal, int result, int /* ply */)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
 
         static thread_local auto q_ = expected_perf_(VariableParameter<double, 0>{});
         static thread_local auto p_ = expected_perf_(scale_score_(ConstantParameter<double, 1>{}));
@@ -281,6 +285,13 @@ namespace Learner
         return loss_.eval(args);
     }
 
+    static auto get_loss(Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        return get_loss_cross_entropy(shallow, teacher_signal, result, ply);
+    }
+
     static auto get_loss(
         Value teacher_signal,
         Value shallow,

From 99cb869db32cb309b9e5f9168706591e3fc97805 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 21:17:44 +0100
Subject: [PATCH 497/583] Reintroduce use_wdl.

---
 src/learn/learn.cpp | 120 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 118 insertions(+), 2 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index e3bfe3a4..18f84114 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -237,6 +237,22 @@ namespace Learner
         return scaled_;
     }
 
+    static Value scale_score(Value v)
+    {
+        // Normalize to [0.0, 1.0].
+        auto normalized =
+            ((double)v - src_score_min_value)
+            / (src_score_max_value - src_score_min_value);
+
+        // Scale to [dest_score_min_value, dest_score_max_value].
+        auto scaled =
+            normalized
+            * (dest_score_max_value - dest_score_min_value)
+            + dest_score_min_value;
+
+        return Value(scaled);
+    }
+
     template <typename ValueT>
     static auto& expected_perf_(ValueT&& v_)
     {
@@ -247,7 +263,72 @@ namespace Learner
         return perf_;
     }
 
-    static ValueWithGrad<double> get_loss_noob(Value shallow, Value teacher_signal, int result, int /* ply */)
+    template <typename ValueT, typename PlyT, typename T = typename ValueT::ValueType>
+    static auto& expected_perf_use_wdl_(
+        ValueT& v_,
+        PlyT&& ply_
+    )
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        // Coefficients of a 3rd order polynomial fit based on fishtest data
+        // for two parameters needed to transform eval to the argument of a
+        // logistic function.
+        static constexpr T as[] = { -8.24404295, 64.23892342, -95.73056462, 153.86478679 };
+        static constexpr T bs[] = { -3.37154371, 28.44489198, -56.67657741,  72.05858751 };
+
+        // The model captures only up to 240 plies, so limit input (and rescale)
+        static thread_local auto m_ = std::forward<PlyT>(ply_) / 64.0;
+         
+        static thread_local auto a_ = (((as[0] * m_ + as[1]) * m_ + as[2]) * m_) + as[3];
+        static thread_local auto b_ = (((bs[0] * m_ + bs[1]) * m_ + bs[2]) * m_) + bs[3];
+
+        // Return win rate in per mille
+        static thread_local auto sv_ = (v_ - a_) / b_;
+        static thread_local auto svn_ = (-v_ - a_) / b_;
+
+        static thread_local auto win_pct_ = sigmoid(sv_);
+        static thread_local auto loss_pct_ = sigmoid(svn_);
+
+        static thread_local auto draw_pct_ = 1.0 - win_pct_ - loss_pct_;
+
+        static thread_local auto perf_ = win_pct_ + draw_pct_ * 0.5;
+
+        return perf_;
+    }
+
+    static double expected_perf_use_wdl(
+        Value v,
+        int ply
+    )
+    {
+        // Coefficients of a 3rd order polynomial fit based on fishtest data
+        // for two parameters needed to transform eval to the argument of a
+        // logistic function.
+        static constexpr double as[] = { -8.24404295, 64.23892342, -95.73056462, 153.86478679 };
+        static constexpr double bs[] = { -3.37154371, 28.44489198, -56.67657741,  72.05858751 };
+
+        // The model captures only up to 240 plies, so limit input (and rescale)
+        auto m = ply / 64.0;
+
+        auto a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
+        auto b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
+
+        // Return win rate in per mille
+        auto sv = ((double)v - a) / b;
+        auto svn = ((double)-v - a) / b;
+
+        auto win_pct = Math::sigmoid(sv);
+        auto loss_pct = Math::sigmoid(svn);
+
+        auto draw_pct = 1.0 - win_pct - loss_pct;
+
+        auto perf = win_pct + draw_pct * 0.5;
+
+        return perf;
+    }
+
+    [[maybe_unused]] static ValueWithGrad<double> get_loss_noob(Value shallow, Value teacher_signal, int result, int /* ply */)
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
@@ -285,11 +366,46 @@ namespace Learner
         return loss_.eval(args);
     }
 
+    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl(
+        Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto ply_ = ConstantParameter<double, 4>{};
+        static thread_local auto shallow_ = VariableParameter<double, 0>{};
+        static thread_local auto q_ = expected_perf_use_wdl_(shallow_, ply_);
+        // We could do just this but MSVC crashes with an internal compiler error :(
+        // static thread_local auto scaled_teacher_ = scale_score_(ConstantParameter<double, 1>{});
+        // static thread_local auto p_ = expected_perf_use_wdl_(scaled_teacher_, ply_);
+        static thread_local auto p_ = ConstantParameter<double, 1>{};
+        static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
+        static thread_local auto lambda_ = ConstantParameter<double, 3>{};
+        static thread_local auto loss_ = cross_entropy_(q_, p_, t_, lambda_);
+
+        auto args = std::tuple(
+            (double)shallow,
+            // This is required because otherwise MSVC crashes :(
+            expected_perf_use_wdl(scale_score(teacher_signal), ply),
+            (double)result,
+            calculate_lambda(teacher_signal),
+            (double)std::min(240, ply)
+        );
+
+        return loss_.eval(args);
+    }
+
     static auto get_loss(Value shallow, Value teacher_signal, int result, int ply)
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
-        return get_loss_cross_entropy(shallow, teacher_signal, result, ply);
+        if (use_wdl)
+        {
+            return get_loss_cross_entropy_use_wdl(shallow, teacher_signal, result, ply);
+        }
+        else
+        {
+            return get_loss_cross_entropy(shallow, teacher_signal, result, ply);
+        }
     }
 
     static auto get_loss(

From 6cd0b030980ca577f699b94884916f277d7f4da1 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 21:34:43 +0100
Subject: [PATCH 498/583] Add some comments regarding the current state of
 autograd loss computation.

---
 src/learn/learn.cpp | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 18f84114..d3316bf0 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -195,6 +195,48 @@ namespace Learner
         return lambda;
     }
 
+    // We use our own simple static autograd for automatic
+    // differentiation of the loss function. While it works it has it's caveats.
+    // To work fast enough it requires memoization and reference semantics.
+    // Memoization is mostly opaque to the user and is only per eval basis.
+    // As for reference semantics, we cannot copy every node, 
+    // because we need a way to reuse computation.
+    // But we can't really use shared_ptr because of the overhead. That means
+    // that we have to ensure all parts of a loss expression are not destroyed
+    // before use. When lvalue references are used to construct a node it will
+    // store just a reference, it only perform a copy of the rvalue reference arguments.
+    // This means that we need some storage for the whole computation tree
+    // that keeps the values after function returns and never moves them to
+    // a different memory location. This means that we cannot use local
+    // variables and just return by value - because there may be dangling references left.
+    // We also cannot create a struct with this tree on demand because one cannot
+    // use `auto` as a struct members. This is a big issue, and the only way
+    // to solve it as of now is to use static thread_local variables and rely on the
+    // following assumptions:
+    // 1. the expression node must not change for the duration of the program
+    //    within a single instance of a function. This is usually not a problem
+    //    because almost all information is carried by the type. There is an
+    //    exception though, we have ConstantRef and Constant nodes that
+    //    do not encode the constants in the type, so it's possible
+    //    that these nodes are different on the first call to the function
+    //    then later. We MUST ensure that one function is only ever used
+    //    for one specific expression.
+    // 2. thread_local variables are not expensive. Usually after creation
+    //    it only requires a single unsynchronized boolean check and that's
+    //    how most compilers implement it.
+    //
+    // So the general way to do things right now is to use static thread_local
+    // variables for all named autograd nodes. Results being nodes should be
+    // returned by reference, so that there's no need to copy the returned objects.
+    // Parameters being nodes should be taken by lvalue reference if they are
+    // used more than once (to enable reference semantics to reuse computation),
+    // but they can be rvalues and forward on first use if there's only one use
+    // of the node in the scope.
+    // We must keep in mind that the node tree created by such a function
+    // is never going to change as thread_local variables are initialized
+    // on first call. This means that one cannot use one function as a factory
+    // for different autograd expression trees.
+
     template <typename ShallowT, typename TeacherT, typename ResultT, typename LambdaT>
     static auto& cross_entropy_(
         ShallowT& q_,

From 4eb0e77a2a42c86b56c95b960b84da397bfa7587 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 21:43:24 +0100
Subject: [PATCH 499/583] Store references instead of copying the results of
 intermediate autograd computations.

---
 src/learn/learn.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index d3316bf0..4900ff79 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -199,7 +199,7 @@ namespace Learner
     // differentiation of the loss function. While it works it has it's caveats.
     // To work fast enough it requires memoization and reference semantics.
     // Memoization is mostly opaque to the user and is only per eval basis.
-    // As for reference semantics, we cannot copy every node, 
+    // As for reference semantics, we cannot copy every node,
     // because we need a way to reuse computation.
     // But we can't really use shared_ptr because of the overhead. That means
     // that we have to ensure all parts of a loss expression are not destroyed
@@ -321,7 +321,7 @@ namespace Learner
 
         // The model captures only up to 240 plies, so limit input (and rescale)
         static thread_local auto m_ = std::forward<PlyT>(ply_) / 64.0;
-         
+
         static thread_local auto a_ = (((as[0] * m_ + as[1]) * m_ + as[2]) * m_) + as[3];
         static thread_local auto b_ = (((bs[0] * m_ + bs[1]) * m_ + bs[2]) * m_) + bs[3];
 
@@ -392,11 +392,11 @@ namespace Learner
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
-        static thread_local auto q_ = expected_perf_(VariableParameter<double, 0>{});
-        static thread_local auto p_ = expected_perf_(scale_score_(ConstantParameter<double, 1>{}));
+        static thread_local auto& q_ = expected_perf_(VariableParameter<double, 0>{});
+        static thread_local auto& p_ = expected_perf_(scale_score_(ConstantParameter<double, 1>{}));
         static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
-        static thread_local auto loss_ = cross_entropy_(q_, p_, t_, lambda_);
+        static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
 
         auto args = std::tuple(
             (double)shallow,
@@ -415,14 +415,14 @@ namespace Learner
 
         static thread_local auto ply_ = ConstantParameter<double, 4>{};
         static thread_local auto shallow_ = VariableParameter<double, 0>{};
-        static thread_local auto q_ = expected_perf_use_wdl_(shallow_, ply_);
+        static thread_local auto& q_ = expected_perf_use_wdl_(shallow_, ply_);
         // We could do just this but MSVC crashes with an internal compiler error :(
-        // static thread_local auto scaled_teacher_ = scale_score_(ConstantParameter<double, 1>{});
-        // static thread_local auto p_ = expected_perf_use_wdl_(scaled_teacher_, ply_);
+        // static thread_local auto& scaled_teacher_ = scale_score_(ConstantParameter<double, 1>{});
+        // static thread_local auto& p_ = expected_perf_use_wdl_(scaled_teacher_, ply_);
         static thread_local auto p_ = ConstantParameter<double, 1>{};
         static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
-        static thread_local auto loss_ = cross_entropy_(q_, p_, t_, lambda_);
+        static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
 
         auto args = std::tuple(
             (double)shallow,

From fafb9557a874befe5cb1bcf8d7ab00f5d02ec3dc Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 22:17:44 +0100
Subject: [PATCH 500/583] Get train loss from update_parameters.

---
 src/learn/learn.cpp                | 79 ++----------------------------
 src/learn/learn.h                  | 68 +++++++++++++++++++++++++
 src/nnue/evaluate_nnue_learner.cpp | 24 ++++++---
 src/nnue/evaluate_nnue_learner.h   |  2 +-
 4 files changed, 89 insertions(+), 84 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 4900ff79..450a80c6 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -52,7 +52,6 @@
 #include <sstream>
 #include <unordered_set>
 #include <iostream>
-#include <mutex>
 
 #if defined (_OPENMP)
 #include <omp.h>
@@ -98,65 +97,6 @@ namespace Learner
     // Using stockfish's WDL with win rate model instead of sigmoid
     static bool use_wdl = false;
 
-    struct Loss
-    {
-        double value() const
-        {
-            return m_loss.value;
-        }
-
-        double grad() const
-        {
-            return m_loss.grad;
-        }
-
-        uint64_t count() const
-        {
-            return m_count;
-        }
-
-        Loss& operator += (const ValueWithGrad<double>& rhs)
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss += rhs.abs();
-            m_count += 1;
-
-            return *this;
-        }
-
-        Loss& operator += (const Loss& rhs)
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss += rhs.m_loss.abs();
-            m_count += rhs.m_count;
-
-            return *this;
-        }
-
-        void reset()
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss = ValueWithGrad<double>{ 0.0, 0.0 };
-            m_count = 0;
-        }
-
-        template <typename StreamT>
-        void print(const std::string& prefix, StreamT& s) const
-        {
-            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << endl;
-            s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << endl;
-        }
-
-    private:
-        ValueWithGrad<double> m_loss{ 0.0, 0.0 };
-        uint64_t m_count{0};
-        std::mutex m_mutex;
-
-    };
-
     static void append_files_from_dir(
         std::vector<std::string>& filenames,
         const std::string& base_dir,
@@ -714,7 +654,6 @@ namespace Learner
         const auto thread_id = th.thread_idx();
         auto& pos = th.rootPos;
 
-        Loss local_loss_sum{};
         std::vector<StateInfo, AlignedAllocator<StateInfo>> state(MAX_PLY);
 
         while(!stop_flag)
@@ -761,17 +700,8 @@ namespace Learner
             auto pos_add_grad = [&]() {
 
                 // Evaluation value of deep search
-                const auto deep_value = (Value)ps.score;
-
                 const Value shallow_value = Eval::evaluate(pos);
 
-                const auto loss = get_loss(
-                    deep_value,
-                    (rootColor == pos.side_to_move()) ? shallow_value : -shallow_value,
-                    ps);
-
-                local_loss_sum += loss;
-
                 Eval::NNUE::add_example(pos, rootColor, shallow_value, ps, 1.0);
             };
 
@@ -809,8 +739,6 @@ namespace Learner
             // Since we have reached the end phase of PV, add the slope here.
             pos_add_grad();
         }
-
-        learn_loss_sum += local_loss_sum;
     }
 
     void LearnerThink::update_weights(const PSVector& psv, uint64_t epoch)
@@ -819,7 +747,8 @@ namespace Learner
         // should be no real issues happening since
         // the read/write phases are isolated.
         atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(Threads, epoch, params.verbose, params.learning_rate, params.max_grad, get_loss);
+        learn_loss_sum += Eval::NNUE::update_parameters(
+            Threads, epoch, params.verbose, params.learning_rate, params.max_grad, get_loss);
         atomic_thread_fence(memory_order_seq_cst);
 
         if (++save_count * params.mini_batch_size >= params.eval_save_interval)
@@ -899,11 +828,11 @@ namespace Learner
 
         if (psv.size() && test_loss_sum.count() > 0)
         {
-            test_loss_sum.print("test", out);
+            test_loss_sum.print("val", out);
 
             if (learn_loss_sum.count() > 0)
             {
-                learn_loss_sum.print("learn", out);
+                learn_loss_sum.print("train", out);
             }
 
             out << "  - norm = " << sum_norm << endl;
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 4e8d8a02..552096b2 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -40,6 +40,8 @@ using LearnFloatType = float;
 
 #include <sstream>
 #include <vector>
+#include <mutex>
+#include <string>
 
 namespace Learner
 {
@@ -69,6 +71,72 @@ namespace Learner
     void learn(std::istringstream& is);
 
     using CalcLossFunc = ValueWithGrad<double>(Value, Value, int, int);
+
+    struct Loss
+    {
+        double value() const
+        {
+            return m_loss.value;
+        }
+
+        double grad() const
+        {
+            return m_loss.grad;
+        }
+
+        uint64_t count() const
+        {
+            return m_count;
+        }
+
+        Loss() = default;
+
+        Loss(const Loss& other) :
+            m_loss(other.m_loss),
+            m_count(other.m_count)
+        {
+        }
+
+        Loss& operator += (const ValueWithGrad<double>& rhs)
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss += rhs.abs();
+            m_count += 1;
+
+            return *this;
+        }
+
+        Loss& operator += (const Loss& rhs)
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss += rhs.m_loss.abs();
+            m_count += rhs.m_count;
+
+            return *this;
+        }
+
+        void reset()
+        {
+            std::unique_lock lock(m_mutex);
+
+            m_loss = ValueWithGrad<double>{ 0.0, 0.0 };
+            m_count = 0;
+        }
+
+        template <typename StreamT>
+        void print(const std::string& prefix, StreamT& s) const
+        {
+            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
+            s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << std::endl;
+        }
+
+    private:
+        ValueWithGrad<double> m_loss{ 0.0, 0.0 };
+        uint64_t m_count{0};
+        std::mutex m_mutex;
+    };
 }
 
 #endif // ifndef _LEARN_H_
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 8c28e4f4..3061a4f4 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -190,7 +190,7 @@ namespace Eval::NNUE {
     }
 
     // update the evaluation function parameters
-    void update_parameters(
+    Learner::Loss update_parameters(
         ThreadPool& thread_pool,
         uint64_t epoch,
         bool verbose,
@@ -212,9 +212,12 @@ namespace Eval::NNUE {
 
         bool collect_stats = verbose;
 
+        Learner::Loss loss_sum{};
+
         std::vector<double> abs_eval_diff_sum_local(thread_pool.size(), 0.0);
         std::vector<double> abs_discrete_eval_sum_local(thread_pool.size(), 0.0);
         std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
+        std::vector<Learner::Loss> loss_sum_local(thread_pool.size());
 
         auto prev_batch_begin = examples.end();
         while ((long)(prev_batch_begin - examples.begin()) >= (long)batch_size) {
@@ -237,11 +240,11 @@ namespace Eval::NNUE {
                             e.sign * network_output[b] * kPonanzaConstant));
                         const auto discrete = e.sign * e.discrete_nn_eval;
                         const auto& psv = e.psv;
-                        const auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
-                        const double gradient = std::clamp(
+                        auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
+                        loss.grad = std::clamp(
                             loss.grad * e.sign * kPonanzaConstant * e.weight, -max_grad, max_grad);
-                        gradients[b] = static_cast<LearnFloatType>(gradient);
-
+                        gradients[b] = static_cast<LearnFloatType>(loss.grad);
+                        loss_sum_local[thread_id] += loss;
 
                         // The discrete eval will only be valid before first backpropagation,
                         // that is only for the first batch.
@@ -250,7 +253,7 @@ namespace Eval::NNUE {
                         {
                             abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow);
                             abs_discrete_eval_sum_local[thread_id] += std::abs(discrete);
-                            gradient_norm_local[thread_id] += std::abs(gradient);
+                            gradient_norm_local[thread_id] += std::abs(loss.grad);
                         }
                     }
 
@@ -277,9 +280,7 @@ namespace Eval::NNUE {
             abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0);
             abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0);
             gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0);
-        }
 
-        if (verbose) {
             const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
             const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
 
@@ -300,6 +301,13 @@ namespace Eval::NNUE {
         }
 
         send_messages({{"quantize_parameters"}});
+
+        for(auto& loss : loss_sum_local)
+        {
+            loss_sum += loss;
+        }
+
+        return loss_sum;
     }
 
     // Check if there are any problems with learning
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
index 5beca0a7..3d9f5b31 100644
--- a/src/nnue/evaluate_nnue_learner.h
+++ b/src/nnue/evaluate_nnue_learner.h
@@ -33,7 +33,7 @@ namespace Eval::NNUE {
         double weight);
 
     // update the evaluation function parameters
-    void update_parameters(
+    Learner::Loss update_parameters(
         ThreadPool& thread_pool,
         uint64_t epoch,
         bool verbose,

From 28d6d7cb0316dcb8544a9390b4ad91d132106ce5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 30 Nov 2020 22:25:05 +0100
Subject: [PATCH 501/583] Avoid computing gradient for validation loss.

---
 src/learn/learn.cpp | 98 ++++++++++++++++++++++++++++++++++++++++-----
 src/learn/learn.h   |  8 +++-
 2 files changed, 95 insertions(+), 11 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 450a80c6..449542a7 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -310,7 +310,8 @@ namespace Learner
         return perf;
     }
 
-    [[maybe_unused]] static ValueWithGrad<double> get_loss_noob(Value shallow, Value teacher_signal, int result, int /* ply */)
+    [[maybe_unused]] static ValueWithGrad<double> get_loss_noob(
+        Value shallow, Value teacher_signal, int result, int /* ply */)
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
@@ -328,7 +329,7 @@ namespace Learner
         return loss_.eval(args);
     }
 
-    static ValueWithGrad<double> get_loss_cross_entropy(Value shallow, Value teacher_signal, int result, int /* ply */)
+    static auto& get_loss_cross_entropy_()
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
@@ -338,18 +339,45 @@ namespace Learner
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
         static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
 
-        auto args = std::tuple(
+        return loss_;
+    }
+
+    static auto get_loss_cross_entropy_args(
+        Value shallow, Value teacher_signal, int result)
+    {
+        return std::tuple(
             (double)shallow,
             (double)teacher_signal,
             (double)result,
             calculate_lambda(teacher_signal)
         );
+    }
+
+    static ValueWithGrad<double> get_loss_cross_entropy(
+        Value shallow, Value teacher_signal, int result, int /* ply */)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto& loss_ = get_loss_cross_entropy_();
+
+        auto args = get_loss_cross_entropy_args(shallow, teacher_signal, result);
 
         return loss_.eval(args);
     }
 
-    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl(
-        Value shallow, Value teacher_signal, int result, int ply)
+    static ValueWithGrad<double> get_loss_cross_entropy_no_grad(
+        Value shallow, Value teacher_signal, int result, int /* ply */)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto& loss_ = get_loss_cross_entropy_();
+
+        auto args = get_loss_cross_entropy_args(shallow, teacher_signal, result);
+
+        return { loss_.value(args), 0.0 };
+    }
+
+    static auto& get_loss_cross_entropy_use_wdl_()
     {
         using namespace Learner::Autograd::UnivariateStatic;
 
@@ -364,7 +392,13 @@ namespace Learner
         static thread_local auto lambda_ = ConstantParameter<double, 3>{};
         static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
 
-        auto args = std::tuple(
+        return loss_;
+    }
+
+    static auto get_loss_cross_entropy_use_wdl_args(
+        Value shallow, Value teacher_signal, int result, int ply)
+    {
+        return std::tuple(
             (double)shallow,
             // This is required because otherwise MSVC crashes :(
             expected_perf_use_wdl(scale_score(teacher_signal), ply),
@@ -372,10 +406,32 @@ namespace Learner
             calculate_lambda(teacher_signal),
             (double)std::min(240, ply)
         );
+    }
+
+    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl(
+        Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto& loss_ = get_loss_cross_entropy_use_wdl_();
+
+        auto args = get_loss_cross_entropy_use_wdl_args(shallow, teacher_signal, result, ply);
 
         return loss_.eval(args);
     }
 
+    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl_no_grad(
+        Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        static thread_local auto& loss_ = get_loss_cross_entropy_use_wdl_();
+
+        auto args = get_loss_cross_entropy_use_wdl_args(shallow, teacher_signal, result, ply);
+
+        return { loss_.value(args), 0.0 };
+    }
+
     static auto get_loss(Value shallow, Value teacher_signal, int result, int ply)
     {
         using namespace Learner::Autograd::UnivariateStatic;
@@ -390,7 +446,21 @@ namespace Learner
         }
     }
 
-    static auto get_loss(
+    static auto get_loss_no_grad(Value shallow, Value teacher_signal, int result, int ply)
+    {
+        using namespace Learner::Autograd::UnivariateStatic;
+
+        if (use_wdl)
+        {
+            return get_loss_cross_entropy_use_wdl_no_grad(shallow, teacher_signal, result, ply);
+        }
+        else
+        {
+            return get_loss_cross_entropy_no_grad(shallow, teacher_signal, result, ply);
+        }
+    }
+
+    [[maybe_unused]] static auto get_loss(
         Value teacher_signal,
         Value shallow,
         const PackedSfenValue& psv)
@@ -398,6 +468,14 @@ namespace Learner
         return get_loss(shallow, teacher_signal, psv.game_result, psv.gamePly);
     }
 
+    static auto get_loss_no_grad(
+        Value teacher_signal,
+        Value shallow,
+        const PackedSfenValue& psv)
+    {
+        return get_loss_no_grad(shallow, teacher_signal, psv.game_result, psv.gamePly);
+    }
+
     // Class to generate sfen with multiple threads
     struct LearnerThink
     {
@@ -828,11 +906,11 @@ namespace Learner
 
         if (psv.size() && test_loss_sum.count() > 0)
         {
-            test_loss_sum.print("val", out);
+            test_loss_sum.print_only_loss("val", out);
 
             if (learn_loss_sum.count() > 0)
             {
-                learn_loss_sum.print("train", out);
+                learn_loss_sum.print_with_grad("train", out);
             }
 
             out << "  - norm = " << sum_norm << endl;
@@ -880,7 +958,7 @@ namespace Learner
             // Evaluation value of deep search
             const auto deep_value = (Value)ps.score;
 
-            const auto loss = get_loss(
+            const auto loss = get_loss_no_grad(
                 deep_value,
                 shallow_value,
                 ps);
diff --git a/src/learn/learn.h b/src/learn/learn.h
index 552096b2..842ffad0 100644
--- a/src/learn/learn.h
+++ b/src/learn/learn.h
@@ -126,12 +126,18 @@ namespace Learner
         }
 
         template <typename StreamT>
-        void print(const std::string& prefix, StreamT& s) const
+        void print_with_grad(const std::string& prefix, StreamT& s) const
         {
             s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
             s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << std::endl;
         }
 
+        template <typename StreamT>
+        void print_only_loss(const std::string& prefix, StreamT& s) const
+        {
+            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
+        }
+
     private:
         ValueWithGrad<double> m_loss{ 0.0, 0.0 };
         uint64_t m_count{0};

From 3a1bd1185f87cf133321a90b0c0616bf68cc16c6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 5 Dec 2020 15:00:02 +0100
Subject: [PATCH 502/583] Add binpack coarse shuffle tool.

---
 script/shuffle_binpack.py | 69 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 script/shuffle_binpack.py

diff --git a/script/shuffle_binpack.py b/script/shuffle_binpack.py
new file mode 100644
index 00000000..409d4907
--- /dev/null
+++ b/script/shuffle_binpack.py
@@ -0,0 +1,69 @@
+import struct
+import sys
+import os
+import random
+from pathlib import Path
+
+def index_binpack(file):
+    print('Indexing...')
+    index = []
+    offset = 0
+    report_every = 100
+    prev_mib = -report_every
+    while file.peek():
+        chunk_header = file.read(8)
+        assert chunk_header[0:4] == b'BINP'
+        size = struct.unpack('<I', chunk_header[4:])[0]
+        file.seek(size, os.SEEK_CUR)
+        index.append((offset, size + 8))
+        offset += size + 8
+
+        mib = offset // 1024 // 1024
+        if mib // 100 != prev_mib // 100:
+            print('Indexed {} MiB'.format(mib))
+            prev_mib = mib
+
+    return index
+
+def copy_binpack_indexed(in_file, index, out_file):
+    print('Copying...')
+    total_size = 0
+    report_every = 100
+    prev_mib = -report_every
+    for offset, size in index:
+        in_file.seek(offset, os.SEEK_SET)
+        data = in_file.read(size)
+        assert len(data) == size
+        out_file.write(data)
+
+        total_size += size
+        mib = total_size // 1024 // 1024
+        if mib // 100 != prev_mib // 100:
+            print('Copied {} MiB'.format(mib))
+            prev_mib = mib
+
+def main():
+    if len(sys.argv) < 3:
+        print('Usage: python shuffle_binpack.py infile outfile')
+        return
+
+    in_filename = sys.argv[1]
+    out_filename = sys.argv[2]
+
+    if (Path(out_filename).exists()):
+        print('Output path already exists. Please specify a path to a file that does not exist.')
+        return
+
+    in_file = open(in_filename, 'rb')
+    out_file = open(out_filename, 'wb')
+
+    index = index_binpack(in_file)
+    print('Shuffling...')
+    random.shuffle(index)
+
+    copy_binpack_indexed(in_file, index, out_file)
+
+    in_file.close()
+    out_file.close()
+
+main()

From bb26ce5aa1e11de2865f419a0a2228b43df23ead Mon Sep 17 00:00:00 2001
From: kennyfrc <fxkennyfrc@gmail.com>
Date: Tue, 8 Dec 2020 22:14:18 +0800
Subject: [PATCH 503/583] mac specific makefile with compilation instructions

---
 README.md    | 18 ++++++++++++++++++
 src/Makefile | 12 ++++++------
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 6d28a998..56ce7d3e 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,24 @@
 ## Overview
 Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updateable neural network backwards) to Stockfish 11. To learn more about the Stockfish chess engine, look [here](stockfish.md) for an overview and [here](https://github.com/official-stockfish/Stockfish) for the official repository.
 
+## Compilation Instructions for Mac
+
+1. Ensure that you have OpenBlas Installed
+```
+brew install openblas
+```
+2. Go to src then build using the makefile
+```
+cd src
+make learn ARCH=x86-64 COMP=gcc
+```
+or
+```
+cd src
+make profile-learn ARCH=x86-64 COMP=gcc
+```
+
+
 ## Training Guide
 ### Generating Training Data
 To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands. 
diff --git a/src/Makefile b/src/Makefile
index 9372b915..e4c5a836 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -888,16 +888,16 @@ icc-profile-use:
 
 learn: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	EXTRALDFLAGS=' -lopenblas -fopenmp -Wl,-s ' \
+	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/usr/local/opt/openblas/include' \
+	EXTRALDFLAGS=' -L/usr/local/opt/openblas/lib -Wl,-s -lcblas' \
 	all
 	
 profile-learn: config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s '
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/usr/local/opt/openblas/include' \
+	LEARNLDFLAGS=' -L/usr/local/opt/openblas/lib -Wl,-s  -lcblas'
 	@echo ""
 	@echo "Step 2/4. Running benchmark for pgo-build ..."
 	$(PGOGENSFEN) 
@@ -905,8 +905,8 @@ profile-learn: config-sanity objclean profileclean
 	@echo "Step 3/4. Building optimized executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/mingw64/include/OpenBLAS -fopenmp ' \
-	LEARNLDFLAGS=' -lopenblas -fopenmp -Wl,-s '
+	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/usr/local/opt/openblas/include' \
+	LEARNLDFLAGS=' -L/usr/local/opt/openblas/lib -Wl,-s  -lcblas'
 	@echo ""
 	@echo "Step 4/4. Deleting profile data ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean

From b49fd3ab30b721c0dd29c015819ae6485bd6f32d Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Tue, 8 Dec 2020 18:05:31 +0100
Subject: [PATCH 504/583] Add -lstdc++fs to the link line of gcc

older versions of gcc (<8.1) need this, even if they accept -std=c++17

with this patch, the code can be run on fishtest again,
at least by the majority of workers (fishtest doesn't require c++17 to be available)

See e.g.
https://tests.stockfishchess.org/tests/view/5fcfbf801ac1691201888235

Bench: 3820648
---
 src/Makefile                       | 2 +-
 src/learn/convert.cpp              | 2 --
 src/learn/gensfen.cpp              | 1 -
 src/learn/learn.cpp                | 2 --
 src/learn/sfen_writer.h            | 1 -
 src/nnue/evaluate_nnue_learner.cpp | 3 +--
 src/nnue/nnue_common.h             | 7 +++++++
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 6f72809a..5ec747a7 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -328,7 +328,7 @@ endif
 ifeq ($(COMP),gcc)
 	comp=gcc
 	CXX=g++
-	CXXFLAGS += -pedantic -Wextra -Wshadow
+	CXXFLAGS += -pedantic -Wextra -Wshadow -lstdc++fs
 
 	ifeq ($(arch),$(filter $(arch),armv7 armv8))
 		ifeq ($(OS),Android)
diff --git a/src/learn/convert.cpp b/src/learn/convert.cpp
index 5fe7ea1d..47f56f02 100644
--- a/src/learn/convert.cpp
+++ b/src/learn/convert.cpp
@@ -25,7 +25,6 @@
 #include <chrono>
 #include <random>
 #include <regex>
-#include <filesystem>
 
 using namespace std;
 
@@ -610,7 +609,6 @@ namespace Learner
     {
         string kif_base_dir = Path::combine(base_dir, target_dir);
 
-        namespace sys = std::filesystem;
         sys::path p(kif_base_dir); // Origin of enumeration
         std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
             [&](const sys::path& path) {
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 5f8bbba1..1cc9055c 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -22,7 +22,6 @@
 #include <climits>
 #include <cmath>
 #include <cstring>
-#include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <limits>
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 449542a7..6651e096 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -39,7 +39,6 @@
 #include <climits>
 #include <cmath>    // std::exp(),std::pow(),std::log()
 #include <cstring>  // memcpy()
-#include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <limits>
@@ -104,7 +103,6 @@ namespace Learner
     {
         string kif_base_dir = Path::combine(base_dir, target_dir);
 
-        namespace sys = std::filesystem;
         sys::path p(kif_base_dir); // Origin of enumeration
         std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
             [&](const sys::path& path) {
diff --git a/src/learn/sfen_writer.h b/src/learn/sfen_writer.h
index 1bbd916c..b1c3ed5f 100644
--- a/src/learn/sfen_writer.h
+++ b/src/learn/sfen_writer.h
@@ -8,7 +8,6 @@
 #include "syzygy/tbprobe.h"
 
 #include <cstring>
-#include <filesystem>
 #include <fstream>
 #include <limits>
 #include <list>
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
index 3061a4f4..8d95221c 100644
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ b/src/nnue/evaluate_nnue_learner.cpp
@@ -1,6 +1,5 @@
 ﻿#include <random>
 #include <fstream>
-#include <filesystem>
 
 #include "evaluate_nnue.h"
 #include "evaluate_nnue_learner.h"
@@ -326,7 +325,7 @@ namespace Eval::NNUE {
         // mkdir() will fail if this folder already exists, but
         // Apart from that. If not, I just want you to make it.
         // Also, assume that the folders up to EvalSaveDir have been dug.
-        std::filesystem::create_directories(eval_dir);
+        sys::create_directories(eval_dir);
 
         const std::string file_name = Path::combine(eval_dir, NNUE::savedfileName);
         std::ofstream stream(file_name, std::ios::binary);
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 58bfd146..e72168f8 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -25,6 +25,13 @@
 
 #include <cstring>
 #include <iostream>
+#if defined(__GNUC__ ) && (__GNUC__ < 8)
+#include <experimental/filesystem>
+namespace sys = std::experimental::filesystem;
+#else
+#include <filesystem>
+namespace sys = std::filesystem;
+#endif
 
 #if defined(USE_AVX2)
 #include <immintrin.h>

From d99ba07b819f1b0fef84c28ab0ae406384831659 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 8 Dec 2020 23:44:26 +0100
Subject: [PATCH 505/583] Fix incorrect enpassant flag for moves read from uci
 format in the binpack lib

---
 src/extra/nnue_data_binpack_format.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 31c6f7bb..77cf8e0a 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -6238,7 +6238,7 @@ namespace chess
 
                     return Move::castle(castleType, pos.sideToMove());
                 }
-                else if (pos.epSquare() == to)
+                else if (pos.pieceAt(from).type() == PieceType::Pawn && pos.epSquare() == to)
                 {
                     return Move::enPassant(from, to);
                 }

From 9c65e868f950899d8671a362c422cd887083e3de Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Tue, 8 Dec 2020 23:57:54 +0100
Subject: [PATCH 506/583] Enhance pgn_to_plain.py

in case a score can be parsed from the comment field in the pgn, add it to the output.
This form works for the fishtest pgns, and is quite common (cutechess-cli among others).
---
 script/pgn_to_plain.py | 44 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/script/pgn_to_plain.py b/script/pgn_to_plain.py
index 5f9300cb..596df5f3 100644
--- a/script/pgn_to_plain.py
+++ b/script/pgn_to_plain.py
@@ -1,11 +1,15 @@
 import chess.pgn
 import argparse
 import glob
+import re
 from typing import List
 
 # todo close in c++ tools using pgn-extract
 # https://www.cs.kent.ac.uk/people/staff/djb/pgn-extract/help.html#-w
 
+commentRe = re.compile("([+-]*M*[0-9.]*)/([0-9]*)")
+mateRe = re.compile("([+-])M([0-9]*)")
+
 def parse_result(result_str:str, board:chess.Board) -> int:
     if result_str == "1/2-1/2":
         return 0
@@ -20,7 +24,7 @@ def parse_result(result_str:str, board:chess.Board) -> int:
         else:
             return -1
     else:
-        print("illeagal result", result_str)
+        print("illegal result", result_str)
         raise ValueError
 
 def game_sanity_check(game: chess.pgn.Game) -> bool:
@@ -28,21 +32,52 @@ def game_sanity_check(game: chess.pgn.Game) -> bool:
         print("invalid result", game.headers["Result"])
         return False
     return True
+
+def parse_comment_for_score(comment_str: str, board: chess.Board) -> int:
+    global commentRe
+    global mateRe
+
+    try:
+      m = commentRe.search(comment_str)
+      if m:
+         score = m.group(1)
+         # depth = int(m.group(2))
+         m = mateRe.search(score)
+         if m:
+            if m.group(1) == "+":
+               score =  32000 - int(m.group(2))
+            else:
+               score = -32000 + int(m.group(2))
+         else:
+            score = int(float(score) * 208) # pawn to SF PawnValueEg
+
+         if board.turn == chess.BLACK:
+            score = -score
+      else:
+         score = 0
+    except:
+      score = 0
+
+    return score
     
 def parse_game(game: chess.pgn.Game, writer, start_play: int=1)->None:
     board: chess.Board = game.board()
     if not game_sanity_check(game):
         return
+
     result: str = game.headers["Result"]
-    for ply, move in enumerate(game.mainline_moves()):
+    ply = 0
+    for node in game.mainline():
+        move = node.move
         if ply >= start_play:
+            comment: str = node.comment
             writer.write("fen " + board.fen() + "\n")
             writer.write("move " + str(move) + "\n")
-            writer.write("score 0\n")
+            writer.write("score " + str(parse_comment_for_score(comment, board)) + "\n")
             writer.write("ply " + str(ply)+"\n")
             writer.write("result " + str(parse_result(result, board)) +"\n")
             writer.write("e\n")
-
+        ply += 1
         board.push(move)
 
 def main():
@@ -53,6 +88,7 @@ def main():
     args = parser.parse_args()
 
     pgn_files: List[str] = glob.glob(args.pgn)
+    pgn_files = sorted(pgn_files, key=lambda x:float(re.findall("-(\d+).pgn",x)[0] if re.findall("-(\d+).pgn",x) else 0.0))
     f = open(args.output, 'w')
     for pgn_file in pgn_files:
         print("parse", pgn_file)

From f4b4430380d0f1765cbbe4e4272b4148f3a7fc7a Mon Sep 17 00:00:00 2001
From: kennyfrc <fxkennyfrc@gmail.com>
Date: Sat, 12 Dec 2020 23:39:42 +0800
Subject: [PATCH 507/583] remove unnecessary makefile commands and fix blas on
 mac

---
 README.md    |  4 ++--
 src/Makefile | 33 +++++----------------------------
 2 files changed, 7 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 99168e3f..fe6ce27c 100644
--- a/README.md
+++ b/README.md
@@ -44,12 +44,12 @@ brew install openblas
 2. Go to src then build using the makefile
 ```
 cd src
-make learn ARCH=x86-64 COMP=gcc
+make build ARCH=x86-64 COMP=gcc blas=yes
 ```
 or
 ```
 cd src
-make profile-learn ARCH=x86-64 COMP=gcc
+make profile-build ARCH=x86-64 COMP=gcc blas=yes
 ```
 
 ## Training Guide
diff --git a/src/Makefile b/src/Makefile
index 5ec747a7..1ae9cd5f 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -403,8 +403,8 @@ ifeq ($(COMP),clang)
 endif
 
 ifeq ($(KERNEL),Darwin)
-	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14
-	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
+	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.15
+	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.15
 	XCRUN = xcrun
 endif
 
@@ -477,6 +477,9 @@ ifeq ($(blas), yes)
 
 	ifeq ($(KERNEL),Linux)
 		LDFLAGS +=
+	else ifeq ($(KERNEL), Darwin)
+		CXXFLAGS += -I/usr/local/opt/openblas/include
+		LDFLAGS += -L/usr/local/opt/openblas/lib -lcblas
 	else
 		CXXFLAGS += -I/mingw64/include/OpenBLAS
 
@@ -920,32 +923,6 @@ icc-profile-use:
 	EXTRACXXFLAGS='-prof_use -prof_dir ./profdir' \
 	all
 
-learn: config-sanity
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/usr/local/opt/openblas/include' \
-	EXTRALDFLAGS=' -L/usr/local/opt/openblas/lib -Wl,-s -lcblas' \
-	all
-	
-profile-learn: config-sanity objclean profileclean
-	@echo ""
-	@echo "Step 1/4. Building instrumented executable ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/usr/local/opt/openblas/include' \
-	LEARNLDFLAGS=' -L/usr/local/opt/openblas/lib -Wl,-s  -lcblas'
-	@echo ""
-	@echo "Step 2/4. Running benchmark for pgo-build ..."
-	$(PGOGENSFEN) 
-	@echo ""
-	@echo "Step 3/4. Building optimized executable ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) \
-	LEARNCXXFLAGS=' -DEVAL_LEARN -DEVAL_NNUE -DENABLE_TEST_CMD -DUSE_BLAS -I/usr/local/opt/openblas/include' \
-	LEARNLDFLAGS=' -L/usr/local/opt/openblas/lib -Wl,-s  -lcblas'
-	@echo ""
-	@echo "Step 4/4. Deleting profile data ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
-	rm generated_kifu.bin
-
 .depend:
 	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@
 

From 76fbc5e3d043264ccca742754391a99813ce96ac Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sat, 12 Dec 2020 20:01:42 +0100
Subject: [PATCH 508/583] Make score sign flip optional

Bug fix: flipping score is not needed for fishtest, make this optional.
---
 script/pgn_to_plain.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/script/pgn_to_plain.py b/script/pgn_to_plain.py
index 596df5f3..c551c136 100644
--- a/script/pgn_to_plain.py
+++ b/script/pgn_to_plain.py
@@ -9,6 +9,7 @@ from typing import List
 
 commentRe = re.compile("([+-]*M*[0-9.]*)/([0-9]*)")
 mateRe = re.compile("([+-])M([0-9]*)")
+flip_black = False
 
 def parse_result(result_str:str, board:chess.Board) -> int:
     if result_str == "1/2-1/2":
@@ -36,6 +37,7 @@ def game_sanity_check(game: chess.pgn.Game) -> bool:
 def parse_comment_for_score(comment_str: str, board: chess.Board) -> int:
     global commentRe
     global mateRe
+    global flip_black
 
     try:
       m = commentRe.search(comment_str)
@@ -51,7 +53,7 @@ def parse_comment_for_score(comment_str: str, board: chess.Board) -> int:
          else:
             score = int(float(score) * 208) # pawn to SF PawnValueEg
 
-         if board.turn == chess.BLACK:
+         if flip_black and board.turn == chess.BLACK:
             score = -score
       else:
          score = 0
@@ -85,8 +87,12 @@ def main():
     parser.add_argument("--pgn", type=str, required=True)
     parser.add_argument("--start_ply", type=int, default=1)
     parser.add_argument("--output", type=str, default="plain.txt")
+    parser.add_argument("--flip_black_score", action='store_true', dest='flip_black_score', help="Flip black score. Default: False")
     args = parser.parse_args()
 
+    global flip_black
+    flip_black = args.flip_black_score
+
     pgn_files: List[str] = glob.glob(args.pgn)
     pgn_files = sorted(pgn_files, key=lambda x:float(re.findall("-(\d+).pgn",x)[0] if re.findall("-(\d+).pgn",x) else 0.0))
     f = open(args.output, 'w')

From a7378f3249968077a6c9ec90f0169af8240ebd62 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Dec 2020 17:45:16 +0100
Subject: [PATCH 509/583] Make next_fen in opening_book a critical section

---
 src/learn/opening_book.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/learn/opening_book.h b/src/learn/opening_book.h
index 16207f13..d07fc58b 100644
--- a/src/learn/opening_book.h
+++ b/src/learn/opening_book.h
@@ -11,6 +11,7 @@
 #include <string>
 #include <cstdint>
 #include <memory>
+#include <mutex>
 
 namespace Learner {
 
@@ -20,6 +21,8 @@ namespace Learner {
         {
             assert(fens.size() > 0);
 
+            std::unique_lock lock(mutex);
+
             auto& fen = fens[current_index++];
             if (current_index >= fens.size())
                 current_index = 0;
@@ -39,6 +42,7 @@ namespace Learner {
         }
 
 
+        std::mutex mutex;
         std::string filename;
         std::vector<std::string> fens;
         std::size_t current_index;

From f56613ebf685a381f4d4b93e2a1081b47a140ee2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 19 Dec 2020 23:46:18 +0100
Subject: [PATCH 510/583] Add 'validation_count' option for 'learn' that
 specifies how many positions to use for validation

---
 docs/learn.md           |  2 ++
 src/learn/learn.cpp     | 51 ++++++++++++++++++++++++-----------------
 src/learn/sfen_reader.h | 48 ++++++++++----------------------------
 3 files changed, 44 insertions(+), 57 deletions(-)

diff --git a/docs/learn.md b/docs/learn.md
index 30a7c951..e88de089 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -80,6 +80,8 @@ Currently the following options are available:
 
 `validation_set_file_name` - path to the file with training data to be used for validation (loss computation and move accuracy)
 
+`validation_count` - the number of positions to use for validation. Default: 2000.
+
 `sfen_read_size` - the number of sfens to always keep in the buffer. Default: 10000000 (10M)
 
 `thread_buffer_size` - the number of sfens to copy at once to each thread requesting more sfens for learning. Default: 10000
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 6651e096..90f629e1 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -482,6 +482,12 @@ namespace Learner
             // Mini batch size size. Be sure to set it on the side that uses this class.
             uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
 
+            // Number of phases used for calculation such as mse
+            // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
+            // Since search() is performed with depth = 1 in calculation of
+            // move match rate, simple comparison is not possible...
+            uint64_t validation_count = 2000;
+
             // Option to exclude early stage from learning
             int reduction_gameply = 1;
 
@@ -550,16 +556,10 @@ namespace Learner
             }
         };
 
-        // Number of phases used for calculation such as mse
-        // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
-        // Since search() is performed with depth = 1 in calculation of
-        // move match rate, simple comparison is not possible...
-        static constexpr uint64_t sfen_for_mse_size = 2000;
-
         LearnerThink(const Params& prm) :
             params(prm),
             prng(prm.seed),
-            sr(
+            train_sr(
                 prm.filenames,
                 prm.shuffle,
                 SfenReaderMode::Cyclic,
@@ -567,6 +567,14 @@ namespace Learner
                 std::to_string(prng.next_random_seed()),
                 prm.sfen_read_size,
                 prm.thread_buffer_size),
+            validation_sr(
+                prm.validation_set_file_name.empty() ? prm.filenames : std::vector<std::string>{ prm.validation_set_file_name },
+                prm.shuffle,
+                SfenReaderMode::Cyclic,
+                1,
+                std::to_string(prng.next_random_seed()),
+                prm.sfen_read_size,
+                prm.thread_buffer_size),
             learn_loss_sum{}
         {
             save_count = 0;
@@ -612,7 +620,8 @@ namespace Learner
         PRNG prng;
 
         // sfen reader
-        SfenReader sr;
+        SfenReader train_sr;
+        SfenReader validation_sr;
 
         uint64_t save_count;
         uint64_t loss_output_count;
@@ -666,28 +675,26 @@ namespace Learner
 
         Eval::NNUE::verify_any_net_loaded();
 
-        const PSVector sfen_for_mse =
-            params.validation_set_file_name.empty()
-            ? sr.read_for_mse(sfen_for_mse_size)
-            : sr.read_validation_set(
-                params.validation_set_file_name,
+        const PSVector validation_data =
+            validation_sr.read_some(
+                params.validation_count,
                 params.eval_limit,
-                params.use_draw_games_in_validation);
+                params.use_draw_games_in_validation
+            );
 
-        if (params.validation_set_file_name.empty()
-            && sfen_for_mse.size() != sfen_for_mse_size)
+        if (validation_data.size() != params.validation_count)
         {
             auto out = sync_region_cout.new_region();
             out
-                << "INFO (learn): Error reading sfen_for_mse. Read " << sfen_for_mse.size()
-                << " out of " << sfen_for_mse_size << '\n';
+                << "INFO (learn): Error reading validation data. Read " << validation_data.size()
+                << " out of " << params.validation_count << '\n';
 
             return;
         }
 
         if (params.newbob_decay != 1.0) {
 
-            calc_loss(sfen_for_mse, 0);
+            calc_loss(validation_data, 0);
 
             best_loss = latest_loss_sum / latest_loss_count;
             latest_loss_sum = 0.0;
@@ -714,7 +721,7 @@ namespace Learner
             if (stop_flag)
                 break;
 
-            update_weights(sfen_for_mse, epoch);
+            update_weights(validation_data, epoch);
 
             if (stop_flag)
                 break;
@@ -742,7 +749,7 @@ namespace Learner
 
         RETRY_READ:;
 
-            if (!sr.read_to_thread_buffer(thread_id, ps))
+            if (!train_sr.read_to_thread_buffer(thread_id, ps))
             {
                 // If we ran out of data we stop completely
                 // because there's nothing left to do.
@@ -1146,6 +1153,7 @@ namespace Learner
                 is >> filename;
                 params.filenames.push_back(filename);
             }
+            else if (option == "validation_count") is >> params.validation_count;
 
             // Specify the number of loops
             else if (option == "epochs") is >> epochs;
@@ -1260,6 +1268,7 @@ namespace Learner
             out << "  - validation set           : " << params.validation_set_file_name << endl;
         }
 
+        out << "  - validation count         : " << params.validation_count << endl;
         out << "  - epochs                   : " << epochs << endl;
         out << "  - epochs * minibatch size  : " << epochs * params.mini_batch_size << endl;
         out << "  - eval_limit               : " << params.eval_limit << endl;
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 512f1165..206ed2bd 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -73,10 +73,10 @@ namespace Learner{
         }
 
         // Load the phase for calculation such as mse.
-        PSVector read_for_mse(uint64_t count)
+        PSVector read_some(uint64_t count, int eval_limit, bool use_draw_games)
         {
-            PSVector sfen_for_mse;
-            sfen_for_mse.reserve(count);
+            PSVector psv;
+            psv.reserve(count);
 
             for (uint64_t i = 0; i < count; ++i)
             {
@@ -84,43 +84,19 @@ namespace Learner{
                 if (!read_to_thread_buffer(0, ps))
                 {
                     std::cout << "ERROR (sfen_reader): Reading failed." << std::endl;
-                    return sfen_for_mse;
+                    return psv;
                 }
 
-                sfen_for_mse.push_back(ps);
+                if (eval_limit < abs(ps.score))
+                    continue;
+
+                if (!use_draw_games && ps.game_result == 0)
+                    continue;
+
+                psv.push_back(ps);
             }
 
-            return sfen_for_mse;
-        }
-
-        PSVector read_validation_set(const std::string& file_name, int eval_limit, bool use_draw_games)
-        {
-            PSVector sfen_for_mse;
-
-            auto input = open_sfen_input_file(file_name);
-
-            while(!input->eof())
-            {
-                std::optional<PackedSfenValue> p_opt = input->next();
-                if (p_opt.has_value())
-                {
-                    auto& p = *p_opt;
-
-                    if (eval_limit < abs(p.score))
-                        continue;
-
-                    if (!use_draw_games && p.game_result == 0)
-                        continue;
-
-                    sfen_for_mse.push_back(p);
-                }
-                else
-                {
-                    break;
-                }
-            }
-
-            return sfen_for_mse;
+            return psv;
         }
 
         // [ASYNC] Thread returns one aspect. Otherwise returns false.

From a9cfaa4d98d343ea7d7ce5cf0bf96201ad00ab77 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Dec 2020 23:20:32 +0100
Subject: [PATCH 511/583] Add a tool for rescoring fens from an epd file with
 fixed depth search

---
 src/learn/transform.cpp | 148 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 147 insertions(+), 1 deletion(-)

diff --git a/src/learn/transform.cpp b/src/learn/transform.cpp
index 5687b48b..3e302e21 100644
--- a/src/learn/transform.cpp
+++ b/src/learn/transform.cpp
@@ -6,6 +6,7 @@
 #include "thread.h"
 #include "position.h"
 #include "evaluate.h"
+#include "search.h"
 
 #include "nnue/evaluate_nnue.h"
 
@@ -16,6 +17,8 @@
 #include <algorithm>
 #include <cstdint>
 #include <limits>
+#include <mutex>
+#include <optional>
 
 namespace Learner
 {
@@ -44,6 +47,18 @@ namespace Learner
         }
     };
 
+    struct RescoreFenParams
+    {
+        std::string input_filename = "in.epd";
+        std::string output_filename = "out.binpack";
+        int depth = 3;
+
+        void enforce_constraints()
+        {
+            depth = std::max(1, depth);
+        }
+    };
+
     [[nodiscard]] std::int16_t nudge(NudgedStaticParams& params, std::int16_t static_eval_i16, std::int16_t deep_eval_i16)
     {
         auto saturate_i32_to_i16 = [](int v) {
@@ -218,10 +233,141 @@ namespace Learner
         do_nudged_static(params);
     }
 
+    void do_rescore_fen(RescoreFenParams& params)
+    {
+        std::ifstream fens_file(params.input_filename);
+
+        auto next_fen = [&fens_file]() -> std::optional<std::string>{
+            static std::mutex mutex;
+
+            std::string fen;
+
+            std::unique_lock lock(mutex);
+
+            if (std::getline(fens_file, fen) && fen.size() >= 10)
+            {
+                return fen;
+            }
+            else
+            {
+                return std::nullopt;
+            }
+        };
+
+        PSVector buffer;
+        uint64_t batch_size = 10'000;
+
+        buffer.reserve(batch_size);
+
+        auto out = Learner::create_new_sfen_output(params.output_filename);
+
+        std::mutex mutex;
+        uint64_t num_processed = 0;
+
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+
+        Threads.execute_with_workers([&](auto& th){
+            Position& pos = th.rootPos;
+            StateInfo si;
+
+            for(;;)
+            {
+                auto fen = next_fen();
+                if (!fen.has_value())
+                    return;
+
+                pos.set(*fen, false, &si, &th);
+                pos.state()->rule50 = 0;
+
+                auto [search_value, search_pv] = Search::search(pos, params.depth, 1);
+                if (search_pv.empty())
+                    continue;
+
+                PackedSfenValue ps;
+                pos.sfen_pack(ps.sfen);
+                ps.score = search_value;
+                ps.move = search_pv[0];
+                ps.gamePly = 1;
+                ps.game_result = 0;
+                ps.padding = 0;
+
+                std::unique_lock lock(mutex);
+                buffer.emplace_back(ps);
+                if (buffer.size() >= batch_size)
+                {
+                    num_processed += buffer.size();
+
+                    out->write(buffer);
+                    buffer.clear();
+
+                    std::cout << "Processed " << num_processed << " positions.\n";
+                }
+            }
+        });
+        Threads.wait_for_workers_finished();
+
+        if (!buffer.empty())
+        {
+            num_processed += buffer.size();
+
+            out->write(buffer);
+            buffer.clear();
+
+            std::cout << "Processed " << num_processed << " positions.\n";
+        }
+
+        std::cout << "Finished.\n";
+    }
+
+    void rescore_fen(std::istringstream& is)
+    {
+        RescoreFenParams params{};
+
+        while(true)
+        {
+            std::string token;
+            is >> token;
+
+            if (token == "")
+                break;
+
+            if (token == "depth")
+                is >> params.depth;
+            else if (token == "input_file")
+                is >> params.input_filename;
+            else if (token == "output_file")
+                is >> params.output_filename;
+        }
+
+        std::cout << "Performing transform rescore_fen with parameters:\n";
+        std::cout << "depth               : " << params.depth << '\n';
+        std::cout << "input_file          : " << params.input_filename << '\n';
+        std::cout << "output_file         : " << params.output_filename << '\n';
+        std::cout << '\n';
+
+        params.enforce_constraints();
+        do_rescore_fen(params);
+    }
+
     void transform(std::istringstream& is)
     {
         const std::map<std::string, CommandFunc> subcommands = {
-            { "nudged_static", &nudged_static }
+            { "nudged_static", &nudged_static },
+            { "rescore_fen", &rescore_fen }
         };
 
         Eval::NNUE::init();

From ffae19b5a1baca1d6cc0e1b4b43f47eb3d478510 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 20 Dec 2020 23:06:19 +0100
Subject: [PATCH 512/583] Add docs for rescore_fen

---
 docs/transform.md | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/docs/transform.md b/docs/transform.md
index 82e963fe..35ef16a1 100644
--- a/docs/transform.md
+++ b/docs/transform.md
@@ -4,7 +4,7 @@
 
 ## `nudged_static`
 
-`transform nudged_static` takes named parameters in the form of `gensfen param_1_name param_1_value param_2_name param_2_value ...` and flag parameters which don't require values.
+`transform nudged_static` takes named parameters in the form of `transform nudged_static param_1_name param_1_value param_2_name param_2_value ...` and flag parameters which don't require values.
 
 This command goes through positions in the input files and replaces the scores with new ones - generated from static eval - but slightly adjusted based on the scores in the original input file.
 
@@ -19,3 +19,18 @@ Currently the following options are available:
 `relative` - states that the adjustment should be bounded by a value relative in magnitude to the static eval value. After this token follows the maximum relative change - a floating point value greater than 0. For example a value of 0.1 only allows changing the static eval by at most 10% towards the score from the input file.
 
 `interpolate` states that the output score should be a value interpolated between static eval and the score from the input file. After this token follows the interpolation constant `t`. `t` of 0 means that only static eval is used. `t` of 1 means that only score from the input file is used. `t` of 0.5 means that the static eval and input score are averaged. It accepts values outside of range `<0, 1>`, but the usefulness is questionable.
+
+## `rescore_fen`
+
+`transform rescore_fen` takes named parameters in the form of `transform rescore_fen param_1_name param_1_value param_2_name param_2_value ...` and flag parameters which don't require values.
+
+This command takes a path to the input file which contains one FEN per line and outputs a .bin or .binpack file with these positions rescored with specified depth search.
+
+Currently the following options are available:
+
+`input_file` - path to the input .epd file. Default: in.binpack.
+
+`output_file` - path to the output .bin or .binpack file. The file is opened in append mode. Default: out.binpack.
+
+`depth` - the search depth to use for rescoring. Default: 3.
+

From 994eb5e183e76765af1ccd33c37bdfe192fe2141 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 21 Dec 2020 00:01:34 +0100
Subject: [PATCH 513/583] rescore_fen -> rescore. Make it work on .bin and
 .binpack inputs.

---
 docs/transform.md       |  12 ++--
 src/learn/transform.cpp | 134 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 131 insertions(+), 15 deletions(-)

diff --git a/docs/transform.md b/docs/transform.md
index 35ef16a1..a0a7ad75 100644
--- a/docs/transform.md
+++ b/docs/transform.md
@@ -20,17 +20,21 @@ Currently the following options are available:
 
 `interpolate` states that the output score should be a value interpolated between static eval and the score from the input file. After this token follows the interpolation constant `t`. `t` of 0 means that only static eval is used. `t` of 1 means that only score from the input file is used. `t` of 0.5 means that the static eval and input score are averaged. It accepts values outside of range `<0, 1>`, but the usefulness is questionable.
 
-## `rescore_fen`
+## `rescore`
 
-`transform rescore_fen` takes named parameters in the form of `transform rescore_fen param_1_name param_1_value param_2_name param_2_value ...` and flag parameters which don't require values.
+`transform rescore` takes named parameters in the form of `transform rescore param_1_name param_1_value param_2_name param_2_value ...` and flag parameters which don't require values.
 
-This command takes a path to the input file which contains one FEN per line and outputs a .bin or .binpack file with these positions rescored with specified depth search.
+This tool respects the UCI option `Threads` and uses all available threads.
+
+This command takes a path to the input file that is either a .epd file which contains one FEN per line or a .bin or .binpack file and outputs a .bin or .binpack file with these positions rescored with specified depth search.
 
 Currently the following options are available:
 
-`input_file` - path to the input .epd file. Default: in.binpack.
+`input_file` - path to the input file. Default: in.binpack.
 
 `output_file` - path to the output .bin or .binpack file. The file is opened in append mode. Default: out.binpack.
 
 `depth` - the search depth to use for rescoring. Default: 3.
 
+`keep_moves` - whether to keep moves from the input file if available. Allows to keep compression in .binpack. Default: 1.
+
diff --git a/src/learn/transform.cpp b/src/learn/transform.cpp
index 3e302e21..77671c65 100644
--- a/src/learn/transform.cpp
+++ b/src/learn/transform.cpp
@@ -2,6 +2,7 @@
 
 #include "sfen_stream.h"
 #include "packed_sfen.h"
+#include "sfen_writer.h"
 
 #include "thread.h"
 #include "position.h"
@@ -47,11 +48,12 @@ namespace Learner
         }
     };
 
-    struct RescoreFenParams
+    struct RescoreParams
     {
         std::string input_filename = "in.epd";
         std::string output_filename = "out.binpack";
         int depth = 3;
+        bool keep_moves = true;
 
         void enforce_constraints()
         {
@@ -233,13 +235,11 @@ namespace Learner
         do_nudged_static(params);
     }
 
-    void do_rescore_fen(RescoreFenParams& params)
+    void do_rescore_epd(RescoreParams& params)
     {
         std::ifstream fens_file(params.input_filename);
 
-        auto next_fen = [&fens_file]() -> std::optional<std::string>{
-            static std::mutex mutex;
-
+        auto next_fen = [&fens_file, mutex = std::mutex{}]() mutable -> std::optional<std::string>{
             std::string fen;
 
             std::unique_lock lock(mutex);
@@ -333,9 +333,117 @@ namespace Learner
         std::cout << "Finished.\n";
     }
 
-    void rescore_fen(std::istringstream& is)
+    void do_rescore_data(RescoreParams& params)
     {
-        RescoreFenParams params{};
+        // TODO: Use SfenReader once it works correctly in sequential mode. See issue #271
+        auto in = Learner::open_sfen_input_file(params.input_filename);
+        auto readsome = [&in, mutex = std::mutex{}](int n) mutable -> PSVector {
+
+            PSVector psv;
+            psv.reserve(n);
+
+            std::unique_lock lock(mutex);
+
+            for (int i = 0; i < n; ++i)
+            {
+                auto ps_opt = in->next();
+                if (ps_opt.has_value())
+                {
+                    psv.emplace_back(*ps_opt);
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            return psv;
+        };
+
+        auto sfen_format = ends_with(params.output_filename, ".binpack") ? SfenOutputType::Binpack : SfenOutputType::Bin;
+
+        auto out = SfenWriter(
+            params.output_filename,
+            Threads.size(),
+            std::numeric_limits<std::uint64_t>::max(),
+            sfen_format);
+
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+
+        std::atomic<std::uint64_t> num_processed = 0;
+
+        Threads.execute_with_workers([&](auto& th){
+            Position& pos = th.rootPos;
+            StateInfo si;
+
+            for (;;)
+            {
+                PSVector psv = readsome(5000);
+                if (psv.empty())
+                    break;
+
+                for(auto& ps : psv)
+                {
+                    pos.set_from_packed_sfen(ps.sfen, &si, &th);
+
+                    auto [search_value, search_pv] = Search::search(pos, params.depth, 1);
+                    if (search_pv.empty())
+                        continue;
+
+                    pos.sfen_pack(ps.sfen);
+                    ps.score = search_value;
+                    if (!params.keep_moves)
+                        ps.move = search_pv[0];
+                    ps.padding = 0;
+
+                    out.write(th.thread_idx(), ps);
+
+                    auto p = num_processed.fetch_add(1) + 1;
+                    if (p % 10000 == 0)
+                    {
+                        std::cout << "Processed " << p << " positions.\n";
+                    }
+                }
+            }
+        });
+        Threads.wait_for_workers_finished();
+
+        std::cout << "Finished.\n";
+    }
+
+    void do_rescore(RescoreParams& params)
+    {
+        if (ends_with(params.input_filename, ".epd"))
+        {
+            do_rescore_epd(params);
+        }
+        else if (ends_with(params.input_filename, ".bin") || ends_with(params.input_filename, ".binpack"))
+        {
+            do_rescore_data(params);
+        }
+        else
+        {
+            std::cerr << "Invalid input file type.\n";
+        }
+    }
+
+    void rescore(std::istringstream& is)
+    {
+        RescoreParams params{};
 
         while(true)
         {
@@ -351,23 +459,27 @@ namespace Learner
                 is >> params.input_filename;
             else if (token == "output_file")
                 is >> params.output_filename;
+            else if (token == "keep_moves")
+                is >> params.keep_moves;
         }
 
-        std::cout << "Performing transform rescore_fen with parameters:\n";
+        params.enforce_constraints();
+
+        std::cout << "Performing transform rescore with parameters:\n";
         std::cout << "depth               : " << params.depth << '\n';
         std::cout << "input_file          : " << params.input_filename << '\n';
         std::cout << "output_file         : " << params.output_filename << '\n';
+        std::cout << "keep_moves          : " << params.keep_moves << '\n';
         std::cout << '\n';
 
-        params.enforce_constraints();
-        do_rescore_fen(params);
+        do_rescore(params);
     }
 
     void transform(std::istringstream& is)
     {
         const std::map<std::string, CommandFunc> subcommands = {
             { "nudged_static", &nudged_static },
-            { "rescore_fen", &rescore_fen }
+            { "rescore", &rescore }
         };
 
         Eval::NNUE::init();

From 50df3a7389514f2500fbd11772e168ba7fd83769 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 21 Dec 2020 22:20:39 +0100
Subject: [PATCH 514/583] fix annoying warning

---
 src/nnue/trainer/trainer_input_slice.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
index 62a761a7..ff1265dc 100644
--- a/src/nnue/trainer/trainer_input_slice.h
+++ b/src/nnue/trainer/trainer_input_slice.h
@@ -323,17 +323,23 @@ namespace Eval::NNUE {
                 const IndexType output_offset = kOutputDimensions * b;
 
                 IndexType i = 0;
-                for (; i < Offset; ++i) {
-                    gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                if constexpr (Offset > 0)
+                {
+                    for (; i < Offset; ++i) {
+                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                    }
                 }
 
                 for (; i < Offset + kOutputDimensions; ++i) {
                     gradients_[input_offset + i] = gradients[output_offset + i - Offset];
                 }
 
-                for (; i < kInputDimensions; ++i)
+                if constexpr (Offset + kOutputDimensions < kInputDimensions)
                 {
-                    gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                    for (; i < kInputDimensions; ++i)
+                    {
+                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+                    }
                 }
             }
 

From 6853b4aac2055321944c07aa0aa354f5d5803a17 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 21 Dec 2020 20:57:51 +0100
Subject: [PATCH 515/583] Simple filtering for validation data.

---
 docs/learn.md           |  4 ++-
 src/learn/learn.cpp     | 56 +++++++++++++++++++++++++++++++++++------
 src/learn/sfen_reader.h | 17 +++++++------
 3 files changed, 61 insertions(+), 16 deletions(-)

diff --git a/docs/learn.md b/docs/learn.md
index e88de089..fe88e7e8 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -66,7 +66,9 @@ Currently the following options are available:
 
 `assume_quiet` - this is a flag option. When specified learn will not perform qsearch to reach a quiet position.
 
-`smart_fen_skipping` - this is a flag option. When specified some position that are not good candidates for teaching are skipped. This includes positions where the best move is a capture or promotion, and position where a king is in check.
+`smart_fen_skipping` - this is a flag option. When specified some position that are not good candidates for teaching are skipped. This includes positions where the best move is a capture or promotion, and position where a king is in check. Default: 1.
+
+`smart_fen_skipping_for_validation` - same as `smart_fen_skipping` but applies to validation data set. Default: 0.
 
 `newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 4.
 
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 90f629e1..c3499283 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -521,6 +521,7 @@ namespace Learner
 
             bool assume_quiet = false;
             bool smart_fen_skipping = false;
+            bool smart_fen_skipping_for_validation = false;
 
             double learning_rate = 1.0;
             double max_grad = 1.0;
@@ -593,6 +594,8 @@ namespace Learner
     private:
         static void set_learning_search_limits();
 
+        PSVector fetch_next_validation_set();
+
         void learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
 
         void update_weights(const PSVector& psv, uint64_t epoch);
@@ -665,6 +668,44 @@ namespace Learner
         limits.depth = 0;
     }
 
+    PSVector LearnerThink::fetch_next_validation_set()
+    {
+        PSVector validation_data;
+
+        auto mainThread = Threads.main();
+        mainThread->execute_with_worker([&validation_data, this](auto& th){
+            auto do_include_predicate = [&th, this](const PackedSfenValue& ps) -> bool {
+                if (params.eval_limit < abs(ps.score))
+                    return false;
+
+                if (!params.use_draw_games_in_validation && ps.game_result == 0)
+                    return false;
+
+                if (params.smart_fen_skipping_for_validation)
+                {
+                    StateInfo si;
+                    auto& pos = th.rootPos;
+                    if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
+                        return false;
+
+                    if (pos.capture_or_promotion((Move)ps.move) || pos.checkers())
+                        return false;
+                }
+
+                return true;
+            };
+
+            validation_data = validation_sr.read_some(
+                params.validation_count,
+                params.validation_count * 100, // to have a reasonable bound on the running time.
+                do_include_predicate
+            );
+        });
+        mainThread->wait_for_worker_finished();
+
+        return validation_data;
+    }
+
     void LearnerThink::learn(uint64_t epochs)
     {
 #if defined(_OPENMP)
@@ -675,19 +716,16 @@ namespace Learner
 
         Eval::NNUE::verify_any_net_loaded();
 
-        const PSVector validation_data =
-            validation_sr.read_some(
-                params.validation_count,
-                params.eval_limit,
-                params.use_draw_games_in_validation
-            );
+        const PSVector validation_data = fetch_next_validation_set();
 
         if (validation_data.size() != params.validation_count)
         {
             auto out = sync_region_cout.new_region();
             out
                 << "INFO (learn): Error reading validation data. Read " << validation_data.size()
-                << " out of " << params.validation_count << '\n';
+                << " out of " << params.validation_count << '\n'
+                << "INFO (learn): This either means that less than 1% of the validation data passed the filter"
+                << " or the file is empty\n";
 
             return;
         }
@@ -1235,6 +1273,7 @@ namespace Learner
             else if (option == "verbose") params.verbose = true;
             else if (option == "assume_quiet") params.assume_quiet = true;
             else if (option == "smart_fen_skipping") params.smart_fen_skipping = true;
+            else if (option == "smart_fen_skipping_for_validation") params.smart_fen_skipping_for_validation = true;
             else
             {
                 out << "INFO: Unknown option: " << option << ". Ignoring.\n";
@@ -1306,6 +1345,9 @@ namespace Learner
         out << "  - sfen_read_size           : " << params.sfen_read_size << endl;
         out << "  - thread_buffer_size       : " << params.thread_buffer_size << endl;
 
+        out << "  - smart_fen_skipping       : " << params.smart_fen_skipping << endl;
+        out << "  - smart_fen_skipping_val   : " << params.smart_fen_skipping_for_validation << endl;
+
         out << "  - seed                     : " << params.seed << endl;
         out << "  - verbose                  : " << (params.verbose ? "true" : "false") << endl;
 
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 206ed2bd..1574f63a 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -15,6 +15,7 @@
 #include <iostream>
 #include <cstdint>
 #include <thread>
+#include <functional>
 
 namespace Learner{
 
@@ -73,12 +74,12 @@ namespace Learner{
         }
 
         // Load the phase for calculation such as mse.
-        PSVector read_some(uint64_t count, int eval_limit, bool use_draw_games)
+        PSVector read_some(uint64_t count, uint64_t count_tries, std::function<bool(const PackedSfenValue&)> do_take)
         {
             PSVector psv;
             psv.reserve(count);
 
-            for (uint64_t i = 0; i < count; ++i)
+            for (uint64_t i = 0; i < count_tries; ++i)
             {
                 PackedSfenValue ps;
                 if (!read_to_thread_buffer(0, ps))
@@ -87,13 +88,13 @@ namespace Learner{
                     return psv;
                 }
 
-                if (eval_limit < abs(ps.score))
-                    continue;
+                if (do_take(ps))
+                {
+                    psv.push_back(ps);
 
-                if (!use_draw_games && ps.game_result == 0)
-                    continue;
-
-                psv.push_back(ps);
+                    if (psv.size() >= count)
+                        break;
+                }
             }
 
             return psv;

From 8ca82646a981febdaff046fb47c16630b685c4b4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 21 Dec 2020 20:35:23 +0100
Subject: [PATCH 516/583] Use plain nnue eval for validation loss calculation
 instead of first performing qsearch

---
 src/learn/learn.cpp         | 31 +------------------------------
 tests/instrumented_learn.sh |  2 +-
 2 files changed, 2 insertions(+), 31 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index c3499283..8326ab24 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -611,8 +611,6 @@ namespace Learner
             atomic<int>& move_accord_count
         );
 
-        Value get_shallow_value(Position& pos);
-
         bool check_progress();
 
         // save merit function parameters to a file
@@ -996,7 +994,7 @@ namespace Learner
                 continue;
             }
 
-            const Value shallow_value = get_shallow_value(pos);
+            const Value shallow_value = Eval::evaluate(pos);
 
             // Evaluation value of deep search
             const auto deep_value = (Value)ps.score;
@@ -1018,33 +1016,6 @@ namespace Learner
         test_loss_sum += local_loss_sum;
     }
 
-    Value LearnerThink::get_shallow_value(Position& pos)
-    {
-        // Evaluation value for shallow search
-        // The value of evaluate() may be used, but when calculating loss, learn_cross_entropy and
-        // Use qsearch() because it is difficult to compare the values.
-        // EvalHash has been disabled in advance. (If not, the same value will be returned every time)
-        const auto [_, pv] = Search::qsearch(pos);
-
-        const auto rootColor = pos.side_to_move();
-
-        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(pv.size());
-        for (size_t i = 0; i < pv.size(); ++i)
-        {
-            pos.do_move(pv[i], states[i]);
-        }
-
-        const Value shallow_value =
-            (rootColor == pos.side_to_move())
-            ? Eval::evaluate(pos)
-            : -Eval::evaluate(pos);
-
-        for (auto it = pv.rbegin(); it != pv.rend(); ++it)
-            pos.undo_move(*it);
-
-        return shallow_value;
-    }
-
     bool LearnerThink::check_progress()
     {
         auto out = sync_region_cout.new_region();
diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
index 9109e78b..af8c8b17 100755
--- a/tests/instrumented_learn.sh
+++ b/tests/instrumented_learn.sh
@@ -124,7 +124,7 @@ cat << EOF > learn01.exp
 
  send "uci\n"
  send "setoption name SkipLoadingEval value true\n"
- send "setoption name Use NNUE value true\n"
+ send "setoption name Use NNUE value pure\n"
  send "setoption name Threads value $threads\n"
  send "isready\n"
  send "learn targetdir training_data epochs 1 sfen_read_size 100 thread_buffer_size 10 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"

From b50dcd7ddee7c5f59a2a23daeec09e5cc378fcaf Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 23 Dec 2020 22:42:42 +0100
Subject: [PATCH 517/583] allow for repeated searches in rescoring

allows for repeating a depth N search K times.
Repeated searches improve the quality of eval, but don't bring in higher depth info.
Might allow for removing some of the noise in low depth scoring.
---
 docs/transform.md       |  2 ++
 src/learn/transform.cpp | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/docs/transform.md b/docs/transform.md
index a0a7ad75..486fd34e 100644
--- a/docs/transform.md
+++ b/docs/transform.md
@@ -38,3 +38,5 @@ Currently the following options are available:
 
 `keep_moves` - whether to keep moves from the input file if available. Allows to keep compression in .binpack. Default: 1.
 
+`research_count` - number of additional searches of depth N done on the same position before using the eval. Default: 0.
+
diff --git a/src/learn/transform.cpp b/src/learn/transform.cpp
index 77671c65..8991b9f1 100644
--- a/src/learn/transform.cpp
+++ b/src/learn/transform.cpp
@@ -53,11 +53,13 @@ namespace Learner
         std::string input_filename = "in.epd";
         std::string output_filename = "out.binpack";
         int depth = 3;
+        int research_count = 0;
         bool keep_moves = true;
 
         void enforce_constraints()
         {
             depth = std::max(1, depth);
+            research_count = std::max(0, research_count);
         }
     };
 
@@ -293,7 +295,12 @@ namespace Learner
                 pos.set(*fen, false, &si, &th);
                 pos.state()->rule50 = 0;
 
+
+                for (int cnt = 0; cnt < params.research_count; ++cnt)
+                    Search::search(pos, params.depth, 1);
+
                 auto [search_value, search_pv] = Search::search(pos, params.depth, 1);
+
                 if (search_pv.empty())
                     continue;
 
@@ -400,7 +407,11 @@ namespace Learner
                 {
                     pos.set_from_packed_sfen(ps.sfen, &si, &th);
 
+                    for (int cnt = 0; cnt < params.research_count; ++cnt)
+                        Search::search(pos, params.depth, 1);
+
                     auto [search_value, search_pv] = Search::search(pos, params.depth, 1);
+
                     if (search_pv.empty())
                         continue;
 
@@ -461,6 +472,8 @@ namespace Learner
                 is >> params.output_filename;
             else if (token == "keep_moves")
                 is >> params.keep_moves;
+            else if (token == "research_count")
+                is >> params.research_count;
         }
 
         params.enforce_constraints();
@@ -470,6 +483,7 @@ namespace Learner
         std::cout << "input_file          : " << params.input_filename << '\n';
         std::cout << "output_file         : " << params.output_filename << '\n';
         std::cout << "keep_moves          : " << params.keep_moves << '\n';
+        std::cout << "research_count      : " << params.research_count << '\n';
         std::cout << '\n';
 
         do_rescore(params);

From 3f73c40412b82969da319482355bc2f9bef4688b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 21 Dec 2020 21:50:30 +0100
Subject: [PATCH 518/583] More deterministic move accuracy validation.

---
 src/learn/learn.cpp | 48 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 8326ab24..8265a66f 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -51,6 +51,8 @@
 #include <sstream>
 #include <unordered_set>
 #include <iostream>
+#include <map>
+#include <algorithm>
 
 #if defined (_OPENMP)
 #include <omp.h>
@@ -611,6 +613,8 @@ namespace Learner
             atomic<int>& move_accord_count
         );
 
+        bool has_depth1_move_agreement(Position& pos, Move pvmove);
+
         bool check_progress();
 
         // save merit function parameters to a file
@@ -1007,15 +1011,53 @@ namespace Learner
             local_loss_sum += loss;
             sum_norm += (double)abs(shallow_value);
 
-            // Determine if the teacher's move and the score of the shallow search match
-            const auto [value, pv] = Search::search(pos, 1);
-            if (pv.size() > 0 && (uint16_t)pv[0] == ps.move)
+            // Threat all moves with equal scores as first. This is up to move ordering.
+            if (has_depth1_move_agreement(pos, (Move)ps.move))
                 move_accord_count.fetch_add(1, std::memory_order_relaxed);
         }
 
         test_loss_sum += local_loss_sum;
     }
 
+    bool LearnerThink::has_depth1_move_agreement(Position& pos, Move pvmove)
+    {
+        // Determine if the depth 1 search pv matches the move from the dataset.
+        // Do a manual depth 1 search so we're not affected by previous searches.
+        std::vector<std::pair<Move, Value>> child_scores;
+
+        // Call evaluate once for the rootpos so that the evals
+        // for children moves use incremental feature transformer updates.
+        (void)Eval::evaluate(pos);
+
+        // Just to get guaranteed alignment.
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(1);
+        auto legal_moves = MoveList<LEGAL>(pos);
+        for (auto m : legal_moves)
+        {
+            pos.do_move(m, states[0]);
+            // We don't care if the king is in check or stuff like that.
+            // not a big issue and nnue should digest all.
+            auto value = -Eval::evaluate(pos);
+            child_scores.emplace_back(m, value);
+            pos.undo_move(m);
+        }
+
+        if (child_scores.empty())
+            return false;
+
+        std::sort(
+            child_scores.begin(),
+            child_scores.end(),
+            [](auto& lhs, auto& rhs) { return lhs.second > rhs.second; }
+        );
+
+        // Require the best move to have strictly higher score than the next one.
+        return
+            child_scores[0].first == pvmove
+            && (child_scores.size() == 1
+                || child_scores[1].second != child_scores[0].second);
+    }
+
     bool LearnerThink::check_progress()
     {
         auto out = sync_region_cout.new_region();

From 96b377a90ab700fd1b52278a4c2032a22d9ea8cb Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 13 Dec 2020 15:33:35 +0100
Subject: [PATCH 519/583] Add gensfen_nonpv

---
 src/Makefile                |   1 +
 src/evaluate.cpp            |   2 +
 src/learn/gensfen_nonpv.cpp | 474 ++++++++++++++++++++++++++++++++++++
 src/learn/gensfen_nonpv.h   |  12 +
 src/thread.h                |   8 +
 src/uci.cpp                 |   2 +
 6 files changed, 499 insertions(+)
 create mode 100644 src/learn/gensfen_nonpv.cpp
 create mode 100644 src/learn/gensfen_nonpv.h

diff --git a/src/Makefile b/src/Makefile
index 1ae9cd5f..586656d3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -63,6 +63,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	learn/sfen_packer.cpp \
 	learn/learn.cpp \
 	learn/gensfen.cpp \
+	learn/gensfen_nonpv.cpp \
 	learn/opening_book.cpp \
 	learn/convert.cpp \
 	learn/transform.cpp
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index dd204a52..709a50ff 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -914,6 +914,8 @@ make_v:
 
 Value Eval::evaluate(const Position& pos) {
 
+  pos.this_thread()->on_eval();
+
   Value v;
 
   if (NNUE::useNNUE == NNUE::UseNNUEMode::Pure) {
diff --git a/src/learn/gensfen_nonpv.cpp b/src/learn/gensfen_nonpv.cpp
new file mode 100644
index 00000000..a5c667b5
--- /dev/null
+++ b/src/learn/gensfen_nonpv.cpp
@@ -0,0 +1,474 @@
+﻿#include "gensfen_nonpv.h"
+
+#include "sfen_writer.h"
+#include "packed_sfen.h"
+#include "opening_book.h"
+
+#include "misc.h"
+#include "position.h"
+#include "thread.h"
+#include "tt.h"
+#include "uci.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "nnue/evaluate_nnue.h"
+#include "nnue/evaluate_nnue_learner.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <atomic>
+#include <chrono>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <memory>
+#include <optional>
+#include <random>
+#include <shared_mutex>
+#include <sstream>
+#include <unordered_set>
+
+using namespace std;
+
+namespace Learner
+{
+    // Class to generate sfen with multiple threads
+    struct GensfenNonPv
+    {
+        struct Params
+        {
+            // The depth for search on the fens gathered during exploration
+            int search_depth = 3;
+
+            // the min/max number of nodes to use for exploration per ply
+            int exploration_min_nodes = 5000;
+            int exploration_max_nodes = 15000;
+
+            // The pct of positions explored that are saved for rescoring
+            float exploration_save_rate = 0.01;
+
+            // Upper limit of evaluation value of generated situation
+            int eval_limit = 4000;
+
+            // the upper limit on evaluation during exploration selfplay
+            int exploration_eval_limit = 4000;
+
+            int exploration_max_ply = 200;
+
+            int exploration_min_pieces = 8;
+
+            std::string output_file_name = "generated_gensfen_nonpv";
+
+            SfenOutputType sfen_format = SfenOutputType::Binpack;
+
+            std::string seed;
+
+            int num_threads;
+
+            std::string book;
+
+            void enforce_constraints()
+            {
+                // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
+                eval_limit = std::min(eval_limit, (int)mate_in(2));
+                exploration_eval_limit = std::min(eval_limit, (int)mate_in(2));
+                exploration_min_nodes = std::max(100, exploration_min_nodes);
+                exploration_max_nodes = std::max(exploration_min_nodes, exploration_max_nodes);
+
+                num_threads = Options["Threads"];
+            }
+        };
+
+        static constexpr uint64_t REPORT_DOT_EVERY = 5000;
+        static constexpr uint64_t REPORT_STATS_EVERY = 200000;
+        static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
+
+        GensfenNonPv(
+            const Params& prm
+        ) :
+            params(prm),
+            prng(prm.seed),
+            sfen_writer(prm.output_file_name, prm.num_threads, std::numeric_limits<uint64_t>::max(), prm.sfen_format)
+        {
+            if (!prm.book.empty())
+            {
+                opening_book = open_opening_book(prm.book, prng);
+                if (opening_book == nullptr)
+                {
+                    std::cout << "WARNING: Failed to open opening book " << prm.book << ". Falling back to startpos.\n";
+                }
+            }
+
+            // Output seed to veryfy by the user if it's not identical by chance.
+            std::cout << prng << std::endl;
+        }
+
+        void generate(uint64_t limit);
+
+    private:
+        Params params;
+
+        PRNG prng;
+
+        std::mutex stats_mutex;
+        TimePoint last_stats_report_time;
+
+        // sfen exporter
+        SfenWriter sfen_writer;
+
+        SynchronizedRegionLogger::Region out;
+
+        std::unique_ptr<OpeningBook> opening_book;
+
+        static void set_gensfen_search_limits();
+
+        void generate_worker(
+            Thread& th,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit);
+
+        bool commit_psv(
+            Thread& th,
+            PSVector& sfens,
+            std::atomic<uint64_t>& counter,
+            uint64_t limit);
+
+        PSVector do_exploration(
+            Thread& th,
+            int count);
+
+        void report(uint64_t done, uint64_t new_done);
+
+        void maybe_report(uint64_t done);
+    };
+
+    void GensfenNonPv::set_gensfen_search_limits()
+    {
+        // About Search::Limits
+        // Be careful because this member variable is global and affects other threads.
+        auto& limits = Search::Limits;
+
+        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
+        limits.infinite = true;
+
+        // Since PV is an obstacle when displayed, erase it.
+        limits.silent = true;
+
+        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
+        limits.nodes = 0;
+
+        // depth is also processed by the one passed as an argument of Learner::search().
+        limits.depth = 0;
+    }
+
+    void GensfenNonPv::generate(uint64_t limit)
+    {
+        last_stats_report_time = 0;
+
+        set_gensfen_search_limits();
+
+        std::atomic<uint64_t> counter{0};
+        Threads.execute_with_workers([&counter, limit, this](Thread& th) {
+            generate_worker(th, counter, limit);
+        });
+        Threads.wait_for_workers_finished();
+
+        sfen_writer.flush();
+
+        if (limit % REPORT_STATS_EVERY != 0)
+        {
+            report(limit, limit % REPORT_STATS_EVERY);
+        }
+
+        std::cout << std::endl;
+    }
+
+    PSVector GensfenNonPv::do_exploration(
+        Thread& th,
+        int count)
+    {
+        constexpr int max_depth = 30;
+
+        PSVector psv;
+
+        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(
+            max_depth + MAX_PLY /* == search_depth_min + α */);
+
+        th.set_eval_callback([this, &psv](Position& pos) {
+            if ((double)prng.rand<uint64_t>() / std::numeric_limits<uint64_t>::max() < params.exploration_save_rate)
+            {
+                psv.emplace_back();
+                pos.sfen_pack(psv.back().sfen);
+            }
+        });
+
+        auto& pos = th.rootPos;
+        StateInfo si;
+
+        for (int i = 0; i < count; ++i)
+        {
+            if (opening_book != nullptr)
+            {
+                auto& fen = opening_book->next_fen();
+                pos.set(fen, false, &si, &th);
+            }
+            else
+            {
+                pos.set(StartFEN, false, &si, &th);
+            }
+
+            for(int ply = 0; ply < params.exploration_max_ply; ++ply)
+            {
+                auto nodes = prng.rand(params.exploration_max_nodes - params.exploration_min_nodes + 1) + params.exploration_min_nodes;
+
+                auto [search_value, search_pv] = Search::search(pos, max_depth, 1, nodes);
+
+                if (search_pv.empty())
+                {
+                    break;
+                }
+
+                if (std::abs(search_value) > params.exploration_eval_limit)
+                {
+                    break;
+                }
+
+                pos.do_move(search_pv[0], states[ply]);
+
+                if (popcount(pos.pieces()) < params.exploration_min_pieces)
+                {
+                    break;
+                }
+            }
+        }
+
+        th.clear_eval_callback();
+
+        return psv;
+    }
+
+    void GensfenNonPv::generate_worker(
+        Thread& th,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit)
+    {
+        constexpr int exploration_batch_size = 1;
+
+        StateInfo si;
+
+        PSVector psv;
+
+        // end flag
+        bool quit = false;
+
+        // repeat until the specified number of times
+        while (!quit)
+        {
+            // It is necessary to set a dependent thread for Position.
+            // When parallelizing, Threads (since this is a vector<Thread*>,
+            // Do the same for up to Threads[0]...Threads[thread_num-1].
+            auto& pos = th.rootPos;
+
+            auto packed_sfens = do_exploration(th, exploration_batch_size);
+            psv.clear();
+
+            for (auto& ps : packed_sfens)
+            {
+                pos.set_from_packed_sfen(ps.sfen, &si, &th);
+                pos.state()->rule50 = 0;
+                auto [search_value, search_pv] = Search::search(pos, params.search_depth, 1);
+
+                if (search_pv.empty())
+                {
+                    continue;
+                }
+
+                if (std::abs(search_value) > params.eval_limit)
+                {
+                    continue;
+                }
+
+                auto& new_ps = psv.emplace_back();
+                pos.sfen_pack(new_ps.sfen);
+                new_ps.score = search_value;
+                new_ps.move = search_pv[0];
+                new_ps.gamePly = 1;
+                new_ps.game_result = 0;
+                new_ps.padding = 0;
+            }
+
+            quit = commit_psv(th, psv, counter, limit);
+        }
+    }
+
+    // Write out the phases loaded in sfens to a file.
+    // result: win/loss in the next phase after the final phase in sfens
+    // 1 when winning. -1 when losing. Pass 0 for a draw.
+    // Return value: true if the specified number of
+    // sfens has already been reached and the process ends.
+    bool GensfenNonPv::commit_psv(
+        Thread& th,
+        PSVector& sfens,
+        std::atomic<uint64_t>& counter,
+        uint64_t limit)
+    {
+        // Write sfens in move order to make potential compression easier
+        for (auto& sfen : sfens)
+        {
+            // Return true if there is already enough data generated.
+            const auto iter = counter.fetch_add(1);
+            if (iter >= limit)
+                return true;
+
+            // because `iter` was done, now we do one more
+            maybe_report(iter + 1);
+
+            // Write out one sfen.
+            sfen_writer.write(th.thread_idx(), sfen);
+        }
+
+        return false;
+    }
+
+    void GensfenNonPv::report(uint64_t done, uint64_t new_done)
+    {
+        const auto now_time = now();
+        const TimePoint elapsed = now_time - last_stats_report_time + 1;
+
+        out
+            << endl
+            << done << " sfens, "
+            << new_done * 1000 / elapsed << " sfens/second, "
+            << "at " << now_string() << sync_endl;
+
+        last_stats_report_time = now_time;
+
+        out = sync_region_cout.new_region();
+    }
+
+    void GensfenNonPv::maybe_report(uint64_t done)
+    {
+        if (done % REPORT_DOT_EVERY == 0)
+        {
+            std::lock_guard lock(stats_mutex);
+
+            if (last_stats_report_time == 0)
+            {
+                last_stats_report_time = now();
+                out = sync_region_cout.new_region();
+            }
+
+            if (done != 0)
+            {
+                out << '.';
+
+                if (done % REPORT_STATS_EVERY == 0)
+                {
+                    report(done, REPORT_STATS_EVERY);
+                }
+            }
+        }
+    }
+
+    // Command to generate a game record
+    void gensfen_nonpv(istringstream& is)
+    {
+        // Number of generated game records default = 8 billion phases (Ponanza specification)
+        GensfenNonPv::Params params;
+
+        uint64_t count = 1'000'000;
+
+        // Add a random number to the end of the file name.
+        std::string sfen_format = "binpack";
+
+        string token;
+        while (true)
+        {
+            token = "";
+            is >> token;
+            if (token == "")
+                break;
+
+            if (token == "depth")
+                is >> params.search_depth;
+            else if (token == "count")
+                is >> count;
+            else if (token == "output_file")
+                is >> params.output_file_name;
+            else if (token == "exploration_eval_limit")
+                is >> params.exploration_eval_limit;
+            else if (token == "eval_limit")
+                is >> params.eval_limit;
+            else if (token == "exploration_min_nodes")
+                is >> params.exploration_min_nodes;
+            else if (token == "exploration_max_nodes")
+                is >> params.exploration_max_nodes;
+            else if (token == "exploration_min_pieces")
+                is >> params.exploration_min_pieces;
+            else if (token == "exploration_save_rate")
+                is >> params.exploration_save_rate;
+            else if (token == "book")
+                is >> params.book;
+            else if (token == "sfen_format")
+                is >> sfen_format;
+            else if (token == "seed")
+                is >> params.seed;
+            else if (token == "set_recommended_uci_options")
+            {
+                UCI::setoption("Contempt", "0");
+                UCI::setoption("Skill Level", "20");
+                UCI::setoption("UCI_Chess960", "false");
+                UCI::setoption("UCI_AnalyseMode", "false");
+                UCI::setoption("UCI_LimitStrength", "false");
+                UCI::setoption("PruneAtShallowDepth", "false");
+                UCI::setoption("EnableTranspositionTable", "true");
+            }
+            else
+                cout << "ERROR: Ignoring unknown option " << token << endl;
+        }
+
+        if (!sfen_format.empty())
+        {
+            if (sfen_format == "bin")
+                params.sfen_format = SfenOutputType::Bin;
+            else if (sfen_format == "binpack")
+                params.sfen_format = SfenOutputType::Binpack;
+            else
+                cout << "WARNING: Unknown sfen format `" << sfen_format << "`. Using bin\n";
+        }
+
+        params.enforce_constraints();
+
+        std::cout << "INFO: Executing gensfen_nonpv command\n";
+
+        std::cout << "INFO: Parameters:\n";
+        std::cout
+            << "  - search_depth           = " << params.search_depth << endl
+            << "  - output_file            = " << params.output_file_name << endl
+            << "  - exploration_eval_limit = " << params.exploration_eval_limit << endl
+            << "  - eval_limit             = " << params.eval_limit << endl
+            << "  - exploration_min_nodes  = " << params.exploration_min_nodes << endl
+            << "  - exploration_max_nodes  = " << params.exploration_max_nodes << endl
+            << "  - exploration_min_pieces = " << params.exploration_min_pieces << endl
+            << "  - exploration_save_rate  = " << params.exploration_save_rate << endl
+            << "  - book                   = " << params.book << endl
+            << "  - sfen_format            = " << sfen_format << endl
+            << "  - seed                   = " << params.seed << endl
+            << "  - count                  = " << count << endl;
+
+        // Show if the training data generator uses NNUE.
+        Eval::NNUE::verify_eval_file_loaded();
+
+        Threads.main()->ponder = false;
+
+        GensfenNonPv gensfen(params);
+        gensfen.generate(count);
+
+        std::cout << "INFO: gensfen_nonpv finished." << endl;
+    }
+}
diff --git a/src/learn/gensfen_nonpv.h b/src/learn/gensfen_nonpv.h
new file mode 100644
index 00000000..38ccaa60
--- /dev/null
+++ b/src/learn/gensfen_nonpv.h
@@ -0,0 +1,12 @@
+#ifndef _GENSFEN_NONPV_H_
+#define _GENSFEN_NONPV_H_
+
+#include <sstream>
+
+namespace Learner {
+
+    // Automatic generation of teacher position
+    void gensfen_nonpv(std::istringstream& is);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/thread.h b/src/thread.h
index 83ba2f33..6eb38136 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -55,6 +55,7 @@ class Thread {
   size_t idx;
   bool exit = false, searching = true; // Set before starting std::thread
   std::function<void(Thread&)> worker;
+  std::function<void(Position&)> on_eval_callback;
   NativeThread stdThread;
 
 public:
@@ -75,6 +76,13 @@ public:
   void wait_for_worker_finished();
   size_t thread_idx() const { return idx; }
 
+  template <typename FuncT>
+  void set_eval_callback(FuncT&& f) { on_eval_callback = std::forward<FuncT>(f); }
+
+  void clear_eval_callback() { on_eval_callback = nullptr; }
+
+  void on_eval() { if (on_eval_callback) on_eval_callback(rootPos); }
+
   Pawns::Table pawnsTable;
   Material::Table materialTable;
   size_t pvIdx, pvLast;
diff --git a/src/uci.cpp b/src/uci.cpp
index 8e64da6b..55fccea7 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -36,6 +36,7 @@
 #include "uci.h"
 
 #include "learn/gensfen.h"
+#include "learn/gensfen_nonpv.h"
 #include "learn/learn.h"
 #include "learn/convert.h"
 #include "learn/transform.h"
@@ -341,6 +342,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
 
       else if (token == "gensfen") Learner::gensfen(is);
+      else if (token == "gensfen_nonpv") Learner::gensfen_nonpv(is);
       else if (token == "learn") Learner::learn(is);
       else if (token == "convert") Learner::convert(is);
       else if (token == "convert_bin") Learner::convert_bin(is);

From 868b4e942173758021c5c344c1c998e67c497d8b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 16 Dec 2020 21:11:09 +0100
Subject: [PATCH 520/583] add gensfen_nonpv docs

---
 docs/gensfen_nonpv.md | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 docs/gensfen_nonpv.md

diff --git a/docs/gensfen_nonpv.md b/docs/gensfen_nonpv.md
new file mode 100644
index 00000000..29dc1f17
--- /dev/null
+++ b/docs/gensfen_nonpv.md
@@ -0,0 +1,39 @@
+# Gensfen NonPV
+
+`gensfen_nonpv` command allows generation of training data from self-play in a manner that suits training better than traditional games. It plays fixed nodes self play games for exploration and records [some of] the evaluated positions. Then rescores them with fixed depth search.
+
+As all commands in stockfish `gensfen_nonpv` can be invoked either from command line (as `stockfish.exe gensfen_nonpv ...`, but this is not recommended because it's not possible to specify UCI options before `gensfen_nonpv` executes) or in the interactive prompt.
+
+It is recommended to set the `PruneAtShallowDepth` UCI option to `false` as it will increase the quality of fixed depth searches.
+
+It is recommended to keep the `EnableTranspositionTable` UCI option at the default `true` value as it will make the generation process faster without noticably harming the uniformity of the data.
+
+`gensfen_nonpv` takes named parameters in the form of `gensfen_nonpv param_1_name param_1_value param_2_name param_2_value ...`.
+
+Currently the following options are available:
+
+`depth` - the search depth to use for rescoring. Default: 3.
+
+`count` - the number of training data entries to generate. 1 entry == 1 position. Default: 1000000 (1M).
+
+`exploration_min_nodes` - the min number of nodes to use for exploraton during selfplay. Default: 5000.
+
+`exploration_max_nodes` - the min number of nodes to use for exploraton during selfplay. The number of nodes is chosen from a uniform distribution between min and max. Default: 15000.
+
+`exploration_save_rate` - the ratio of positions seen during exploration self play games that are saved for later rescoring. Default: 0.01 (meaning 1 in 100 positions seen during search get saved for rescoring).
+
+`output_file` - the name of the file to output to. If the extension is not present or doesn't match the selected training data format the right extension will be appened. Default: generated_gensfen_nonpv
+
+`eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which is VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000). Default: 4000
+
+`exploration_eval_limit` - same as `eval_limit` but used during exploration with a value from fixed depth search.
+
+`exploration_min_pieces` - the min number of pieces in the self play games to start the fixed depth search. Note that even if there's N pieces on the board the fixed nodes search usually reaches positions with less pieces and they are saved too. Default: 8.
+
+`exploration_max_ply` the max ply for the exploration self play. Default: 200.
+
+`book` - a path to an opening book to use for the starting positions. Currently only .epd format is supported. If not specified then the starting position is always the standard chess starting position.
+
+`sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `binpack`.
+
+`seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.

From 2061be473047b8d6703c70495b52eb34ea70c441 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 17 Dec 2020 16:12:26 +0100
Subject: [PATCH 521/583] smart_fen_skipping at gensfen_nonpv level

---
 docs/gensfen_nonpv.md       |  2 ++
 src/learn/gensfen_nonpv.cpp | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/docs/gensfen_nonpv.md b/docs/gensfen_nonpv.md
index 29dc1f17..0459d607 100644
--- a/docs/gensfen_nonpv.md
+++ b/docs/gensfen_nonpv.md
@@ -32,6 +32,8 @@ Currently the following options are available:
 
 `exploration_max_ply` the max ply for the exploration self play. Default: 200.
 
+`smart_fen_skipping` - this is a flag option. When specified some position that are not good candidates for teaching are removed from the output. This includes positions where the best move is a capture or promotion, and position where a king is in check.
+
 `book` - a path to an opening book to use for the starting positions. Currently only .epd format is supported. If not specified then the starting position is always the standard chess starting position.
 
 `sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `binpack`.
diff --git a/src/learn/gensfen_nonpv.cpp b/src/learn/gensfen_nonpv.cpp
index a5c667b5..ca365034 100644
--- a/src/learn/gensfen_nonpv.cpp
+++ b/src/learn/gensfen_nonpv.cpp
@@ -72,6 +72,8 @@ namespace Learner
 
             std::string book;
 
+            bool smart_fen_skipping = false;
+
             void enforce_constraints()
             {
                 // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
@@ -281,6 +283,12 @@ namespace Learner
             {
                 pos.set_from_packed_sfen(ps.sfen, &si, &th);
                 pos.state()->rule50 = 0;
+
+                if (params.smart_fen_skipping && pos.checkers())
+                {
+                    continue;
+                }
+
                 auto [search_value, search_pv] = Search::search(pos, params.search_depth, 1);
 
                 if (search_pv.empty())
@@ -293,6 +301,11 @@ namespace Learner
                     continue;
                 }
 
+                if (params.smart_fen_skipping && pos.capture_or_promotion(search_pv[0]))
+                {
+                    continue;
+                }
+
                 auto& new_ps = psv.emplace_back();
                 pos.sfen_pack(new_ps.sfen);
                 new_ps.score = search_value;
@@ -418,6 +431,8 @@ namespace Learner
                 is >> sfen_format;
             else if (token == "seed")
                 is >> params.seed;
+            else if (token == "smart_fen_skipping")
+                params.smart_fen_skipping = true;
             else if (token == "set_recommended_uci_options")
             {
                 UCI::setoption("Contempt", "0");

From 7636bcccd1f84276b48383e80b8beb9d923bd32f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Dec 2020 13:49:44 +0100
Subject: [PATCH 522/583] Correctly account for factors when computing the
 average absolute weight of the feature transformer.

---
 src/nnue/trainer/trainer_feature_transformer.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 9afda728..0915ccca 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -621,8 +621,19 @@ namespace Eval::NNUE {
             for(auto b : biases_)
                 abs_bias_sum += std::abs(b);
 
-            for(auto w : weights_)
-                abs_weight_sum += std::abs(w);
+            std::vector<TrainingFeature> training_features;
+            for (IndexType j = 0; j < RawFeatures::kDimensions; ++j)
+            {
+                training_features.clear();
+                Features::Factorizer<RawFeatures>::append_training_features(
+                    j, &training_features);
+
+                for (const auto& feature : training_features) {
+                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                        abs_weight_sum += std::abs(weights_[kHalfDimensions * feature.get_index() + i]);
+                    }
+                }
+            }
 
             auto out = sync_region_cout.new_region();
 

From 4f6fdca31f8baaebab761f0d356addffbf592fc4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Dec 2020 14:09:37 +0100
Subject: [PATCH 523/583] Reduce the amount of sfens buffered for the
 validation step.

Used to be 10M, now we bound it by a multiple of validation_count, and at most 1M. This reduces the RAM usage greatly.
---
 src/learn/learn.cpp     | 2 +-
 src/learn/sfen_reader.h | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 8265a66f..4e70f61c 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -576,7 +576,7 @@ namespace Learner
                 SfenReaderMode::Cyclic,
                 1,
                 std::to_string(prng.next_random_seed()),
-                prm.sfen_read_size,
+                std::min<size_t>(prm.validation_count * 10, 1000000),
                 prm.thread_buffer_size),
             learn_loss_sum{}
         {
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 1574f63a..10fb8404 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -50,7 +50,9 @@ namespace Learner{
         ) :
             filenames(filenames_.begin(), filenames_.end()),
             mode(mode_),
-            sfen_read_size(read_size),
+            // Due to the implementation of waiting for buffer empty a bit
+            // the read size must be at least twice the buffer size.
+            sfen_read_size(std::max(read_size, buffer_size * 2)),
             thread_buffer_size(buffer_size),
             prng(seed)
         {

From bb6188430d6468c1a11be4c3abaa90d985754fd6 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Thu, 24 Dec 2020 14:34:26 +0100
Subject: [PATCH 524/583] Add split_count argument to shuffle_binpack.py

this optional argument allows for splitting the input binpack in multiple output binpacks while shuffling.
---
 script/shuffle_binpack.py | 42 +++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/script/shuffle_binpack.py b/script/shuffle_binpack.py
index 409d4907..ca3b0b8e 100644
--- a/script/shuffle_binpack.py
+++ b/script/shuffle_binpack.py
@@ -25,16 +25,18 @@ def index_binpack(file):
 
     return index
 
-def copy_binpack_indexed(in_file, index, out_file):
+def copy_binpack_indexed(in_file, index, out_files):
     print('Copying...')
     total_size = 0
     report_every = 100
     prev_mib = -report_every
+    nextfile = 0
     for offset, size in index:
         in_file.seek(offset, os.SEEK_SET)
         data = in_file.read(size)
         assert len(data) == size
-        out_file.write(data)
+        out_files[nextfile].write(data)
+        nextfile = (nextfile + 1) % len(out_files)
 
         total_size += size
         mib = total_size // 1024 // 1024
@@ -44,26 +46,44 @@ def copy_binpack_indexed(in_file, index, out_file):
 
 def main():
     if len(sys.argv) < 3:
-        print('Usage: python shuffle_binpack.py infile outfile')
+        print('Usage: python shuffle_binpack.py infile outfile [split_count]')
         return
 
     in_filename = sys.argv[1]
-    out_filename = sys.argv[2]
 
-    if (Path(out_filename).exists()):
-        print('Output path already exists. Please specify a path to a file that does not exist.')
-        return
+    if len(sys.argv) > 3:
+       # split the infile in split_count pieces, creating new outfile names based on the provided name
+       basefile = sys.argv[2]
+       split_count = int(sys.argv[3])
+       base=os.path.splitext(basefile)[0]
+       ext=os.path.splitext(basefile)[1]
+       out_filenames = []
+       for i in range(split_count):
+           out_filenames.append(base+"_{}".format(i)+ext)
+    else:
+       out_filenames = [sys.argv[2]]
+
+    for out_filename in out_filenames:
+      if (Path(out_filename).exists()):
+          print('Output path {} already exists. Please specify a path to a file that does not exist.'.format(out_filename))
+          return
+
+    print(out_filenames)
 
     in_file = open(in_filename, 'rb')
-    out_file = open(out_filename, 'wb')
-
     index = index_binpack(in_file)
+
     print('Shuffling...')
     random.shuffle(index)
 
-    copy_binpack_indexed(in_file, index, out_file)
+    out_files = []
+    for out_filename in out_filenames:
+        out_files.append(open(out_filename, 'wb'))
+
+    copy_binpack_indexed(in_file, index, out_files)
 
     in_file.close()
-    out_file.close()
+    for out_file in out_files:
+        out_file.close()
 
 main()

From c1e69f450e3446ea75c22101553bd751554cf4c3 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Dec 2020 13:25:48 +0100
Subject: [PATCH 525/583] Prevent q_ in loss calculation from reaching values
 that would produce NaN

---
 src/learn/learn.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 4e70f61c..22578ff3 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -191,8 +191,8 @@ namespace Learner
 
         static thread_local auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
         static thread_local auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
-        static thread_local auto teacher_loss_ = -(p_ * log(q_) + (1.0 - p_) * log(1.0 - q_));
-        static thread_local auto outcome_loss_ = -(t_ * log(q_) + (1.0 - t_) * log(1.0 - q_));
+        static thread_local auto teacher_loss_ = -(p_ * log(q_ + epsilon) + (1.0 - p_) * log(1.0 - q_ + epsilon));
+        static thread_local auto outcome_loss_ = -(t_ * log(q_ + epsilon) + (1.0 - t_) * log(1.0 - q_ + epsilon));
         static thread_local auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
         static thread_local auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
         static thread_local auto cross_entropy_ = result_ - entropy_;

From 6d28d97a915bf9409e9c43f77d8a22f2a80576d6 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Dec 2020 17:01:56 +0100
Subject: [PATCH 526/583] Don't unload evalfile on set nnue false

---
 src/nnue/evaluate_nnue.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index c7bd681f..569df292 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -253,12 +253,18 @@ void init() {
 
   useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
 
-  if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
+  if (Options["SkipLoadingEval"])
   {
     eval_file_loaded.clear();
     return;
   }
 
+  if (useNNUE == UseNNUEMode::False)
+  {
+    // Keep the eval file loaded. Useful for mixed bench.
+    return;
+  }
+
   std::string eval_file = std::string(Options["EvalFile"]);
 
 #if defined(DEFAULT_NNUE_DIRECTORY)

From 1b560efabdcf05df979f3284077f9f41f264e77d Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Dec 2020 17:06:12 +0100
Subject: [PATCH 527/583] Correctly handle the last batch of data in
 sfen_reader

---
 src/learn/sfen_reader.h | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 10fb8404..8fa8cf9b 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -218,7 +218,9 @@ namespace Learner{
                 return;
             }
 
-            while (true)
+            // We want to set the `end_of_files` only after we read everything AND copy to the buffer pool.
+            bool local_end_of_files = false;
+            while (!local_end_of_files)
             {
                 // Wait for the buffer to run out.
                 // This size() is read only, so you don't need to lock it.
@@ -254,8 +256,8 @@ namespace Learner{
                             // There was no next file. Abort.
                             auto out = sync_region_cout.new_region();
                             out << "INFO (sfen_reader): End of files." << std::endl;
-                            end_of_files = true;
-                            return;
+                            local_end_of_files = true;
+                            break;
                         }
                     }
                 }
@@ -266,23 +268,21 @@ namespace Learner{
                     Algo::shuffle(sfens, prng);
                 }
 
-                // Divide this by thread_buffer_size. There should be size pieces.
-                // sfen_read_size shall be a multiple of thread_buffer_size.
-                assert((sfen_read_size % thread_buffer_size) == 0);
-
-                auto size = size_t(sfen_read_size / thread_buffer_size);
                 std::vector<std::unique_ptr<PSVector>> buffers;
-                buffers.reserve(size);
-
-                for (size_t i = 0; i < size; ++i)
+                for (size_t offset = 0; offset < sfens.size(); offset += thread_buffer_size)
                 {
+                    const size_t count =
+                        offset + thread_buffer_size > sfens.size()
+                        ? sfens.size() - offset
+                        : thread_buffer_size;
+
                     // Delete this pointer on the receiving side.
                     auto buf = std::make_unique<PSVector>();
-                    buf->resize(thread_buffer_size);
+                    buf->resize(count);
                     memcpy(
                         buf->data(),
-                        &sfens[i * thread_buffer_size],
-                        sizeof(PackedSfenValue) * thread_buffer_size);
+                        &sfens[offset],
+                        sizeof(PackedSfenValue) * count);
 
                     buffers.emplace_back(std::move(buf));
                 }
@@ -297,6 +297,8 @@ namespace Learner{
                         packed_sfens_pool.emplace_back(std::move(buf));
                 }
             }
+
+            end_of_files = true;
         }
 
     protected:

From acf95c7c98ceb67ac8511d6058813721277d6866 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Dec 2020 13:30:04 +0100
Subject: [PATCH 528/583] Accumulate clipping statistics to a 64 bit integer to
 prevent overflow for larger batch sizes.

---
 src/nnue/trainer/trainer_clipped_relu.h        | 6 ++----
 src/nnue/trainer/trainer_feature_transformer.h | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
index ff883afc..48dec8be 100644
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ b/src/nnue/trainer/trainer_clipped_relu.h
@@ -295,8 +295,6 @@ namespace Eval::NNUE {
         // number of samples in mini-batch
         IndexType batch_size_;
 
-        IndexType num_total_;
-
         const LearnFloatType* input_;
 
         // Trainer of the previous layer
@@ -316,8 +314,8 @@ namespace Eval::NNUE {
             // Health check statistics
             LearnFloatType min_activations_[kOutputDimensions];
             LearnFloatType max_activations_[kOutputDimensions];
-            IndexType num_clipped_;
-            IndexType num_total_;
+            uint64_t num_clipped_;
+            uint64_t num_total_;
 
             ThreadState() { reset(); }
 
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
index 0915ccca..b0e0ebba 100644
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ b/src/nnue/trainer/trainer_feature_transformer.h
@@ -690,8 +690,6 @@ namespace Eval::NNUE {
         // layer to learn
         LayerType* const target_layer_;
 
-        IndexType num_total_;
-
         // parameter
         alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
         alignas(kCacheLineSize)
@@ -717,8 +715,8 @@ namespace Eval::NNUE {
             alignas(kCacheLineSize) LearnFloatType max_activations_[kHalfDimensions];
             LearnFloatType min_pre_activation_;
             LearnFloatType max_pre_activation_;
-            IndexType num_clipped_;
-            IndexType num_total_;
+            uint64_t num_clipped_;
+            uint64_t num_total_;
 
             ThreadStatState() { reset(); }
 

From 1f7e5d386170ade5feab28704fde3a1b03c17bca Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 24 Dec 2020 13:35:48 +0100
Subject: [PATCH 529/583] Add thread sanitized run for instrumented_learn and
 fix races.

---
 .travis.yml             |  1 +
 src/learn/gensfen.cpp   | 26 ++++++++++++++++++--------
 src/learn/learn.cpp     | 16 ++++++++++++----
 src/learn/sfen_reader.h |  8 +++++++-
 4 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 9dad6b1d..3a04de58 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -101,3 +101,4 @@ script:
   - export CXXFLAGS="-O1 -fno-inline"
   - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no build > /dev/null && ../tests/instrumented_learn.sh --valgrind
   - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
+  - make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented_learn.sh --sanitizer-thread
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 1cc9055c..f26a619c 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -127,14 +127,20 @@ namespace Learner
             const Params& prm
         ) :
             params(prm),
-            prng(prm.seed),
             sfen_writer(prm.output_file_name, prm.num_threads, prm.save_every, prm.sfen_format)
         {
             hash.resize(GENSFEN_HASH_SIZE);
+            prngs.reserve(prm.num_threads);
+            auto seed = prm.seed;
+            for (uint64_t i = 0; i < prm.num_threads; ++i)
+            {
+                prngs.emplace_back(seed);
+                seed = prngs.back().next_random_seed();
+            }
 
             if (!prm.book.empty())
             {
-                opening_book = open_opening_book(prm.book, prng);
+                opening_book = open_opening_book(prm.book, prngs[0]);
                 if (opening_book == nullptr)
                 {
                     std::cout << "WARNING: Failed to open opening book " << prm.book << ". Falling back to startpos.\n";
@@ -142,7 +148,7 @@ namespace Learner
             }
 
             // Output seed to veryfy by the user if it's not identical by chance.
-            std::cout << prng << std::endl;
+            std::cout << prngs[0] << std::endl;
         }
 
         void generate(uint64_t limit);
@@ -150,7 +156,7 @@ namespace Learner
     private:
         Params params;
 
-        PRNG prng;
+        std::vector<PRNG> prngs;
 
         std::mutex stats_mutex;
         TimePoint last_stats_report_time;
@@ -177,9 +183,10 @@ namespace Learner
             Position& pos,
             const vector<int>& move_hist_scores) const;
 
-        vector<uint8_t> generate_random_move_flags();
+        vector<uint8_t> generate_random_move_flags(PRNG& prng);
 
         optional<Move> choose_random_move(
+            PRNG& prng,
             Position& pos,
             std::vector<uint8_t>& random_move_flag,
             int ply,
@@ -252,6 +259,8 @@ namespace Learner
 
         StateInfo si;
 
+        auto& prng = prngs[th.thread_idx()];
+
         // end flag
         bool quit = false;
 
@@ -279,7 +288,7 @@ namespace Learner
             packed_sfens.reserve(params.write_maxply + MAX_PLY);
 
             // Precomputed flags. Used internally by choose_random_move.
-            vector<uint8_t> random_move_flag = generate_random_move_flags();
+            vector<uint8_t> random_move_flag = generate_random_move_flags(prng);
 
             // A counter that keeps track of the number of random moves
             // When random_move_minply == -1, random moves are
@@ -423,7 +432,7 @@ namespace Learner
                 }
 
                 // Update the next move according to best search result or random move.
-                auto random_move = choose_random_move(pos, random_move_flag, ply, actual_random_move_count);
+                auto random_move = choose_random_move(prng, pos, random_move_flag, ply, actual_random_move_count);
                 const Move next_move = random_move.has_value() ? *random_move : search_pv[0];
 
                 // We don't have the whole game yet, but it ended,
@@ -579,7 +588,7 @@ namespace Learner
         return nullopt;
     }
 
-    vector<uint8_t> Gensfen::generate_random_move_flags()
+    vector<uint8_t> Gensfen::generate_random_move_flags(PRNG& prng)
     {
         vector<uint8_t> random_move_flag;
 
@@ -617,6 +626,7 @@ namespace Learner
     }
 
     optional<Move> Gensfen::choose_random_move(
+        PRNG& prng,
         Position& pos,
         std::vector<uint8_t>& random_move_flag,
         int ply,
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 22578ff3..60204e71 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -561,13 +561,13 @@ namespace Learner
 
         LearnerThink(const Params& prm) :
             params(prm),
-            prng(prm.seed),
+            init_prng(prm.seed),
             train_sr(
                 prm.filenames,
                 prm.shuffle,
                 SfenReaderMode::Cyclic,
                 prm.num_threads,
-                std::to_string(prng.next_random_seed()),
+                std::to_string(init_prng.next_random_seed()),
                 prm.sfen_read_size,
                 prm.thread_buffer_size),
             validation_sr(
@@ -575,7 +575,7 @@ namespace Learner
                 prm.shuffle,
                 SfenReaderMode::Cyclic,
                 1,
-                std::to_string(prng.next_random_seed()),
+                std::to_string(init_prng.next_random_seed()),
                 std::min<size_t>(prm.validation_count * 10, 1000000),
                 prm.thread_buffer_size),
             learn_loss_sum{}
@@ -589,6 +589,12 @@ namespace Learner
             total_done = 0;
             trials = params.newbob_num_trials;
             dir_number = 0;
+
+            prngs.reserve(prm.num_threads);
+            for (uint64_t i = 0; i < prm.num_threads; ++i)
+            {
+                prngs.emplace_back(init_prng.next_random_seed());
+            }
         }
 
         void learn(uint64_t epochs);
@@ -622,7 +628,8 @@ namespace Learner
 
         Params params;
 
-        PRNG prng;
+        PRNG init_prng;
+        std::vector<PRNG> prngs;
 
         // sfen reader
         SfenReader train_sr;
@@ -776,6 +783,7 @@ namespace Learner
     {
         const auto thread_id = th.thread_idx();
         auto& pos = th.rootPos;
+        auto& prng = prngs[th.thread_idx()];
 
         std::vector<StateInfo, AlignedAllocator<StateInfo>> state(MAX_PLY);
 
diff --git a/src/learn/sfen_reader.h b/src/learn/sfen_reader.h
index 8fa8cf9b..e36efcc6 100644
--- a/src/learn/sfen_reader.h
+++ b/src/learn/sfen_reader.h
@@ -61,6 +61,7 @@ namespace Learner{
             end_of_files = false;
             shuffle = do_shuffle;
             stop_flag = false;
+            num_buffers_in_pool.store(0);
 
             file_worker_thread = std::thread([&] {
                 this->file_read_worker();
@@ -147,6 +148,7 @@ namespace Learner{
 
                         packed_sfens[thread_id] = std::move(packed_sfens_pool.front());
                         packed_sfens_pool.pop_front();
+                        num_buffers_in_pool.fetch_sub(1);
 
                         total_read += thread_buffer_size;
 
@@ -224,7 +226,7 @@ namespace Learner{
             {
                 // Wait for the buffer to run out.
                 // This size() is read only, so you don't need to lock it.
-                while (!stop_flag && packed_sfens_pool.size() >= sfen_read_size / thread_buffer_size)
+                while (!stop_flag && num_buffers_in_pool.load() >= sfen_read_size / thread_buffer_size)
                     sleep(100);
 
                 if (stop_flag)
@@ -294,7 +296,10 @@ namespace Learner{
                     // contents of packed_sfens_pool are changed.
 
                     for (auto& buf : buffers)
+                    {
+                        num_buffers_in_pool.fetch_add(1);
                         packed_sfens_pool.emplace_back(std::move(buf));
+                    }
                 }
             }
 
@@ -342,5 +347,6 @@ namespace Learner{
         // Each worker thread fills its own packed_sfens[thread_id] from here.
         // * Lock and access the mutex.
         std::list<std::unique_ptr<PSVector>> packed_sfens_pool;
+        std::atomic<size_t> num_buffers_in_pool;
     };
 }

From 74774c36e19d43f56d533e16d778582239de8934 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 25 Jan 2021 11:54:12 +0100
Subject: [PATCH 530/583] Fix wrong multipv depth range. Fixes #291

---
 src/learn/gensfen.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index f26a619c..06e6295b 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -103,7 +103,6 @@ namespace Learner
             void enforce_constraints()
             {
                 search_depth_max = std::max(search_depth_min, search_depth_max);
-                random_multi_pv_depth = std::max(search_depth_min, random_multi_pv_depth);
 
                 // Limit the maximum to a one-stop score. (Otherwise you might not end the loop)
                 eval_limit = std::min(eval_limit, (int)mate_in(2));

From b68cd36708551a78c84db5febd06d5f08df0b107 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <ts.tomeksopel@gmail.com>
Date: Sun, 28 Feb 2021 12:51:45 +0100
Subject: [PATCH 531/583] 
 http://talkchess.com/forum3/viewtopic.php?f=2&t=76736&p=885254#p885254

---
 src/extra/nnue_data_binpack_format.h | 32 +++++++++++-----------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 77cf8e0a..038cb536 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -1,27 +1,19 @@
 /*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
 
-Copyright 2020 Tomasz Sobczyk
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
 
-Permission is hereby granted, free of charge,
-to any person obtaining a copy of this software
-and associated documentation files (the "Software"),
-to deal in the Software without restriction,
-including without limitation the rights to use, copy,
-modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall
-be included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH
-THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
 
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 #pragma once

From 0ddad45ab2a6cf87edd55e04ba4ae80db02f293a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 28 Feb 2021 15:21:49 +0100
Subject: [PATCH 532/583] Add `gather_statistics` command that allows gathering
 statistics from a .bin or .binpack file. Initially only support position
 count.

---
 docs/stats.md       |  15 ++++
 src/Makefile        |   3 +-
 src/learn/stats.cpp | 209 ++++++++++++++++++++++++++++++++++++++++++++
 src/learn/stats.h   |  12 +++
 src/uci.cpp         |   2 +
 5 files changed, 240 insertions(+), 1 deletion(-)
 create mode 100644 docs/stats.md
 create mode 100644 src/learn/stats.cpp
 create mode 100644 src/learn/stats.h

diff --git a/docs/stats.md b/docs/stats.md
new file mode 100644
index 00000000..d5a76b61
--- /dev/null
+++ b/docs/stats.md
@@ -0,0 +1,15 @@
+# Stats
+
+`gather_statistics` command allows gathering various statistics from a .bin or a .binpack file. The syntax is `gather_statistics (GROUP)* input_file FILENAME`. There can be many groups specified. Any statistic gatherer that belongs to at least one of the specified groups will be used.
+
+Simplest usage: `stockfish.exe gather_statistics all input_file a.binpack`
+
+## Groups
+
+`all`
+
+ - A special group designating all statistics gatherers available.
+
+`position_count`
+
+ - `struct PositionCounter` - the total number of positions in the file.
diff --git a/src/Makefile b/src/Makefile
index 586656d3..a4ced5f0 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -66,7 +66,8 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	learn/gensfen_nonpv.cpp \
 	learn/opening_book.cpp \
 	learn/convert.cpp \
-	learn/transform.cpp
+	learn/transform.cpp \
+	learn/stats.cpp
 
 OBJS = $(notdir $(SRCS:.cpp=.o))
 
diff --git a/src/learn/stats.cpp b/src/learn/stats.cpp
new file mode 100644
index 00000000..9d9589c4
--- /dev/null
+++ b/src/learn/stats.cpp
@@ -0,0 +1,209 @@
+#include "stats.h"
+
+#include "sfen_stream.h"
+#include "packed_sfen.h"
+#include "sfen_writer.h"
+
+#include "thread.h"
+#include "position.h"
+#include "evaluate.h"
+#include "search.h"
+
+#include "nnue/evaluate_nnue.h"
+
+#include <string>
+#include <map>
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <mutex>
+#include <optional>
+
+namespace Learner::Stats
+{
+    struct StatisticGathererBase
+    {
+        virtual void on_position(const Position&) {}
+        virtual void on_move(const Move&) {}
+        virtual void reset() = 0;
+        [[nodiscard]] virtual std::map<std::string, std::string> get_formatted_stats() const = 0;
+    };
+
+    struct PositionCounter : StatisticGathererBase
+    {
+        PositionCounter() :
+            m_num_positions(0)
+        {
+        }
+
+        void on_position(const Position&) override
+        {
+            m_num_positions += 1;
+        }
+
+        void reset() override
+        {
+            m_num_positions = 0;
+        }
+
+        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        {
+            return {
+                { "Number of positions", std::to_string(m_num_positions) }
+            };
+        }
+
+    private:
+        std::uint64_t m_num_positions;
+    };
+
+    struct StatisticGathererFactoryBase
+    {
+        [[nodiscard]] virtual std::unique_ptr<StatisticGathererBase> create() const = 0;
+    };
+
+    template <typename T>
+    struct StatisticGathererFactory : StatisticGathererFactoryBase
+    {
+        [[nodiscard]] std::unique_ptr<StatisticGathererBase> create() const override
+        {
+            return std::make_unique<T>();
+        }
+    };
+
+    struct StatisticGathererRegistry
+    {
+        void add_statistic_gatherers_by_group(
+            std::vector<std::unique_ptr<StatisticGathererBase>>& gatherers,
+            const std::string& group) const
+        {
+            auto it = m_gatherers_by_group.find(group);
+            if (it != m_gatherers_by_group.end())
+            {
+                for (auto& factory : it->second)
+                {
+                    gatherers.emplace_back(factory->create());
+                }
+            }
+        }
+
+        template <typename T>
+        void add(const std::string& group)
+        {
+            m_gatherers_by_group[group].emplace_back(std::make_unique<StatisticGathererFactory<T>>());
+
+            // Always add to the special group "all".
+            m_gatherers_by_group["all"].emplace_back(std::make_unique<StatisticGathererFactory<T>>());
+        }
+
+    private:
+        std::map<std::string, std::vector<std::unique_ptr<StatisticGathererFactoryBase>>> m_gatherers_by_group;
+    };
+
+    const auto& get_statistics_gatherers_registry()
+    {
+        static StatisticGathererRegistry s_reg = [](){
+            StatisticGathererRegistry reg;
+
+            reg.add<PositionCounter>("position_count");
+
+            return reg;
+        }();
+
+        return s_reg;
+    }
+
+    void do_gather_statistics(
+        const std::string& filename,
+        std::vector<std::unique_ptr<StatisticGathererBase>>& statistic_gatherers)
+    {
+        Thread* th = Threads.main();
+        Position& pos = th->rootPos;
+        StateInfo si;
+
+        auto in = Learner::open_sfen_input_file(filename);
+
+        auto on_move = [&](Move move) {
+            for (auto&& s : statistic_gatherers)
+            {
+                s->on_move(move);
+            }
+        };
+
+        auto on_position = [&](const Position& position) {
+            for (auto&& s : statistic_gatherers)
+            {
+                s->on_position(position);
+            }
+        };
+
+        if (in == nullptr)
+        {
+            std::cerr << "Invalid input file type.\n";
+            return;
+        }
+
+        uint64_t num_processed = 0;
+        for (;;)
+        {
+            auto v = in->next();
+            if (!v.has_value())
+                break;
+
+            auto& ps = v.value();
+
+            pos.set_from_packed_sfen(ps.sfen, &si, th);
+
+            on_position(pos);
+            on_move((Move)ps.move);
+
+            num_processed += 1;
+            if (num_processed % 1'000'000 == 0)
+            {
+                std::cout << "Processed " << num_processed << " positions.\n";
+            }
+        }
+
+        std::cout << "Finished gathering statistics.\n\n";
+        std::cout << "Results:\n\n";
+
+        for (auto&& s : statistic_gatherers)
+        {
+            for (auto&& [name, value] : s->get_formatted_stats())
+            {
+                std::cout << name << ": " << value << '\n';
+            }
+            std::cout << '\n';
+        }
+    }
+
+    void gather_statistics(std::istringstream& is)
+    {
+        Eval::NNUE::init();
+
+        auto& registry = get_statistics_gatherers_registry();
+
+        std::vector<std::unique_ptr<StatisticGathererBase>> statistic_gatherers;
+
+        std::string input_file;
+
+        while(true)
+        {
+            std::string token;
+            is >> token;
+
+            if (token == "")
+                break;
+
+            if (token == "input_file")
+                is >> input_file;
+            else
+                registry.add_statistic_gatherers_by_group(statistic_gatherers, token);
+        }
+
+        do_gather_statistics(input_file, statistic_gatherers);
+    }
+
+}
diff --git a/src/learn/stats.h b/src/learn/stats.h
new file mode 100644
index 00000000..c9a71e5a
--- /dev/null
+++ b/src/learn/stats.h
@@ -0,0 +1,12 @@
+#ifndef _STATS_H_
+#define _STATS_H_
+
+#include <sstream>
+
+namespace Learner::Stats {
+
+    void gather_statistics(std::istringstream& is);
+
+}
+
+#endif
diff --git a/src/uci.cpp b/src/uci.cpp
index 55fccea7..7da2881f 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -40,6 +40,7 @@
 #include "learn/learn.h"
 #include "learn/convert.h"
 #include "learn/transform.h"
+#include "learn/stats.h"
 
 using namespace std;
 
@@ -349,6 +350,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "convert_plain") Learner::convert_plain(is);
       else if (token == "convert_bin_from_pgn_extract") Learner::convert_bin_from_pgn_extract(is);
       else if (token == "transform") Learner::transform(is);
+      else if (token == "gather_statistics") Learner::Stats::gather_statistics(is);
 
       // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);

From 03b888e118190e3db7888bc2d80b9889c86c4be5 Mon Sep 17 00:00:00 2001
From: QuackQuackBlah <77398316+QuackQuackBlah@users.noreply.github.com>
Date: Mon, 8 Mar 2021 20:29:33 -0800
Subject: [PATCH 533/583] Update gensfen_nonpv.md

Fixes typo.
---
 docs/gensfen_nonpv.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/gensfen_nonpv.md b/docs/gensfen_nonpv.md
index 0459d607..0814bd60 100644
--- a/docs/gensfen_nonpv.md
+++ b/docs/gensfen_nonpv.md
@@ -18,7 +18,7 @@ Currently the following options are available:
 
 `exploration_min_nodes` - the min number of nodes to use for exploraton during selfplay. Default: 5000.
 
-`exploration_max_nodes` - the min number of nodes to use for exploraton during selfplay. The number of nodes is chosen from a uniform distribution between min and max. Default: 15000.
+`exploration_max_nodes` - the max number of nodes to use for exploraton during selfplay. The number of nodes is chosen from a uniform distribution between min and max. Default: 15000.
 
 `exploration_save_rate` - the ratio of positions seen during exploration self play games that are saved for later rescoring. Default: 0.01 (meaning 1 in 100 positions seen during search get saved for rescoring).
 

From 591609c262a2d43ebc08ed35a177191f7e534cee Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 14 Mar 2021 13:11:26 +0100
Subject: [PATCH 534/583] Fix relation between halfmove and fullmove clocks.

---
 src/extra/nnue_data_binpack_format.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 038cb536..dce53b83 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -4506,12 +4506,12 @@ namespace chess
 
         [[nodiscard]] inline std::uint16_t fullMove() const
         {
-            return (m_ply + 1) / 2;
+            return m_ply / 2 + 1;
         }
 
         inline void setFullMove(std::uint16_t hm)
         {
-            m_ply = 2 * hm - 1 + (m_sideToMove == Color::Black);
+            m_ply = 2 * (hm - 1) + (m_sideToMove == Color::Black);
         }
 
         [[nodiscard]] inline bool isCheck() const;
@@ -5979,7 +5979,7 @@ namespace chess
             const auto fullMove = nextPart();
             if (!fullMove.empty())
             {
-                m_ply = std::stoi(fullMove.data()) * 2 - (m_sideToMove == Color::White);
+                m_ply = 2 * (std::stoi(fullMove.data()) - 1) + (m_sideToMove == Color::Black);
             }
             else
             {

From 5fdb48a7cb4c8b34232c99530b862a3e1a672309 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 14 Mar 2021 14:12:35 +0100
Subject: [PATCH 535/583] Change some `learn` parameter naming. Update docs.

---
 docs/learn.md       | 40 +++++++++++++++++++++++++---------------
 src/learn/learn.cpp | 16 ++++++++++++----
 2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/docs/learn.md b/docs/learn.md
index fe88e7e8..d7ae329d 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -16,30 +16,22 @@ Currently the following options are available:
 
 `set_recommended_uci_options` - this is a modifier not a parameter, no value follows it. If specified then some UCI options are set to recommended values.
 
-`bat` - the size of a batch in multiples of 10000. This determines how many entries are read and shuffled at once during training. Default: 100 (meaning batch size of 1000000).
-
 `targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
 
-`epochs` - the number of weight update cycles (epochs) to train the network for. One such cycle is `batchsize` positions. If not specified then the training will loop forever.
+`epochs` - the number of weight update cycles (epochs) to train the network for. One such cycle is `epoch_size` positions. If not specified then the training will loop forever.
+
+`epoch_size` - The number of positions per epoch. Should be kept lowish as the current implementation loads all into memory before processing. Default is already high enough. The epoch size is not tied to validation nor net serialization, there are more specific options for that. Default: 1000000
 
 `basedir` - the base directory for the paths. Default: "" (current directory)
 
-`batchsize` - same as `bat` but doesn't scale by 10000. Default: 1000000
-
 `lr` - initial learning rate. Default: 1.
 
 `use_draw_games_in_training` - either 0 or 1. If 1 then draws will be used in training too. Default: 1.
 
-`use_draw_in_training` - deprecated, alias for `use_draw_games_in_training`
-
 `use_draw_games_in_validation` - either 0 or 1. If 1 then draws will be used in validation too. Default: 1.
 
-`use_draw_in_validation` - deprecated, alias for `use_draw_games_in_validation`
-
 `skip_duplicated_positions_in_training` - either 0 or 1. If 1 then a small hashtable will be used to try to eliminate duplicated position from training. Default: 0.
 
-`use_hash_in_training` - deprecated, alias for `skip_duplicated_positions_in_training`
-
 `winning_probability_coefficient` - some magic value for winning probability. If you need to read this then don't touch it. Default: 1.0 / PawnValueEg / 4.0 * std::log(10.0)
 
 `use_wdl` - either 0 or 1. If 1 then the evaluations will be converted to win/draw/loss percentages prior to learning on them. (Slightly changes the gradient because eval has a different derivative than wdl). Default: 0.
@@ -60,17 +52,17 @@ Currently the following options are available:
 
 `no_shuffle` - this is a modifier not a parameter, no value follows it. If specified then data within a batch won't be shuffled.
 
-`nn_batch_size` - minibatch size used for learning. Should be smaller than batch size. Default: 1000.
+`batch_size` - the number of positions per one learning step. Default: 1000
 
-`newbob_decay` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 0.5 (no LR drops)
+`lr_step` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 0.5 (no LR drops)
 
 `assume_quiet` - this is a flag option. When specified learn will not perform qsearch to reach a quiet position.
 
-`smart_fen_skipping` - this is a flag option. When specified some position that are not good candidates for teaching are skipped. This includes positions where the best move is a capture or promotion, and position where a king is in check. Default: 1.
+`smart_fen_skipping` - this is a flag option. When specified some position that are not good candidates for teaching are skipped. This includes positions where the best move is a capture or promotion, and position where a king is in check. Default: 0.
 
 `smart_fen_skipping_for_validation` - same as `smart_fen_skipping` but applies to validation data set. Default: 0.
 
-`newbob_num_trials` - determines after how many subsequent rejected nets the training process will be terminated. Default: 4.
+`max_consecutive_rejections` - determines after how many subsequent rejected nets the training process will be terminated. Default: 4.
 
 `auto_lr_drop` - every time this many positions are processed the learning rate is multiplied by `newbob_decay`. In other words this value specifies for how many positions a single learning rate stage lasts. If 0 then doesn't have any effect. Default: 0.
 
@@ -92,6 +84,24 @@ Currently the following options are available:
 
 `verbose` - this is a modifier, not a parameter. When used there will be more detailed output during training.
 
+### Deprecated options
+
+`bat` (deprecated) - the size of a batch in multiples of 10000. This determines how many entries are read and shuffled at once during training. Default: 100 (meaning batch size of 1000000).
+
+`newbob_num_trials` (deprecated) - same as `max_consecutive_rejections`
+
+`newbob_decay` (deprecated) - same as `lr_step`
+
+`nn_batch_size` (deprecated) - same as `batch_size`
+
+`use_hash_in_training` (deprecated) - alias for `skip_duplicated_positions_in_training`
+
+`batchsize` (deprecated) - same as `epoch_size`
+
+`use_draw_in_training` (deprecated) - alias for `use_draw_games_in_training`
+
+`use_draw_in_validation` (deprecated) - alias for `use_draw_games_in_validation`
+
 ## Legacy subcommands and parameters
 
 ### Convert
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index 60204e71..e17537ff 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -1221,7 +1221,9 @@ namespace Learner
             else if (option == "basedir") is >> base_dir;
 
             // Mini batch size
-            else if (option == "batchsize") is >> params.mini_batch_size;
+            else if (option == "batchsize"
+                  || option == "epoch_size")
+                is >> params.mini_batch_size;
 
             // learning rate
             else if (option == "lr") is >> params.learning_rate;
@@ -1260,9 +1262,15 @@ namespace Learner
             else if (option == "save_only_once") params.save_only_once = true;
             else if (option == "no_shuffle") params.shuffle = false;
 
-            else if (option == "nn_batch_size") is >> nn_batch_size;
-            else if (option == "newbob_decay") is >> params.newbob_decay;
-            else if (option == "newbob_num_trials") is >> params.newbob_num_trials;
+            else if (option == "nn_batch_size"
+                  || option == "batch_size")
+                is >> nn_batch_size;
+            else if (option == "newbob_decay"
+                  || option == "lr_step")
+                is >> params.newbob_decay;
+            else if (option == "newbob_num_trials"
+                  || option == "max_consecutive_rejections")
+                is >> params.newbob_num_trials;
             else if (option == "nn_options") is >> nn_options;
             else if (option == "auto_lr_drop") is >> params.auto_lr_drop;
 

From bbe338b9fcab3ee7f071303ca35956ad667cc6b2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 25 Mar 2021 14:00:00 +0100
Subject: [PATCH 536/583] Add random move accuracy for comparison.

---
 src/learn/learn.cpp | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index e17537ff..cf19bcc2 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -616,7 +616,8 @@ namespace Learner
             const PSVector& psv,
             Loss& test_loss_sum,
             atomic<double>& sum_norm,
-            atomic<int>& move_accord_count
+            atomic<int>& move_accord_count,
+            atomic<double>& sum_one_over_move_count
         );
 
         bool has_depth1_move_agreement(Position& pos, Move pvmove);
@@ -931,6 +932,12 @@ namespace Learner
         // search matches the pv first move of search(1).
         atomic<int> move_accord_count{0};
 
+        // If there is 10 legal moves then 0.1 will be added.
+        // This happens for each position tested.
+        // Effectively at the end we have the random move accuracy
+        // multiplied by the number of positions, which is psv.size()
+        atomic<double> sum_one_over_move_count{0.0};
+
         auto mainThread = Threads.main();
         mainThread->execute_with_worker([&out](auto& th){
             auto& pos = th.rootPos;
@@ -949,7 +956,8 @@ namespace Learner
                 psv,
                 test_loss_sum,
                 sum_norm,
-                move_accord_count
+                move_accord_count,
+                sum_one_over_move_count
             );
         });
         Threads.wait_for_workers_finished();
@@ -968,6 +976,7 @@ namespace Learner
 
             out << "  - norm = " << sum_norm << endl;
             out << "  - move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
+            out << "  - random move accuracy = " << (sum_one_over_move_count * 100.0 / psv.size()) << "%" << endl;
         }
         else
         {
@@ -983,10 +992,12 @@ namespace Learner
         const PSVector& psv,
         Loss& test_loss_sum,
         atomic<double>& sum_norm,
-        atomic<int>& move_accord_count
+        atomic<int>& move_accord_count,
+        atomic<double>& sum_one_over_move_count
     )
     {
         Loss local_loss_sum{};
+        double local_sum_one_over_move_count = 0.0;
         auto& pos = th.rootPos;
 
         for(;;)
@@ -1022,8 +1033,11 @@ namespace Learner
             // Threat all moves with equal scores as first. This is up to move ordering.
             if (has_depth1_move_agreement(pos, (Move)ps.move))
                 move_accord_count.fetch_add(1, std::memory_order_relaxed);
+
+            local_sum_one_over_move_count += 1.0 / static_cast<double>(MoveList<LEGAL>(pos).size());
         }
 
+        sum_one_over_move_count += local_sum_one_over_move_count;
         test_loss_sum += local_loss_sum;
     }
 

From 876902070d1dd0e1493ad276a0aaae1b4d5c4793 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 25 Mar 2021 14:41:24 +0100
Subject: [PATCH 537/583] Add optional warmup step for training.

Specified with `warmup_epochs`, uses `warmup_lr`.
The purpose is to put the net into a somewhat stable state so that the gradients are not as high during the early stages of the training and don't "accidentally" break the net.
---
 docs/learn.md       |  4 +++
 src/learn/learn.cpp | 59 ++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/docs/learn.md b/docs/learn.md
index d7ae329d..ab2b1861 100644
--- a/docs/learn.md
+++ b/docs/learn.md
@@ -20,12 +20,16 @@ Currently the following options are available:
 
 `epochs` - the number of weight update cycles (epochs) to train the network for. One such cycle is `epoch_size` positions. If not specified then the training will loop forever.
 
+`warmup_epochs` - the number of epochs to "pretrain" the net for with `warmup_lr` learning rate. Default: 0.
+
 `epoch_size` - The number of positions per epoch. Should be kept lowish as the current implementation loads all into memory before processing. Default is already high enough. The epoch size is not tied to validation nor net serialization, there are more specific options for that. Default: 1000000
 
 `basedir` - the base directory for the paths. Default: "" (current directory)
 
 `lr` - initial learning rate. Default: 1.
 
+`warmup_lr` - the learning rate to use during warmup epochs. Default: 0.1.
+
 `use_draw_games_in_training` - either 0 or 1. If 1 then draws will be used in training too. Default: 1.
 
 `use_draw_games_in_validation` - either 0 or 1. If 1 then draws will be used in validation too. Default: 1.
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
index cf19bcc2..9c4546a6 100644
--- a/src/learn/learn.cpp
+++ b/src/learn/learn.cpp
@@ -526,6 +526,7 @@ namespace Learner
             bool smart_fen_skipping_for_validation = false;
 
             double learning_rate = 1.0;
+            double warmup_learning_rate = 0.1;
             double max_grad = 1.0;
 
             string validation_set_file_name;
@@ -597,7 +598,7 @@ namespace Learner
             }
         }
 
-        void learn(uint64_t epochs);
+        void learn(uint64_t epochs, uint64_t warmup_epochs = 0);
 
     private:
         static void set_learning_search_limits();
@@ -607,6 +608,7 @@ namespace Learner
         void learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
 
         void update_weights(const PSVector& psv, uint64_t epoch);
+        void update_weights_warmup(uint64_t warmup_epoch);
 
         void calc_loss(const PSVector& psv, uint64_t epoch);
 
@@ -716,7 +718,7 @@ namespace Learner
         return validation_data;
     }
 
-    void LearnerThink::learn(uint64_t epochs)
+    void LearnerThink::learn(uint64_t epochs, uint64_t warmup_epochs)
     {
 #if defined(_OPENMP)
         omp_set_num_threads((int)Options["Threads"]);
@@ -740,6 +742,36 @@ namespace Learner
             return;
         }
 
+        stop_flag = false;
+
+        if (warmup_epochs > 0)
+        {
+            cout << "Doing " << warmup_epochs << " warmup epochs." << endl;
+        }
+
+        for(uint64_t warmup_epoch = 1; warmup_epoch <= warmup_epochs; ++warmup_epoch)
+        {
+            std::atomic<uint64_t> counter{0};
+
+            Threads.execute_with_workers([this, &counter](auto& th){
+                learn_worker(th, counter, params.mini_batch_size);
+            });
+
+            total_done += params.mini_batch_size;
+
+            Threads.wait_for_workers_finished();
+
+            if (stop_flag)
+                break;
+
+            update_weights_warmup(warmup_epoch);
+
+            if (stop_flag)
+                break;
+
+            cout << "Finished " << warmup_epoch << " out of " << warmup_epochs << " warmup epochs." << endl;
+        }
+
         if (params.newbob_decay != 1.0) {
 
             calc_loss(validation_data, 0);
@@ -752,8 +784,6 @@ namespace Learner
             out << "INFO (learn): initial loss = " << best_loss << endl;
         }
 
-        stop_flag = false;
-
         for(uint64_t epoch = 1; epoch <= epochs; ++epoch)
         {
             std::atomic<uint64_t> counter{0};
@@ -873,6 +903,17 @@ namespace Learner
         }
     }
 
+    void LearnerThink::update_weights_warmup(uint64_t warmup_epoch)
+    {
+        // I'm not sure this fencing is correct. But either way there
+        // should be no real issues happening since
+        // the read/write phases are isolated.
+        atomic_thread_fence(memory_order_seq_cst);
+        Eval::NNUE::update_parameters(
+            Threads, warmup_epoch, params.verbose, params.warmup_learning_rate, params.max_grad, get_loss);
+        atomic_thread_fence(memory_order_seq_cst);
+    }
+
     void LearnerThink::update_weights(const PSVector& psv, uint64_t epoch)
     {
         // I'm not sure this fencing is correct. But either way there
@@ -1192,6 +1233,7 @@ namespace Learner
 
         // Number of epochs
         uint64_t epochs = std::numeric_limits<uint64_t>::max();
+        uint64_t warmup_epochs = 0;
 
         // Game file storage folder (get game file with relative path from here)
         string base_dir;
@@ -1230,6 +1272,7 @@ namespace Learner
 
             // Specify the number of loops
             else if (option == "epochs") is >> epochs;
+            else if (option == "warmup_epochs") is >> warmup_epochs;
 
             // Game file storage folder (get game file with relative path from here)
             else if (option == "basedir") is >> base_dir;
@@ -1241,6 +1284,7 @@ namespace Learner
 
             // learning rate
             else if (option == "lr") is >> params.learning_rate;
+            else if (option == "warmup_lr") is >> params.warmup_learning_rate;
             else if (option == "max_grad") is >> params.max_grad;
 
             // Accept also the old option name.
@@ -1352,7 +1396,9 @@ namespace Learner
 
         out << "  - validation count         : " << params.validation_count << endl;
         out << "  - epochs                   : " << epochs << endl;
-        out << "  - epochs * minibatch size  : " << epochs * params.mini_batch_size << endl;
+        out << "  - positions                : " << epochs * params.mini_batch_size << endl;
+        out << "  - warmup epochs            : " << warmup_epochs << endl;
+        out << "  - warmup positions         : " << warmup_epochs * params.mini_batch_size << endl;
         out << "  - eval_limit               : " << params.eval_limit << endl;
         out << "  - save_only_once           : " << (params.save_only_once ? "true" : "false") << endl;
         out << "  - shuffle on read          : " << (params.shuffle ? "true" : "false") << endl;
@@ -1364,6 +1410,7 @@ namespace Learner
         out << "  - nn_options               : " << nn_options << endl;
 
         out << "  - learning rate            : " << params.learning_rate << endl;
+        out << "  - warmup learning rate     : " << params.warmup_learning_rate << endl;
         out << "  - max_grad                 : " << params.max_grad << endl;
         out << "  - use draws in training    : " << params.use_draw_games_in_training << endl;
         out << "  - use draws in validation  : " << params.use_draw_games_in_validation << endl;
@@ -1421,7 +1468,7 @@ namespace Learner
         out.unlock();
 
         // Start learning.
-        learn_think.learn(epochs);
+        learn_think.learn(epochs, warmup_epochs);
     }
 
 } // namespace Learner

From 6afcdaa928c4b7a9711b3df53d4cbb70616975f5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 3 Apr 2021 16:15:51 +0200
Subject: [PATCH 538/583] Add additional checks for en-passant possiblity when
 fixing the erroneus ep flag from a fen.

---
 src/extra/nnue_data_binpack_format.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index dce53b83..b957fae6 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -6115,6 +6115,26 @@ namespace chess
             return false;
         }
 
+        if (pieceAt(epSquare) != Piece::none())
+        {
+            return false;
+        }
+
+        const auto forward =
+            sideToMove == chess::Color::White
+            ? FlatSquareOffset(0, 1)
+            : FlatSquareOffset(0, -1);
+
+        if (pieceAt(epSquare + forward) != Piece::none())
+        {
+            return false;
+        }
+
+        if (pieceAt(epSquare + -forward) != Piece(PieceType::Pawn, !sideToMove))
+        {
+            return false;
+        }
+
         return isEpPossibleColdPath(epSquare, pawnsAttackingEpSquare, sideToMove);
     }
 

From 5bb6cdf7ba732f2f108451214a3eed243fc14024 Mon Sep 17 00:00:00 2001
From: fsmosca <fsmosca@users.noreply.github.com>
Date: Mon, 5 Apr 2021 13:29:49 +0800
Subject: [PATCH 539/583] Update gensfen.cpp

* Terminate game by 3-fold repetition.
* Fix segmentation fault by properly initializing the random_multi_pv_depth.
---
 src/learn/gensfen.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index 06e6295b..b28afa13 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -110,6 +110,8 @@ namespace Learner
                 save_every = std::max(save_every, REPORT_STATS_EVERY);
 
                 num_threads = Options["Threads"];
+
+                random_multi_pv_depth = std::max(search_depth_max, random_multi_pv_depth);
             }
         };
 
@@ -489,8 +491,11 @@ namespace Learner
         // draw at the maximum number of steps to write.
         const int ply = move_hist_scores.size();
 
-        // has it reached the max length or is a draw
-        if (ply >= params.write_maxply || pos.is_draw(ply))
+        // has it reached the max length or is a draw by fifty-move rule
+        // or by 3-fold repetition
+        if (ply >= params.write_maxply 
+            || pos.is_fifty_move_draw() 
+            || pos.is_three_fold_repetition())
         {
             return 0;
         }

From f57af4d203fd86a35f088786ba06531539c2190b Mon Sep 17 00:00:00 2001
From: fsmosca <fsmosca@users.noreply.github.com>
Date: Mon, 5 Apr 2021 13:31:21 +0800
Subject: [PATCH 540/583] Update position.cpp

* Add is_fifty_move_draw() and is_three_fold_repetition for gensfen()
---
 src/position.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/position.cpp b/src/position.cpp
index 934c1403..1b5ff222 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -1180,6 +1180,22 @@ bool Position::is_draw(int ply) const {
 }
 
 
+/// Position::is_fifty_move_draw() returns true if a game can be claimed
+/// by a fifty-move draw rule.
+
+bool Position::is_fifty_move_draw() const {
+
+  return (st->rule50 > 99 && (!checkers() || MoveList<LEGAL>(*this).size()));
+}
+
+
+/// Position::is_three_fold_repetition() returns true if there is 3-fold repetition.
+bool Position::is_three_fold_repetition() const {
+
+  return st->repetition < 0;
+}
+
+
 // Position::has_repeated() tests whether there has been at least one repetition
 // of positions since the last capture or pawn move.
 

From 560daefb01639fe04c185756051b9b9bdb0b57ef Mon Sep 17 00:00:00 2001
From: fsmosca <fsmosca@users.noreply.github.com>
Date: Mon, 5 Apr 2021 13:31:49 +0800
Subject: [PATCH 541/583] Update position.h

* Add is_fifty_move_draw() and is_three_fold_repetition for gensfen()
---
 src/position.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/position.h b/src/position.h
index e7513eb1..fe5be374 100644
--- a/src/position.h
+++ b/src/position.h
@@ -161,6 +161,8 @@ public:
   bool is_chess960() const;
   Thread* this_thread() const;
   bool is_draw(int ply) const;
+  bool is_fifty_move_draw() const;
+  bool is_three_fold_repetition() const;
   bool has_game_cycle(int ply) const;
   bool has_repeated() const;
   int rule50_count() const;

From 83651099725b6c9ddfc45941e02e8aea7c383944 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 5 Apr 2021 12:37:11 +0200
Subject: [PATCH 542/583] Revert "Add additional checks for en-passant
 possiblity when fixing the erroneus ep flag from a fen."

This reverts commit 6afcdaa928c4b7a9711b3df53d4cbb70616975f5.
---
 src/extra/nnue_data_binpack_format.h | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index b957fae6..dce53b83 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -6115,26 +6115,6 @@ namespace chess
             return false;
         }
 
-        if (pieceAt(epSquare) != Piece::none())
-        {
-            return false;
-        }
-
-        const auto forward =
-            sideToMove == chess::Color::White
-            ? FlatSquareOffset(0, 1)
-            : FlatSquareOffset(0, -1);
-
-        if (pieceAt(epSquare + forward) != Piece::none())
-        {
-            return false;
-        }
-
-        if (pieceAt(epSquare + -forward) != Piece(PieceType::Pawn, !sideToMove))
-        {
-            return false;
-        }
-
         return isEpPossibleColdPath(epSquare, pawnsAttackingEpSquare, sideToMove);
     }
 

From f85dbc3fe316b7c9c09d2d10098b0caa14b6552a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 30 Mar 2021 14:56:29 +0200
Subject: [PATCH 543/583] Reorder code and add important comments.

---
 src/learn/stats.cpp | 64 +++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 28 deletions(-)

diff --git a/src/learn/stats.cpp b/src/learn/stats.cpp
index 9d9589c4..d8224760 100644
--- a/src/learn/stats.cpp
+++ b/src/learn/stats.cpp
@@ -31,34 +31,6 @@ namespace Learner::Stats
         [[nodiscard]] virtual std::map<std::string, std::string> get_formatted_stats() const = 0;
     };
 
-    struct PositionCounter : StatisticGathererBase
-    {
-        PositionCounter() :
-            m_num_positions(0)
-        {
-        }
-
-        void on_position(const Position&) override
-        {
-            m_num_positions += 1;
-        }
-
-        void reset() override
-        {
-            m_num_positions = 0;
-        }
-
-        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
-        {
-            return {
-                { "Number of positions", std::to_string(m_num_positions) }
-            };
-        }
-
-    private:
-        std::uint64_t m_num_positions;
-    };
-
     struct StatisticGathererFactoryBase
     {
         [[nodiscard]] virtual std::unique_ptr<StatisticGathererBase> create() const = 0;
@@ -102,6 +74,42 @@ namespace Learner::Stats
         std::map<std::string, std::vector<std::unique_ptr<StatisticGathererFactoryBase>>> m_gatherers_by_group;
     };
 
+    /*
+        Definitions for specific statistic gatherers follow:
+    */
+
+    struct PositionCounter : StatisticGathererBase
+    {
+        PositionCounter() :
+            m_num_positions(0)
+        {
+        }
+
+        void on_position(const Position&) override
+        {
+            m_num_positions += 1;
+        }
+
+        void reset() override
+        {
+            m_num_positions = 0;
+        }
+
+        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        {
+            return {
+                { "Number of positions", std::to_string(m_num_positions) }
+            };
+        }
+
+    private:
+        std::uint64_t m_num_positions;
+    };
+
+    /*
+        This function provides factories for all possible statistic gatherers.
+        Each new statistic gatherer needs to be added there.
+    */
     const auto& get_statistics_gatherers_registry()
     {
         static StatisticGathererRegistry s_reg = [](){

From 7d74185d0be2665f9d18e959c2b77be21b1605a9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 30 Mar 2021 15:00:34 +0200
Subject: [PATCH 544/583] Add max_count parameter to limit the number of
 positions read.

---
 docs/stats.md       |  8 ++++++++
 src/learn/stats.cpp | 10 +++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/docs/stats.md b/docs/stats.md
index d5a76b61..2a2d3a07 100644
--- a/docs/stats.md
+++ b/docs/stats.md
@@ -4,6 +4,14 @@
 
 Simplest usage: `stockfish.exe gather_statistics all input_file a.binpack`
 
+Any name that doesn't designate an argument name or is not an argument will be interpreted as a group name.
+
+## Parameters
+
+`input_file` - the path to the .bin or .binpack input file to read
+
+`max_count` - the maximum number of positions to process. Default: no limit.
+
 ## Groups
 
 `all`
diff --git a/src/learn/stats.cpp b/src/learn/stats.cpp
index d8224760..8efbeb6f 100644
--- a/src/learn/stats.cpp
+++ b/src/learn/stats.cpp
@@ -125,7 +125,8 @@ namespace Learner::Stats
 
     void do_gather_statistics(
         const std::string& filename,
-        std::vector<std::unique_ptr<StatisticGathererBase>>& statistic_gatherers)
+        std::vector<std::unique_ptr<StatisticGathererBase>>& statistic_gatherers,
+        std::uint64_t max_count)
     {
         Thread* th = Threads.main();
         Position& pos = th->rootPos;
@@ -154,7 +155,7 @@ namespace Learner::Stats
         }
 
         uint64_t num_processed = 0;
-        for (;;)
+        while (num_processed < max_count)
         {
             auto v = in->next();
             if (!v.has_value())
@@ -196,6 +197,7 @@ namespace Learner::Stats
         std::vector<std::unique_ptr<StatisticGathererBase>> statistic_gatherers;
 
         std::string input_file;
+        std::uint64_t max_count = std::numeric_limits<std::uint64_t>::max();
 
         while(true)
         {
@@ -207,11 +209,13 @@ namespace Learner::Stats
 
             if (token == "input_file")
                 is >> input_file;
+            else if (token == "max_count")
+                is >> max_count;
             else
                 registry.add_statistic_gatherers_by_group(statistic_gatherers, token);
         }
 
-        do_gather_statistics(input_file, statistic_gatherers);
+        do_gather_statistics(input_file, statistic_gatherers, max_count);
     }
 
 }

From 570a0f6f3c6b012e89744657e08f34e9855b1c5f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 5 Apr 2021 16:12:47 +0200
Subject: [PATCH 545/583] Per square stats utility

---
 src/learn/stats.cpp | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/src/learn/stats.cpp b/src/learn/stats.cpp
index 8efbeb6f..ce85caa0 100644
--- a/src/learn/stats.cpp
+++ b/src/learn/stats.cpp
@@ -11,12 +11,15 @@
 
 #include "nnue/evaluate_nnue.h"
 
+#include <array>
 #include <string>
 #include <map>
 #include <iostream>
 #include <cmath>
 #include <algorithm>
 #include <cstdint>
+#include <sstream>
+#include <iomanip>
 #include <limits>
 #include <mutex>
 #include <optional>
@@ -74,6 +77,44 @@ namespace Learner::Stats
         std::map<std::string, std::vector<std::unique_ptr<StatisticGathererFactoryBase>>> m_gatherers_by_group;
     };
 
+    /*
+        Statistic gatherer helpers
+    */
+
+    template <typename T>
+    struct StatPerSquare
+    {
+        StatPerSquare()
+        {
+            for (int i = 0; i < SQUARE_NB; ++i)
+                m_squares[i] = 0;
+        }
+
+        [[nodiscard]] T& operator[](Square sq)
+        {
+            return m_squares[sq];
+        }
+
+        [[nodiscard]] const T& operator[](Square sq) const
+        {
+            return m_squares[sq];
+        }
+
+        [[nodiscard]] std::string get_formatted_stats() const
+        {
+            std::stringstream ss;
+            for (int i = 0; i < SQUARE_NB; ++i)
+            {
+                ss << std::setw(8) << m_squares[i] << ' ';
+                if ((i + 1) % 8 == 0)
+                    ss << '\n';
+            }
+        }
+
+    private:
+        std::array<T, SQUARE_NB> m_squares;
+    };
+
     /*
         Definitions for specific statistic gatherers follow:
     */

From eda51f19a2c1c5feeb755b6877132738ce1b1771 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 5 Apr 2021 16:15:37 +0200
Subject: [PATCH 546/583] Add king square counter

---
 src/learn/stats.cpp | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/src/learn/stats.cpp b/src/learn/stats.cpp
index ce85caa0..419108d9 100644
--- a/src/learn/stats.cpp
+++ b/src/learn/stats.cpp
@@ -147,6 +147,32 @@ namespace Learner::Stats
         std::uint64_t m_num_positions;
     };
 
+    struct KingSquareCounter : StatisticGathererBase
+    {
+        KingSquareCounter() :
+            m_white{},
+            m_black{}
+        {
+
+        }
+
+        void on_position(const Position& pos) override
+        {
+            m_white[pos.square<KING>(WHITE)] += 1;
+            m_black[pos.square<KING>(BLACK)] += 1;
+        }
+
+        void reset() override
+        {
+            m_white = StatPerSquare<std::uint64_t>{};
+            m_black = StatPerSquare<std::uint64_t>{};
+        }
+
+    private:
+        StatPerSquare<std::uint64_t> m_white;
+        StatPerSquare<std::uint64_t> m_black;
+    };
+
     /*
         This function provides factories for all possible statistic gatherers.
         Each new statistic gatherer needs to be added there.
@@ -158,6 +184,9 @@ namespace Learner::Stats
 
             reg.add<PositionCounter>("position_count");
 
+            reg.add<KingSquareCounter>("king");
+            reg.add<KingSquareCounter>("king_square_count");
+
             return reg;
         }();
 

From b2a5bf4171c943a6037eeabb49710cf726861e72 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 5 Apr 2021 16:36:27 +0200
Subject: [PATCH 547/583] Deduplicate statistic gatherers. Fix King square
 counter compilation errors.

---
 src/learn/stats.cpp | 124 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 106 insertions(+), 18 deletions(-)

diff --git a/src/learn/stats.cpp b/src/learn/stats.cpp
index 419108d9..0a93cc5e 100644
--- a/src/learn/stats.cpp
+++ b/src/learn/stats.cpp
@@ -14,6 +14,7 @@
 #include <array>
 #include <string>
 #include <map>
+#include <set>
 #include <iostream>
 #include <cmath>
 #include <algorithm>
@@ -31,12 +32,14 @@ namespace Learner::Stats
         virtual void on_position(const Position&) {}
         virtual void on_move(const Move&) {}
         virtual void reset() = 0;
+        [[nodiscard]] virtual const std::string& get_name() const = 0;
         [[nodiscard]] virtual std::map<std::string, std::string> get_formatted_stats() const = 0;
     };
 
     struct StatisticGathererFactoryBase
     {
         [[nodiscard]] virtual std::unique_ptr<StatisticGathererBase> create() const = 0;
+        [[nodiscard]] virtual const std::string& get_name() const = 0;
     };
 
     template <typename T>
@@ -46,12 +49,84 @@ namespace Learner::Stats
         {
             return std::make_unique<T>();
         }
+
+        [[nodiscard]] const std::string& get_name() const override
+        {
+            return T::name;
+        }
+    };
+
+    struct StatisticGathererSet : StatisticGathererBase
+    {
+        void add(const StatisticGathererFactoryBase& factory)
+        {
+            const std::string name = factory.get_name();
+            if (m_gatherers_names.count(name) == 0)
+            {
+                m_gatherers_names.insert(name);
+                m_gatherers.emplace_back(factory.create());
+            }
+        }
+
+        void add(std::unique_ptr<StatisticGathererBase>&& gatherer)
+        {
+            const std::string name = gatherer->get_name();
+            if (m_gatherers_names.count(name) == 0)
+            {
+                m_gatherers_names.insert(name);
+                m_gatherers.emplace_back(std::move(gatherer));
+            }
+        }
+
+        void on_position(const Position& position) override
+        {
+            for (auto& g : m_gatherers)
+            {
+                g->on_position(position);
+            }
+        }
+
+        void on_move(const Move& move) override
+        {
+            for (auto& g : m_gatherers)
+            {
+                g->on_move(move);
+            }
+        }
+
+        void reset() override
+        {
+            for (auto& g : m_gatherers)
+            {
+                g->reset();
+            }
+        }
+
+        [[nodiscard]] virtual const std::string& get_name() const override
+        {
+            static std::string name = "SET";
+            return name;
+        }
+
+        [[nodiscard]] virtual std::map<std::string, std::string> get_formatted_stats() const override
+        {
+            std::map<std::string, std::string> parts;
+            for (auto&& s : m_gatherers)
+            {
+                parts.merge(s->get_formatted_stats());
+            }
+            return parts;
+        }
+
+    private:
+        std::vector<std::unique_ptr<StatisticGathererBase>> m_gatherers;
+        std::set<std::string> m_gatherers_names;
     };
 
     struct StatisticGathererRegistry
     {
         void add_statistic_gatherers_by_group(
-            std::vector<std::unique_ptr<StatisticGathererBase>>& gatherers,
+            StatisticGathererSet& gatherers,
             const std::string& group) const
         {
             auto it = m_gatherers_by_group.find(group);
@@ -59,7 +134,7 @@ namespace Learner::Stats
             {
                 for (auto& factory : it->second)
                 {
-                    gatherers.emplace_back(factory->create());
+                    gatherers.add(*factory);
                 }
             }
         }
@@ -109,6 +184,7 @@ namespace Learner::Stats
                 if ((i + 1) % 8 == 0)
                     ss << '\n';
             }
+            return ss.str();
         }
 
     private:
@@ -121,6 +197,8 @@ namespace Learner::Stats
 
     struct PositionCounter : StatisticGathererBase
     {
+        static inline std::string name = "PositionCounter";
+
         PositionCounter() :
             m_num_positions(0)
         {
@@ -136,6 +214,11 @@ namespace Learner::Stats
             m_num_positions = 0;
         }
 
+        [[nodiscard]] const std::string& get_name() const override
+        {
+            return name;
+        }
+
         [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
         {
             return {
@@ -149,6 +232,8 @@ namespace Learner::Stats
 
     struct KingSquareCounter : StatisticGathererBase
     {
+        static inline std::string name = "KingSquareCounter";
+
         KingSquareCounter() :
             m_white{},
             m_black{}
@@ -168,6 +253,19 @@ namespace Learner::Stats
             m_black = StatPerSquare<std::uint64_t>{};
         }
 
+        [[nodiscard]] const std::string& get_name() const override
+        {
+            return name;
+        }
+
+        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        {
+            return {
+                { "White king squares", m_white.get_formatted_stats() },
+                { "Black king squares", m_black.get_formatted_stats() }
+            };
+        }
+
     private:
         StatPerSquare<std::uint64_t> m_white;
         StatPerSquare<std::uint64_t> m_black;
@@ -195,7 +293,7 @@ namespace Learner::Stats
 
     void do_gather_statistics(
         const std::string& filename,
-        std::vector<std::unique_ptr<StatisticGathererBase>>& statistic_gatherers,
+        StatisticGathererSet& statistic_gatherers,
         std::uint64_t max_count)
     {
         Thread* th = Threads.main();
@@ -205,17 +303,11 @@ namespace Learner::Stats
         auto in = Learner::open_sfen_input_file(filename);
 
         auto on_move = [&](Move move) {
-            for (auto&& s : statistic_gatherers)
-            {
-                s->on_move(move);
-            }
+            statistic_gatherers.on_move(move);
         };
 
         auto on_position = [&](const Position& position) {
-            for (auto&& s : statistic_gatherers)
-            {
-                s->on_position(position);
-            }
+            statistic_gatherers.on_position(position);
         };
 
         if (in == nullptr)
@@ -248,13 +340,9 @@ namespace Learner::Stats
         std::cout << "Finished gathering statistics.\n\n";
         std::cout << "Results:\n\n";
 
-        for (auto&& s : statistic_gatherers)
+        for (auto&& [name, value] : statistic_gatherers.get_formatted_stats())
         {
-            for (auto&& [name, value] : s->get_formatted_stats())
-            {
-                std::cout << name << ": " << value << '\n';
-            }
-            std::cout << '\n';
+            std::cout << name << ": " << value << '\n';
         }
     }
 
@@ -264,7 +352,7 @@ namespace Learner::Stats
 
         auto& registry = get_statistics_gatherers_registry();
 
-        std::vector<std::unique_ptr<StatisticGathererBase>> statistic_gatherers;
+        StatisticGathererSet statistic_gatherers;
 
         std::string input_file;
         std::uint64_t max_count = std::numeric_limits<std::uint64_t>::max();

From fcd53684b672a185d823338d757ae360307b21a9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 5 Apr 2021 16:43:25 +0200
Subject: [PATCH 548/583] To/from move stats

---
 src/learn/stats.cpp | 106 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 98 insertions(+), 8 deletions(-)

diff --git a/src/learn/stats.cpp b/src/learn/stats.cpp
index 0a93cc5e..c77b61b9 100644
--- a/src/learn/stats.cpp
+++ b/src/learn/stats.cpp
@@ -30,7 +30,7 @@ namespace Learner::Stats
     struct StatisticGathererBase
     {
         virtual void on_position(const Position&) {}
-        virtual void on_move(const Move&) {}
+        virtual void on_move(const Position&, const Move&) {}
         virtual void reset() = 0;
         [[nodiscard]] virtual const std::string& get_name() const = 0;
         [[nodiscard]] virtual std::map<std::string, std::string> get_formatted_stats() const = 0;
@@ -86,11 +86,11 @@ namespace Learner::Stats
             }
         }
 
-        void on_move(const Move& move) override
+        void on_move(const Position& pos, const Move& move) override
         {
             for (auto& g : m_gatherers)
             {
-                g->on_move(move);
+                g->on_move(pos, move);
             }
         }
 
@@ -261,8 +261,94 @@ namespace Learner::Stats
         [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
         {
             return {
-                { "White king squares", m_white.get_formatted_stats() },
-                { "Black king squares", m_black.get_formatted_stats() }
+                { "White king squares", '\n' + m_white.get_formatted_stats() },
+                { "Black king squares", '\n' + m_black.get_formatted_stats() }
+            };
+        }
+
+    private:
+        StatPerSquare<std::uint64_t> m_white;
+        StatPerSquare<std::uint64_t> m_black;
+    };
+
+    struct MoveFromCounter : StatisticGathererBase
+    {
+        static inline std::string name = "MoveFromCounter";
+
+        MoveFromCounter() :
+            m_white{},
+            m_black{}
+        {
+
+        }
+
+        void on_move(const Position& pos, const Move& move) override
+        {
+            if (pos.side_to_move() == WHITE)
+                m_white[from_sq(move)] += 1;
+            else
+                m_black[from_sq(move)] += 1;
+        }
+
+        void reset() override
+        {
+            m_white = StatPerSquare<std::uint64_t>{};
+            m_black = StatPerSquare<std::uint64_t>{};
+        }
+
+        [[nodiscard]] const std::string& get_name() const override
+        {
+            return name;
+        }
+
+        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        {
+            return {
+                { "White move from squares", '\n' + m_white.get_formatted_stats() },
+                { "Black move from squares", '\n' + m_black.get_formatted_stats() }
+            };
+        }
+
+    private:
+        StatPerSquare<std::uint64_t> m_white;
+        StatPerSquare<std::uint64_t> m_black;
+    };
+
+    struct MoveToCounter : StatisticGathererBase
+    {
+        static inline std::string name = "MoveToCounter";
+
+        MoveToCounter() :
+            m_white{},
+            m_black{}
+        {
+
+        }
+
+        void on_move(const Position& pos, const Move& move) override
+        {
+            if (pos.side_to_move() == WHITE)
+                m_white[to_sq(move)] += 1;
+            else
+                m_black[to_sq(move)] += 1;
+        }
+
+        void reset() override
+        {
+            m_white = StatPerSquare<std::uint64_t>{};
+            m_black = StatPerSquare<std::uint64_t>{};
+        }
+
+        [[nodiscard]] const std::string& get_name() const override
+        {
+            return name;
+        }
+
+        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        {
+            return {
+                { "White move to squares", '\n' + m_white.get_formatted_stats() },
+                { "Black move to squares", '\n' + m_black.get_formatted_stats() }
             };
         }
 
@@ -285,6 +371,10 @@ namespace Learner::Stats
             reg.add<KingSquareCounter>("king");
             reg.add<KingSquareCounter>("king_square_count");
 
+            reg.add<MoveFromCounter>("move");
+            reg.add<MoveFromCounter>("move_from_count");
+            reg.add<MoveToCounter>("move_to_count");
+
             return reg;
         }();
 
@@ -302,8 +392,8 @@ namespace Learner::Stats
 
         auto in = Learner::open_sfen_input_file(filename);
 
-        auto on_move = [&](Move move) {
-            statistic_gatherers.on_move(move);
+        auto on_move = [&](const Position& position, const Move& move) {
+            statistic_gatherers.on_move(position, move);
         };
 
         auto on_position = [&](const Position& position) {
@@ -328,7 +418,7 @@ namespace Learner::Stats
             pos.set_from_packed_sfen(ps.sfen, &si, th);
 
             on_position(pos);
-            on_move((Move)ps.move);
+            on_move(pos, (Move)ps.move);
 
             num_processed += 1;
             if (num_processed % 1'000'000 == 0)

From e7b3803fd0249e21eb53736254ee1a76b3d9a50b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 5 Apr 2021 17:00:27 +0200
Subject: [PATCH 549/583] Add more counters

---
 src/learn/stats.cpp | 163 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)

diff --git a/src/learn/stats.cpp b/src/learn/stats.cpp
index c77b61b9..3a8b4454 100644
--- a/src/learn/stats.cpp
+++ b/src/learn/stats.cpp
@@ -357,6 +357,165 @@ namespace Learner::Stats
         StatPerSquare<std::uint64_t> m_black;
     };
 
+    struct MoveTypeCounter : StatisticGathererBase
+    {
+        static inline std::string name = "MoveTypeCounter";
+
+        MoveTypeCounter() :
+            m_total(0),
+            m_normal(0),
+            m_capture(0),
+            m_promotion(0),
+            m_castling(0),
+            m_enpassant(0)
+        {
+
+        }
+
+        void on_move(const Position& pos, const Move& move) override
+        {
+            m_total += 1;
+
+            if (!pos.empty(to_sq(move)))
+                m_capture += 1;
+
+            if (type_of(move) == CASTLING)
+                m_castling += 1;
+            else if (type_of(move) == PROMOTION)
+                m_promotion += 1;
+            else if (type_of(move) == ENPASSANT)
+                m_enpassant += 1;
+            else if (type_of(move) == NORMAL)
+                m_normal += 1;
+        }
+
+        void reset() override
+        {
+            m_total = 0;
+            m_normal = 0;
+            m_capture = 0;
+            m_promotion = 0;
+            m_castling = 0;
+            m_enpassant = 0;
+        }
+
+        [[nodiscard]] const std::string& get_name() const override
+        {
+            return name;
+        }
+
+        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        {
+            return {
+                { "Total moves", std::to_string(m_total) },
+                { "Normal moves", std::to_string(m_normal) },
+                { "Capture moves", std::to_string(m_capture) },
+                { "Promotion moves", std::to_string(m_promotion) },
+                { "Castling moves", std::to_string(m_castling) },
+                { "En-passant moves", std::to_string(m_enpassant) }
+            };
+        }
+
+    private:
+        std::uint64_t m_total;
+        std::uint64_t m_normal;
+        std::uint64_t m_capture;
+        std::uint64_t m_promotion;
+        std::uint64_t m_castling;
+        std::uint64_t m_enpassant;
+    };
+
+    struct PieceCountCounter : StatisticGathererBase
+    {
+        static inline std::string name = "PieceCountCounter";
+
+        PieceCountCounter()
+        {
+            reset();
+        }
+
+        void on_position(const Position& pos) override
+        {
+            m_piece_count_hist[popcount(pos.pieces())] += 1;
+        }
+
+        void reset() override
+        {
+            for (int i = 0; i < SQUARE_NB; ++i)
+                m_num_pieces[i] = 0;
+        }
+
+        [[nodiscard]] const std::string& get_name() const override
+        {
+            return name;
+        }
+
+        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        {
+            std::map<std::string, std::string> result;
+            bool do_write = false;
+            for (int i = SQUARE_NB; i >= 0; --i)
+            {
+                if (m_piece_count_hist[i] != 0)
+                    do_write = true;
+
+                // Start writing when the first non-zero number pops up.
+                if (do_write)
+                {
+                    result.try_emplace(
+                        std::string("Number of positions with ") + std::to_string(i) + " pieces",
+                        std::to_string(m_piece_count_hist[i])
+                    );
+                }
+            }
+            return result;
+        }
+
+    private:
+        std::uint64_t m_piece_count_hist[SQUARE_NB];
+    };
+
+    struct MovedPieceTypeCounter : StatisticGathererBase
+    {
+        static inline std::string name = "MovedPieceTypeCounter";
+
+        MovedPieceTypeCounter()
+        {
+            reset();
+        }
+
+        void on_move(const Position& pos, const Move& move) override
+        {
+            m_moved_piece_type_hist[type_of(pos.piece_on(from_sq(move)))] += 1;
+        }
+
+        void reset() override
+        {
+            for (int i = 0; i < PIECE_TYPE_NB; ++i)
+                m_moved_piece_type_hist[i] = 0;
+        }
+
+        [[nodiscard]] const std::string& get_name() const override
+        {
+            return name;
+        }
+
+        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        {
+            return {
+                { "Pawn moves", std::to_string(m_moved_piece_type_hist[PAWN]) },
+                { "Knight moves", std::to_string(m_moved_piece_type_hist[KNIGHT]) },
+                { "Bishop moves", std::to_string(m_moved_piece_type_hist[BISHOP]) },
+                { "Rook moves", std::to_string(m_moved_piece_type_hist[ROOK]) },
+                { "Queen moves", std::to_string(m_moved_piece_type_hist[QUEEN]) },
+                { "King moves", std::to_string(m_moved_piece_type_hist[KING]) }
+            };
+        }
+
+    private:
+        std::uint64_t m_moved_piece_type_hist[PIECE_TYPE_NB];
+    };
+
     /*
         This function provides factories for all possible statistic gatherers.
         Each new statistic gatherer needs to be added there.
@@ -374,6 +533,10 @@ namespace Learner::Stats
             reg.add<MoveFromCounter>("move");
             reg.add<MoveFromCounter>("move_from_count");
             reg.add<MoveToCounter>("move_to_count");
+            reg.add<MoveTypeCounter>("move_type");
+            reg.add<MovedPieceTypeCounter>("moved_piece_type");
+
+            reg.add<PieceCountCounter>("piece_count")
 
             return reg;
         }();

From e371d133a7cbb08c0ce2cb83788f8f9145769890 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 5 Apr 2021 17:10:16 +0200
Subject: [PATCH 550/583] Fix grouping and do dedup in registry.

---
 src/learn/stats.cpp | 44 ++++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/src/learn/stats.cpp b/src/learn/stats.cpp
index 3a8b4454..d899090e 100644
--- a/src/learn/stats.cpp
+++ b/src/learn/stats.cpp
@@ -45,6 +45,8 @@ namespace Learner::Stats
     template <typename T>
     struct StatisticGathererFactory : StatisticGathererFactoryBase
     {
+        static inline std::string name = T::name;
+
         [[nodiscard]] std::unique_ptr<StatisticGathererBase> create() const override
         {
             return std::make_unique<T>();
@@ -52,7 +54,7 @@ namespace Learner::Stats
 
         [[nodiscard]] const std::string& get_name() const override
         {
-            return T::name;
+            return name;
         }
     };
 
@@ -139,17 +141,29 @@ namespace Learner::Stats
             }
         }
 
-        template <typename T>
-        void add(const std::string& group)
+        template <typename T, typename... ArgsTs>
+        void add(const ArgsTs&... group)
         {
-            m_gatherers_by_group[group].emplace_back(std::make_unique<StatisticGathererFactory<T>>());
-
-            // Always add to the special group "all".
-            m_gatherers_by_group["all"].emplace_back(std::make_unique<StatisticGathererFactory<T>>());
+            auto dummy = {(add_single<T>(group), 0)...};
+            (void)dummy;
+            add_single<T>("all");
         }
 
     private:
         std::map<std::string, std::vector<std::unique_ptr<StatisticGathererFactoryBase>>> m_gatherers_by_group;
+        std::map<std::string, std::set<std::string>> m_gatherers_names_by_group;
+
+        template <typename T, typename ArgT>
+        void add_single(const ArgT& group)
+        {
+            using FactoryT = StatisticGathererFactory<T>;
+
+            if (m_gatherers_names_by_group[group].count(FactoryT::name) == 0)
+            {
+                m_gatherers_by_group[group].emplace_back(std::make_unique<FactoryT>());
+                m_gatherers_names_by_group[group].insert(FactoryT::name);
+            }
+        }
     };
 
     /*
@@ -442,7 +456,7 @@ namespace Learner::Stats
         void reset() override
         {
             for (int i = 0; i < SQUARE_NB; ++i)
-                m_num_pieces[i] = 0;
+                m_piece_count_hist[i] = 0;
         }
 
         [[nodiscard]] const std::string& get_name() const override
@@ -527,16 +541,14 @@ namespace Learner::Stats
 
             reg.add<PositionCounter>("position_count");
 
-            reg.add<KingSquareCounter>("king");
-            reg.add<KingSquareCounter>("king_square_count");
+            reg.add<KingSquareCounter>("king", "king_square_count");
 
-            reg.add<MoveFromCounter>("move");
-            reg.add<MoveFromCounter>("move_from_count");
-            reg.add<MoveToCounter>("move_to_count");
-            reg.add<MoveTypeCounter>("move_type");
-            reg.add<MovedPieceTypeCounter>("moved_piece_type");
+            reg.add<MoveFromCounter>("move", "move_from_count");
+            reg.add<MoveToCounter>("move", "move_to_count");
+            reg.add<MoveTypeCounter>("move", "move_type");
+            reg.add<MovedPieceTypeCounter>("move", "moved_piece_type");
 
-            reg.add<PieceCountCounter>("piece_count")
+            reg.add<PieceCountCounter>("piece_count");
 
             return reg;
         }();

From 1786be5553d3ee2ee4353b107e579c15709fb1fa Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 5 Apr 2021 17:25:24 +0200
Subject: [PATCH 551/583] Minor fixes

---
 src/learn/stats.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learn/stats.cpp b/src/learn/stats.cpp
index d899090e..4ee123da 100644
--- a/src/learn/stats.cpp
+++ b/src/learn/stats.cpp
@@ -194,7 +194,7 @@ namespace Learner::Stats
             std::stringstream ss;
             for (int i = 0; i < SQUARE_NB; ++i)
             {
-                ss << std::setw(8) << m_squares[i] << ' ';
+                ss << std::setw(8) << m_squares[i ^ (int)SQ_A8] << ' ';
                 if ((i + 1) % 8 == 0)
                     ss << '\n';
             }

From f8d9836ca363c45e7d137a5f261e23ffdb53a2a5 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 5 Apr 2021 17:27:57 +0200
Subject: [PATCH 552/583] Use an ordered container for the results.

---
 src/learn/stats.cpp | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/learn/stats.cpp b/src/learn/stats.cpp
index 4ee123da..c0e2c0a1 100644
--- a/src/learn/stats.cpp
+++ b/src/learn/stats.cpp
@@ -33,7 +33,7 @@ namespace Learner::Stats
         virtual void on_move(const Position&, const Move&) {}
         virtual void reset() = 0;
         [[nodiscard]] virtual const std::string& get_name() const = 0;
-        [[nodiscard]] virtual std::map<std::string, std::string> get_formatted_stats() const = 0;
+        [[nodiscard]] virtual std::vector<std::pair<std::string, std::string>> get_formatted_stats() const = 0;
     };
 
     struct StatisticGathererFactoryBase
@@ -110,12 +110,13 @@ namespace Learner::Stats
             return name;
         }
 
-        [[nodiscard]] virtual std::map<std::string, std::string> get_formatted_stats() const override
+        [[nodiscard]] virtual std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
         {
-            std::map<std::string, std::string> parts;
+            std::vector<std::pair<std::string, std::string>> parts;
             for (auto&& s : m_gatherers)
             {
-                parts.merge(s->get_formatted_stats());
+                auto part = s->get_formatted_stats();
+                parts.insert(parts.end(), part.begin(), part.end());
             }
             return parts;
         }
@@ -233,7 +234,7 @@ namespace Learner::Stats
             return name;
         }
 
-        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
         {
             return {
                 { "Number of positions", std::to_string(m_num_positions) }
@@ -272,7 +273,7 @@ namespace Learner::Stats
             return name;
         }
 
-        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
         {
             return {
                 { "White king squares", '\n' + m_white.get_formatted_stats() },
@@ -315,7 +316,7 @@ namespace Learner::Stats
             return name;
         }
 
-        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
         {
             return {
                 { "White move from squares", '\n' + m_white.get_formatted_stats() },
@@ -358,7 +359,7 @@ namespace Learner::Stats
             return name;
         }
 
-        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
         {
             return {
                 { "White move to squares", '\n' + m_white.get_formatted_stats() },
@@ -418,7 +419,7 @@ namespace Learner::Stats
             return name;
         }
 
-        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
         {
             return {
                 { "Total moves", std::to_string(m_total) },
@@ -464,11 +465,11 @@ namespace Learner::Stats
             return name;
         }
 
-        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
         {
-            std::map<std::string, std::string> result;
+            std::vector<std::pair<std::string, std::string>> result;
             bool do_write = false;
-            for (int i = SQUARE_NB; i >= 0; --i)
+            for (int i = SQUARE_NB - 1; i >= 0; --i)
             {
                 if (m_piece_count_hist[i] != 0)
                     do_write = true;
@@ -476,7 +477,7 @@ namespace Learner::Stats
                 // Start writing when the first non-zero number pops up.
                 if (do_write)
                 {
-                    result.try_emplace(
+                    result.emplace_back(
                         std::string("Number of positions with ") + std::to_string(i) + " pieces",
                         std::to_string(m_piece_count_hist[i])
                     );
@@ -514,7 +515,7 @@ namespace Learner::Stats
             return name;
         }
 
-        [[nodiscard]] std::map<std::string, std::string> get_formatted_stats() const override
+        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
         {
             return {
                 { "Pawn moves", std::to_string(m_moved_piece_type_hist[PAWN]) },

From 9dac979ce8d4452881dd154793611fc2522d04e8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 5 Apr 2021 17:37:15 +0200
Subject: [PATCH 553/583] Update docs

---
 docs/stats.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/docs/stats.md b/docs/stats.md
index 2a2d3a07..78fe9051 100644
--- a/docs/stats.md
+++ b/docs/stats.md
@@ -21,3 +21,25 @@ Any name that doesn't designate an argument name or is not an argument will be i
 `position_count`
 
  - `struct PositionCounter` - the total number of positions in the file.
+
+
+            reg.add<KingSquareCounter>("king", "king_square_count");
+
+            reg.add<MoveFromCounter>("move", "move_from_count");
+            reg.add<MoveToCounter>("move", "move_to_count");
+            reg.add<MoveTypeCounter>("move", "move_type");
+            reg.add<MovedPieceTypeCounter>("move", "moved_piece_type");
+
+            reg.add<PieceCountCounter>("piece_count");
+
+`king`, `king_square_count` - the number of times a king was on each square. Output is layed out as a chessboard, with the 8th rank being the topmost. Separate values for white and black kings.
+
+`move`, `move_from_count` - same as `king_square_count` but for from_sq(move)
+
+`move`, `move_to_count` - same as `king_square_count` but for to_sq(move)
+
+`move`, `move_type` - the number of moves with each type. Includes normal, captures, castling, promotions, enpassant. The groups are not disjoint.
+
+`move`, `moved_piece_type` - the number of times a piece of each type was moved
+
+`piece_count` - the histogram of the number of pieces on the board

From a93777c4edc8db00e7550b3fdd548dbbec98599d Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 5 Apr 2021 18:54:31 +0200
Subject: [PATCH 554/583] Fix stats.md docs.

---
 docs/stats.md | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/docs/stats.md b/docs/stats.md
index 78fe9051..54b4e4e0 100644
--- a/docs/stats.md
+++ b/docs/stats.md
@@ -14,23 +14,9 @@ Any name that doesn't designate an argument name or is not an argument will be i
 
 ## Groups
 
-`all`
+`all` - a special group designating all statistics gatherers available.
 
- - A special group designating all statistics gatherers available.
-
-`position_count`
-
- - `struct PositionCounter` - the total number of positions in the file.
-
-
-            reg.add<KingSquareCounter>("king", "king_square_count");
-
-            reg.add<MoveFromCounter>("move", "move_from_count");
-            reg.add<MoveToCounter>("move", "move_to_count");
-            reg.add<MoveTypeCounter>("move", "move_type");
-            reg.add<MovedPieceTypeCounter>("move", "moved_piece_type");
-
-            reg.add<PieceCountCounter>("piece_count");
+`position_count` - the total number of positions in the file.
 
 `king`, `king_square_count` - the number of times a king was on each square. Output is layed out as a chessboard, with the 8th rank being the topmost. Separate values for white and black kings.
 

From dfa53e40620f981c5435b39c4531640cf109a887 Mon Sep 17 00:00:00 2001
From: fsmosca <fsmosca@users.noreply.github.com>
Date: Fri, 9 Apr 2021 21:52:24 +0800
Subject: [PATCH 555/583] Fix some include paths in tbprobe

---
 src/syzygy/tbprobe.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp
index 191986da..f4b9447f 100644
--- a/src/syzygy/tbprobe.cpp
+++ b/src/syzygy/tbprobe.cpp
@@ -28,12 +28,12 @@
 #include <type_traits>
 #include <mutex>
 
-#include "bitboard.h"
-#include "movegen.h"
-#include "position.h"
-#include "search.h"
-#include "types.h"
-#include "uci.h"
+#include "../bitboard.h"
+#include "../movegen.h"
+#include "../position.h"
+#include "../search.h"
+#include "../types.h"
+#include "../uci.h"
 
 #include "tbprobe.h"
 

From 8748fd49b3c3a501e8e6b02abd0b0b7c58dbccf0 Mon Sep 17 00:00:00 2001
From: fsmosca <fsmosca@users.noreply.github.com>
Date: Fri, 9 Apr 2021 21:52:55 +0800
Subject: [PATCH 556/583] Fix include path in tbprobe

---
 src/syzygy/tbprobe.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h
index efc4b6b7..b998989b 100644
--- a/src/syzygy/tbprobe.h
+++ b/src/syzygy/tbprobe.h
@@ -21,7 +21,7 @@
 
 #include <ostream>
 
-#include "search.h"
+#include "../search.h"
 
 namespace Tablebases {
 

From 44f4d6f617b3f82216e10e5171e885b380b764f5 Mon Sep 17 00:00:00 2001
From: fsmosca <fsmosca@users.noreply.github.com>
Date: Tue, 13 Apr 2021 18:54:19 +0800
Subject: [PATCH 557/583] Fix ranking of root moves by TB

---
 src/search.cpp | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 30384868..8fe35000 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -309,6 +309,9 @@ void Thread::search() {
   bestValue = delta = alpha = -VALUE_INFINITE;
   beta = VALUE_INFINITE;
 
+  if (!this->rootMoves.empty())
+    Tablebases::rank_root_moves(this->rootPos, this->rootMoves);
+      
   if (mainThread)
   {
       if (mainThread->bestPreviousScore == VALUE_INFINITE)
@@ -1934,10 +1937,14 @@ bool RootMove::extract_ponder_from_tt(Position& pos) {
 
 void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
 
-    auto& rootInTB = pos.this_thread()->rootInTB;
+    pos.this_thread()->Cardinality = int(Options["SyzygyProbeLimit"]);
+    pos.this_thread()->ProbeDepth = int(Options["SyzygyProbeDepth"]);
+    pos.this_thread()->UseRule50 = bool(Options["Syzygy50MoveRule"]);
+    pos.this_thread()->rootInTB = false;
+
     auto& cardinality = pos.this_thread()->Cardinality;
     auto& probeDepth = pos.this_thread()->ProbeDepth;
-    rootInTB = false;
+    auto& rootInTB = pos.this_thread()->rootInTB;
     bool dtz_available = true;
 
     // Tables with fewer pieces than SyzygyProbeLimit are searched with
@@ -2044,18 +2051,6 @@ namespace Search
       if (rootMoves.empty())
         return false;
 
-      th->UseRule50 = bool(Options["Syzygy50MoveRule"]);
-      th->ProbeDepth = int(Options["SyzygyProbeDepth"]);
-      th->Cardinality = int(Options["SyzygyProbeLimit"]);
-
-      // Tables with fewer pieces than SyzygyProbeLimit are searched with
-      // ProbeDepth == DEPTH_ZERO
-      if (th->Cardinality > Tablebases::MaxCardinality)
-      {
-          th->Cardinality = Tablebases::MaxCardinality;
-          th->ProbeDepth = 0;
-      }
-
       Tablebases::rank_root_moves(pos, rootMoves);
     }
 

From 744533c2cfd523fde5a21db00c83efbb470be3a8 Mon Sep 17 00:00:00 2001
From: fsmosca <fsmosca@users.noreply.github.com>
Date: Tue, 13 Apr 2021 18:54:54 +0800
Subject: [PATCH 558/583] Fix ranking of root moves by TB

---
 src/thread.cpp | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/src/thread.cpp b/src/thread.cpp
index f035186b..8f727ea8 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -232,24 +232,6 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
       th->rootMoves = rootMoves;
       th->rootPos.set(pos.fen(), pos.is_chess960(), &th->rootState, th);
       th->rootState = setupStates->back();
-      // This is also set by rank_root_moves but we need to set it
-      // also when there is no legal moves.
-      th->rootInTB = false;
-      th->UseRule50 = bool(Options["Syzygy50MoveRule"]);
-      th->ProbeDepth = int(Options["SyzygyProbeDepth"]);
-      th->Cardinality = int(Options["SyzygyProbeLimit"]);
-
-      // Tables with fewer pieces than SyzygyProbeLimit are searched with
-      // ProbeDepth == DEPTH_ZERO
-      if (th->Cardinality > Tablebases::MaxCardinality)
-      {
-          th->Cardinality = Tablebases::MaxCardinality;
-          th->ProbeDepth = 0;
-      }
-
-      if (!rootMoves.empty())
-          Tablebases::rank_root_moves(pos, rootMoves);
-
   }
 
   main()->start_searching();

From 3101ae7973b94f6eea176bb302813210eb3feeb3 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Apr 2021 19:04:14 +0200
Subject: [PATCH 559/583] remove learn

---
 src/Makefile                                  |    4 -
 src/extra/stockfish_blas.cpp                  | 1291 ---------------
 src/extra/stockfish_blas.h                    |  140 --
 src/learn/autograd.h                          |  667 --------
 src/learn/gensfen.cpp                         |    5 +-
 src/learn/gensfen_nonpv.cpp                   |    1 -
 src/learn/half_float.h                        |  133 --
 src/learn/learn.cpp                           | 1474 -----------------
 src/learn/learn.h                             |  148 --
 src/nnue/evaluate_nnue_learner.cpp            |  341 ----
 src/nnue/evaluate_nnue_learner.h              |   52 -
 src/nnue/nnue_test_command.cpp                |  215 ---
 src/nnue/nnue_test_command.h                  |   12 -
 src/nnue/trainer/features/all_factorizers.h   |   10 -
 src/nnue/trainer/features/factorizer.h        |  117 --
 .../trainer/features/factorizer_feature_set.h |  121 --
 .../trainer/features/factorizer_half_ka.h     |   93 --
 .../trainer/features/factorizer_half_kp.h     |  104 --
 src/nnue/trainer/trainer.h                    |  122 --
 src/nnue/trainer/trainer_affine_transform.h   |  476 ------
 src/nnue/trainer/trainer_clipped_relu.h       |  354 ----
 .../trainer/trainer_feature_transformer.h     |  783 ---------
 src/nnue/trainer/trainer_input_slice.h        |  383 -----
 src/nnue/trainer/trainer_sum.h                |  201 ---
 src/uci.cpp                                   |   25 -
 25 files changed, 2 insertions(+), 7270 deletions(-)
 delete mode 100644 src/extra/stockfish_blas.cpp
 delete mode 100644 src/extra/stockfish_blas.h
 delete mode 100644 src/learn/autograd.h
 delete mode 100644 src/learn/half_float.h
 delete mode 100644 src/learn/learn.cpp
 delete mode 100644 src/learn/learn.h
 delete mode 100644 src/nnue/evaluate_nnue_learner.cpp
 delete mode 100644 src/nnue/evaluate_nnue_learner.h
 delete mode 100644 src/nnue/nnue_test_command.cpp
 delete mode 100644 src/nnue/nnue_test_command.h
 delete mode 100644 src/nnue/trainer/features/all_factorizers.h
 delete mode 100644 src/nnue/trainer/features/factorizer.h
 delete mode 100644 src/nnue/trainer/features/factorizer_feature_set.h
 delete mode 100644 src/nnue/trainer/features/factorizer_half_ka.h
 delete mode 100644 src/nnue/trainer/features/factorizer_half_kp.h
 delete mode 100644 src/nnue/trainer/trainer.h
 delete mode 100644 src/nnue/trainer/trainer_affine_transform.h
 delete mode 100644 src/nnue/trainer/trainer_clipped_relu.h
 delete mode 100644 src/nnue/trainer/trainer_feature_transformer.h
 delete mode 100644 src/nnue/trainer/trainer_input_slice.h
 delete mode 100644 src/nnue/trainer/trainer_sum.h

diff --git a/src/Makefile b/src/Makefile
index a4ced5f0..19927ce5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -47,9 +47,7 @@ PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 sfen_format bin output_file_name
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
 	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
 	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
-	extra/stockfish_blas.cpp \
 	nnue/evaluate_nnue.cpp \
-	nnue/evaluate_nnue_learner.cpp \
 	nnue/features/half_kp.cpp \
 	nnue/features/half_ka.cpp \
 	nnue/features/half_relative_kp.cpp \
@@ -59,9 +57,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/features/a.cpp \
 	nnue/features/castling_right.cpp \
 	nnue/features/enpassant.cpp \
-	nnue/nnue_test_command.cpp \
 	learn/sfen_packer.cpp \
-	learn/learn.cpp \
 	learn/gensfen.cpp \
 	learn/gensfen_nonpv.cpp \
 	learn/opening_book.cpp \
diff --git a/src/extra/stockfish_blas.cpp b/src/extra/stockfish_blas.cpp
deleted file mode 100644
index 70b258bc..00000000
--- a/src/extra/stockfish_blas.cpp
+++ /dev/null
@@ -1,1291 +0,0 @@
-#include "stockfish_blas.h"
-
-#include "thread.h"
-
-#include <cstring>
-#include <random>
-#include <iostream>
-#include <vector>
-#include <algorithm>
-#include <cmath>
-#include <atomic>
-#include <chrono>
-
-#if defined(USE_SSE2)
-#include <xmmintrin.h>
-#endif
-
-#if defined (USE_SSE3)
-#include <pmmintrin.h>
-#endif
-
-#if defined(USE_BLAS)
-#include <cblas.h>
-#endif
-
-namespace Blas {
-    void scopy(
-        const int N,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    )
-    {
-        std::memcpy(Y, X, sizeof(float) * N);
-    }
-
-    void scopy(
-        const int N,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    )
-    {
-        if (incX == 1 && incY == 1)
-        {
-            scopy(N, X, Y);
-        }
-        else
-        {
-            for(int i = 0; i < N; ++i)
-            {
-                *Y = *X;
-                X += incX;
-                Y += incY;
-            }
-        }
-    }
-
-    void scopy(
-        ThreadPool&,
-        const int N,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    )
-    {
-        scopy(N, X, Y);
-    }
-
-    void scopy(
-        ThreadPool&,
-        const int N,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    )
-    {
-        scopy(N, X, incX, Y, incY);
-    }
-
-    void sscal(
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X
-    )
-    {
-#if defined (USE_SSE2)
-
-        const __m128 alpha4 = _mm_set1_ps(alpha);
-
-        int i = 0;
-        for(; i < N - 31; i += 32)
-        {
-            __m128 x0 = _mm_loadu_ps(X + i +  0);
-            __m128 x1 = _mm_loadu_ps(X + i +  4);
-            __m128 x2 = _mm_loadu_ps(X + i +  8);
-            __m128 x3 = _mm_loadu_ps(X + i + 12);
-            __m128 x4 = _mm_loadu_ps(X + i + 16);
-            __m128 x5 = _mm_loadu_ps(X + i + 20);
-            __m128 x6 = _mm_loadu_ps(X + i + 24);
-            __m128 x7 = _mm_loadu_ps(X + i + 28);
-
-            x0 = _mm_mul_ps(x0, alpha4);
-            x1 = _mm_mul_ps(x1, alpha4);
-            x2 = _mm_mul_ps(x2, alpha4);
-            x3 = _mm_mul_ps(x3, alpha4);
-            x4 = _mm_mul_ps(x4, alpha4);
-            x5 = _mm_mul_ps(x5, alpha4);
-            x6 = _mm_mul_ps(x6, alpha4);
-            x7 = _mm_mul_ps(x7, alpha4);
-
-            _mm_storeu_ps(X + i +  0, x0);
-            _mm_storeu_ps(X + i +  4, x1);
-            _mm_storeu_ps(X + i +  8, x2);
-            _mm_storeu_ps(X + i + 12, x3);
-            _mm_storeu_ps(X + i + 16, x4);
-            _mm_storeu_ps(X + i + 20, x5);
-            _mm_storeu_ps(X + i + 24, x6);
-            _mm_storeu_ps(X + i + 28, x7);
-        }
-
-        for(; i < N; ++i)
-        {
-            X[i] *= alpha;
-        }
-
-#else
-
-        for(int i = 0; i < N; ++i)
-        {
-            X[i] *= alpha;
-        }
-
-#endif
-    }
-
-    void sscal(
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X, const int incX
-    )
-    {
-        if (incX == 1)
-        {
-            sscal(N, alpha, X);
-        }
-        else
-        {
-            for(int i = 0; i < N; ++i)
-            {
-                *X *= alpha;
-                X += incX;
-            }
-        }
-    }
-
-    void sscal(
-        ThreadPool&,
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X
-    )
-    {
-        sscal(N, alpha, X);
-    }
-
-    void sscal(
-        ThreadPool&,
-        const int N,
-        const float alpha,
-        float *X, const int incX
-    )
-    {
-        sscal(N, alpha, X, incX);
-    }
-
-    void saxpy(
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    )
-    {
-        if (alpha == 1.0f)
-        {
-            for (int i = 0; i < N; ++i)
-            {
-                Y[i] += X[i];
-            }
-        }
-        else
-        {
-            for (int i = 0; i < N; ++i)
-            {
-                Y[i] += X[i] * alpha;
-            }
-        }
-
-    }
-
-    void saxpy(
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    )
-    {
-        if (incX == 1 && incY == 1)
-        {
-            saxpy(N, alpha, X, Y);
-        }
-        else
-        {
-            for(int i = 0; i < N; ++i)
-            {
-                *Y += *X * alpha;
-                Y += incY;
-                X += incX;
-            }
-        }
-    }
-
-    void saxpy(
-        ThreadPool&,
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    )
-    {
-        saxpy(N, alpha, X, Y);
-    }
-
-    void saxpy(
-        ThreadPool&,
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    )
-    {
-        saxpy(N, alpha, X, incX, Y, incY);
-    }
-
-#if defined (USE_SSE3)
-    inline __m128 m128_hadd_ps(__m128 a, __m128 b, __m128 c, __m128 d)
-    {
-        const __m128 t0 = _mm_hadd_ps(a, b);
-        const __m128 t1 = _mm_hadd_ps(c, d);
-        return _mm_hadd_ps(t0, t1);
-    }
-#endif
-
-#if defined (USE_SSE2)
-
-    inline void transpose4x4_sse2(
-        const float* SF_BLAS_RESTRICT A, const int lda,
-        float* SF_BLAS_RESTRICT B, const int ldb
-    )
-    {
-        __m128 row1 = _mm_loadu_ps(&A[0 * lda]);
-        __m128 row2 = _mm_loadu_ps(&A[1 * lda]);
-        __m128 row3 = _mm_loadu_ps(&A[2 * lda]);
-        __m128 row4 = _mm_loadu_ps(&A[3 * lda]);
-
-        _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
-
-        _mm_storeu_ps(&B[0 * ldb], row1);
-        _mm_storeu_ps(&B[1 * ldb], row2);
-        _mm_storeu_ps(&B[2 * ldb], row3);
-        _mm_storeu_ps(&B[3 * ldb], row4);
-    }
-
-    void transpose_sse2(
-        const int N, const int M,
-        const float* SF_BLAS_RESTRICT A, const int lda,
-        float* SF_BLAS_RESTRICT B, const int ldb
-    )
-    {
-        static constexpr int block_size = 16;
-
-        for (int n = 0; n < N; n += block_size)
-        {
-            for (int m = 0; m < M; m += block_size)
-            {
-                const int max_n2 = n + block_size < N ? n + block_size : N;
-                const int max_m2 = m + block_size < M ? m + block_size : M;
-
-                int n2 = n;
-                for (; n2 < max_n2 - 3; n2 += 4)
-                {
-                    int m2 = m;
-                    for (; m2 < max_m2 - 3; m2 += 4)
-                    {
-                        transpose4x4_sse2(
-                            &A[n2 * lda + m2], lda,
-                            &B[m2 * ldb + n2], ldb
-                        );
-                    }
-
-                    for (; m2 < max_m2; ++m2)
-                    {
-                        B[m2 * ldb + n2 + 0] = A[(n2 + 0) * lda + m2];
-                        B[m2 * ldb + n2 + 1] = A[(n2 + 1) * lda + m2];
-                        B[m2 * ldb + n2 + 2] = A[(n2 + 2) * lda + m2];
-                        B[m2 * ldb + n2 + 3] = A[(n2 + 3) * lda + m2];
-                    }
-                }
-
-                for (; n2 < max_n2; ++n2)
-                {
-                    for (int m2 = m; m2 < max_m2; ++m2)
-                    {
-                        B[m2 * ldb + n2] = A[n2 * lda + m2];
-                    }
-                }
-            }
-        }
-    }
-#endif
-
-    void transpose(
-        const int N, const int M,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        float* SF_BLAS_RESTRICT B, const int ldb
-    )
-    {
-#if defined (USE_SSE2)
-
-        transpose_sse2(
-            N, M,
-            A, lda,
-            B, ldb
-        );
-
-#else
-
-        for(int r = 0; r < N; ++r)
-        {
-            for (int c = 0; c < M; ++c)
-            {
-                B[c*ldb + r] = A[r*lda + c];
-            }
-        }
-
-#endif
-    }
-
-    void sgemm_row_major_transpose_right(
-        ThreadPool& thread_pool,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-
-#if defined(USE_SSE3)
-
-        const __m128 alpha4 = _mm_set1_ps(alpha);
-        const __m128 beta4 = _mm_set1_ps(beta);
-
-        std::atomic<int> m_atomic = 0;
-        thread_pool.execute_with_workers(
-            [
-                M, N, K,
-                alpha, alpha4,
-                A, lda,
-                B, ldb,
-                beta, beta4,
-                C, ldc,
-                &m_atomic
-            ](Thread&) {
-                for (;;)
-                {
-                    const int m = m_atomic.fetch_add(2);
-                    if (m >= M - 1)
-                        break;
-
-                    int n = 0;
-                    for (; n < N - 3; n += 4)
-                    {
-                        //        mn
-                        __m128 sum00 = _mm_setzero_ps();
-                        __m128 sum01 = _mm_setzero_ps();
-                        __m128 sum02 = _mm_setzero_ps();
-                        __m128 sum03 = _mm_setzero_ps();
-                        __m128 sum10 = _mm_setzero_ps();
-                        __m128 sum11 = _mm_setzero_ps();
-                        __m128 sum12 = _mm_setzero_ps();
-                        __m128 sum13 = _mm_setzero_ps();
-
-                        // Horizontal sum of elements in sum[m][n] corresponds to
-                        // the final element in the C.
-
-                        int k = 0;
-                        for (; k < K - 3; k += 4)
-                        {
-                            const __m128 a0 = _mm_loadu_ps(&A[(m+0)*lda+k+0]);
-                            const __m128 a1 = _mm_loadu_ps(&A[(m+1)*lda+k+0]);
-
-                            const __m128 b0 = _mm_loadu_ps(&B[(n+0)*ldb+k+0]);
-                            const __m128 b1 = _mm_loadu_ps(&B[(n+1)*ldb+k+0]);
-                            const __m128 b2 = _mm_loadu_ps(&B[(n+2)*ldb+k+0]);
-                            const __m128 b3 = _mm_loadu_ps(&B[(n+3)*ldb+k+0]);
-
-                            sum00 = _mm_add_ps(sum00, _mm_mul_ps(a0, b0));
-                            sum01 = _mm_add_ps(sum01, _mm_mul_ps(a0, b1));
-                            sum02 = _mm_add_ps(sum02, _mm_mul_ps(a0, b2));
-                            sum03 = _mm_add_ps(sum03, _mm_mul_ps(a0, b3));
-                            sum10 = _mm_add_ps(sum10, _mm_mul_ps(a1, b0));
-                            sum11 = _mm_add_ps(sum11, _mm_mul_ps(a1, b1));
-                            sum12 = _mm_add_ps(sum12, _mm_mul_ps(a1, b2));
-                            sum13 = _mm_add_ps(sum13, _mm_mul_ps(a1, b3));
-                        }
-
-                        for(; k < K; k += 1)
-                        {
-                            const float a0 = A[(m+0)*lda+k+0];
-                            const float a1 = A[(m+1)*lda+k+0];
-
-                            const float b0 = B[(n+0)*ldb+k+0];
-                            const float b1 = B[(n+1)*ldb+k+0];
-                            const float b2 = B[(n+2)*ldb+k+0];
-                            const float b3 = B[(n+3)*ldb+k+0];
-
-                            // Since all will be summed vertically anyway we can
-                            // just add to the first element.
-                            // Other elements are left unmodified.
-                            sum00 = _mm_add_ss(sum00, _mm_set_ss(a0 * b0));
-                            sum01 = _mm_add_ss(sum01, _mm_set_ss(a0 * b1));
-                            sum02 = _mm_add_ss(sum02, _mm_set_ss(a0 * b2));
-                            sum03 = _mm_add_ss(sum03, _mm_set_ss(a0 * b3));
-                            sum10 = _mm_add_ss(sum10, _mm_set_ss(a1 * b0));
-                            sum11 = _mm_add_ss(sum11, _mm_set_ss(a1 * b1));
-                            sum12 = _mm_add_ss(sum12, _mm_set_ss(a1 * b2));
-                            sum13 = _mm_add_ss(sum13, _mm_set_ss(a1 * b3));
-                        }
-
-                        __m128 s0 = m128_hadd_ps(sum00, sum01, sum02, sum03);
-                        __m128 s1 = m128_hadd_ps(sum10, sum11, sum12, sum13);
-                        s0 = _mm_mul_ps(s0, alpha4);
-                        s1 = _mm_mul_ps(s1, alpha4);
-
-                        __m128 c0 = _mm_loadu_ps(&C[(m+0)*ldc+(n+0)]);
-                        __m128 c1 = _mm_loadu_ps(&C[(m+1)*ldc+(n+0)]);
-                        c0 = _mm_mul_ps(c0, beta4);
-                        c1 = _mm_mul_ps(c1, beta4);
-
-                        c0 = _mm_add_ps(c0, s0);
-                        c1 = _mm_add_ps(c1, s1);
-
-                        _mm_storeu_ps(&C[(m+0)*ldc+(n+0)], c0);
-                        _mm_storeu_ps(&C[(m+1)*ldc+(n+0)], c1);
-                    }
-
-                    for(; n < N; n += 1)
-                    {
-                        float sum0 = 0.0f;
-                        float sum1 = 0.0f;
-
-                        for (int k = 0; k < K; ++k)
-                        {
-                            const float a0 = A[(m+0)*lda+k+0];
-                            const float a1 = A[(m+1)*lda+k+0];
-
-                            const float b0 = B[(n+0)*ldb+k+0];
-
-                            sum0 += a0 * b0;
-                            sum1 += a1 * b0;
-                        }
-
-                        C[(m+0)*ldc+(n+0)] = C[(m+0)*ldc+(n+0)] * beta + sum0 * alpha;
-                        C[(m+1)*ldc+(n+0)] = C[(m+1)*ldc+(n+0)] * beta + sum1 * alpha;
-                    }
-                }
-            }
-        );
-
-        int m = M - (M % 2);
-        for (; m < M; m += 1)
-        {
-            for (int n = 0; n < N; n += 1)
-            {
-                float sum = 0.0f;
-
-                for (int k = 0; k < K; k += 1)
-                {
-                    sum += A[m*lda + k] * B[n*ldb + k];
-                }
-
-                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
-            }
-        }
-
-        thread_pool.wait_for_workers_finished();
-
-#else
-
-        thread_pool.for_each_index_with_workers(
-            0, M,
-            [&](Thread&, int m) {
-                for (int n = 0; n < N; n += 1)
-                {
-                    float sum = 0.0f;
-
-                    for (int k = 0; k < K; k += 1)
-                    {
-                        sum += A[m*lda + k] * B[n*ldb + k];
-                    }
-
-                    C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
-                }
-            }
-        );
-        thread_pool.wait_for_workers_finished();
-
-#endif
-    }
-
-    void sgemm_row_major_transpose_right(
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-
-#if defined(USE_SSE3)
-
-        const __m128 alpha4 = _mm_set1_ps(alpha);
-        const __m128 beta4 = _mm_set1_ps(beta);
-
-        int m = 0;
-        for (; m < M - 1; m += 2)
-        {
-            int n = 0;
-            for (; n < N - 3; n += 4)
-            {
-                //        mn
-                __m128 sum00 = _mm_setzero_ps();
-                __m128 sum01 = _mm_setzero_ps();
-                __m128 sum02 = _mm_setzero_ps();
-                __m128 sum03 = _mm_setzero_ps();
-                __m128 sum10 = _mm_setzero_ps();
-                __m128 sum11 = _mm_setzero_ps();
-                __m128 sum12 = _mm_setzero_ps();
-                __m128 sum13 = _mm_setzero_ps();
-
-                // Horizontal sum of elements in sum[m][n] corresponds to
-                // the final element in the C.
-
-                int k = 0;
-                for (; k < K - 3; k += 4)
-                {
-                    const __m128 a0 = _mm_loadu_ps(&A[(m+0)*lda+k+0]);
-                    const __m128 a1 = _mm_loadu_ps(&A[(m+1)*lda+k+0]);
-
-                    const __m128 b0 = _mm_loadu_ps(&B[(n+0)*ldb+k+0]);
-                    const __m128 b1 = _mm_loadu_ps(&B[(n+1)*ldb+k+0]);
-                    const __m128 b2 = _mm_loadu_ps(&B[(n+2)*ldb+k+0]);
-                    const __m128 b3 = _mm_loadu_ps(&B[(n+3)*ldb+k+0]);
-
-                    sum00 = _mm_add_ps(sum00, _mm_mul_ps(a0, b0));
-                    sum01 = _mm_add_ps(sum01, _mm_mul_ps(a0, b1));
-                    sum02 = _mm_add_ps(sum02, _mm_mul_ps(a0, b2));
-                    sum03 = _mm_add_ps(sum03, _mm_mul_ps(a0, b3));
-                    sum10 = _mm_add_ps(sum10, _mm_mul_ps(a1, b0));
-                    sum11 = _mm_add_ps(sum11, _mm_mul_ps(a1, b1));
-                    sum12 = _mm_add_ps(sum12, _mm_mul_ps(a1, b2));
-                    sum13 = _mm_add_ps(sum13, _mm_mul_ps(a1, b3));
-                }
-
-                for(; k < K; k += 1)
-                {
-                    const float a0 = A[(m+0)*lda+k+0];
-                    const float a1 = A[(m+1)*lda+k+0];
-
-                    const float b0 = B[(n+0)*ldb+k+0];
-                    const float b1 = B[(n+1)*ldb+k+0];
-                    const float b2 = B[(n+2)*ldb+k+0];
-                    const float b3 = B[(n+3)*ldb+k+0];
-
-                    // Since all will be summed vertically anyway we can
-                    // just add to the first element.
-                    // Other elements are left unmodified.
-                    sum00 = _mm_add_ss(sum00, _mm_set_ss(a0 * b0));
-                    sum01 = _mm_add_ss(sum01, _mm_set_ss(a0 * b1));
-                    sum02 = _mm_add_ss(sum02, _mm_set_ss(a0 * b2));
-                    sum03 = _mm_add_ss(sum03, _mm_set_ss(a0 * b3));
-                    sum10 = _mm_add_ss(sum10, _mm_set_ss(a1 * b0));
-                    sum11 = _mm_add_ss(sum11, _mm_set_ss(a1 * b1));
-                    sum12 = _mm_add_ss(sum12, _mm_set_ss(a1 * b2));
-                    sum13 = _mm_add_ss(sum13, _mm_set_ss(a1 * b3));
-                }
-
-                __m128 s0 = m128_hadd_ps(sum00, sum01, sum02, sum03);
-                __m128 s1 = m128_hadd_ps(sum10, sum11, sum12, sum13);
-                s0 = _mm_mul_ps(s0, alpha4);
-                s1 = _mm_mul_ps(s1, alpha4);
-
-                __m128 c0 = _mm_loadu_ps(&C[(m+0)*ldc+(n+0)]);
-                __m128 c1 = _mm_loadu_ps(&C[(m+1)*ldc+(n+0)]);
-                c0 = _mm_mul_ps(c0, beta4);
-                c1 = _mm_mul_ps(c1, beta4);
-
-                c0 = _mm_add_ps(c0, s0);
-                c1 = _mm_add_ps(c1, s1);
-
-                _mm_storeu_ps(&C[(m+0)*ldc+(n+0)], c0);
-                _mm_storeu_ps(&C[(m+1)*ldc+(n+0)], c1);
-            }
-
-            for(; n < N; n += 1)
-            {
-                float sum0 = 0.0f;
-                float sum1 = 0.0f;
-
-                for (int k = 0; k < K; ++k)
-                {
-                    const float a0 = A[(m+0)*lda+k+0];
-                    const float a1 = A[(m+1)*lda+k+0];
-
-                    const float b0 = B[(n+0)*ldb+k+0];
-
-                    sum0 += a0 * b0;
-                    sum1 += a1 * b0;
-                }
-
-                C[(m+0)*ldc+(n+0)] = C[(m+0)*ldc+(n+0)] * beta + sum0 * alpha;
-                C[(m+1)*ldc+(n+0)] = C[(m+1)*ldc+(n+0)] * beta + sum1 * alpha;
-            }
-        }
-
-        for (; m < M; m += 1)
-        {
-            for (int n = 0; n < N; n += 1)
-            {
-                float sum = 0.0f;
-
-                for (int k = 0; k < K; k += 1)
-                {
-                    sum += A[m*lda + k] * B[n*ldb + k];
-                }
-
-                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
-            }
-        }
-
-#else
-
-        for (int m = 0; m < M; m += 1)
-        {
-            for (int n = 0; n < N; n += 1)
-            {
-                float sum = 0.0f;
-
-                for (int k = 0; k < K; k += 1)
-                {
-                    sum += A[m*lda + k] * B[n*ldb + k];
-                }
-
-                C[m*ldc + n] = C[m*ldc + n] * beta + sum * alpha;
-            }
-        }
-
-#endif
-    }
-
-    // The pointer to the storage returned by this function
-    // is valid until the next call to this function from
-    // the same thread with the same idx.
-    // This is an unsafe function and should be used with caution
-    // and only within this translation unit.
-    // The number of buffers available is just enough to make
-    // all functions here work.
-    float* get_thread_local_temporary_storage(
-        int requested_size, int idx
-    )
-    {
-        static constexpr int MAX_NUM_BUFFERS = 2;
-
-        static thread_local int s_data_size[MAX_NUM_BUFFERS] = {0};
-        static thread_local std::unique_ptr<float[]> s_data[MAX_NUM_BUFFERS];
-
-        if (requested_size > s_data_size[idx])
-        {
-            s_data[idx] = std::make_unique<float[]>(requested_size);
-            s_data_size[idx] = requested_size;
-        }
-
-        return s_data[idx].get();
-    }
-
-    void sgemm_row_major_transpose_none(
-        ThreadPool& thread_pool,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-        constexpr static int temporary_buffer_index = 1;
-
-        auto B_tr = get_thread_local_temporary_storage(K * N, temporary_buffer_index);
-
-        transpose(
-            K, N,
-            B, ldb,
-            B_tr, K
-        );
-
-        sgemm_row_major_transpose_right(
-            thread_pool,
-            M, N, K,
-            alpha,
-            A, lda,
-            B_tr, K,
-            beta,
-            C, ldc
-        );
-    }
-
-    void sgemm_row_major_transpose_none(
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-        constexpr static int temporary_buffer_index = 1;
-
-        auto B_tr = get_thread_local_temporary_storage(K * N, temporary_buffer_index);
-
-        transpose(
-            K, N,
-            B, ldb,
-            B_tr, K
-        );
-
-        sgemm_row_major_transpose_right(
-            M, N, K,
-            alpha,
-            A, lda,
-            B_tr, K,
-            beta,
-            C, ldc
-        );
-    }
-
-    void sgemm_row_major(
-        ThreadPool& thread_pool,
-        MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-        constexpr static int temporary_buffer_index = 0;
-
-        if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::Trans)
-        {
-            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
-
-            transpose(
-                K, M,
-                A, lda,
-                A_tr, K
-            );
-
-            sgemm_row_major_transpose_right(
-                thread_pool,
-                M, N, K,
-                alpha,
-                A_tr, K,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else if (TransA == MatrixTranspose::NoTrans && TransB == MatrixTranspose::Trans)
-        {
-            sgemm_row_major_transpose_right(
-                thread_pool,
-                M, N, K,
-                alpha,
-                A, lda,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::NoTrans)
-        {
-            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
-
-            transpose(
-                K, M,
-                A, lda,
-                A_tr, K
-            );
-
-            sgemm_row_major_transpose_none(
-                thread_pool,
-                M, N, K,
-                alpha,
-                A_tr, K,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else // no transpositions
-        {
-            sgemm_row_major_transpose_none(
-                thread_pool,
-                M, N, K,
-                alpha,
-                A, lda,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-    }
-
-    void sgemm_row_major(
-        MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-        constexpr static int temporary_buffer_index = 0;
-
-        if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::Trans)
-        {
-            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
-
-            transpose(
-                K, M,
-                A, lda,
-                A_tr, K
-            );
-
-            sgemm_row_major_transpose_right(
-                M, N, K,
-                alpha,
-                A_tr, K,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else if (TransA == MatrixTranspose::NoTrans && TransB == MatrixTranspose::Trans)
-        {
-            sgemm_row_major_transpose_right(
-                M, N, K,
-                alpha,
-                A, lda,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else if (TransA == MatrixTranspose::Trans && TransB == MatrixTranspose::NoTrans)
-        {
-            auto A_tr = get_thread_local_temporary_storage(K * M, temporary_buffer_index);
-
-            transpose(
-                K, M,
-                A, lda,
-                A_tr, K
-            );
-
-            sgemm_row_major_transpose_none(
-                M, N, K,
-                alpha,
-                A_tr, K,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else // no transpositions
-        {
-            sgemm_row_major_transpose_none(
-                M, N, K,
-                alpha,
-                A, lda,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-    }
-
-    void sgemm(
-        ThreadPool& thread_pool,
-        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-        if (layout == MatrixLayout::RowMajor)
-        {
-            sgemm_row_major(
-                thread_pool,
-                TransA, TransB,
-                M, N, K,
-                alpha,
-                A, lda,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else
-        {
-            sgemm_row_major(
-                thread_pool,
-                TransB, TransA,
-                N, M, K,
-                alpha,
-                B, ldb,
-                A, lda,
-                beta,
-                C, ldc
-            );
-        }
-    }
-
-
-    void sgemm(
-        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    )
-    {
-        if (layout == MatrixLayout::RowMajor)
-        {
-            sgemm_row_major(
-                TransA, TransB,
-                M, N, K,
-                alpha,
-                A, lda,
-                B, ldb,
-                beta,
-                C, ldc
-            );
-        }
-        else
-        {
-            sgemm_row_major(
-                TransB, TransA,
-                N, M, K,
-                alpha,
-                B, ldb,
-                A, lda,
-                beta,
-                C, ldc
-            );
-        }
-    }
-
-    std::vector<float> generate_random_matrix(int rows, int cols)
-    {
-        std::vector<float> m(rows * cols);
-
-        std::mt19937_64 rng;
-        std::uniform_real_distribution<float> d(-1.0, 1.0);
-
-        for(auto& v : m)
-        {
-            v = d(rng);
-        }
-
-        return m;
-    }
-
-    std::vector<float> generate_zero_matrix(int rows, int cols)
-    {
-        return std::vector<float>(rows * cols, 0.0f);
-    }
-
-    float matrix_relative_error(
-        const std::vector<float>& ref,
-        const std::vector<float>& our
-    )
-    {
-        double sum = 0.0;
-        double diff_sum = 0.0;
-
-        for(size_t i = 0; i < ref.size(); ++i)
-        {
-            sum += std::abs(ref[i]);
-            diff_sum += std::abs(ref[i] - our[i]);
-        }
-
-        return diff_sum / sum;
-    }
-
-    float norm(
-        const std::vector<float>& v
-    )
-    {
-        double sum = 0.0;
-
-        for(auto& e : v)
-        {
-            sum += e * e;
-        }
-
-        return std::sqrt(sum);
-    }
-
-#if defined (USE_BLAS)
-
-    CBLAS_LAYOUT matrix_layout_to_blas_layout(MatrixLayout layout)
-    {
-        if (layout == MatrixLayout::RowMajor)
-            return CblasRowMajor;
-        else if (layout == MatrixLayout::ColMajor)
-            return CblasColMajor;
-
-        return static_cast<CBLAS_LAYOUT>(-1);
-    }
-
-    const char* matrix_layout_to_string(MatrixLayout layout)
-    {
-        if (layout == MatrixLayout::RowMajor)
-            return "RowMajor";
-        else if (layout == MatrixLayout::ColMajor)
-            return "ColMajor";
-
-        return "INVALID";
-    }
-
-    CBLAS_TRANSPOSE matrix_transpose_to_blas_transpose(MatrixTranspose tr)
-    {
-        if (tr == MatrixTranspose::NoTrans)
-            return CblasNoTrans;
-        else if (tr == MatrixTranspose::Trans)
-            return CblasTrans;
-
-        return static_cast<CBLAS_TRANSPOSE>(-1);
-    }
-
-    const char* matrix_transpose_to_string(MatrixTranspose tr)
-    {
-        if (tr == MatrixTranspose::NoTrans)
-            return "NoTrans";
-        else if (tr == MatrixTranspose::Trans)
-            return "Trans";
-
-        return "INVALID";
-    }
-
-    void test_sgemm(
-        ThreadPool& thread_pool,
-        MatrixLayout layout, MatrixTranspose trA, MatrixTranspose trB,
-        int M, int N, int K
-    )
-    {
-        auto A = generate_random_matrix(M * 2, K * 2);
-        auto B = generate_random_matrix(K * 2, N * 2);
-        auto C_ref = generate_random_matrix(M * 2, N * 2);
-        auto C_our = C_ref;
-
-        std::cout
-            << matrix_layout_to_string(layout) << ' '
-            << matrix_transpose_to_string(trA) << ' '
-            << matrix_transpose_to_string(trB) << '\n';
-
-        std::cout << "A norm: " << norm(A) << '\n';
-        std::cout << "B norm: " << norm(B) << '\n';
-        std::cout << "C norm: " << norm(C_ref) << '\n';
-
-        const int lda = (trA == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? K * 2 : M * 2;
-        const int ldb = (trB == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? N * 2 : K * 2;
-        const int ldc = (layout == MatrixLayout::RowMajor) ? N * 2 : M * 2;
-
-        cblas_sgemm(
-            matrix_layout_to_blas_layout(layout),
-            matrix_transpose_to_blas_transpose(trA),
-            matrix_transpose_to_blas_transpose(trB),
-            M, N, K,
-            1.0,
-            A.data(), lda,
-            B.data(), ldb,
-            1.0,
-            C_ref.data(), ldc
-        );
-
-        sgemm(
-            thread_pool,
-            layout, trA, trB,
-            M, N, K,
-            1.0,
-            A.data(), lda,
-            B.data(), ldb,
-            1.0,
-            C_our.data(), ldc
-        );
-
-        std::cout << "C_ref norm: " << norm(C_ref) << '\n';
-        std::cout << "C_our norm: " << norm(C_our) << '\n';
-        std::cout << "Relative error: " << matrix_relative_error(C_ref, C_our) << '\n';
-
-        std::cout << '\n';
-    }
-
-    void test_sgemm(
-        ThreadPool& thread_pool
-    )
-    {
-        constexpr int M = 57;
-        constexpr int N = 127;
-        constexpr int K = 31;
-
-        std::cout << "SGEMM test:\n";
-
-        for(auto layout : { MatrixLayout::RowMajor, MatrixLayout::ColMajor })
-        {
-            for(auto trA : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
-            {
-                for(auto trB : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
-                {
-                    test_sgemm(
-                        thread_pool,
-                        layout, trA, trB,
-                        M, N, K
-                    );
-                }
-            }
-        }
-    }
-
-    void bench_sgemm(
-        ThreadPool& thread_pool,
-        MatrixLayout layout, MatrixTranspose trA, MatrixTranspose trB,
-        int M, int N, int K
-    )
-    {
-        constexpr int num_iters = 1000;
-
-        auto A = generate_random_matrix(M * 2, K * 2);
-        auto B = generate_random_matrix(K * 2, N * 2);
-        auto C_ref = generate_random_matrix(M * 2, N * 2);
-        auto C_our = C_ref;
-
-        std::cout
-            << matrix_layout_to_string(layout) << ' '
-            << matrix_transpose_to_string(trA) << ' '
-            << matrix_transpose_to_string(trB) << '\n';
-
-        std::cout << "A norm: " << norm(A) << '\n';
-        std::cout << "B norm: " << norm(B) << '\n';
-        std::cout << "C norm: " << norm(C_ref) << '\n';
-
-        const int lda = (trA == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? K * 2 : M * 2;
-        const int ldb = (trB == MatrixTranspose::NoTrans) == (layout == MatrixLayout::RowMajor) ? N * 2 : K * 2;
-        const int ldc = (layout == MatrixLayout::RowMajor) ? N * 2 : M * 2;
-
-        auto t0_ref = std::chrono::high_resolution_clock::now();
-        for(int i = 0; i < num_iters; ++i)
-        {
-            cblas_sgemm(
-                matrix_layout_to_blas_layout(layout),
-                matrix_transpose_to_blas_transpose(trA),
-                matrix_transpose_to_blas_transpose(trB),
-                M, N, K,
-                1.0,
-                A.data(), lda,
-                B.data(), ldb,
-                -0.5,
-                C_ref.data(), ldc
-            );
-        }
-        auto t1_ref = std::chrono::high_resolution_clock::now();
-        auto diff_ref = t1_ref - t0_ref;
-
-        auto t0_our = std::chrono::high_resolution_clock::now();
-        for(int i = 0; i < num_iters; ++i)
-        {
-            sgemm(
-                thread_pool,
-                layout, trA, trB,
-                M, N, K,
-                1.0,
-                A.data(), lda,
-                B.data(), ldb,
-                -0.5,
-                C_our.data(), ldc
-            );
-        }
-        auto t1_our = std::chrono::high_resolution_clock::now();
-        auto diff_our = t1_our - t0_our;
-
-        std::cout << "C_ref norm: " << norm(C_ref) << '\n';
-        std::cout << "C_our norm: " << norm(C_our) << '\n';
-        std::cout << "Relative error: " << matrix_relative_error(C_ref, C_our) << '\n';
-        std::cout << "Ref time: " << std::chrono::duration_cast<std::chrono::nanoseconds>(diff_ref).count() << " [ns]\n";
-        std::cout << "Our time: " << std::chrono::duration_cast<std::chrono::nanoseconds>(diff_our).count() << " [ns]\n";
-
-        std::cout << '\n';
-    }
-
-    void bench_sgemm(
-        ThreadPool& thread_pool
-    )
-    {
-        constexpr int M = 107;
-        constexpr int N = 213;
-        constexpr int K = 57;
-
-        std::cout << "SGEMM benchmark:\n";
-
-        for(auto layout : { MatrixLayout::RowMajor, MatrixLayout::ColMajor })
-        {
-            for(auto trA : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
-            {
-                for(auto trB : { MatrixTranspose::NoTrans, MatrixTranspose::Trans })
-                {
-                    bench_sgemm(
-                        thread_pool,
-                        layout, trA, trB,
-                        M, N, K
-                    );
-                }
-            }
-        }
-    }
-
-#endif
-
-    void print_arch()
-    {
-#if defined (USE_SSE3)
-        std::cout << "Using the sse3 implementation.\n";
-#elif defined (USE_SSE2)
-        std::cout << "Using the sse2 implementation.\n";
-#else
-        std::cout << "Using the base implementation.\n";
-#endif
-    }
-
-    void test(
-        ThreadPool& thread_pool
-    )
-    {
-#if defined (USE_BLAS)
-        print_arch();
-        test_sgemm(thread_pool);
-#else
-        std::cout << "Blas tests are only runnable when USE_BLAS is defined.\n";
-        (void)thread_pool;
-#endif
-    }
-
-    void bench(
-        ThreadPool& thread_pool
-    )
-    {
-#if defined (USE_BLAS)
-        print_arch();
-        bench_sgemm(thread_pool);
-#else
-        std::cout << "Blas benchmarks are only runnable when USE_BLAS is defined.\n";
-        (void)thread_pool;
-#endif
-    }
-}
\ No newline at end of file
diff --git a/src/extra/stockfish_blas.h b/src/extra/stockfish_blas.h
deleted file mode 100644
index f551bbf2..00000000
--- a/src/extra/stockfish_blas.h
+++ /dev/null
@@ -1,140 +0,0 @@
-#ifndef _STOCKFISH_BLAS_H_
-#define _STOCKFISH_BLAS_H_
-
-struct ThreadPool;
-
-#if defined (_MSC_VER)
-#define SF_BLAS_RESTRICT __restrict
-#elif defined (__INTEL_COMPILER)
-#define SF_BLAS_RESTRICT restrict
-#elif defined (__clang__)
-#define SF_BLAS_RESTRICT __restrict__
-#elif defined (__GNUC__)
-#define SF_BLAS_RESTRICT __restrict__
-#endif
-
-namespace Blas {
-
-    enum struct MatrixLayout {
-        RowMajor = 101,
-        ColMajor = 102
-    };
-
-    enum struct MatrixTranspose {
-        NoTrans = 111,
-        Trans = 112
-    };
-
-    void scopy(
-        const int N,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    );
-
-    void scopy(
-        const int N,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    );
-
-    void scopy(
-        ThreadPool& thread_pool,
-        const int N,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    );
-
-    void scopy(
-        ThreadPool& thread_pool,
-        const int N,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    );
-
-    void sscal(
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X
-    );
-
-    void sscal(
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X, const int incX
-    );
-
-    void sscal(
-        ThreadPool& thread_pool,
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X
-    );
-
-    void sscal(
-        ThreadPool& thread_pool,
-        const int N,
-        const float alpha,
-        float * SF_BLAS_RESTRICT X, const int incX
-    );
-
-    void saxpy(
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    );
-
-    void saxpy(
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    );
-
-    void saxpy(
-        ThreadPool& thread_pool,
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X,
-        float * SF_BLAS_RESTRICT Y
-    );
-
-    void saxpy(
-        ThreadPool& thread_pool,
-        const int N,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT X, const int incX,
-        float * SF_BLAS_RESTRICT Y, const int incY
-    );
-
-    void sgemm(
-        ThreadPool& thread_pool,
-        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    );
-
-    void sgemm(
-        MatrixLayout layout, MatrixTranspose TransA, MatrixTranspose TransB,
-        const int M, const int N, const int K,
-        const float alpha,
-        const float * SF_BLAS_RESTRICT A, const int lda,
-        const float * SF_BLAS_RESTRICT B, const int ldb,
-        const float beta,
-        float * SF_BLAS_RESTRICT C, const int ldc
-    );
-
-    void test(
-        ThreadPool& thread_pool
-    );
-
-    void bench(
-        ThreadPool& thread_pool
-    );
-}
-
-#endif
diff --git a/src/learn/autograd.h b/src/learn/autograd.h
deleted file mode 100644
index 7b2853df..00000000
--- a/src/learn/autograd.h
+++ /dev/null
@@ -1,667 +0,0 @@
-#ifndef LEARNER_AUTOGRAD_H
-#define LEARNER_AUTOGRAD_H
-
-#include <cmath>
-#include <utility>
-#include <type_traits>
-#include <memory>
-#include <tuple>
-#include <optional>
-#include <algorithm>
-#include <cstdint>
-
-namespace Learner
-{
-    template <typename T>
-    struct ValueWithGrad
-    {
-        T value;
-        T grad;
-
-        ValueWithGrad& operator+=(const ValueWithGrad<T>& rhs)
-        {
-            value += rhs.value;
-            grad += rhs.grad;
-            return *this;
-        }
-
-        ValueWithGrad& operator-=(const ValueWithGrad<T>& rhs)
-        {
-            value -= rhs.value;
-            grad -= rhs.grad;
-            return *this;
-        }
-
-        ValueWithGrad& operator*=(T rhs)
-        {
-            value *= rhs;
-            grad *= rhs;
-            return *this;
-        }
-
-        ValueWithGrad& operator/=(T rhs)
-        {
-            value /= rhs;
-            grad /= rhs;
-            return *this;
-        }
-
-        [[nodiscard]] ValueWithGrad abs() const
-        {
-            return { std::abs(value), std::abs(grad) };
-        }
-
-        [[nodiscard]] ValueWithGrad clamp_grad(T max) const
-        {
-            return { value, std::clamp(grad, -max, max) };
-        }
-    };
-}
-
-namespace Learner::Autograd::UnivariateStatic
-{
-
-    template <typename T>
-    struct Identity
-    {
-        using type = T;
-    };
-
-    template <typename T>
-    using Id = typename Identity<T>::type;
-
-    template <typename T>
-    using StoreValueOrRef = std::conditional_t<
-            std::is_rvalue_reference_v<T>,
-            std::remove_reference_t<T>,
-            const std::remove_reference_t<T>&
-        >;
-
-    namespace Detail
-    {
-        using CallIdType = std::uint32_t;
-
-        struct CallId
-        {
-            CallIdType call_id{};
-
-            constexpr CallId() :
-                call_id(0)
-            {
-            }
-
-            constexpr CallId(CallIdType id) :
-                call_id(id)
-            {
-            }
-
-            [[nodiscard]] bool operator==(CallId rhs) const noexcept
-            {
-                return call_id == rhs.call_id;
-            }
-
-            [[nodiscard]] bool operator!=(CallId rhs) const noexcept
-            {
-                return call_id != rhs.call_id;
-            }
-        };
-
-        [[nodiscard]] inline CallId next_call_id()
-        {
-            static thread_local CallIdType s_call_id = 0;
-            return CallId{ s_call_id++ };
-        }
-
-        template <typename T, typename Tuple>
-        struct TupleContains;
-
-        template <typename T, typename... Us>
-        struct TupleContains<T, std::tuple<Us...>> : std::disjunction<std::is_same<T, Us>...> {};
-
-        template <typename T, typename Tuple>
-        constexpr bool TupleContainsV = TupleContains<T, Tuple>::value;
-
-        template <typename... Ts>
-        constexpr bool AreAllConstantV = (std::remove_reference_t<Ts>::is_constant && ...);
-    }
-
-    template <typename T, typename ChildT>
-    struct Evaluable
-    {
-        constexpr Evaluable() = default;
-
-        // We append a unique call id so that we can invalidate the cache when
-        // the next computation starts. A single evaluation should see
-        // the same call_id at every node.
-        template <typename... ArgsTs>
-        [[nodiscard]] auto eval(const std::tuple<ArgsTs...>& args) const
-        {
-            const auto call_id = Detail::next_call_id();
-            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
-            return ValueWithGrad<T>{ value(new_args), grad(new_args) };
-        }
-
-        template <typename... ArgsTs,
-            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
-        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args) const
-        {
-            const ChildT* this_ = static_cast<const ChildT*>(this);
-
-            const auto call_id = std::get<Detail::CallId>(args);
-            if (!value_cache.has_value() || value_cache_call_id != call_id)
-            {
-                value_cache_call_id = call_id;
-                value_cache = this_->calculate_value(args);
-            }
-
-            return *value_cache;
-        }
-
-        template <typename... ArgsTs,
-            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
-        [[nodiscard]] auto value(const std::tuple<ArgsTs...>& args, ...) const
-        {
-            const auto call_id = Detail::next_call_id();
-            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
-            return value(new_args);
-        }
-
-        template <typename... ArgsTs,
-            typename SFINAE = std::enable_if_t<Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
-        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args) const
-        {
-            if constexpr (ChildT::is_constant)
-            {
-                return T(0.0);
-            }
-            else
-            {
-                const ChildT* this_ = static_cast<const ChildT*>(this);
-
-                const auto call_id = std::get<Detail::CallId>(args);
-                if (!grad_cache.has_value() || grad_cache_call_id != call_id)
-                {
-                    grad_cache_call_id = call_id;
-                    grad_cache = this_->calculate_grad(args);
-                }
-
-                return *grad_cache;
-            }
-        }
-
-        template <typename... ArgsTs,
-            typename SFINAE = std::enable_if_t<!Detail::TupleContainsV<Detail::CallId, std::tuple<ArgsTs...>>>>
-        [[nodiscard]] auto grad(const std::tuple<ArgsTs...>& args, ...) const
-        {
-            const auto call_id = Detail::next_call_id();
-            const auto new_args = std::tuple_cat(args, std::tuple(call_id));
-            return grad(new_args);
-        }
-
-    private:
-        mutable std::optional<T> value_cache;
-        mutable std::optional<T> grad_cache;
-        mutable Detail::CallId value_cache_call_id{};
-        mutable Detail::CallId grad_cache_call_id{};
-    };
-
-    template <typename T, int I>
-    struct VariableParameter : Evaluable<T, VariableParameter<T, I>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = false;
-
-        constexpr VariableParameter()
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return std::get<I>(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
-        {
-            return T(1.0);
-        }
-    };
-
-    template <typename T, int I>
-    struct ConstantParameter : Evaluable<T, ConstantParameter<T, I>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = true;
-
-        constexpr ConstantParameter()
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return std::get<I>(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
-        {
-            return T(0.0);
-        }
-    };
-
-    template <typename T>
-    struct Constant : Evaluable<T, Constant<T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = true;
-
-        constexpr Constant(T x) :
-            m_x(std::move(x))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
-        {
-            return m_x;
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
-        {
-            return T(0.0);
-        }
-
-    private:
-        T m_x;
-    };
-
-    // The "constant" may change between executions, but is assumed to be
-    // constant during a single evaluation.
-    template <typename T>
-    struct ConstantRef : Evaluable<T, ConstantRef<T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = true;
-
-        constexpr ConstantRef(const T& x) :
-            m_x(x)
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>&) const
-        {
-            return m_x;
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>&) const
-        {
-            return T(0.0);
-        }
-
-    private:
-        const T& m_x;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    struct Sum : Evaluable<T, Sum<LhsT, RhsT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
-
-        constexpr Sum(LhsT&& lhs, RhsT&& rhs) :
-            m_lhs(std::forward<LhsT>(lhs)),
-            m_rhs(std::forward<RhsT>(rhs))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.value(args) + m_rhs.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.grad(args) + m_rhs.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<LhsT> m_lhs;
-        StoreValueOrRef<RhsT> m_rhs;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, RhsT&& rhs)
-    {
-        return Sum<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator+(LhsT&& lhs, Id<T> rhs)
-    {
-        return Sum<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
-    }
-
-    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator+(Id<T> lhs, RhsT&& rhs)
-    {
-        return Sum<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    struct Difference : Evaluable<T, Difference<LhsT, RhsT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
-
-        constexpr Difference(LhsT&& lhs, RhsT&& rhs) :
-            m_lhs(std::forward<LhsT>(lhs)),
-            m_rhs(std::forward<RhsT>(rhs))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.value(args) - m_rhs.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.grad(args) - m_rhs.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<LhsT> m_lhs;
-        StoreValueOrRef<RhsT> m_rhs;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, RhsT&& rhs)
-    {
-        return Difference<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator-(LhsT&& lhs, Id<T> rhs)
-    {
-        return Difference<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
-    }
-
-    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator-(Id<T> lhs, RhsT&& rhs)
-    {
-        return Difference<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    struct Product : Evaluable<T, Product<LhsT, RhsT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
-
-        constexpr Product(LhsT&& lhs, RhsT&& rhs) :
-            m_lhs(std::forward<LhsT>(lhs)),
-            m_rhs(std::forward<RhsT>(rhs))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.value(args) * m_rhs.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.grad(args) * m_rhs.value(args) + m_lhs.value(args) * m_rhs.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<LhsT> m_lhs;
-        StoreValueOrRef<RhsT> m_rhs;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, RhsT&& rhs)
-    {
-        return Product<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator*(LhsT&& lhs, Id<T> rhs)
-    {
-        return Product<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
-    }
-
-    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator*(Id<T> lhs, RhsT&& rhs)
-    {
-        return Product<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    struct Quotient : Evaluable<T, Quotient<LhsT, RhsT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<LhsT, RhsT>;
-
-        constexpr Quotient(LhsT&& lhs, RhsT&& rhs) :
-            m_lhs(std::forward<LhsT>(lhs)),
-            m_rhs(std::forward<RhsT>(rhs))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_lhs.value(args) / m_rhs.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            auto g = m_rhs.value(args);
-            return (m_lhs.grad(args) * g - m_lhs.value(args) * m_rhs.grad(args)) / (g * g);
-        }
-
-    private:
-        StoreValueOrRef<LhsT> m_lhs;
-        StoreValueOrRef<RhsT> m_rhs;
-    };
-
-    template <typename LhsT, typename RhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, RhsT&& rhs)
-    {
-        return Quotient<LhsT&&, RhsT&&>(std::forward<LhsT>(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename LhsT, typename T = typename std::remove_reference_t<LhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator/(LhsT&& lhs, Id<T> rhs)
-    {
-        return Quotient<LhsT&&, Constant<T>&&>(std::forward<LhsT>(lhs), Constant(rhs));
-    }
-
-    template <typename RhsT, typename T = typename std::remove_reference_t<RhsT>::ValueType>
-    [[nodiscard]] constexpr auto operator/(Id<T> lhs, RhsT&& rhs)
-    {
-        return Quotient<Constant<T>&&, RhsT&&>(Constant(lhs), std::forward<RhsT>(rhs));
-    }
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    struct Negation : Evaluable<T, Negation<ArgT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
-
-        constexpr explicit Negation(ArgT&& x) :
-            m_x(std::forward<ArgT>(x))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return -m_x.value(args);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return -m_x.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<ArgT> m_x;
-    };
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    [[nodiscard]] constexpr auto operator-(ArgT&& x)
-    {
-        return Negation<ArgT&&>(std::forward<ArgT>(x));
-    }
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    struct Sigmoid : Evaluable<T, Sigmoid<ArgT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
-
-        constexpr explicit Sigmoid(ArgT&& x) :
-            m_x(std::forward<ArgT>(x))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return value_(m_x.value(args));
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_x.grad(args) * grad_(m_x.value(args));
-        }
-
-    private:
-        StoreValueOrRef<ArgT> m_x;
-
-        [[nodiscard]] T value_(T x) const
-        {
-            return 1.0 / (1.0 + std::exp(-x));
-        }
-
-        [[nodiscard]] T grad_(T x) const
-        {
-            return value_(x) * (1.0 - value_(x));
-        }
-    };
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    [[nodiscard]] constexpr auto sigmoid(ArgT&& x)
-    {
-        return Sigmoid<ArgT&&>(std::forward<ArgT>(x));
-    }
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    struct Pow : Evaluable<T, Pow<ArgT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
-
-        constexpr explicit Pow(ArgT&& x, Id<T> exponent) :
-            m_x(std::forward<ArgT>(x)),
-            m_exponent(std::move(exponent))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return std::pow(m_x.value(args), m_exponent);
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_exponent * std::pow(m_x.value(args), m_exponent - T(1.0)) * m_x.grad(args);
-        }
-
-    private:
-        StoreValueOrRef<ArgT> m_x;
-        T m_exponent;
-    };
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    [[nodiscard]] constexpr auto pow(ArgT&& x, Id<T> exp)
-    {
-        return Pow<ArgT&&>(std::forward<ArgT>(x), std::move(exp));
-    }
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    struct Log : Evaluable<T, Log<ArgT, T>>
-    {
-        using ValueType = T;
-
-        static constexpr bool is_constant = Detail::AreAllConstantV<ArgT>;
-
-        constexpr explicit Log(ArgT&& x) :
-            m_x(std::forward<ArgT>(x))
-        {
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_value(const std::tuple<ArgsTs...>& args) const
-        {
-            return value_(m_x.value(args));
-        }
-
-        template <typename... ArgsTs>
-        [[nodiscard]] T calculate_grad(const std::tuple<ArgsTs...>& args) const
-        {
-            return m_x.grad(args) * grad_(m_x.value(args));
-        }
-
-    private:
-        StoreValueOrRef<ArgT> m_x;
-
-        T value_(T x) const
-        {
-            return std::log(x);
-        }
-
-        T grad_(T x) const
-        {
-            return 1.0 / x;
-        }
-    };
-
-    template <typename ArgT, typename T = typename std::remove_reference_t<ArgT>::ValueType>
-    [[nodiscard]] constexpr auto log(ArgT&& x)
-    {
-        return Log<ArgT&&>(std::forward<ArgT>(x));
-    }
-
-}
-
-#endif
\ No newline at end of file
diff --git a/src/learn/gensfen.cpp b/src/learn/gensfen.cpp
index b28afa13..e5ddd6aa 100644
--- a/src/learn/gensfen.cpp
+++ b/src/learn/gensfen.cpp
@@ -13,7 +13,6 @@
 #include "extra/nnue_data_binpack_format.h"
 
 #include "nnue/evaluate_nnue.h"
-#include "nnue/evaluate_nnue_learner.h"
 
 #include "syzygy/tbprobe.h"
 
@@ -493,8 +492,8 @@ namespace Learner
 
         // has it reached the max length or is a draw by fifty-move rule
         // or by 3-fold repetition
-        if (ply >= params.write_maxply 
-            || pos.is_fifty_move_draw() 
+        if (ply >= params.write_maxply
+            || pos.is_fifty_move_draw()
             || pos.is_three_fold_repetition())
         {
             return 0;
diff --git a/src/learn/gensfen_nonpv.cpp b/src/learn/gensfen_nonpv.cpp
index ca365034..098511fe 100644
--- a/src/learn/gensfen_nonpv.cpp
+++ b/src/learn/gensfen_nonpv.cpp
@@ -13,7 +13,6 @@
 #include "extra/nnue_data_binpack_format.h"
 
 #include "nnue/evaluate_nnue.h"
-#include "nnue/evaluate_nnue_learner.h"
 
 #include "syzygy/tbprobe.h"
 
diff --git a/src/learn/half_float.h b/src/learn/half_float.h
deleted file mode 100644
index 5808a786..00000000
--- a/src/learn/half_float.h
+++ /dev/null
@@ -1,133 +0,0 @@
-﻿#ifndef __HALF_FLOAT_H__
-#define __HALF_FLOAT_H__
-
-// Half Float Library by yaneurao
-// (16-bit float)
-
-// Floating point operation by 16bit type
-// Assume that the float type code generated by the compiler is in IEEE 754 format and use it.
-
-#include "types.h"
-
-namespace HalfFloat
-{
-    // IEEE 754 float 32 format is :
-    //   sign(1bit) + exponent(8bits) + fraction(23bits) = 32bits
-    //
-    // Our float16 format is :
-    //   sign(1bit) + exponent(5bits) + fraction(10bits) = 16bits
-    union float32_converter
-    {
-        int32_t n;
-        float f;
-    };
-
-
-    // 16-bit float
-    struct float16
-    {
-        // --- constructors
-
-        float16() {}
-        float16(int16_t n) { from_float((float)n);  }
-        float16(int32_t n) { from_float((float)n); }
-        float16(float n) { from_float(n); }
-        float16(double n) { from_float((float)n); }
-
-        // build from a float
-        void from_float(float f) { *this = to_float16(f); }
-
-        // --- implicit converters
-
-        operator int32_t() const { return (int32_t)to_float(*this); }
-        operator float() const { return to_float(*this); }
-        operator double() const { return double(to_float(*this)); }
-
-        // --- operators
-
-        float16 operator += (float16 rhs) { from_float(to_float(*this) + to_float(rhs)); return *this; }
-        float16 operator -= (float16 rhs) { from_float(to_float(*this) - to_float(rhs)); return *this; }
-        float16 operator *= (float16 rhs) { from_float(to_float(*this) * to_float(rhs)); return *this; }
-        float16 operator /= (float16 rhs) { from_float(to_float(*this) / to_float(rhs)); return *this; }
-        float16 operator + (float16 rhs) const { return float16(*this) += rhs; }
-        float16 operator - (float16 rhs) const { return float16(*this) -= rhs; }
-        float16 operator * (float16 rhs) const { return float16(*this) *= rhs; }
-        float16 operator / (float16 rhs) const { return float16(*this) /= rhs; }
-        float16 operator - () const { return float16(-to_float(*this)); }
-        bool operator == (float16 rhs) const { return this->v_ == rhs.v_; }
-        bool operator != (float16 rhs) const { return !(*this == rhs); }
-
-        static void UnitTest() { unit_test(); }
-
-    private:
-
-        // --- entity
-
-        uint16_t v_;
-
-        // --- conversion between float and float16
-
-        static float16 to_float16(float f)
-        {
-            float32_converter c;
-            c.f = f;
-            u32 n = c.n;
-
-            // The sign bit is MSB in common.
-            uint16_t sign_bit = (n >> 16) & 0x8000;
-
-            // The exponent of IEEE 754's float 32 is biased +127 , so we change this bias into +15 and limited to 5-bit.
-            uint16_t exponent = (((n >> 23) - 127 + 15) & 0x1f) << 10;
-
-            // The fraction is limited to 10-bit.
-            uint16_t fraction = (n >> (23-10)) & 0x3ff;
-
-            float16 f_;
-            f_.v_ = sign_bit | exponent | fraction;
-
-            return f_;
-        }
-
-        static float to_float(float16 v)
-        {
-            u32 sign_bit = (v.v_ & 0x8000) << 16;
-            u32 exponent = ((((v.v_ >> 10) & 0x1f) - 15 + 127) & 0xff) << 23;
-            u32 fraction = (v.v_ & 0x3ff) << (23 - 10);
-
-            float32_converter c;
-            c.n = sign_bit | exponent | fraction;
-            return c.f;
-        }
-
-        // It is not a unit test, but I confirmed that it can be calculated. I'll fix the code later (maybe).
-        static void unit_test()
-        {
-            float16 a, b, c, d;
-            a = 1;
-            std::cout << (float)a << std::endl;
-            b = -118.625;
-            std::cout << (float)b << std::endl;
-            c = 2.5;
-            std::cout << (float)c << std::endl;
-            d = a + c;
-            std::cout << (float)d << std::endl;
-
-            c *= 1.5;
-            std::cout << (float)c << std::endl;
-
-            b /= 3;
-            std::cout << (float)b << std::endl;
-
-            float f1 = 1.5;
-            a += f1;
-            std::cout << (float)a << std::endl;
-
-            a += f1 * (float)a;
-            std::cout << (float)a << std::endl;
-        }
-
-    };
-
-}
-
-#endif // __HALF_FLOAT_H__
diff --git a/src/learn/learn.cpp b/src/learn/learn.cpp
deleted file mode 100644
index 9c4546a6..00000000
--- a/src/learn/learn.cpp
+++ /dev/null
@@ -1,1474 +0,0 @@
-﻿// Learning routines:
-//
-// 1) Automatic generation of game records in .bin format
-// → "gensfen" command
-//
-// 2) Learning evaluation function parameters from the generated .bin files
-// → "learn" command
-//
-// → Shuffle in the teacher phase is also an extension of this command.
-// Example) "learn shuffle"
-//
-// 3) Automatic generation of fixed traces
-// → "makebook think" command
-// → implemented in extra/book/book.cpp
-//
-// 4) Post-station automatic review mode
-// → I will not be involved in the engine because it is a problem that the GUI should assist.
-// etc..
-
-#include "learn.h"
-
-#include "autograd.h"
-#include "sfen_reader.h"
-
-#include "misc.h"
-#include "position.h"
-#include "thread.h"
-#include "tt.h"
-#include "uci.h"
-#include "search.h"
-#include "timeman.h"
-
-#include "nnue/evaluate_nnue.h"
-#include "nnue/evaluate_nnue_learner.h"
-
-#include "syzygy/tbprobe.h"
-
-#include <chrono>
-#include <climits>
-#include <cmath>    // std::exp(),std::pow(),std::log()
-#include <cstring>  // memcpy()
-#include <fstream>
-#include <iomanip>
-#include <limits>
-#include <list>
-#include <memory>
-#include <optional>
-#include <random>
-#include <regex>
-#include <shared_mutex>
-#include <sstream>
-#include <unordered_set>
-#include <iostream>
-#include <map>
-#include <algorithm>
-
-#if defined (_OPENMP)
-#include <omp.h>
-#endif
-
-using namespace std;
-
-template <typename T>
-T operator +=(std::atomic<T>& x, const T rhs)
-{
-    T old = x.load(std::memory_order_consume);
-
-    // It is allowed that the value is rewritten from other thread at this timing.
-    // The idea that the value is not destroyed is good.
-    T desired = old + rhs;
-    while (!x.compare_exchange_weak(old, desired, std::memory_order_release, std::memory_order_consume))
-        desired = old + rhs;
-    return desired;
-}
-template <typename T>
-T operator -= (std::atomic<T>& x, const T rhs) { return x += -rhs; }
-
-namespace Learner
-{
-    static double winning_probability_coefficient = 1.0 / PawnValueEg / 4.0 * std::log(10.0);
-
-    // Score scale factors. ex) If we set src_score_min_value = 0.0,
-    // src_score_max_value = 1.0, dest_score_min_value = 0.0,
-    // dest_score_max_value = 10000.0, [0.0, 1.0] will be scaled to [0, 10000].
-    static double src_score_min_value = 0.0;
-    static double src_score_max_value = 1.0;
-    static double dest_score_min_value = 0.0;
-    static double dest_score_max_value = 1.0;
-
-    // A constant used in elmo (WCSC27). Adjustment required.
-    // Since elmo does not internally divide the expression, the value is different.
-    // You can set this value with the learn command.
-    // 0.33 is equivalent to the constant (0.5) used in elmo (WCSC27)
-    static double elmo_lambda_low = 1.0;
-    static double elmo_lambda_high = 1.0;
-    static double elmo_lambda_limit = 32000;
-
-    // Using stockfish's WDL with win rate model instead of sigmoid
-    static bool use_wdl = false;
-
-    static void append_files_from_dir(
-        std::vector<std::string>& filenames,
-        const std::string& base_dir,
-        const std::string& target_dir)
-    {
-        string kif_base_dir = Path::combine(base_dir, target_dir);
-
-        sys::path p(kif_base_dir); // Origin of enumeration
-        std::for_each(sys::directory_iterator(p), sys::directory_iterator(),
-            [&](const sys::path& path) {
-                if (sys::is_regular_file(path))
-                    filenames.push_back(Path::combine(target_dir, path.filename().generic_string()));
-            });
-    }
-
-    static void rebase_files(
-        std::vector<std::string>& filenames,
-        const std::string& base_dir)
-    {
-        for (auto& file : filenames)
-        {
-            file = Path::combine(base_dir, file);
-        }
-    }
-
-    static double calculate_lambda(double teacher_signal)
-    {
-        // If the evaluation value in deep search exceeds elmo_lambda_limit
-        // then apply elmo_lambda_high instead of elmo_lambda_low.
-        const double lambda =
-            (std::abs(teacher_signal) >= elmo_lambda_limit)
-            ? elmo_lambda_high
-            : elmo_lambda_low;
-
-        return lambda;
-    }
-
-    // We use our own simple static autograd for automatic
-    // differentiation of the loss function. While it works it has it's caveats.
-    // To work fast enough it requires memoization and reference semantics.
-    // Memoization is mostly opaque to the user and is only per eval basis.
-    // As for reference semantics, we cannot copy every node,
-    // because we need a way to reuse computation.
-    // But we can't really use shared_ptr because of the overhead. That means
-    // that we have to ensure all parts of a loss expression are not destroyed
-    // before use. When lvalue references are used to construct a node it will
-    // store just a reference, it only perform a copy of the rvalue reference arguments.
-    // This means that we need some storage for the whole computation tree
-    // that keeps the values after function returns and never moves them to
-    // a different memory location. This means that we cannot use local
-    // variables and just return by value - because there may be dangling references left.
-    // We also cannot create a struct with this tree on demand because one cannot
-    // use `auto` as a struct members. This is a big issue, and the only way
-    // to solve it as of now is to use static thread_local variables and rely on the
-    // following assumptions:
-    // 1. the expression node must not change for the duration of the program
-    //    within a single instance of a function. This is usually not a problem
-    //    because almost all information is carried by the type. There is an
-    //    exception though, we have ConstantRef and Constant nodes that
-    //    do not encode the constants in the type, so it's possible
-    //    that these nodes are different on the first call to the function
-    //    then later. We MUST ensure that one function is only ever used
-    //    for one specific expression.
-    // 2. thread_local variables are not expensive. Usually after creation
-    //    it only requires a single unsynchronized boolean check and that's
-    //    how most compilers implement it.
-    //
-    // So the general way to do things right now is to use static thread_local
-    // variables for all named autograd nodes. Results being nodes should be
-    // returned by reference, so that there's no need to copy the returned objects.
-    // Parameters being nodes should be taken by lvalue reference if they are
-    // used more than once (to enable reference semantics to reuse computation),
-    // but they can be rvalues and forward on first use if there's only one use
-    // of the node in the scope.
-    // We must keep in mind that the node tree created by such a function
-    // is never going to change as thread_local variables are initialized
-    // on first call. This means that one cannot use one function as a factory
-    // for different autograd expression trees.
-
-    template <typename ShallowT, typename TeacherT, typename ResultT, typename LambdaT>
-    static auto& cross_entropy_(
-        ShallowT& q_,
-        TeacherT& p_,
-        ResultT& t_,
-        LambdaT& lambda_
-    )
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        constexpr double epsilon = 1e-12;
-
-        static thread_local auto teacher_entropy_ = -(p_ * log(p_ + epsilon) + (1.0 - p_) * log(1.0 - p_ + epsilon));
-        static thread_local auto outcome_entropy_ = -(t_ * log(t_ + epsilon) + (1.0 - t_) * log(1.0 - t_ + epsilon));
-        static thread_local auto teacher_loss_ = -(p_ * log(q_ + epsilon) + (1.0 - p_) * log(1.0 - q_ + epsilon));
-        static thread_local auto outcome_loss_ = -(t_ * log(q_ + epsilon) + (1.0 - t_) * log(1.0 - q_ + epsilon));
-        static thread_local auto result_ = lambda_ * teacher_loss_ + (1.0 - lambda_) * outcome_loss_;
-        static thread_local auto entropy_ = lambda_ * teacher_entropy_ + (1.0 - lambda_) * outcome_entropy_;
-        static thread_local auto cross_entropy_ = result_ - entropy_;
-
-        return cross_entropy_;
-    }
-
-    template <typename ValueT>
-    static auto& scale_score_(ValueT&& v_)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        // Normalize to [0.0, 1.0].
-        static thread_local auto normalized_ =
-            (std::forward<ValueT>(v_) - ConstantRef<double>(src_score_min_value))
-            / (ConstantRef<double>(src_score_max_value) - ConstantRef<double>(src_score_min_value));
-
-        // Scale to [dest_score_min_value, dest_score_max_value].
-        static thread_local auto scaled_ =
-            normalized_
-            * (ConstantRef<double>(dest_score_max_value) - ConstantRef<double>(dest_score_min_value))
-            + ConstantRef<double>(dest_score_min_value);
-
-        return scaled_;
-    }
-
-    static Value scale_score(Value v)
-    {
-        // Normalize to [0.0, 1.0].
-        auto normalized =
-            ((double)v - src_score_min_value)
-            / (src_score_max_value - src_score_min_value);
-
-        // Scale to [dest_score_min_value, dest_score_max_value].
-        auto scaled =
-            normalized
-            * (dest_score_max_value - dest_score_min_value)
-            + dest_score_min_value;
-
-        return Value(scaled);
-    }
-
-    template <typename ValueT>
-    static auto& expected_perf_(ValueT&& v_)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto perf_ = sigmoid(std::forward<ValueT>(v_) * ConstantRef<double>(winning_probability_coefficient));
-
-        return perf_;
-    }
-
-    template <typename ValueT, typename PlyT, typename T = typename ValueT::ValueType>
-    static auto& expected_perf_use_wdl_(
-        ValueT& v_,
-        PlyT&& ply_
-    )
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        // Coefficients of a 3rd order polynomial fit based on fishtest data
-        // for two parameters needed to transform eval to the argument of a
-        // logistic function.
-        static constexpr T as[] = { -8.24404295, 64.23892342, -95.73056462, 153.86478679 };
-        static constexpr T bs[] = { -3.37154371, 28.44489198, -56.67657741,  72.05858751 };
-
-        // The model captures only up to 240 plies, so limit input (and rescale)
-        static thread_local auto m_ = std::forward<PlyT>(ply_) / 64.0;
-
-        static thread_local auto a_ = (((as[0] * m_ + as[1]) * m_ + as[2]) * m_) + as[3];
-        static thread_local auto b_ = (((bs[0] * m_ + bs[1]) * m_ + bs[2]) * m_) + bs[3];
-
-        // Return win rate in per mille
-        static thread_local auto sv_ = (v_ - a_) / b_;
-        static thread_local auto svn_ = (-v_ - a_) / b_;
-
-        static thread_local auto win_pct_ = sigmoid(sv_);
-        static thread_local auto loss_pct_ = sigmoid(svn_);
-
-        static thread_local auto draw_pct_ = 1.0 - win_pct_ - loss_pct_;
-
-        static thread_local auto perf_ = win_pct_ + draw_pct_ * 0.5;
-
-        return perf_;
-    }
-
-    static double expected_perf_use_wdl(
-        Value v,
-        int ply
-    )
-    {
-        // Coefficients of a 3rd order polynomial fit based on fishtest data
-        // for two parameters needed to transform eval to the argument of a
-        // logistic function.
-        static constexpr double as[] = { -8.24404295, 64.23892342, -95.73056462, 153.86478679 };
-        static constexpr double bs[] = { -3.37154371, 28.44489198, -56.67657741,  72.05858751 };
-
-        // The model captures only up to 240 plies, so limit input (and rescale)
-        auto m = ply / 64.0;
-
-        auto a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
-        auto b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
-
-        // Return win rate in per mille
-        auto sv = ((double)v - a) / b;
-        auto svn = ((double)-v - a) / b;
-
-        auto win_pct = Math::sigmoid(sv);
-        auto loss_pct = Math::sigmoid(svn);
-
-        auto draw_pct = 1.0 - win_pct - loss_pct;
-
-        auto perf = win_pct + draw_pct * 0.5;
-
-        return perf;
-    }
-
-    [[maybe_unused]] static ValueWithGrad<double> get_loss_noob(
-        Value shallow, Value teacher_signal, int result, int /* ply */)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto q_ = VariableParameter<double, 0>{};
-        static thread_local auto p_ = ConstantParameter<double, 1>{};
-        static thread_local auto loss_ = pow(q_ - p_, 2.0) * (1.0 / (2400.0 * 2.0 * 600.0));
-
-        auto args = std::tuple(
-            (double)shallow,
-            (double)teacher_signal,
-            (double)result,
-            calculate_lambda(teacher_signal)
-        );
-
-        return loss_.eval(args);
-    }
-
-    static auto& get_loss_cross_entropy_()
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto& q_ = expected_perf_(VariableParameter<double, 0>{});
-        static thread_local auto& p_ = expected_perf_(scale_score_(ConstantParameter<double, 1>{}));
-        static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
-        static thread_local auto lambda_ = ConstantParameter<double, 3>{};
-        static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
-
-        return loss_;
-    }
-
-    static auto get_loss_cross_entropy_args(
-        Value shallow, Value teacher_signal, int result)
-    {
-        return std::tuple(
-            (double)shallow,
-            (double)teacher_signal,
-            (double)result,
-            calculate_lambda(teacher_signal)
-        );
-    }
-
-    static ValueWithGrad<double> get_loss_cross_entropy(
-        Value shallow, Value teacher_signal, int result, int /* ply */)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto& loss_ = get_loss_cross_entropy_();
-
-        auto args = get_loss_cross_entropy_args(shallow, teacher_signal, result);
-
-        return loss_.eval(args);
-    }
-
-    static ValueWithGrad<double> get_loss_cross_entropy_no_grad(
-        Value shallow, Value teacher_signal, int result, int /* ply */)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto& loss_ = get_loss_cross_entropy_();
-
-        auto args = get_loss_cross_entropy_args(shallow, teacher_signal, result);
-
-        return { loss_.value(args), 0.0 };
-    }
-
-    static auto& get_loss_cross_entropy_use_wdl_()
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto ply_ = ConstantParameter<double, 4>{};
-        static thread_local auto shallow_ = VariableParameter<double, 0>{};
-        static thread_local auto& q_ = expected_perf_use_wdl_(shallow_, ply_);
-        // We could do just this but MSVC crashes with an internal compiler error :(
-        // static thread_local auto& scaled_teacher_ = scale_score_(ConstantParameter<double, 1>{});
-        // static thread_local auto& p_ = expected_perf_use_wdl_(scaled_teacher_, ply_);
-        static thread_local auto p_ = ConstantParameter<double, 1>{};
-        static thread_local auto t_ = (ConstantParameter<double, 2>{} + 1.0) * 0.5;
-        static thread_local auto lambda_ = ConstantParameter<double, 3>{};
-        static thread_local auto& loss_ = cross_entropy_(q_, p_, t_, lambda_);
-
-        return loss_;
-    }
-
-    static auto get_loss_cross_entropy_use_wdl_args(
-        Value shallow, Value teacher_signal, int result, int ply)
-    {
-        return std::tuple(
-            (double)shallow,
-            // This is required because otherwise MSVC crashes :(
-            expected_perf_use_wdl(scale_score(teacher_signal), ply),
-            (double)result,
-            calculate_lambda(teacher_signal),
-            (double)std::min(240, ply)
-        );
-    }
-
-    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl(
-        Value shallow, Value teacher_signal, int result, int ply)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto& loss_ = get_loss_cross_entropy_use_wdl_();
-
-        auto args = get_loss_cross_entropy_use_wdl_args(shallow, teacher_signal, result, ply);
-
-        return loss_.eval(args);
-    }
-
-    static ValueWithGrad<double> get_loss_cross_entropy_use_wdl_no_grad(
-        Value shallow, Value teacher_signal, int result, int ply)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        static thread_local auto& loss_ = get_loss_cross_entropy_use_wdl_();
-
-        auto args = get_loss_cross_entropy_use_wdl_args(shallow, teacher_signal, result, ply);
-
-        return { loss_.value(args), 0.0 };
-    }
-
-    static auto get_loss(Value shallow, Value teacher_signal, int result, int ply)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        if (use_wdl)
-        {
-            return get_loss_cross_entropy_use_wdl(shallow, teacher_signal, result, ply);
-        }
-        else
-        {
-            return get_loss_cross_entropy(shallow, teacher_signal, result, ply);
-        }
-    }
-
-    static auto get_loss_no_grad(Value shallow, Value teacher_signal, int result, int ply)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        if (use_wdl)
-        {
-            return get_loss_cross_entropy_use_wdl_no_grad(shallow, teacher_signal, result, ply);
-        }
-        else
-        {
-            return get_loss_cross_entropy_no_grad(shallow, teacher_signal, result, ply);
-        }
-    }
-
-    [[maybe_unused]] static auto get_loss(
-        Value teacher_signal,
-        Value shallow,
-        const PackedSfenValue& psv)
-    {
-        return get_loss(shallow, teacher_signal, psv.game_result, psv.gamePly);
-    }
-
-    static auto get_loss_no_grad(
-        Value teacher_signal,
-        Value shallow,
-        const PackedSfenValue& psv)
-    {
-        return get_loss_no_grad(shallow, teacher_signal, psv.game_result, psv.gamePly);
-    }
-
-    // Class to generate sfen with multiple threads
-    struct LearnerThink
-    {
-        struct Params
-        {
-            // Mini batch size size. Be sure to set it on the side that uses this class.
-            uint64_t mini_batch_size = LEARN_MINI_BATCH_SIZE;
-
-            // Number of phases used for calculation such as mse
-            // mini-batch size = 1M is standard, so 0.2% of that should be negligible in terms of time.
-            // Since search() is performed with depth = 1 in calculation of
-            // move match rate, simple comparison is not possible...
-            uint64_t validation_count = 2000;
-
-            // Option to exclude early stage from learning
-            int reduction_gameply = 1;
-
-            // If the absolute value of the evaluation value of the deep search
-            // of the teacher phase exceeds this value, discard the teacher phase.
-            int eval_limit = 32000;
-
-            // Flag whether to dig a folder each time the evaluation function is saved.
-            // If true, do not dig the folder.
-            bool save_only_once = false;
-
-            bool shuffle = true;
-
-            bool verbose = false;
-
-            double newbob_decay = 0.5;
-            int newbob_num_trials = 4;
-            uint64_t auto_lr_drop = 0;
-
-            std::string best_nn_directory;
-
-            uint64_t eval_save_interval = LEARN_EVAL_SAVE_INTERVAL;
-            uint64_t loss_output_interval = 1'000'000;
-
-            size_t sfen_read_size = SfenReader::DEFAULT_SFEN_READ_SIZE;
-            size_t thread_buffer_size = SfenReader::DEFAULT_THREAD_BUFFER_SIZE;
-
-            bool use_draw_games_in_training = true;
-            bool use_draw_games_in_validation = true;
-            bool skip_duplicated_positions_in_training = true;
-
-            bool assume_quiet = false;
-            bool smart_fen_skipping = false;
-            bool smart_fen_skipping_for_validation = false;
-
-            double learning_rate = 1.0;
-            double warmup_learning_rate = 0.1;
-            double max_grad = 1.0;
-
-            string validation_set_file_name;
-            string seed;
-
-            std::vector<std::string> filenames;
-
-            uint64_t num_threads;
-
-            void enforce_constraints()
-            {
-                num_threads = Options["Threads"];
-
-                if (loss_output_interval == 0)
-                {
-                    loss_output_interval = LEARN_RMSE_OUTPUT_INTERVAL * mini_batch_size;
-                }
-
-                // If reduction_gameply is set to 0, rand(0) will be divided by 0, so correct it to 1.
-                reduction_gameply = max(reduction_gameply, 1);
-
-                if (newbob_decay != 1.0 && !Options["SkipLoadingEval"]) {
-                    // Save the current net to [EvalSaveDir]\original.
-                    Eval::NNUE::save_eval("original");
-
-                    // Set the folder above to best_nn_directory so that the trainer can
-                    // resotre the network parameters from the original net file.
-                    best_nn_directory =
-                        Path::combine(Options["EvalSaveDir"], "original");
-                }
-            }
-        };
-
-        LearnerThink(const Params& prm) :
-            params(prm),
-            init_prng(prm.seed),
-            train_sr(
-                prm.filenames,
-                prm.shuffle,
-                SfenReaderMode::Cyclic,
-                prm.num_threads,
-                std::to_string(init_prng.next_random_seed()),
-                prm.sfen_read_size,
-                prm.thread_buffer_size),
-            validation_sr(
-                prm.validation_set_file_name.empty() ? prm.filenames : std::vector<std::string>{ prm.validation_set_file_name },
-                prm.shuffle,
-                SfenReaderMode::Cyclic,
-                1,
-                std::to_string(init_prng.next_random_seed()),
-                std::min<size_t>(prm.validation_count * 10, 1000000),
-                prm.thread_buffer_size),
-            learn_loss_sum{}
-        {
-            save_count = 0;
-            loss_output_count = 0;
-            last_lr_drop = 0;
-            best_loss = std::numeric_limits<double>::infinity();
-            latest_loss_sum = 0.0;
-            latest_loss_count = 0;
-            total_done = 0;
-            trials = params.newbob_num_trials;
-            dir_number = 0;
-
-            prngs.reserve(prm.num_threads);
-            for (uint64_t i = 0; i < prm.num_threads; ++i)
-            {
-                prngs.emplace_back(init_prng.next_random_seed());
-            }
-        }
-
-        void learn(uint64_t epochs, uint64_t warmup_epochs = 0);
-
-    private:
-        static void set_learning_search_limits();
-
-        PSVector fetch_next_validation_set();
-
-        void learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit);
-
-        void update_weights(const PSVector& psv, uint64_t epoch);
-        void update_weights_warmup(uint64_t warmup_epoch);
-
-        void calc_loss(const PSVector& psv, uint64_t epoch);
-
-        void calc_loss_worker(
-            Thread& th,
-            std::atomic<uint64_t>& counter,
-            const PSVector& psv,
-            Loss& test_loss_sum,
-            atomic<double>& sum_norm,
-            atomic<int>& move_accord_count,
-            atomic<double>& sum_one_over_move_count
-        );
-
-        bool has_depth1_move_agreement(Position& pos, Move pvmove);
-
-        bool check_progress();
-
-        // save merit function parameters to a file
-        bool save(bool is_final = false);
-
-        Params params;
-
-        PRNG init_prng;
-        std::vector<PRNG> prngs;
-
-        // sfen reader
-        SfenReader train_sr;
-        SfenReader validation_sr;
-
-        uint64_t save_count;
-        uint64_t loss_output_count;
-
-        std::atomic<bool> stop_flag;
-
-        uint64_t total_done;
-
-        uint64_t last_lr_drop;
-        double best_loss;
-        double latest_loss_sum;
-        uint64_t latest_loss_count;
-
-        int trials;
-        int dir_number;
-
-        // For calculation of learning data loss
-        Loss learn_loss_sum;
-    };
-
-    void LearnerThink::set_learning_search_limits()
-    {
-        Threads.main()->ponder = false;
-
-        // About Search::Limits
-        // Be careful because this member variable is global and affects other threads.
-        auto& limits = Search::Limits;
-
-        limits.startTime = now();
-
-        // Make the search equivalent to the "go infinite" command. (Because it is troublesome if time management is done)
-        limits.infinite = true;
-
-        // Since PV is an obstacle when displayed, erase it.
-        limits.silent = true;
-
-        // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
-        limits.nodes = 0;
-
-        // depth is also processed by the one passed as an argument of Learner::search().
-        limits.depth = 0;
-    }
-
-    PSVector LearnerThink::fetch_next_validation_set()
-    {
-        PSVector validation_data;
-
-        auto mainThread = Threads.main();
-        mainThread->execute_with_worker([&validation_data, this](auto& th){
-            auto do_include_predicate = [&th, this](const PackedSfenValue& ps) -> bool {
-                if (params.eval_limit < abs(ps.score))
-                    return false;
-
-                if (!params.use_draw_games_in_validation && ps.game_result == 0)
-                    return false;
-
-                if (params.smart_fen_skipping_for_validation)
-                {
-                    StateInfo si;
-                    auto& pos = th.rootPos;
-                    if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
-                        return false;
-
-                    if (pos.capture_or_promotion((Move)ps.move) || pos.checkers())
-                        return false;
-                }
-
-                return true;
-            };
-
-            validation_data = validation_sr.read_some(
-                params.validation_count,
-                params.validation_count * 100, // to have a reasonable bound on the running time.
-                do_include_predicate
-            );
-        });
-        mainThread->wait_for_worker_finished();
-
-        return validation_data;
-    }
-
-    void LearnerThink::learn(uint64_t epochs, uint64_t warmup_epochs)
-    {
-#if defined(_OPENMP)
-        omp_set_num_threads((int)Options["Threads"]);
-#endif
-
-        set_learning_search_limits();
-
-        Eval::NNUE::verify_any_net_loaded();
-
-        const PSVector validation_data = fetch_next_validation_set();
-
-        if (validation_data.size() != params.validation_count)
-        {
-            auto out = sync_region_cout.new_region();
-            out
-                << "INFO (learn): Error reading validation data. Read " << validation_data.size()
-                << " out of " << params.validation_count << '\n'
-                << "INFO (learn): This either means that less than 1% of the validation data passed the filter"
-                << " or the file is empty\n";
-
-            return;
-        }
-
-        stop_flag = false;
-
-        if (warmup_epochs > 0)
-        {
-            cout << "Doing " << warmup_epochs << " warmup epochs." << endl;
-        }
-
-        for(uint64_t warmup_epoch = 1; warmup_epoch <= warmup_epochs; ++warmup_epoch)
-        {
-            std::atomic<uint64_t> counter{0};
-
-            Threads.execute_with_workers([this, &counter](auto& th){
-                learn_worker(th, counter, params.mini_batch_size);
-            });
-
-            total_done += params.mini_batch_size;
-
-            Threads.wait_for_workers_finished();
-
-            if (stop_flag)
-                break;
-
-            update_weights_warmup(warmup_epoch);
-
-            if (stop_flag)
-                break;
-
-            cout << "Finished " << warmup_epoch << " out of " << warmup_epochs << " warmup epochs." << endl;
-        }
-
-        if (params.newbob_decay != 1.0) {
-
-            calc_loss(validation_data, 0);
-
-            best_loss = latest_loss_sum / latest_loss_count;
-            latest_loss_sum = 0.0;
-            latest_loss_count = 0;
-
-            auto out = sync_region_cout.new_region();
-            out << "INFO (learn): initial loss = " << best_loss << endl;
-        }
-
-        for(uint64_t epoch = 1; epoch <= epochs; ++epoch)
-        {
-            std::atomic<uint64_t> counter{0};
-
-            Threads.execute_with_workers([this, &counter](auto& th){
-                learn_worker(th, counter, params.mini_batch_size);
-            });
-
-            total_done += params.mini_batch_size;
-
-            Threads.wait_for_workers_finished();
-
-            if (stop_flag)
-                break;
-
-            update_weights(validation_data, epoch);
-
-            if (stop_flag)
-                break;
-        }
-
-        Eval::NNUE::finalize_net();
-
-        save(true);
-    }
-
-    void LearnerThink::learn_worker(Thread& th, std::atomic<uint64_t>& counter, uint64_t limit)
-    {
-        const auto thread_id = th.thread_idx();
-        auto& pos = th.rootPos;
-        auto& prng = prngs[th.thread_idx()];
-
-        std::vector<StateInfo, AlignedAllocator<StateInfo>> state(MAX_PLY);
-
-        while(!stop_flag)
-        {
-            const auto iter = counter.fetch_add(1);
-            if (iter >= limit)
-                break;
-
-            PackedSfenValue ps;
-
-        RETRY_READ:;
-
-            if (!train_sr.read_to_thread_buffer(thread_id, ps))
-            {
-                // If we ran out of data we stop completely
-                // because there's nothing left to do.
-                stop_flag = true;
-                break;
-            }
-
-            if (params.eval_limit < abs(ps.score))
-                goto RETRY_READ;
-
-            if (!params.use_draw_games_in_training && ps.game_result == 0)
-                goto RETRY_READ;
-
-            // Skip over the opening phase
-            if (ps.gamePly < prng.rand(params.reduction_gameply))
-                goto RETRY_READ;
-
-            StateInfo si;
-            if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
-            {
-                // Malformed sfen
-                auto out = sync_region_cout.new_region();
-                out << "ERROR: illigal packed sfen = " << pos.fen() << endl;
-                goto RETRY_READ;
-            }
-
-            const auto rootColor = pos.side_to_move();
-
-            // A function that adds the current `pos` and `ps`
-            // to the training set.
-            auto pos_add_grad = [&]() {
-
-                // Evaluation value of deep search
-                const Value shallow_value = Eval::evaluate(pos);
-
-                Eval::NNUE::add_example(pos, rootColor, shallow_value, ps, 1.0);
-            };
-
-            if (!pos.pseudo_legal((Move)ps.move) || !pos.legal((Move)ps.move))
-            {
-                goto RETRY_READ;
-            }
-
-            // We don't need to qsearch when doing smart skipping
-            if (!params.assume_quiet && !params.smart_fen_skipping)
-            {
-                int ply = 0;
-                pos.do_move((Move)ps.move, state[ply++]);
-
-                // Evaluation value of shallow search (qsearch)
-                const auto [_, pv] = Search::qsearch(pos);
-
-                for (auto m : pv)
-                {
-                    pos.do_move(m, state[ply++]);
-                }
-            }
-
-            if (params.smart_fen_skipping
-                && (pos.capture_or_promotion((Move)ps.move)
-                    || pos.checkers()))
-            {
-                goto RETRY_READ;
-            }
-
-            // We want to position being trained on not to be terminal
-            if (MoveList<LEGAL>(pos).size() == 0)
-                goto RETRY_READ;
-
-            // Since we have reached the end phase of PV, add the slope here.
-            pos_add_grad();
-        }
-    }
-
-    void LearnerThink::update_weights_warmup(uint64_t warmup_epoch)
-    {
-        // I'm not sure this fencing is correct. But either way there
-        // should be no real issues happening since
-        // the read/write phases are isolated.
-        atomic_thread_fence(memory_order_seq_cst);
-        Eval::NNUE::update_parameters(
-            Threads, warmup_epoch, params.verbose, params.warmup_learning_rate, params.max_grad, get_loss);
-        atomic_thread_fence(memory_order_seq_cst);
-    }
-
-    void LearnerThink::update_weights(const PSVector& psv, uint64_t epoch)
-    {
-        // I'm not sure this fencing is correct. But either way there
-        // should be no real issues happening since
-        // the read/write phases are isolated.
-        atomic_thread_fence(memory_order_seq_cst);
-        learn_loss_sum += Eval::NNUE::update_parameters(
-            Threads, epoch, params.verbose, params.learning_rate, params.max_grad, get_loss);
-        atomic_thread_fence(memory_order_seq_cst);
-
-        if (++save_count * params.mini_batch_size >= params.eval_save_interval)
-        {
-            save_count = 0;
-
-            const bool converged = save();
-            if (converged)
-            {
-                stop_flag = true;
-                return;
-            }
-        }
-
-        if (++loss_output_count * params.mini_batch_size >= params.loss_output_interval)
-        {
-            loss_output_count = 0;
-
-            // loss calculation
-            calc_loss(psv, epoch);
-
-            Eval::NNUE::check_health();
-        }
-    }
-
-    void LearnerThink::calc_loss(const PSVector& psv, uint64_t epoch)
-    {
-        TT.new_search();
-        TimePoint elapsed = now() - Search::Limits.startTime + 1;
-
-        auto out = sync_region_cout.new_region();
-
-        out << "\n";
-        out << "PROGRESS (calc_loss): " << now_string()
-             << ", " << total_done << " sfens"
-             << ", " << total_done * 1000 / elapsed  << " sfens/second"
-             << ", epoch " << epoch
-             << endl;
-
-        out << "  - learning rate = " << params.learning_rate << endl;
-
-        // For calculation of verification data loss
-        Loss test_loss_sum{};
-
-        // norm for learning
-        atomic<double> sum_norm{0.0};
-
-        // The number of times the pv first move of deep
-        // search matches the pv first move of search(1).
-        atomic<int> move_accord_count{0};
-
-        // If there is 10 legal moves then 0.1 will be added.
-        // This happens for each position tested.
-        // Effectively at the end we have the random move accuracy
-        // multiplied by the number of positions, which is psv.size()
-        atomic<double> sum_one_over_move_count{0.0};
-
-        auto mainThread = Threads.main();
-        mainThread->execute_with_worker([&out](auto& th){
-            auto& pos = th.rootPos;
-            StateInfo si;
-            pos.set(StartFEN, false, &si, &th);
-            out << "  - startpos eval = " << Eval::evaluate(pos) << endl;
-        });
-        mainThread->wait_for_worker_finished();
-
-        // The number of tasks to do.
-        atomic<uint64_t> counter{0};
-        Threads.execute_with_workers([&](auto& th){
-            calc_loss_worker(
-                th,
-                counter,
-                psv,
-                test_loss_sum,
-                sum_norm,
-                move_accord_count,
-                sum_one_over_move_count
-            );
-        });
-        Threads.wait_for_workers_finished();
-
-        latest_loss_sum += test_loss_sum.value();
-        latest_loss_count += psv.size();
-
-        if (psv.size() && test_loss_sum.count() > 0)
-        {
-            test_loss_sum.print_only_loss("val", out);
-
-            if (learn_loss_sum.count() > 0)
-            {
-                learn_loss_sum.print_with_grad("train", out);
-            }
-
-            out << "  - norm = " << sum_norm << endl;
-            out << "  - move accuracy = " << (move_accord_count * 100.0 / psv.size()) << "%" << endl;
-            out << "  - random move accuracy = " << (sum_one_over_move_count * 100.0 / psv.size()) << "%" << endl;
-        }
-        else
-        {
-            out << "ERROR: psv.size() = " << psv.size() << " ,  done = " << test_loss_sum.count() << endl;
-        }
-
-        learn_loss_sum.reset();
-    }
-
-    void LearnerThink::calc_loss_worker(
-        Thread& th,
-        std::atomic<uint64_t>& counter,
-        const PSVector& psv,
-        Loss& test_loss_sum,
-        atomic<double>& sum_norm,
-        atomic<int>& move_accord_count,
-        atomic<double>& sum_one_over_move_count
-    )
-    {
-        Loss local_loss_sum{};
-        double local_sum_one_over_move_count = 0.0;
-        auto& pos = th.rootPos;
-
-        for(;;)
-        {
-            const auto task_id = counter.fetch_add(1);
-            if (task_id >= psv.size())
-            {
-                break;
-            }
-
-            const auto& ps = psv[task_id];
-
-            StateInfo si;
-            if (pos.set_from_packed_sfen(ps.sfen, &si, &th) != 0)
-            {
-                cout << "Error! : illegal packed sfen " << pos.fen() << endl;
-                continue;
-            }
-
-            const Value shallow_value = Eval::evaluate(pos);
-
-            // Evaluation value of deep search
-            const auto deep_value = (Value)ps.score;
-
-            const auto loss = get_loss_no_grad(
-                deep_value,
-                shallow_value,
-                ps);
-
-            local_loss_sum += loss;
-            sum_norm += (double)abs(shallow_value);
-
-            // Threat all moves with equal scores as first. This is up to move ordering.
-            if (has_depth1_move_agreement(pos, (Move)ps.move))
-                move_accord_count.fetch_add(1, std::memory_order_relaxed);
-
-            local_sum_one_over_move_count += 1.0 / static_cast<double>(MoveList<LEGAL>(pos).size());
-        }
-
-        sum_one_over_move_count += local_sum_one_over_move_count;
-        test_loss_sum += local_loss_sum;
-    }
-
-    bool LearnerThink::has_depth1_move_agreement(Position& pos, Move pvmove)
-    {
-        // Determine if the depth 1 search pv matches the move from the dataset.
-        // Do a manual depth 1 search so we're not affected by previous searches.
-        std::vector<std::pair<Move, Value>> child_scores;
-
-        // Call evaluate once for the rootpos so that the evals
-        // for children moves use incremental feature transformer updates.
-        (void)Eval::evaluate(pos);
-
-        // Just to get guaranteed alignment.
-        std::vector<StateInfo, AlignedAllocator<StateInfo>> states(1);
-        auto legal_moves = MoveList<LEGAL>(pos);
-        for (auto m : legal_moves)
-        {
-            pos.do_move(m, states[0]);
-            // We don't care if the king is in check or stuff like that.
-            // not a big issue and nnue should digest all.
-            auto value = -Eval::evaluate(pos);
-            child_scores.emplace_back(m, value);
-            pos.undo_move(m);
-        }
-
-        if (child_scores.empty())
-            return false;
-
-        std::sort(
-            child_scores.begin(),
-            child_scores.end(),
-            [](auto& lhs, auto& rhs) { return lhs.second > rhs.second; }
-        );
-
-        // Require the best move to have strictly higher score than the next one.
-        return
-            child_scores[0].first == pvmove
-            && (child_scores.size() == 1
-                || child_scores[1].second != child_scores[0].second);
-    }
-
-    bool LearnerThink::check_progress()
-    {
-        auto out = sync_region_cout.new_region();
-
-        const double latest_loss = latest_loss_sum / latest_loss_count;
-        bool converged = false;
-        latest_loss_sum = 0.0;
-        latest_loss_count = 0;
-
-        auto drop_lr = [&]() {
-            last_lr_drop = total_done;
-
-            out
-                << "  - reducing learning rate from " << params.learning_rate
-                << " to " << (params.learning_rate * params.newbob_decay)
-                << " (" << trials << " more trials)" << endl;
-
-            params.learning_rate *= params.newbob_decay;
-        };
-
-        auto accept = [&]() {
-            out << "  - loss = " << latest_loss << " < best (" << best_loss << "), accepted" << endl;
-
-            best_loss = latest_loss;
-            trials = params.newbob_num_trials;
-        };
-
-        auto reject = [&]() {
-            out << "  - loss = " << latest_loss << " >= best (" << best_loss << "), rejected" << endl;
-
-            --trials;
-            if (trials > 0)
-            {
-                drop_lr();
-                return false;
-            }
-            else
-            {
-                return true;
-            }
-        };
-
-        out << "INFO (learning_rate):" << endl;
-
-        if (params.auto_lr_drop)
-        {
-            accept();
-
-            if (total_done >= last_lr_drop + params.auto_lr_drop)
-            {
-                drop_lr();
-            }
-        }
-        else if (latest_loss < best_loss)
-        {
-            accept();
-        }
-        else
-        {
-            converged = reject();
-        }
-
-        if (converged)
-        {
-            out << "  - converged" << endl;
-        }
-
-        return converged;
-    }
-
-    // Write evaluation function file.
-    bool LearnerThink::save(bool is_final)
-    {
-        // Each time you save, change the extension part of the file name like "0","1","2",..
-        // (Because I want to compare the winning rate for each evaluation function parameter later)
-
-        bool converged = false;
-
-        if (params.save_only_once)
-        {
-            // When EVAL_SAVE_ONLY_ONCE is defined,
-            // Do not dig a subfolder because I want to save it only once.
-            Eval::NNUE::save_eval("");
-        }
-        else if (is_final)
-        {
-            Eval::NNUE::save_eval("final");
-            converged = true;
-        }
-        else
-        {
-            // TODO: consider naming the output directory by epoch.
-            const std::string dir_name = std::to_string(dir_number++);
-            Eval::NNUE::save_eval(dir_name);
-
-            if (params.newbob_decay != 1.0 && latest_loss_count > 0)
-            {
-                converged = check_progress();
-                params.best_nn_directory = Path::combine((std::string)Options["EvalSaveDir"], dir_name);
-            }
-        }
-
-        return converged;
-    }
-
-    // Learning from the generated game record
-    void learn(istringstream& is)
-    {
-        LearnerThink::Params params;
-
-        // Number of epochs
-        uint64_t epochs = std::numeric_limits<uint64_t>::max();
-        uint64_t warmup_epochs = 0;
-
-        // Game file storage folder (get game file with relative path from here)
-        string base_dir;
-        string target_dir;
-
-        uint64_t nn_batch_size = 1000;
-        string nn_options;
-
-        auto out = sync_region_cout.new_region();
-
-        // Assume the filenames are staggered.
-        while (true)
-        {
-            string option;
-            is >> option;
-
-            if (option == "")
-                break;
-
-            // specify the number of phases of mini-batch
-            if (option == "bat")
-            {
-                is >> params.mini_batch_size;
-                params.mini_batch_size *= 10000; // Unit is ten thousand
-            }
-
-            // Specify the folder in which the game record is stored and make it the rooting target.
-            else if (option == "targetdir") is >> target_dir;
-            else if (option == "targetfile")
-            {
-                std::string filename;
-                is >> filename;
-                params.filenames.push_back(filename);
-            }
-            else if (option == "validation_count") is >> params.validation_count;
-
-            // Specify the number of loops
-            else if (option == "epochs") is >> epochs;
-            else if (option == "warmup_epochs") is >> warmup_epochs;
-
-            // Game file storage folder (get game file with relative path from here)
-            else if (option == "basedir") is >> base_dir;
-
-            // Mini batch size
-            else if (option == "batchsize"
-                  || option == "epoch_size")
-                is >> params.mini_batch_size;
-
-            // learning rate
-            else if (option == "lr") is >> params.learning_rate;
-            else if (option == "warmup_lr") is >> params.warmup_learning_rate;
-            else if (option == "max_grad") is >> params.max_grad;
-
-            // Accept also the old option name.
-            else if (option == "use_draw_in_training"
-                  || option == "use_draw_games_in_training")
-                is >> params.use_draw_games_in_training;
-
-            // Accept also the old option name.
-            else if (option == "use_draw_in_validation"
-                  || option == "use_draw_games_in_validation")
-                is >> params.use_draw_games_in_validation;
-
-            // Accept also the old option name.
-            else if (option == "use_hash_in_training"
-                  || option == "skip_duplicated_positions_in_training")
-                is >> params.skip_duplicated_positions_in_training;
-
-            else if (option == "winning_probability_coefficient")
-                is >> winning_probability_coefficient;
-
-            // Using WDL with win rate model instead of sigmoid
-            else if (option == "use_wdl") is >> use_wdl;
-
-
-            // LAMBDA
-            else if (option == "lambda") is >> elmo_lambda_low;
-            else if (option == "lambda2") is >> elmo_lambda_high;
-            else if (option == "lambda_limit") is >> elmo_lambda_limit;
-
-            else if (option == "reduction_gameply") is >> params.reduction_gameply;
-
-            else if (option == "eval_limit") is >> params.eval_limit;
-            else if (option == "save_only_once") params.save_only_once = true;
-            else if (option == "no_shuffle") params.shuffle = false;
-
-            else if (option == "nn_batch_size"
-                  || option == "batch_size")
-                is >> nn_batch_size;
-            else if (option == "newbob_decay"
-                  || option == "lr_step")
-                is >> params.newbob_decay;
-            else if (option == "newbob_num_trials"
-                  || option == "max_consecutive_rejections")
-                is >> params.newbob_num_trials;
-            else if (option == "nn_options") is >> nn_options;
-            else if (option == "auto_lr_drop") is >> params.auto_lr_drop;
-
-            else if (option == "eval_save_interval") is >> params.eval_save_interval;
-            else if (option == "loss_output_interval") is >> params.loss_output_interval;
-            else if (option == "validation_set_file_name") is >> params.validation_set_file_name;
-
-            else if (option == "src_score_min_value") is >> src_score_min_value;
-            else if (option == "src_score_max_value") is >> src_score_max_value;
-            else if (option == "dest_score_min_value") is >> dest_score_min_value;
-            else if (option == "dest_score_max_value") is >> dest_score_max_value;
-
-            else if (option == "sfen_read_size") is >> params.sfen_read_size;
-            else if (option == "thread_buffer_size") is >> params.thread_buffer_size;
-
-            else if (option == "seed") is >> params.seed;
-            else if (option == "set_recommended_uci_options")
-            {
-                UCI::setoption("Use NNUE", "pure");
-                UCI::setoption("MultiPV", "1");
-                UCI::setoption("Contempt", "0");
-                UCI::setoption("Skill Level", "20");
-                UCI::setoption("UCI_Chess960", "false");
-                UCI::setoption("UCI_AnalyseMode", "false");
-                UCI::setoption("UCI_LimitStrength", "false");
-                UCI::setoption("PruneAtShallowDepth", "false");
-                UCI::setoption("EnableTranspositionTable", "false");
-            }
-            else if (option == "verbose") params.verbose = true;
-            else if (option == "assume_quiet") params.assume_quiet = true;
-            else if (option == "smart_fen_skipping") params.smart_fen_skipping = true;
-            else if (option == "smart_fen_skipping_for_validation") params.smart_fen_skipping_for_validation = true;
-            else
-            {
-                out << "INFO: Unknown option: " << option << ". Ignoring.\n";
-            }
-        }
-
-        out << "INFO: Executing learn command\n";
-
-        // Issue a warning if OpenMP is disabled.
-#if !defined(_OPENMP)
-        out << "WARNING: OpenMP disabled." << endl;
-#endif
-
-        params.enforce_constraints();
-
-        // Right now we only have the individual files.
-        // We need to apply base_dir here
-        if (!target_dir.empty())
-        {
-            append_files_from_dir(params.filenames, base_dir, target_dir);
-        }
-        rebase_files(params.filenames, base_dir);
-
-        out << "INFO: Input files:\n";
-        for (auto s : params.filenames)
-            out << "  - " << s << '\n';
-
-        out << "INFO: Parameters:\n";
-        if (!params.validation_set_file_name.empty())
-        {
-            out << "  - validation set           : " << params.validation_set_file_name << endl;
-        }
-
-        out << "  - validation count         : " << params.validation_count << endl;
-        out << "  - epochs                   : " << epochs << endl;
-        out << "  - positions                : " << epochs * params.mini_batch_size << endl;
-        out << "  - warmup epochs            : " << warmup_epochs << endl;
-        out << "  - warmup positions         : " << warmup_epochs * params.mini_batch_size << endl;
-        out << "  - eval_limit               : " << params.eval_limit << endl;
-        out << "  - save_only_once           : " << (params.save_only_once ? "true" : "false") << endl;
-        out << "  - shuffle on read          : " << (params.shuffle ? "true" : "false") << endl;
-
-        out << "  - Loss Function            : " << LOSS_FUNCTION << endl;
-        out << "  - minibatch size           : " << params.mini_batch_size << endl;
-
-        out << "  - nn_batch_size            : " << nn_batch_size << endl;
-        out << "  - nn_options               : " << nn_options << endl;
-
-        out << "  - learning rate            : " << params.learning_rate << endl;
-        out << "  - warmup learning rate     : " << params.warmup_learning_rate << endl;
-        out << "  - max_grad                 : " << params.max_grad << endl;
-        out << "  - use draws in training    : " << params.use_draw_games_in_training << endl;
-        out << "  - use draws in validation  : " << params.use_draw_games_in_validation << endl;
-        out << "  - skip repeated positions  : " << params.skip_duplicated_positions_in_training << endl;
-
-        out << "  - winning prob coeff       : " << winning_probability_coefficient << endl;
-        out << "  - use_wdl                  : " << use_wdl << endl;
-
-        out << "  - src_score_min_value      : " << src_score_min_value << endl;
-        out << "  - src_score_max_value      : " << src_score_max_value << endl;
-        out << "  - dest_score_min_value     : " << dest_score_min_value << endl;
-        out << "  - dest_score_max_value     : " << dest_score_max_value << endl;
-
-        out << "  - reduction_gameply        : " << params.reduction_gameply << endl;
-
-        out << "  - elmo_lambda_low          : " << elmo_lambda_low << endl;
-        out << "  - elmo_lambda_high         : " << elmo_lambda_high << endl;
-        out << "  - elmo_lambda_limit        : " << elmo_lambda_limit << endl;
-        out << "  - eval_save_interval       : " << params.eval_save_interval << " sfens" << endl;
-        out << "  - loss_output_interval     : " << params.loss_output_interval << " sfens" << endl;
-
-        out << "  - sfen_read_size           : " << params.sfen_read_size << endl;
-        out << "  - thread_buffer_size       : " << params.thread_buffer_size << endl;
-
-        out << "  - smart_fen_skipping       : " << params.smart_fen_skipping << endl;
-        out << "  - smart_fen_skipping_val   : " << params.smart_fen_skipping_for_validation << endl;
-
-        out << "  - seed                     : " << params.seed << endl;
-        out << "  - verbose                  : " << (params.verbose ? "true" : "false") << endl;
-
-        if (params.auto_lr_drop) {
-            out << "  - learning rate scheduling : every " << params.auto_lr_drop << " sfens" << endl;
-        }
-        else if (params.newbob_decay != 1.0) {
-            out << "  - learning rate scheduling : newbob with decay" << endl;
-            out << "  - newbob_decay             : " << params.newbob_decay << endl;
-            out << "  - newbob_num_trials        : " << params.newbob_num_trials << endl;
-        }
-        else {
-            out << "  - learning rate scheduling : fixed learning rate" << endl;
-        }
-
-        out << endl;
-
-        out << "INFO: Started initialization." << endl;
-
-        Eval::NNUE::initialize_training(params.seed, out);
-        Eval::NNUE::set_batch_size(nn_batch_size);
-        Eval::NNUE::set_options(nn_options);
-
-        LearnerThink learn_think(params);
-
-        out << "Finished initialization." << endl;
-
-        out.unlock();
-
-        // Start learning.
-        learn_think.learn(epochs, warmup_epochs);
-    }
-
-} // namespace Learner
diff --git a/src/learn/learn.h b/src/learn/learn.h
deleted file mode 100644
index 842ffad0..00000000
--- a/src/learn/learn.h
+++ /dev/null
@@ -1,148 +0,0 @@
-﻿#ifndef _LEARN_H_
-#define _LEARN_H_
-
-// ----------------------
-// Floating point for learning
-// ----------------------
-
-// If this is set to double, the calculation accuracy will be higher, but the weight array entangled memory will be doubled.
-// Currently, if this is float, the weight array is 4.5 times the size of the evaluation function file. (About 4.5GB with KPPT)
-// Even if it is a double type, there is almost no difference in the way of convergence, so fix it to float.
-
-// when using float
-using LearnFloatType = float;
-
-// when using double
-//typedef double LearnFloatType;
-
-// when using float16
-//#include "half_float.h"
-//typedef HalfFloat::float16 LearnFloatType;
-
-// ======================
-// configure
-// ======================
-
-// ----------------------
-// Learning with the method of elmo (WCSC27)
-// ----------------------
-
-#define LOSS_FUNCTION "ELMO_METHOD(WCSC27)"
-
-// ----------------------
-// Definition of struct used in Learner
-// ----------------------
-
-#include "autograd.h"
-#include "packed_sfen.h"
-
-#include "position.h"
-
-#include <sstream>
-#include <vector>
-#include <mutex>
-#include <string>
-
-namespace Learner
-{
-    // ----------------------
-    // Settings for learning
-    // ----------------------
-
-    // mini-batch size.
-    // Calculate the gradient by combining this number of phases.
-    // If you make it smaller, the number of update_weights() will increase and the convergence will be faster. The gradient is incorrect.
-    // If you increase it, the number of update_weights() decreases, so the convergence will be slow. The slope will come out accurately.
-    // I don't think you need to change this value in most cases.
-
-    constexpr std::size_t LEARN_MINI_BATCH_SIZE = 1000 * 1000 * 1;
-
-    // Saving interval of evaluation function at learning. Save each time you learn this number of phases.
-    // Needless to say, the longer the saving interval, the shorter the learning time.
-    // Folder name is incremented for each save like 0/, 1/, 2/...
-    // By default, once every 1 billion phases.
-    constexpr std::size_t LEARN_EVAL_SAVE_INTERVAL = 100'000'000ULL;
-
-    // Reduce the output of rmse during learning to 1 for this number of times.
-    // rmse calculation is done in one thread, so it takes some time, so reducing the output is effective.
-    constexpr std::size_t LEARN_RMSE_OUTPUT_INTERVAL = 1;
-
-    // Learning from the generated game record
-    void learn(std::istringstream& is);
-
-    using CalcLossFunc = ValueWithGrad<double>(Value, Value, int, int);
-
-    struct Loss
-    {
-        double value() const
-        {
-            return m_loss.value;
-        }
-
-        double grad() const
-        {
-            return m_loss.grad;
-        }
-
-        uint64_t count() const
-        {
-            return m_count;
-        }
-
-        Loss() = default;
-
-        Loss(const Loss& other) :
-            m_loss(other.m_loss),
-            m_count(other.m_count)
-        {
-        }
-
-        Loss& operator += (const ValueWithGrad<double>& rhs)
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss += rhs.abs();
-            m_count += 1;
-
-            return *this;
-        }
-
-        Loss& operator += (const Loss& rhs)
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss += rhs.m_loss.abs();
-            m_count += rhs.m_count;
-
-            return *this;
-        }
-
-        void reset()
-        {
-            std::unique_lock lock(m_mutex);
-
-            m_loss = ValueWithGrad<double>{ 0.0, 0.0 };
-            m_count = 0;
-        }
-
-        template <typename StreamT>
-        void print_with_grad(const std::string& prefix, StreamT& s) const
-        {
-            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
-            s << "  - " << prefix << "_grad_norm  = " << m_loss.grad / (double)m_count << std::endl;
-        }
-
-        template <typename StreamT>
-        void print_only_loss(const std::string& prefix, StreamT& s) const
-        {
-            s << "  - " << prefix << "_loss       = " << m_loss.value / (double)m_count << std::endl;
-        }
-
-    private:
-        ValueWithGrad<double> m_loss{ 0.0, 0.0 };
-        uint64_t m_count{0};
-        std::mutex m_mutex;
-    };
-}
-
-#endif // ifndef _LEARN_H_
diff --git a/src/nnue/evaluate_nnue_learner.cpp b/src/nnue/evaluate_nnue_learner.cpp
deleted file mode 100644
index 8d95221c..00000000
--- a/src/nnue/evaluate_nnue_learner.cpp
+++ /dev/null
@@ -1,341 +0,0 @@
-﻿#include <random>
-#include <fstream>
-
-#include "evaluate_nnue.h"
-#include "evaluate_nnue_learner.h"
-
-#include "trainer/features/all_factorizers.h"
-
-#include "trainer/trainer_feature_transformer.h"
-#include "trainer/trainer_input_slice.h"
-#include "trainer/trainer_affine_transform.h"
-#include "trainer/trainer_clipped_relu.h"
-#include "trainer/trainer_sum.h"
-
-#include "position.h"
-#include "uci.h"
-#include "misc.h"
-#include "thread_win32_osx.h"
-#include "thread.h"
-
-// Code for learning NNUE evaluation function
-namespace Eval::NNUE {
-
-    namespace {
-
-        // learning data
-        std::vector<Example> examples;
-
-        // Mutex for exclusive control of examples
-        std::mutex examples_mutex;
-
-        // number of samples in mini-batch
-        uint64_t batch_size;
-
-        // random number generator
-        std::mt19937 rng;
-
-        // learner
-        std::shared_ptr<Trainer<Network>> trainer;
-
-        // Tell the learner options such as hyperparameters
-        void send_messages(std::vector<Message> messages) {
-            for (auto& message : messages) {
-                trainer->send_message(&message);
-                assert(message.num_receivers > 0);
-            }
-        }
-
-    }  // namespace
-
-    // Initialize learning
-    void initialize_training(
-        const std::string& seed,
-        SynchronizedRegionLogger::Region& out) {
-
-#if defined (OPENBLAS_VERSION)
-        openblas_set_num_threads(1);
-#elif defined (INTEL_MKL_VERSION)
-        mkl_set_num_threads(1);
-#endif
-
-        out << "INFO (initialize_training): Initializing NN training for "
-            << get_architecture_string() << std::endl;
-
-        out << std::endl;
-
-        out << "Layers:\n"
-            << get_layers_info() << std::endl;
-
-        out << std::endl;
-
-        out << "Factorizers:\n"
-            << Features::Factorizer<RawFeatures>::get_factorizers_string() << std::endl;
-
-        out << std::endl;
-
-        assert(feature_transformer);
-        assert(network);
-
-        trainer = Trainer<Network>::create(network.get(), feature_transformer.get());
-        rng.seed(PRNG(seed).rand<uint64_t>());
-
-        if (Options["SkipLoadingEval"]) {
-            out << "INFO (initialize_training): Performing random net initialization.\n";
-            trainer->initialize(rng);
-        }
-    }
-
-    // set the number of samples in the mini-batch
-    void set_batch_size(uint64_t size) {
-        assert(size > 0);
-        batch_size = size;
-    }
-
-    // Set options such as hyperparameters
-    void set_options(const std::string& options) {
-        std::vector<Message> messages;
-        for (const auto& option : Algo::split(options, ',')) {
-          const auto fields = Algo::split(option, '=');
-          assert(fields.size() == 1 || fields.size() == 2);
-
-          if (fields.size() == 1) {
-              messages.emplace_back(fields[0]);
-          } else {
-              messages.emplace_back(fields[0], fields[1]);
-          }
-        }
-
-        send_messages(std::move(messages));
-    }
-
-    // Reread the evaluation function parameters for learning from the file
-    void restore_parameters(const std::string& dir_name) {
-        const std::string file_name = Path::combine(dir_name, NNUE::savedfileName);
-        std::ifstream stream(file_name, std::ios::binary);
-#ifndef NDEBUG
-        bool result =
-#endif
-        ReadParameters(stream);
-#ifndef NDEBUG
-        assert(result);
-#endif
-
-        send_messages({{"reset"}});
-    }
-
-    void finalize_net() {
-        send_messages({{"clear_unobserved_feature_weights"}});
-    }
-
-    // Add 1 sample of learning data
-    void add_example(
-        Position& pos,
-        Color rootColor,
-        Value discrete_nn_eval,
-        const Learner::PackedSfenValue& psv,
-        double weight) {
-
-        Example example;
-        if (rootColor == pos.side_to_move()) {
-            example.sign = 1;
-        } else {
-            example.sign = -1;
-        }
-
-        example.discrete_nn_eval = discrete_nn_eval;
-        example.psv = psv;
-        example.weight = weight;
-
-        Features::IndexList active_indices[2];
-        for (const auto trigger : kRefreshTriggers) {
-            RawFeatures::append_active_indices(pos, trigger, active_indices);
-        }
-
-        if (pos.side_to_move() != WHITE) {
-            active_indices[0].swap(active_indices[1]);
-        }
-
-        static thread_local std::vector<TrainingFeature> s_training_features;
-        auto& training_features = s_training_features;
-
-        for (const auto color : Colors) {
-            training_features.clear();
-
-            for (const auto base_index : active_indices[color]) {
-                static_assert(Features::Factorizer<RawFeatures>::get_dimensions() <
-                              (1 << TrainingFeature::kIndexBits), "");
-                Features::Factorizer<RawFeatures>::append_training_features(
-                    base_index, &training_features);
-            }
-
-            std::sort(training_features.begin(), training_features.end());
-
-            auto& unique_features = example.training_features[color];
-            unique_features.reserve(training_features.size());
-            for (const auto& feature : training_features) {
-                if (!unique_features.empty() &&
-                    feature.get_index() == unique_features.back().get_index()) {
-
-                    unique_features.back() += feature;
-                } else {
-                    unique_features.push_back(feature);
-                }
-            }
-        }
-
-        std::lock_guard<std::mutex> lock(examples_mutex);
-        examples.push_back(std::move(example));
-    }
-
-    // update the evaluation function parameters
-    Learner::Loss update_parameters(
-        ThreadPool& thread_pool,
-        uint64_t epoch,
-        bool verbose,
-        double learning_rate,
-        double max_grad,
-        Learner::CalcLossFunc calc_loss)
-    {
-        using namespace Learner::Autograd::UnivariateStatic;
-
-        assert(batch_size > 0);
-
-        learning_rate /= batch_size;
-
-        std::lock_guard<std::mutex> lock(examples_mutex);
-
-        double abs_eval_diff_sum = 0.0;
-        double abs_discrete_eval_sum = 0.0;
-        double gradient_norm = 0.0;
-
-        bool collect_stats = verbose;
-
-        Learner::Loss loss_sum{};
-
-        std::vector<double> abs_eval_diff_sum_local(thread_pool.size(), 0.0);
-        std::vector<double> abs_discrete_eval_sum_local(thread_pool.size(), 0.0);
-        std::vector<double> gradient_norm_local(thread_pool.size(), 0.0);
-        std::vector<Learner::Loss> loss_sum_local(thread_pool.size());
-
-        auto prev_batch_begin = examples.end();
-        while ((long)(prev_batch_begin - examples.begin()) >= (long)batch_size) {
-            auto batch_begin = prev_batch_begin - batch_size;
-            auto batch_end = prev_batch_begin;
-            auto size = batch_end - batch_begin;
-            const auto network_output = trainer->step_start(thread_pool, batch_begin, batch_end);
-            std::vector<LearnFloatType> gradients(size);
-
-            thread_pool.for_each_index_chunk_with_workers(
-                std::size_t(0), size,
-                [&](Thread& th, std::size_t offset, std::size_t count) {
-                    const auto thread_id = th.thread_idx();
-
-                    trainer->propagate(th, offset, count);
-
-                    for (std::size_t b = offset; b < offset + count; ++b) {
-                        const auto& e = *(batch_begin + b);
-                        const auto shallow = static_cast<Value>(round<std::int32_t>(
-                            e.sign * network_output[b] * kPonanzaConstant));
-                        const auto discrete = e.sign * e.discrete_nn_eval;
-                        const auto& psv = e.psv;
-                        auto loss = calc_loss(shallow, (Value)psv.score, psv.game_result, psv.gamePly);
-                        loss.grad = std::clamp(
-                            loss.grad * e.sign * kPonanzaConstant * e.weight, -max_grad, max_grad);
-                        gradients[b] = static_cast<LearnFloatType>(loss.grad);
-                        loss_sum_local[thread_id] += loss;
-
-                        // The discrete eval will only be valid before first backpropagation,
-                        // that is only for the first batch.
-                        // Similarily we want only gradients from one batch.
-                        if (collect_stats)
-                        {
-                            abs_eval_diff_sum_local[thread_id] += std::abs(discrete - shallow);
-                            abs_discrete_eval_sum_local[thread_id] += std::abs(discrete);
-                            gradient_norm_local[thread_id] += std::abs(loss.grad);
-                        }
-                    }
-
-                    trainer->backpropagate(th, gradients.data(), offset, count);
-                }
-            );
-
-            // We can asyncronously erase the examples that we used in the previous
-            // step. This can be done safely because we're no longer using these
-            // examples and erase won't invalidate iterators.
-            examples.erase(prev_batch_begin, examples.end());
-            prev_batch_begin = batch_begin;
-
-            thread_pool.wait_for_workers_finished();
-
-            trainer->step_end(thread_pool, learning_rate);
-
-            collect_stats = false;
-        }
-        examples.erase(prev_batch_begin, examples.end());
-
-        if (verbose)
-        {
-            abs_eval_diff_sum = std::accumulate(abs_eval_diff_sum_local.begin(), abs_eval_diff_sum_local.end(), 0.0);
-            abs_discrete_eval_sum = std::accumulate(abs_discrete_eval_sum_local.begin(), abs_discrete_eval_sum_local.end(), 0.0);
-            gradient_norm = std::accumulate(gradient_norm_local.begin(), gradient_norm_local.end(), 0.0);
-
-            const double avg_abs_eval_diff = abs_eval_diff_sum / batch_size;
-            const double avg_abs_discrete_eval = abs_discrete_eval_sum / batch_size;
-
-            auto out = sync_region_cout.new_region();
-
-            out << "INFO (update_parameters):"
-                << " epoch = " << epoch
-                << " , avg_abs(trainer_eval-nnue_eval) = " << avg_abs_eval_diff
-                << " , avg_abs(nnue_eval) = " << avg_abs_discrete_eval
-                << " , avg_relative_error = " << avg_abs_eval_diff / avg_abs_discrete_eval
-                << " , batch_size = " << batch_size
-                << " , grad_norm = " << gradient_norm
-                << std::endl;
-        } else {
-            // Display some progress but don't synchronize as
-            // we can't really decide when to release the output lock here
-            std::cout << '.';
-        }
-
-        send_messages({{"quantize_parameters"}});
-
-        for(auto& loss : loss_sum_local)
-        {
-            loss_sum += loss;
-        }
-
-        return loss_sum;
-    }
-
-    // Check if there are any problems with learning
-    void check_health() {
-        send_messages({{"check_health"}});
-    }
-
-    // save merit function parameters to a file
-    void save_eval(std::string dir_name) {
-        auto eval_dir = Path::combine(Options["EvalSaveDir"], dir_name);
-
-        auto out = sync_region_cout.new_region();
-
-        out << "INFO (save_eval): Saving current evaluation file in " << eval_dir << std::endl;
-
-        // mkdir() will fail if this folder already exists, but
-        // Apart from that. If not, I just want you to make it.
-        // Also, assume that the folders up to EvalSaveDir have been dug.
-        sys::create_directories(eval_dir);
-
-        const std::string file_name = Path::combine(eval_dir, NNUE::savedfileName);
-        std::ofstream stream(file_name, std::ios::binary);
-#ifndef NDEBUG
-        bool result =
-#endif
-        WriteParameters(stream);
-#ifndef NDEBUG
-        assert(result);
-#endif
-        out << "INFO (save_eval): Finished saving evaluation file in " << eval_dir << std::endl;
-    }
-}  // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue_learner.h b/src/nnue/evaluate_nnue_learner.h
deleted file mode 100644
index 3d9f5b31..00000000
--- a/src/nnue/evaluate_nnue_learner.h
+++ /dev/null
@@ -1,52 +0,0 @@
-﻿#ifndef _EVALUATE_NNUE_LEARNER_H_
-#define _EVALUATE_NNUE_LEARNER_H_
-
-#include "learn/learn.h"
-
-#include "misc.h"
-
-struct ThreadPool;
-
-// Interface used for learning NNUE evaluation function
-namespace Eval::NNUE {
-
-    // Initialize learning
-    void initialize_training(
-        const std::string& seed,
-        SynchronizedRegionLogger::Region& out);
-
-    // set the number of samples in the mini-batch
-    void set_batch_size(uint64_t size);
-
-    // Set options such as hyperparameters
-    void set_options(const std::string& options);
-
-    // Reread the evaluation function parameters for learning from the file
-    void restore_parameters(const std::string& dir_name);
-
-    // Add 1 sample of learning data
-    void add_example(
-        Position& pos,
-        Color rootColor,
-        Value discrete_nn_eval,
-    	const Learner::PackedSfenValue& psv,
-        double weight);
-
-    // update the evaluation function parameters
-    Learner::Loss update_parameters(
-        ThreadPool& thread_pool,
-        uint64_t epoch,
-        bool verbose,
-        double learning_rate,
-        double max_grad,
-        Learner::CalcLossFunc calc_loss);
-
-    // Check if there are any problems with learning
-    void check_health();
-
-    void finalize_net();
-
-    void save_eval(std::string suffix);
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/nnue_test_command.cpp b/src/nnue/nnue_test_command.cpp
deleted file mode 100644
index d892222b..00000000
--- a/src/nnue/nnue_test_command.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-﻿#include "evaluate_nnue.h"
-#include "nnue_test_command.h"
-
-#include "thread.h"
-#include "uci.h"
-
-#include <set>
-#include <fstream>
-
-#define ASSERT(X) { \
-    if (!(X)) { \
-        std::cout \
-            << "\nError : ASSERT(" << #X << "), " \
-            << __FILE__ << "(" << __LINE__ << "): " \
-            << __func__ << std::endl; \
-            std::this_thread::sleep_for(std::chrono::microseconds(3000)); \
-            *(int*)1 =0; \
-    } \
-}
-
-// USI extended command for NNUE evaluation function
-namespace Eval::NNUE {
-
-    namespace {
-
-        // Testing RawFeatures mainly for difference calculation
-        void test_features(Position& pos) {
-            const std::uint64_t num_games = 1000;
-            StateInfo si;
-            pos.set(StartFEN, false, &si, Threads.main());
-            const int MAX_PLY = 256; // test up to 256 hands
-
-            StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
-            int ply; // Trouble from the initial phase
-
-            PRNG prng(20171128);
-
-            std::uint64_t num_moves = 0;
-            std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
-            std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
-            constexpr IndexType kUnknown = -1;
-            std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
-
-            auto make_index_sets = [&](const Position& position) {
-                std::vector<std::vector<std::set<IndexType>>> index_sets(
-                    kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
-
-                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-                    Features::IndexList active_indices[2];
-                    RawFeatures::append_active_indices(position, kRefreshTriggers[i],
-                                                     active_indices);
-
-                    for (const auto perspective : Colors) {
-                        for (const auto index : active_indices[perspective]) {
-                            ASSERT(index < RawFeatures::kDimensions);
-                            ASSERT(index_sets[i][perspective].count(index) == 0);
-                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-                            index_sets[i][perspective].insert(index);
-                            trigger_map[index] = i;
-                        }
-                    }
-                }
-
-                return index_sets;
-            };
-
-            auto update_index_sets = [&](const Position& position, auto* index_sets) {
-                for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-                    Features::IndexList removed_indices[2], added_indices[2];
-                    bool reset[2] = { false, false };
-                    RawFeatures::append_changed_indices(position, kRefreshTriggers[i],
-                                                      removed_indices, added_indices, reset);
-                    for (const auto perspective : Colors) {
-                        if (reset[perspective]) {
-                            (*index_sets)[i][perspective].clear();
-                            ++num_resets[i];
-                        } else {
-                            for (const auto index : removed_indices[perspective]) {
-                                ASSERT(index < RawFeatures::kDimensions);
-                                ASSERT((*index_sets)[i][perspective].count(index) == 1);
-                                ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-                                (*index_sets)[i][perspective].erase(index);
-                                ++num_updates.back();
-                                ++num_updates[i];
-                                trigger_map[index] = i;
-                            }
-                        }
-
-                        for (const auto index : added_indices[perspective]) {
-                            ASSERT(index < RawFeatures::kDimensions);
-                            ASSERT((*index_sets)[i][perspective].count(index) == 0);
-                            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
-                            (*index_sets)[i][perspective].insert(index);
-                            ++num_updates.back();
-                            ++num_updates[i];
-                            trigger_map[index] = i;
-                        }
-                    }
-                }
-            };
-
-            std::cout << "feature set: " << RawFeatures::get_name()
-                      << "[" << RawFeatures::kDimensions << "]" << std::endl;
-            std::cout << "start testing with random games";
-
-            for (std::uint64_t i = 0; i < num_games; ++i) {
-                auto index_sets = make_index_sets(pos);
-                for (ply = 0; ply < MAX_PLY; ++ply) {
-                    MoveList<LEGAL> mg(pos); // Generate all legal hands
-
-                    // There was no legal move == Clog
-                    if (mg.size() == 0)
-                        break;
-
-                    // Randomly choose from the generated moves and advance the phase with the moves.
-                    Move m = mg.begin()[prng.rand(mg.size())];
-                    pos.do_move(m, state[ply]);
-
-                    ++num_moves;
-                    update_index_sets(pos, &index_sets);
-                    ASSERT(index_sets == make_index_sets(pos));
-                }
-
-                pos.set(StartFEN, false, &si, Threads.main());
-
-                // Output'.' every 100 times (so you can see that it's progressing)
-                if ((i % 100) == 0)
-                    std::cout << "." << std::flush;
-            }
-
-            std::cout << "passed." << std::endl;
-            std::cout << num_games << " games, " << num_moves << " moves, "
-                      << num_updates.back() << " updates, "
-                      << (1.0 * num_updates.back() / num_moves)
-                      << " updates per move" << std::endl;
-            std::size_t num_observed_indices = 0;
-
-            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-                const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
-                num_observed_indices += count;
-                std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
-                          << "): " << count << " features ("
-                          << (100.0 * count / RawFeatures::kDimensions) << "%), "
-                          << num_updates[i] << " updates ("
-                          << (1.0 * num_updates[i] / num_moves) << " per move), "
-                          << num_resets[i] << " resets ("
-                          << (100.0 * num_resets[i] / num_moves) << "%)"
-                          << std::endl;
-            }
-            std::cout << "observed " << num_observed_indices << " ("
-                      << (100.0 * num_observed_indices / RawFeatures::kDimensions)
-                      << "% of " << RawFeatures::kDimensions
-                      << ") features" << std::endl;
-        }
-
-        // Output a string that represents the structure of the evaluation function
-        void print_info(std::istream& stream) {
-            std::cout << "network architecture: " << get_architecture_string() << std::endl;
-
-            while (true) {
-                std::string file_name;
-                stream >> file_name;
-                if (file_name.empty())
-                    break;
-
-                std::uint32_t hash_value;
-                std::string architecture;
-                const bool success = [&]() {
-                    std::ifstream file_stream(file_name, std::ios::binary);
-
-                    if (!file_stream)
-                        return false;
-                    if (!read_header(file_stream, &hash_value, &architecture))
-                        return false;
-
-                    return true;
-                }();
-
-                std::cout << file_name << ": ";
-                if (success) {
-                    if (hash_value == kHashValue) {
-                        std::cout << "matches with this binary";
-                        if (architecture != get_architecture_string()) {
-                            std::cout << ", but architecture string differs: " << architecture;
-                        }
-
-                        std::cout << std::endl;
-                    } else {
-                        std::cout << architecture << std::endl;
-                    }
-                } else {
-                    std::cout << "failed to read header" << std::endl;
-                }
-            }
-        }
-
-    }  // namespace
-
-    // USI extended command for NNUE evaluation function
-    void test_command(Position& pos, std::istream& stream) {
-        std::string sub_command;
-        stream >> sub_command;
-
-        if (sub_command == "test_features") {
-            test_features(pos);
-        } else if (sub_command == "info") {
-            print_info(stream);
-        } else {
-            std::cout << "usage:" << std::endl;
-            std::cout << " test nnue test_features" << std::endl;
-            std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
-        }
-    }
-
-}  // namespace Eval::NNUE
diff --git a/src/nnue/nnue_test_command.h b/src/nnue/nnue_test_command.h
deleted file mode 100644
index fcfe16f6..00000000
--- a/src/nnue/nnue_test_command.h
+++ /dev/null
@@ -1,12 +0,0 @@
-﻿#ifndef _NNUE_TEST_COMMAND_H_
-#define _NNUE_TEST_COMMAND_H_
-
-// USI extended command interface for NNUE evaluation function
-namespace Eval::NNUE {
-
-    // USI extended command for NNUE evaluation function
-    void test_command(Position& pos, std::istream& stream);
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/trainer/features/all_factorizers.h b/src/nnue/trainer/features/all_factorizers.h
deleted file mode 100644
index 75d62ec8..00000000
--- a/src/nnue/trainer/features/all_factorizers.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
-#define _NNUE_TRAINER_FEATURES_ALL_FACTORIZERS_H_
-
-#include "factorizer.h"
-#include "factorizer_feature_set.h"
-
-#include "factorizer_half_kp.h"
-#include "factorizer_half_ka.h"
-
-#endif
diff --git a/src/nnue/trainer/features/factorizer.h b/src/nnue/trainer/features/factorizer.h
deleted file mode 100644
index b64b0c74..00000000
--- a/src/nnue/trainer/features/factorizer.h
+++ /dev/null
@@ -1,117 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
-#define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
-
-#include "nnue/nnue_common.h"
-
-#include "nnue/trainer/trainer.h"
-
-// NNUE evaluation function feature conversion class template
-namespace Eval::NNUE::Features {
-
-    // Class template that converts input features into learning features
-    // By default, the learning feature is the same as the original input feature, and specialized as necessary
-    template <typename FeatureType>
-    class Factorizer {
-    public:
-        static constexpr std::string get_name() {
-            return "Factorizer<" + FeatureType::get_name() + "> -> " + std::string("No factorizer");
-        }
-
-        static constexpr std::string get_factorizers_string() {
-            return "  - " + get_name();
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return FeatureType::kDimensions;
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features) {
-
-            assert(base_index <FeatureType::kDimensions);
-            training_features->emplace_back(base_index);
-        }
-    };
-
-    // Learning feature information
-    struct FeatureProperties {
-        bool active;
-        IndexType dimensions;
-    };
-
-    // Add the original input features to the learning features
-    template <typename FeatureType>
-    IndexType append_base_feature(
-        FeatureProperties properties, IndexType base_index,
-        std::vector<TrainingFeature>* training_features) {
-
-        assert(properties.dimensions == FeatureType::kDimensions);
-        assert(base_index < FeatureType::kDimensions);
-        training_features->emplace_back(base_index);
-        return properties.dimensions;
-    }
-
-    // If the learning rate scale is not 0, inherit other types of learning features
-    template <typename FeatureType>
-    IndexType inherit_features_if_required(
-        IndexType index_offset, FeatureProperties properties, IndexType base_index,
-        std::vector<TrainingFeature>* training_features) {
-
-        if (!properties.active) {
-            return 0;
-        }
-
-        assert(properties.dimensions == Factorizer<FeatureType>::get_dimensions());
-        assert(base_index < FeatureType::kDimensions);
-
-        const auto start = training_features->size();
-        Factorizer<FeatureType>::append_training_features(
-            base_index, training_features);
-
-        for (auto i = start; i < training_features->size(); ++i) {
-            auto& feature = (*training_features)[i];
-            assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
-            feature.shift_index(index_offset);
-        }
-
-        return properties.dimensions;
-    }
-
-    // Return the index difference as needed, without adding learning features
-    // Call instead of InheritFeaturesIfRequired() if there are no corresponding features
-    IndexType skip_features(FeatureProperties properties) {
-        if (!properties.active)
-            return 0;
-
-        return properties.dimensions;
-    }
-
-    // Get the dimensionality of the learning feature
-    template <std::size_t N>
-    constexpr IndexType get_active_dimensions(
-        const FeatureProperties (&properties)[N]) {
-
-        static_assert(N > 0, "");
-
-        IndexType dimensions = properties[0].dimensions;
-
-        for (std::size_t i = 1; i < N; ++i) {
-            if (properties[i].active) {
-                dimensions += properties[i].dimensions;
-            }
-        }
-
-        return dimensions;
-    }
-
-    // get the number of elements in the array
-    template <typename T, std::size_t N>
-    constexpr std::size_t get_array_length(const T (&/*array*/)[N]) {
-        return N;
-    }
-
-}  // namespace Eval::NNUE::Features
-
-#endif
diff --git a/src/nnue/trainer/features/factorizer_feature_set.h b/src/nnue/trainer/features/factorizer_feature_set.h
deleted file mode 100644
index 60f42166..00000000
--- a/src/nnue/trainer/features/factorizer_feature_set.h
+++ /dev/null
@@ -1,121 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
-#define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
-
-#include "factorizer.h"
-
-#include "nnue/features/feature_set.h"
-
-// Specialization for feature set of feature conversion class template of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Class template that converts input features into learning features
-    // Specialization for FeatureSet
-    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-    class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
-    private:
-        using Head = Factorizer<FeatureSet<FirstFeatureType>>;
-        using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
-
-    public:
-        // number of dimensions of original input features
-        static constexpr IndexType kBaseDimensions =
-            FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
-
-        static constexpr std::string get_factorizers_string() {
-            std::string str = "  - ";
-            str += Head::get_name();
-            str += '\n';
-            str += Tail::get_factorizers_string();
-            return str;
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return Head::get_dimensions() + Tail::get_dimensions();
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features,
-            IndexType base_dimensions = kBaseDimensions) {
-
-            assert(base_index < kBaseDimensions);
-
-            constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
-
-            if (base_index < boundary) {
-                Tail::append_training_features(
-                    base_index, training_features, base_dimensions);
-            }
-            else {
-                const auto start = training_features->size();
-
-                Head::append_training_features(
-                    base_index - boundary, training_features, base_dimensions);
-
-                for (auto i = start; i < training_features->size(); ++i) {
-                    auto& feature = (*training_features)[i];
-                    const auto index = feature.get_index();
-
-                    assert(index < Head::get_dimensions() ||
-                               (index >= base_dimensions &&
-                                index < base_dimensions +
-                                        Head::get_dimensions() - Head::kBaseDimensions));
-
-                    if (index < Head::kBaseDimensions) {
-                        feature.shift_index(Tail::kBaseDimensions);
-                    }
-                    else {
-                        feature.shift_index(Tail::get_dimensions() - Tail::kBaseDimensions);
-                    }
-                }
-            }
-        }
-    };
-
-    // Class template that converts input features into learning features
-    // Specialization when FeatureSet has one template argument
-    template <typename FeatureType>
-    class Factorizer<FeatureSet<FeatureType>> {
-    public:
-        // number of dimensions of original input features
-        static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
-
-        static constexpr std::string get_name() {
-            return Factorizer<FeatureType>::get_name();
-        }
-
-        static constexpr std::string get_factorizers_string() {
-            return "  - " + get_name();
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return Factorizer<FeatureType>::get_dimensions();
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features,
-            IndexType base_dimensions = kBaseDimensions) {
-
-            assert(base_index < kBaseDimensions);
-
-            const auto start = training_features->size();
-
-            Factorizer<FeatureType>::append_training_features(
-                base_index, training_features);
-
-            for (auto i = start; i < training_features->size(); ++i) {
-                auto& feature = (*training_features)[i];
-                assert(feature.get_index() < Factorizer<FeatureType>::get_dimensions());
-                if (feature.get_index() >= kBaseDimensions) {
-                    feature.shift_index(base_dimensions - kBaseDimensions);
-                }
-            }
-        }
-    };
-
-}  // namespace Eval::NNUE::Features
-
-#endif
diff --git a/src/nnue/trainer/features/factorizer_half_ka.h b/src/nnue/trainer/features/factorizer_half_ka.h
deleted file mode 100644
index 36d36a2d..00000000
--- a/src/nnue/trainer/features/factorizer_half_ka.h
+++ /dev/null
@@ -1,93 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
-#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
-
-#include "factorizer.h"
-
-#include "nnue/features/half_ka.h"
-#include "nnue/features/a.h"
-#include "nnue/features/half_relative_ka.h"
-
-// Specialization of NNUE evaluation function feature conversion class template for HalfKA
-namespace Eval::NNUE::Features {
-
-    // Class template that converts input features into learning features
-    // Specialization for HalfKA
-    template <Side AssociatedKing>
-    class Factorizer<HalfKA<AssociatedKing>> {
-    private:
-        using FeatureType = HalfKA<AssociatedKing>;
-
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions =
-            FeatureType::kMaxActiveDimensions;
-
-        // Type of learning feature
-        enum TrainingFeatureType {
-            kFeaturesHalfKA,
-            kFeaturesA,
-            kFeaturesHalfRelativeKA,
-            kNumTrainingFeatureTypes,
-        };
-
-        // Learning feature information
-        static constexpr FeatureProperties kProperties[] = {
-            // kFeaturesHalfA
-            {true, FeatureType::kDimensions},
-            // kFeaturesA
-            {true, Factorizer<A>::get_dimensions()},
-            // kFeaturesHalfRelativeKA
-            {true, Factorizer<HalfRelativeKA<AssociatedKing>>::get_dimensions()},
-        };
-
-        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
-
-    public:
-        static constexpr std::string get_name() {
-            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "A, HalfRelativeKA";
-        }
-
-        static constexpr std::string get_factorizers_string() {
-            return "  - " + get_name();
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return get_active_dimensions(kProperties);
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features) {
-
-            // kFeaturesHalfA
-            IndexType index_offset = append_base_feature<FeatureType>(
-                kProperties[kFeaturesHalfKA], base_index, training_features);
-
-            const auto sq_k = static_cast<Square>(base_index / PS_END2);
-            const auto a = static_cast<IndexType>(base_index % PS_END2);
-
-            // kFeaturesA
-            index_offset += inherit_features_if_required<A>(
-                index_offset, kProperties[kFeaturesA], a, training_features);
-
-            // kFeaturesHalfRelativeKA
-            if (a >= PS_W_PAWN) {
-                index_offset += inherit_features_if_required<HalfRelativeKA<AssociatedKing>>(
-                    index_offset, kProperties[kFeaturesHalfRelativeKA],
-                    HalfRelativeKA<AssociatedKing>::make_index(sq_k, a),
-                    training_features);
-            }
-            else {
-                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKA]);
-            }
-
-            assert(index_offset == get_dimensions());
-        }
-    };
-
-    template <Side AssociatedKing>
-    constexpr FeatureProperties Factorizer<HalfKA<AssociatedKing>>::kProperties[];
-
-}  // namespace Eval::NNUE::Features
-
-#endif // #ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KA_H_
diff --git a/src/nnue/trainer/features/factorizer_half_kp.h b/src/nnue/trainer/features/factorizer_half_kp.h
deleted file mode 100644
index c554f0fc..00000000
--- a/src/nnue/trainer/features/factorizer_half_kp.h
+++ /dev/null
@@ -1,104 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
-#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
-
-#include "factorizer.h"
-
-#include "nnue/features/half_kp.h"
-#include "nnue/features/p.h"
-#include "nnue/features/half_relative_kp.h"
-
-// Specialization of NNUE evaluation function feature conversion class template for HalfKP
-namespace Eval::NNUE::Features {
-
-    // Class template that converts input features into learning features
-    // Specialization for HalfKP
-    template <Side AssociatedKing>
-    class Factorizer<HalfKP<AssociatedKing>> {
-    private:
-        using FeatureType = HalfKP<AssociatedKing>;
-
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions =
-            FeatureType::kMaxActiveDimensions;
-
-        // Type of learning feature
-        enum TrainingFeatureType {
-            kFeaturesHalfKP,
-            kFeaturesHalfK,
-            kFeaturesP,
-            kFeaturesHalfRelativeKP,
-            kNumTrainingFeatureTypes,
-        };
-
-        // Learning feature information
-        static constexpr FeatureProperties kProperties[] = {
-            // kFeaturesHalfKP
-            {true, FeatureType::kDimensions},
-            // kFeaturesHalfK
-            {true, SQUARE_NB},
-            // kFeaturesP
-            {true, Factorizer<P>::get_dimensions()},
-            // kFeaturesHalfRelativeKP
-            {true, Factorizer<HalfRelativeKP<AssociatedKing>>::get_dimensions()},
-        };
-
-        static_assert(get_array_length(kProperties) == kNumTrainingFeatureTypes, "");
-
-    public:
-        static constexpr std::string get_name() {
-            return std::string("Factorizer<") + FeatureType::kName + "> -> " + "HalfK, P, HalfRelativeKP";
-        }
-
-        static constexpr std::string get_factorizers_string() {
-            return "  - " + get_name();
-        }
-
-        // Get the dimensionality of the learning feature
-        static constexpr IndexType get_dimensions() {
-            return get_active_dimensions(kProperties);
-        }
-
-        // Get index of learning feature and scale of learning rate
-        static void append_training_features(
-            IndexType base_index, std::vector<TrainingFeature>* training_features) {
-
-            // kFeaturesHalfKP
-            IndexType index_offset = append_base_feature<FeatureType>(
-                kProperties[kFeaturesHalfKP], base_index, training_features);
-
-            const auto sq_k = static_cast<Square>(base_index / PS_END);
-            const auto p = static_cast<IndexType>(base_index % PS_END);
-
-            // kFeaturesHalfK
-            {
-                const auto& properties = kProperties[kFeaturesHalfK];
-                if (properties.active) {
-                    training_features->emplace_back(index_offset + sq_k);
-                    index_offset += properties.dimensions;
-                }
-            }
-
-            // kFeaturesP
-            index_offset += inherit_features_if_required<P>(
-                index_offset, kProperties[kFeaturesP], p, training_features);
-            // kFeaturesHalfRelativeKP
-            if (p >= PS_W_PAWN) {
-                index_offset += inherit_features_if_required<HalfRelativeKP<AssociatedKing>>(
-                    index_offset, kProperties[kFeaturesHalfRelativeKP],
-                    HalfRelativeKP<AssociatedKing>::make_index(sq_k, p),
-                    training_features);
-            }
-            else {
-                index_offset += skip_features(kProperties[kFeaturesHalfRelativeKP]);
-            }
-
-            assert(index_offset == get_dimensions());
-        }
-    };
-
-    template <Side AssociatedKing>
-    constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
-
-}  // namespace Eval::NNUE::Features
-
-#endif
diff --git a/src/nnue/trainer/trainer.h b/src/nnue/trainer/trainer.h
deleted file mode 100644
index 973bc898..00000000
--- a/src/nnue/trainer/trainer.h
+++ /dev/null
@@ -1,122 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_H_
-#define _NNUE_TRAINER_H_
-
-#include "nnue/nnue_common.h"
-#include "nnue/features/index_list.h"
-
-#include <sstream>
-
-#if defined(USE_BLAS)
-static_assert(std::is_same<LearnFloatType, float>::value, "");
-#include <cblas.h>
-#endif
-
-// Common header of class template for learning NNUE evaluation function
-namespace Eval::NNUE {
-
-    // Ponanza constant used in the relation between evaluation value and winning percentage
-    constexpr double kPonanzaConstant = 600.0;
-
-    // Class that represents one index of learning feature
-    class TrainingFeature {
-        using StorageType = std::uint32_t;
-        static_assert(std::is_unsigned<StorageType>::value, "");
-
-    public:
-        static constexpr std::uint32_t kIndexBits = 24;
-
-        static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
-
-        static constexpr std::uint32_t kCountBits =
-            std::numeric_limits<StorageType>::digits - kIndexBits;
-
-        explicit TrainingFeature(IndexType index) :
-            index_and_count_((index << kCountBits) | 1) {
-
-            assert(index < (1 << kIndexBits));
-        }
-
-        TrainingFeature& operator+=(const TrainingFeature& other) {
-            assert(other.get_index() == get_index());
-            assert(other.get_count() + get_count() < (1 << kCountBits));
-            index_and_count_ += other.get_count();
-            return *this;
-        }
-
-        IndexType get_index() const {
-            return static_cast<IndexType>(index_and_count_ >> kCountBits);
-        }
-
-        void shift_index(IndexType offset) {
-            assert(get_index() + offset < (1 << kIndexBits));
-            index_and_count_ += offset << kCountBits;
-        }
-
-        IndexType get_count() const {
-            return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
-        }
-
-        bool operator<(const TrainingFeature& other) const {
-            return index_and_count_ < other.index_and_count_;
-        }
-
-    private:
-        StorageType index_and_count_;
-    };
-
-    // Structure that represents one sample of training data
-    struct Example {
-        std::vector<TrainingFeature> training_features[2];
-        Learner::PackedSfenValue psv;
-        Value discrete_nn_eval;
-        int sign;
-        double weight;
-    };
-
-    // Message used for setting hyperparameters
-    struct Message {
-        Message(const std::string& message_name, const std::string& message_value = "") :
-            name(message_name), value(message_value), num_peekers(0), num_receivers(0)
-        {
-        }
-
-        const std::string name;
-        const std::string value;
-        std::uint32_t num_peekers;
-        std::uint32_t num_receivers;
-    };
-
-    // determine whether to accept the message
-    bool receive_message(const std::string& name, Message* message) {
-        const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
-
-        if (message->name.substr(0, name.size() + 1) == name + "[") {
-            ++message->num_peekers;
-        }
-
-        if (message->name == name || message->name == name + subscript) {
-            ++message->num_receivers;
-            return true;
-        }
-
-        return false;
-    }
-
-    // round a floating point number to an integer
-    template <typename IntType>
-    IntType round(double value) {
-        return static_cast<IntType>(std::floor(value + 0.5));
-    }
-
-    // make_shared with alignment
-    template <typename T, typename... ArgumentTypes>
-    std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments) {
-        const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
-            T(std::forward<ArgumentTypes>(arguments)...);
-
-        return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
-    }
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/trainer/trainer_affine_transform.h b/src/nnue/trainer/trainer_affine_transform.h
deleted file mode 100644
index 53e8f904..00000000
--- a/src/nnue/trainer/trainer_affine_transform.h
+++ /dev/null
@@ -1,476 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
-#define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
-
-#include "trainer.h"
-
-#include "extra/stockfish_blas.h"
-
-#include "learn/learn.h"
-
-#include "nnue/layers/affine_transform.h"
-
-#include "thread.h"
-
-#include <random>
-
-// Specialization of NNUE evaluation function learning class template for AffineTransform
-namespace Eval::NNUE {
-
-    // Learning: Affine transformation layer
-    template <typename PreviousLayer, IndexType OutputDimensions>
-    class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* target_layer, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(
-                new Trainer(target_layer, ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            previous_layer_trainer_->send_message(message);
-
-            if (receive_message("momentum", message)) {
-                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-            }
-
-            if (receive_message("learning_rate_scale", message)) {
-                learning_rate_scale_ =
-                    static_cast<LearnFloatType>(std::stod(message->value));
-            }
-
-            if (receive_message("reset", message)) {
-                dequantize_parameters();
-            }
-
-            if (receive_message("quantize_parameters", message)) {
-                quantize_parameters();
-            }
-
-            if (receive_message("check_health", message)) {
-                check_health();
-            }
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            previous_layer_trainer_->initialize(rng);
-
-            if (kIsOutputLayer) {
-                // Initialize output layer with 0
-                std::fill(std::begin(biases_), std::end(biases_),
-                          static_cast<LearnFloatType>(0.0));
-                std::fill(std::begin(weights_), std::end(weights_),
-                          static_cast<LearnFloatType>(0.0));
-            }
-            else {
-                // Assuming that the input distribution is unit-mean 0.5, equal variance,
-                // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
-                const double kSigma = 1.0 / std::sqrt(kInputDimensions);
-                auto distribution = std::normal_distribution<double>(0.0, kSigma);
-
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    double sum = 0.0;
-                      for (IndexType j = 0; j < kInputDimensions; ++j) {
-                          const auto weight = static_cast<LearnFloatType>(distribution(rng));
-                          weights_[kInputDimensions * i + j] = weight;
-                          sum += weight;
-                      }
-
-                    biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
-                }
-            }
-
-            quantize_parameters();
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)output_.size() < (long)kOutputDimensions * size) {
-                output_.resize(kOutputDimensions * size);
-                gradients_.resize(kInputDimensions * size);
-            }
-
-            if (thread_states_.size() < thread_pool.size())
-            {
-                thread_states_.resize(thread_pool.size());
-            }
-
-            combined_batch_size_ = size;
-            combined_batch_input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
-
-            auto& main_thread_state = thread_states_[0];
-
-#if defined(USE_BLAS)
-
-            // update
-            cblas_sscal(
-                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
-            );
-
-#else
-
-            Blas::sscal(
-                kOutputDimensions, momentum_, main_thread_state.biases_diff_, 1
-            );
-
-#endif
-
-            for (IndexType i = 1; i < thread_states_.size(); ++i)
-                thread_states_[i].reset_biases();
-
-            return output_.data();
-        }
-
-        // forward propagation
-        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
-
-            previous_layer_trainer_->propagate(th, offset, count);
-
-#if defined(USE_BLAS)
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                cblas_scopy(
-                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
-                );
-            }
-
-            cblas_sgemm(
-                CblasColMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, count, kInputDimensions,
-                1.0,
-                weights_, kInputDimensions,
-                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
-                1.0,
-                &output_[offset * kOutputDimensions], kOutputDimensions
-            );
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                Blas::scopy(
-                    kOutputDimensions, biases_, 1, &output_[batch_offset], 1
-                );
-            }
-
-            Blas::sgemm(
-                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
-                kOutputDimensions, count, kInputDimensions,
-                1.0,
-                weights_, kInputDimensions,
-                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
-                1.0,
-                &output_[offset * kOutputDimensions], kOutputDimensions
-            );
-
-#endif
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           uint64_t offset,
-                           uint64_t count) {
-
-            auto& thread_state = thread_states_[th.thread_idx()];
-            const auto momentum = th.thread_idx() == 0 ? momentum_ : 0.0f;
-#if defined(USE_BLAS)
-
-            cblas_sgemm(
-                CblasColMajor, CblasNoTrans, CblasNoTrans,
-                kInputDimensions, count, kOutputDimensions,
-                1.0,
-                weights_, kInputDimensions,
-                gradients + offset * kOutputDimensions, kOutputDimensions,
-                0.0,
-                &gradients_[offset * kInputDimensions], kInputDimensions
-            );
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                cblas_saxpy(
-                    kOutputDimensions, 1.0,
-                    &gradients[batch_offset], 1, thread_state.biases_diff_, 1
-                );
-            }
-
-            cblas_sgemm(
-                CblasRowMajor, CblasTrans, CblasNoTrans,
-                kOutputDimensions, kInputDimensions, count,
-                1.0,
-                gradients + offset * kOutputDimensions, kOutputDimensions,
-                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
-                momentum,
-                thread_state.weights_diff_, kInputDimensions
-            );
-
-#else
-
-            // backpropagate
-            Blas::sgemm(
-                Blas::MatrixLayout::ColMajor, Blas::MatrixTranspose::NoTrans, Blas::MatrixTranspose::NoTrans,
-                kInputDimensions, count, kOutputDimensions,
-                1.0,
-                weights_, kInputDimensions,
-                gradients + offset * kOutputDimensions, kOutputDimensions,
-                0.0,
-                &gradients_[offset * kInputDimensions], kInputDimensions
-            );
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                Blas::saxpy(kOutputDimensions, 1.0,
-                          &gradients[batch_offset], 1, thread_state.biases_diff_, 1);
-            }
-
-            Blas::sgemm(
-                Blas::MatrixLayout::RowMajor, Blas::MatrixTranspose::Trans, Blas::MatrixTranspose::NoTrans,
-                kOutputDimensions, kInputDimensions, count,
-                1.0,
-                gradients + offset * kOutputDimensions, kOutputDimensions,
-                combined_batch_input_ + offset * kInputDimensions, kInputDimensions,
-                momentum,
-                thread_state.weights_diff_, kInputDimensions
-            );
-
-#endif
-
-            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
-        }
-
-        void reduce_thread_state()
-        {
-            for (IndexType i = 1; i < thread_states_.size(); ++i)
-            {
-                thread_states_[0] += thread_states_[i];
-            }
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
-        {
-            const LearnFloatType local_learning_rate =
-                learning_rate * learning_rate_scale_;
-
-            reduce_thread_state();
-
-            auto& main_thread_state = thread_states_[0];
-
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                const double d = local_learning_rate * main_thread_state.biases_diff_[i];
-                biases_[i] -= d;
-                abs_biases_diff_sum_ += std::abs(d);
-            }
-            num_biases_diffs_ += kOutputDimensions;
-
-            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-                const double d = local_learning_rate * main_thread_state.weights_diff_[i];
-                weights_[i] -= d;
-                abs_weights_diff_sum_ += std::abs(d);
-            }
-            num_weights_diffs_ += kOutputDimensions * kInputDimensions;
-
-            previous_layer_trainer_->step_end(thread_pool, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-            combined_batch_size_(0),
-            combined_batch_input_(nullptr),
-            previous_layer_trainer_(Trainer<PreviousLayer>::create(
-                &target_layer->previous_layer_, ft)),
-            target_layer_(target_layer),
-            biases_(),
-            weights_(),
-            momentum_(0.2),
-            learning_rate_scale_(1.0) {
-
-            dequantize_parameters();
-        }
-
-        void reset_stats() {
-            abs_biases_diff_sum_ = 0.0;
-            abs_weights_diff_sum_ = 0.0;
-            num_biases_diffs_ = 0;
-            num_weights_diffs_ = 0;
-        }
-
-        void check_health() {
-
-            double abs_bias_sum = 0.0;
-            double abs_weight_sum = 0.0;
-
-            for(auto b : biases_)
-                abs_bias_sum += std::abs(b);
-
-            for(auto w : weights_)
-                abs_weight_sum += std::abs(w);
-
-            auto out = sync_region_cout.new_region();
-
-            out << "INFO (check_health):"
-                << " layer " << LayerType::kLayerIndex
-                << " - " << LayerType::get_name()
-                << std::endl;
-
-            out << "  - avg_abs_bias        = " << abs_bias_sum / std::size(biases_) << std::endl;
-            out << "  - avg_abs_bias_diff   = " << abs_biases_diff_sum_ / num_biases_diffs_ << std::endl;
-            out << "  - avg_abs_weight      = " << abs_weight_sum / std::size(weights_) << std::endl;
-            out << "  - avg_abs_weight_diff = " << abs_weights_diff_sum_ / num_weights_diffs_ << std::endl;
-
-            out.unlock();
-
-            reset_stats();
-        }
-
-        // Weight saturation and parameterization
-        void quantize_parameters() {
-            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-                weights_[i] = std::max(-kMaxWeightMagnitude,
-                                       std::min(+kMaxWeightMagnitude, weights_[i]));
-            }
-
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                target_layer_->biases_[i] =
-                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-            }
-
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                const auto offset = kInputDimensions * i;
-                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-                for (IndexType j = 0; j < kInputDimensions; ++j) {
-                    target_layer_->weights_[padded_offset + j] =
-                        round<typename LayerType::WeightType>(
-                            weights_[offset + j] * kWeightScale);
-                }
-            }
-        }
-
-        // read parameterized integer
-        void dequantize_parameters() {
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                biases_[i] = static_cast<LearnFloatType>(
-                    target_layer_->biases_[i] / kBiasScale);
-            }
-
-            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                const auto offset = kInputDimensions * i;
-                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
-                for (IndexType j = 0; j < kInputDimensions; ++j) {
-                    weights_[offset + j] = static_cast<LearnFloatType>(
-                        target_layer_->weights_[padded_offset + j] / kWeightScale);
-                }
-            }
-
-            for (auto& state : thread_states_)
-            {
-                state.reset_weights();
-                state.reset_biases();
-            }
-
-
-            reset_stats();
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-        // If the output dimensionality is 1, the output layer
-        static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
-
-        // Coefficient used for parameterization
-        static constexpr LearnFloatType kActivationScale =
-            std::numeric_limits<std::int8_t>::max();
-
-        static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
-            (kPonanzaConstant * FV_SCALE) :
-            ((1 << kWeightScaleBits) * kActivationScale);
-
-        static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
-
-        // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
-        static constexpr LearnFloatType kMaxWeightMagnitude =
-            std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
-
-        // number of samples in mini-batch
-        IndexType combined_batch_size_;
-
-        double abs_biases_diff_sum_;
-        double abs_weights_diff_sum_;
-        uint64_t num_biases_diffs_;
-        uint64_t num_weights_diffs_;
-
-        // Input mini batch
-        const LearnFloatType* combined_batch_input_;
-
-        // Trainer of the previous layer
-        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-
-        // parameter
-        struct alignas(kCacheLineSize) ThreadState
-        {
-            // Buffer used for updating parameters
-            alignas(kCacheLineSize) LearnFloatType biases_diff_[kOutputDimensions];
-            alignas(kCacheLineSize) LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
-
-            ThreadState() { reset_weights(); reset_biases(); }
-
-            ThreadState& operator+=(const ThreadState& other)
-            {
-                for (IndexType i = 0; i < kOutputDimensions; ++i)
-                {
-                    biases_diff_[i] += other.biases_diff_[i];
-                }
-
-                for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i)
-                {
-                    weights_diff_[i] += other.weights_diff_[i];
-                }
-
-                return *this;
-            }
-
-            void reset_weights()
-            {
-                std::fill(std::begin(weights_diff_), std::end(weights_diff_), 0.0f);
-            }
-
-            void reset_biases()
-            {
-                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
-            }
-        };
-
-        alignas(kCacheLineSize) LearnFloatType biases_[kOutputDimensions];
-        alignas(kCacheLineSize) LearnFloatType weights_[kOutputDimensions * kInputDimensions];
-
-        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-
-        // buffer for back propagation
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-
-        // hyper parameter
-        LearnFloatType momentum_;
-        LearnFloatType learning_rate_scale_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/trainer/trainer_clipped_relu.h b/src/nnue/trainer/trainer_clipped_relu.h
deleted file mode 100644
index 48dec8be..00000000
--- a/src/nnue/trainer/trainer_clipped_relu.h
+++ /dev/null
@@ -1,354 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
-#define _NNUE_TRAINER_CLIPPED_RELU_H_
-
-#include "trainer.h"
-
-#include "learn/learn.h"
-
-#include "nnue/layers/clipped_relu.h"
-
-#include "thread.h"
-
-// Specialization of NNUE evaluation function learning class template for ClippedReLU
-namespace Eval::NNUE {
-
-    // Learning: Affine transformation layer
-    template <typename PreviousLayer>
-    class Trainer<Layers::ClippedReLU<PreviousLayer>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::ClippedReLU<PreviousLayer>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* target_layer, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(
-                new Trainer(target_layer, ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            previous_layer_trainer_->send_message(message);
-            if (receive_message("check_health", message)) {
-                check_health();
-            }
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            previous_layer_trainer_->initialize(rng);
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)output_.size() < (long)kOutputDimensions * size) {
-              output_.resize(kOutputDimensions * size);
-              gradients_.resize(kInputDimensions * size);
-            }
-
-            if (thread_states_.size() < thread_pool.size())
-            {
-                thread_states_.resize(thread_pool.size());
-            }
-
-            input_ = previous_layer_trainer_->step_start(thread_pool, batch_begin, batch_end);
-
-            batch_size_ = size;
-
-            return output_.data();
-        }
-
-        // forward propagation
-        void propagate(Thread& th, const uint64_t offset, const uint64_t count) {
-
-            auto& thread_state = thread_states_[th.thread_idx()];
-
-            previous_layer_trainer_->propagate(th, offset, count);
-
-#if defined (USE_SSE2)
-
-            {
-                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
-
-                const __m128 kZero4 = _mm_set1_ps(+kZero);
-                const __m128 kOne4 = _mm_set1_ps(+kOne);
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-
-                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
-                    {
-                        __m128 out0 = _mm_loadu_ps(&input_[i + 0 + batch_offset]);
-                        __m128 out1 = _mm_loadu_ps(&input_[i + 4 + batch_offset]);
-                        __m128 out2 = _mm_loadu_ps(&input_[i + 8 + batch_offset]);
-                        __m128 out3 = _mm_loadu_ps(&input_[i + 12 + batch_offset]);
-
-                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
-                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
-                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
-                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
-
-                        _mm_storeu_ps(&output_[i + 0 + batch_offset], out0);
-                        _mm_storeu_ps(&output_[i + 4 + batch_offset], out1);
-                        _mm_storeu_ps(&output_[i + 8 + batch_offset], out2);
-                        _mm_storeu_ps(&output_[i + 12 + batch_offset], out3);
-
-                        __m128 minact0 = _mm_loadu_ps(&thread_state.min_activations_[i + 0]);
-                        __m128 minact1 = _mm_loadu_ps(&thread_state.min_activations_[i + 4]);
-                        __m128 minact2 = _mm_loadu_ps(&thread_state.min_activations_[i + 8]);
-                        __m128 minact3 = _mm_loadu_ps(&thread_state.min_activations_[i + 12]);
-
-                        __m128 maxact0 = _mm_loadu_ps(&thread_state.max_activations_[i + 0]);
-                        __m128 maxact1 = _mm_loadu_ps(&thread_state.max_activations_[i + 4]);
-                        __m128 maxact2 = _mm_loadu_ps(&thread_state.max_activations_[i + 8]);
-                        __m128 maxact3 = _mm_loadu_ps(&thread_state.max_activations_[i + 12]);
-
-                        minact0 = _mm_min_ps(out0, minact0);
-                        minact1 = _mm_min_ps(out1, minact1);
-                        minact2 = _mm_min_ps(out2, minact2);
-                        minact3 = _mm_min_ps(out3, minact3);
-
-                        maxact0 = _mm_max_ps(out0, maxact0);
-                        maxact1 = _mm_max_ps(out1, maxact1);
-                        maxact2 = _mm_max_ps(out2, maxact2);
-                        maxact3 = _mm_max_ps(out3, maxact3);
-
-                        _mm_storeu_ps(&thread_state.min_activations_[i + 0], minact0);
-                        _mm_storeu_ps(&thread_state.min_activations_[i + 4], minact1);
-                        _mm_storeu_ps(&thread_state.min_activations_[i + 8], minact2);
-                        _mm_storeu_ps(&thread_state.min_activations_[i + 12], minact3);
-
-                        _mm_storeu_ps(&thread_state.max_activations_[i + 0], maxact0);
-                        _mm_storeu_ps(&thread_state.max_activations_[i + 4], maxact1);
-                        _mm_storeu_ps(&thread_state.max_activations_[i + 8], maxact2);
-                        _mm_storeu_ps(&thread_state.max_activations_[i + 12], maxact3);
-                    }
-                }
-            }
-
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    const IndexType index = batch_offset + i;
-                    output_[index] = std::max(+kZero, std::min(+kOne, input_[index]));
-                    thread_state.min_activations_[i] = std::min(thread_state.min_activations_[i], output_[index]);
-                    thread_state.max_activations_[i] = std::max(thread_state.max_activations_[i], output_[index]);
-                }
-            }
-
-#endif
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           const uint64_t offset,
-                           const uint64_t count) {
-
-            auto& thread_state = thread_states_[th.thread_idx()];
-
-#if defined (USE_SSE2)
-
-            {
-                static_assert(kOutputDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
-
-                const __m128 kZero4 = _mm_set1_ps(+kZero);
-                const __m128 kOne4 = _mm_set1_ps(+kOne);
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-
-                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
-                    {
-                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
-                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
-                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
-                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
-
-                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
-                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
-                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
-                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
-
-                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
-                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
-                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
-                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
-
-                        grad0 = _mm_andnot_ps(clipped0, grad0);
-                        grad1 = _mm_andnot_ps(clipped1, grad1);
-                        grad2 = _mm_andnot_ps(clipped2, grad2);
-                        grad3 = _mm_andnot_ps(clipped3, grad3);
-
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
-
-                        const int clipped_mask =
-                            (_mm_movemask_ps(clipped0) << 0)
-                            | (_mm_movemask_ps(clipped1) << 4)
-                            | (_mm_movemask_ps(clipped2) << 8)
-                            | (_mm_movemask_ps(clipped3) << 12);
-
-                        thread_state.num_clipped_ += popcount(clipped_mask);
-                    }
-                }
-            }
-
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    const IndexType index = batch_offset + i;
-                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
-                    gradients_[index] = gradients[index] * !clipped;
-                    thread_state.num_clipped_ += clipped;
-                }
-            }
-
-#endif
-
-            thread_state.num_total_ += count * kOutputDimensions;
-
-            previous_layer_trainer_->backpropagate(th, gradients_.data(), offset, count);
-        }
-
-        void reduce_thread_state()
-        {
-            for (IndexType i = 1; i < thread_states_.size(); ++i)
-            {
-                thread_states_[0] += thread_states_[i];
-            }
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate)
-        {
-            previous_layer_trainer_->step_end(thread_pool, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-            batch_size_(0),
-            previous_layer_trainer_(Trainer<PreviousLayer>::create(
-                &target_layer->previous_layer_, ft)),
-            target_layer_(target_layer) {
-
-            reset_stats();
-        }
-
-        void reset_stats() {
-            for(auto& state : thread_states_)
-                state.reset();
-        }
-
-        // Check if there are any problems with learning
-        void check_health() {
-
-            reduce_thread_state();
-
-            auto& main_thread_state = thread_states_[0];
-
-            const auto largest_min_activation = *std::max_element(
-                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
-            const auto smallest_max_activation = *std::min_element(
-                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
-
-            auto out = sync_region_cout.new_region();
-
-            out << "INFO (check_health):"
-                << " layer " << LayerType::kLayerIndex
-                << " - " << LayerType::get_name()
-                << std::endl;
-
-            out << "  - largest min activation = " << largest_min_activation
-                << " , smallest max activation = " << smallest_max_activation
-                << std::endl;
-
-            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
-                << std::endl;
-
-            out.unlock();
-
-            reset_stats();
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-        // LearnFloatType constant
-        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        const LearnFloatType* input_;
-
-        // Trainer of the previous layer
-        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-
-        // buffer for back propagation
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-
-        struct alignas(kCacheLineSize) ThreadState
-        {
-            // Health check statistics
-            LearnFloatType min_activations_[kOutputDimensions];
-            LearnFloatType max_activations_[kOutputDimensions];
-            uint64_t num_clipped_;
-            uint64_t num_total_;
-
-            ThreadState() { reset(); }
-
-            ThreadState& operator+=(const ThreadState& other)
-            {
-                for (IndexType i = 0; i < kOutputDimensions; ++i)
-                {
-                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
-                }
-
-                for (IndexType i = 0; i < kOutputDimensions; ++i)
-                {
-                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
-                }
-
-                num_clipped_ += other.num_clipped_;
-                num_total_ += other.num_total_;
-
-                return *this;
-            }
-
-            void reset()
-            {
-                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
-                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
-                num_clipped_ = 0;
-                num_total_ = 0;
-            }
-        };
-
-        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/trainer/trainer_feature_transformer.h b/src/nnue/trainer/trainer_feature_transformer.h
deleted file mode 100644
index b0e0ebba..00000000
--- a/src/nnue/trainer/trainer_feature_transformer.h
+++ /dev/null
@@ -1,783 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
-#define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
-
-#include "trainer.h"
-
-#include "extra/stockfish_blas.h"
-
-#include "features/all_factorizers.h"
-
-#include "learn/learn.h"
-
-#include "nnue/nnue_feature_transformer.h"
-
-#include "thread.h"
-
-#include <array>
-#include <bitset>
-#include <numeric>
-#include <random>
-#include <set>
-
-// Specialization for feature transformer of learning class template of NNUE evaluation function
-namespace Eval::NNUE {
-
-    // Learning: Input feature converter
-    template <>
-    class Trainer<FeatureTransformer> {
-    private:
-        // Type of layer to learn
-        using LayerType = FeatureTransformer;
-
-    public:
-        template <typename T>
-        friend struct AlignedDeleter;
-
-        template <typename T, typename... ArgumentTypes>
-        friend std::shared_ptr<T> make_aligned_shared_ptr(ArgumentTypes&&... arguments);
-
-        // factory function
-        static std::shared_ptr<Trainer> create(LayerType* target_layer) {
-            return make_aligned_shared_ptr<Trainer>(target_layer);
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            if (receive_message("momentum", message)) {
-                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-            }
-
-            if (receive_message("learning_rate_scale", message)) {
-                learning_rate_scale_ =
-                    static_cast<LearnFloatType>(std::stod(message->value));
-            }
-
-            if (receive_message("reset", message)) {
-                dequantize_parameters();
-            }
-
-            if (receive_message("quantize_parameters", message)) {
-                quantize_parameters();
-            }
-
-            if (receive_message("clear_unobserved_feature_weights", message)) {
-                clear_unobserved_feature_weights();
-            }
-
-            if (receive_message("check_health", message)) {
-                check_health();
-            }
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            std::fill(std::begin(weights_), std::end(weights_), +kZero);
-
-            const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
-            auto distribution = std::normal_distribution<double>(0.0, kSigma);
-
-            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
-                const auto weight = static_cast<LearnFloatType>(distribution(rng));
-                weights_[i] = weight;
-            }
-
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                biases_[i] = static_cast<LearnFloatType>(0.5);
-            }
-
-            quantize_parameters();
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)output_.size() < (long)kOutputDimensions * size) {
-                output_.resize(kOutputDimensions * size);
-                gradients_.resize(kOutputDimensions * size);
-            }
-
-            if (thread_stat_states_.size() < thread_pool.size())
-            {
-                thread_stat_states_.resize(thread_pool.size());
-            }
-
-            if (thread_bias_states_.size() < thread_pool.size())
-            {
-                thread_bias_states_.resize(thread_pool.size());
-            }
-
-            batch_ = &*batch_begin;
-            batch_size_ = size;
-
-            auto& main_thread_bias_state = thread_bias_states_[0];
-
-#if defined(USE_BLAS)
-
-            cblas_sscal(
-                kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
-            );
-
-#else
-
-            Blas::sscal(
-                kHalfDimensions, momentum_, main_thread_bias_state.biases_diff_, 1
-            );
-
-#endif
-
-            for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
-                thread_bias_states_[i].reset();
-
-            return output_.data();
-        }
-
-        // forward propagation
-        void propagate(Thread& th, uint64_t offset, uint64_t count) {
-
-            auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
-
-            for (IndexType b = offset; b < offset + count; ++b)
-            {
-                const IndexType batch_offset = kOutputDimensions * b;
-
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-
-#if defined(USE_BLAS)
-
-                    cblas_scopy(
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-
-                    for (const auto& feature : batch_[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        cblas_saxpy(
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], 1, &output_[output_offset], 1
-                        );
-                    }
-
-#else
-
-                    Blas::scopy(
-                        kHalfDimensions, biases_, 1, &output_[output_offset], 1
-                    );
-                    for (const auto& feature : batch_[b].training_features[c]) {
-                        const IndexType weights_offset = kHalfDimensions * feature.get_index();
-                        Blas::saxpy(
-                            kHalfDimensions, (float)feature.get_count(),
-                            &weights_[weights_offset], &output_[output_offset]
-                        );
-                    }
-
-#endif
-                }
-            }
-
-#if defined (USE_SSE2)
-
-            {
-                static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
-
-                auto m128_hmin_ps = [](__m128 x3210) {
-                    __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
-                    __m128 min_x_x_13_20 = _mm_min_ps(x3210, x0032);
-                    // a = [ # , # , min(x[1], x[3]) , min(x[2], x[0]) ]
-                    __m128 min_x_x_20_13 = _mm_shuffle_ps(min_x_x_13_20, min_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
-                    return _mm_cvtss_f32(_mm_min_ps(min_x_x_13_20, min_x_x_20_13));
-                };
-
-                auto m128_hmax_ps = [](__m128 x3210) {
-                    __m128 x0032 = _mm_shuffle_ps(x3210, x3210, _MM_SHUFFLE(0, 0, 3, 2));
-                    __m128 max_x_x_13_20 = _mm_max_ps(x3210, x0032);
-                    // a = [ # , # , max(x[1], x[3]) , max(x[2], x[0]) ]
-                    __m128 max_x_x_20_13 = _mm_shuffle_ps(max_x_x_13_20, max_x_x_13_20, _MM_SHUFFLE(0, 0, 0, 1));
-                    return _mm_cvtss_f32(_mm_max_ps(max_x_x_13_20, max_x_x_20_13));
-                };
-
-                const __m128 kZero4 = _mm_set1_ps(+kZero);
-                const __m128 kOne4 = _mm_set1_ps(+kOne);
-
-                __m128 min_pre_activation0 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
-                __m128 min_pre_activation1 = _mm_set1_ps(thread_stat_state.min_pre_activation_);
-                __m128 max_pre_activation0 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
-                __m128 max_pre_activation1 = _mm_set1_ps(thread_stat_state.max_pre_activation_);
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
-                    {
-                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i +  0]);
-                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i +  4]);
-                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i +  8]);
-                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
-
-                        __m128 min01 = _mm_min_ps(out0, out1);
-                        __m128 min23 = _mm_min_ps(out2, out3);
-
-                        __m128 max01 = _mm_max_ps(out0, out1);
-                        __m128 max23 = _mm_max_ps(out2, out3);
-
-                        min_pre_activation0 = _mm_min_ps(min_pre_activation0, min01);
-                        min_pre_activation1 = _mm_min_ps(min_pre_activation1, min23);
-                        max_pre_activation0 = _mm_max_ps(max_pre_activation0, max01);
-                        max_pre_activation1 = _mm_max_ps(max_pre_activation1, max23);
-
-                        out0 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out0));
-                        out1 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out1));
-                        out2 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out2));
-                        out3 = _mm_max_ps(kZero4, _mm_min_ps(kOne4, out3));
-
-                        _mm_storeu_ps(&output_[batch_offset + i +  0], out0);
-                        _mm_storeu_ps(&output_[batch_offset + i +  4], out1);
-                        _mm_storeu_ps(&output_[batch_offset + i +  8], out2);
-                        _mm_storeu_ps(&output_[batch_offset + i + 12], out3);
-                    }
-                }
-
-                thread_stat_state.min_pre_activation_ = m128_hmin_ps(_mm_min_ps(min_pre_activation0, min_pre_activation1));
-                thread_stat_state.max_pre_activation_ = m128_hmax_ps(_mm_max_ps(max_pre_activation0, max_pre_activation1));
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-
-                    for (IndexType half = 0; half < 2; ++half)
-                    {
-                        const IndexType half_offset = batch_offset + half * kHalfDimensions;
-                        for (IndexType i = 0; i < kHalfDimensions; i += 16)
-                        {
-                            const __m128 out0 = _mm_loadu_ps(&output_[i +  0 + half_offset]);
-                            const __m128 out1 = _mm_loadu_ps(&output_[i +  4 + half_offset]);
-                            const __m128 out2 = _mm_loadu_ps(&output_[i +  8 + half_offset]);
-                            const __m128 out3 = _mm_loadu_ps(&output_[i + 12 + half_offset]);
-
-                            __m128 minact0 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  0]);
-                            __m128 minact1 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  4]);
-                            __m128 minact2 = _mm_loadu_ps(&thread_stat_state.min_activations_[i +  8]);
-                            __m128 minact3 = _mm_loadu_ps(&thread_stat_state.min_activations_[i + 12]);
-
-                            __m128 maxact0 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  0]);
-                            __m128 maxact1 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  4]);
-                            __m128 maxact2 = _mm_loadu_ps(&thread_stat_state.max_activations_[i +  8]);
-                            __m128 maxact3 = _mm_loadu_ps(&thread_stat_state.max_activations_[i + 12]);
-
-                            minact0 = _mm_min_ps(out0, minact0);
-                            minact1 = _mm_min_ps(out1, minact1);
-                            minact2 = _mm_min_ps(out2, minact2);
-                            minact3 = _mm_min_ps(out3, minact3);
-
-                            maxact0 = _mm_max_ps(out0, maxact0);
-                            maxact1 = _mm_max_ps(out1, maxact1);
-                            maxact2 = _mm_max_ps(out2, maxact2);
-                            maxact3 = _mm_max_ps(out3, maxact3);
-
-                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  0], minact0);
-                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  4], minact1);
-                            _mm_storeu_ps(&thread_stat_state.min_activations_[i +  8], minact2);
-                            _mm_storeu_ps(&thread_stat_state.min_activations_[i + 12], minact3);
-
-                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  0], maxact0);
-                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  4], maxact1);
-                            _mm_storeu_ps(&thread_stat_state.max_activations_[i +  8], maxact2);
-                            _mm_storeu_ps(&thread_stat_state.max_activations_[i + 12], maxact3);
-                        }
-                    }
-                }
-            }
-
-#else
-
-            // clipped ReLU
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    const IndexType index = batch_offset + i;
-                    thread_stat_state.min_pre_activation_ = std::min(thread_stat_state.min_pre_activation_, output_[index]);
-                    thread_stat_state.max_pre_activation_ = std::max(thread_stat_state.max_pre_activation_, output_[index]);
-                    output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
-                    const IndexType t = i % kHalfDimensions;
-                    thread_stat_state.min_activations_[t] = std::min(thread_stat_state.min_activations_[t], output_[index]);
-                    thread_stat_state.max_activations_[t] = std::max(thread_stat_state.max_activations_[t], output_[index]);
-                }
-            }
-
-#endif
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           uint64_t offset,
-                           uint64_t count) {
-
-            auto& thread_stat_state = thread_stat_states_[th.thread_idx()];
-            auto& thread_bias_state = thread_bias_states_[th.thread_idx()];
-
-#if defined (USE_SSE2)
-
-            {
-                static_assert(kHalfDimensions % 16 == 0, "This implementation assumes that it can process 16 floats at a time");
-
-                const __m128 kZero4 = _mm_set1_ps(+kZero);
-                const __m128 kOne4 = _mm_set1_ps(+kOne);
-
-                for (IndexType b = offset; b < offset + count; ++b)
-                {
-                    const IndexType batch_offset = kOutputDimensions * b;
-                    for (IndexType i = 0; i < kOutputDimensions; i += 16)
-                    {
-                        __m128 out0 = _mm_loadu_ps(&output_[batch_offset + i + 0]);
-                        __m128 out1 = _mm_loadu_ps(&output_[batch_offset + i + 4]);
-                        __m128 out2 = _mm_loadu_ps(&output_[batch_offset + i + 8]);
-                        __m128 out3 = _mm_loadu_ps(&output_[batch_offset + i + 12]);
-
-                        __m128 clipped0 = _mm_or_ps(_mm_cmple_ps(out0, kZero4), _mm_cmpge_ps(out0, kOne4));
-                        __m128 clipped1 = _mm_or_ps(_mm_cmple_ps(out1, kZero4), _mm_cmpge_ps(out1, kOne4));
-                        __m128 clipped2 = _mm_or_ps(_mm_cmple_ps(out2, kZero4), _mm_cmpge_ps(out2, kOne4));
-                        __m128 clipped3 = _mm_or_ps(_mm_cmple_ps(out3, kZero4), _mm_cmpge_ps(out3, kOne4));
-
-                        __m128 grad0 = _mm_loadu_ps(&gradients[batch_offset + i + 0]);
-                        __m128 grad1 = _mm_loadu_ps(&gradients[batch_offset + i + 4]);
-                        __m128 grad2 = _mm_loadu_ps(&gradients[batch_offset + i + 8]);
-                        __m128 grad3 = _mm_loadu_ps(&gradients[batch_offset + i + 12]);
-
-                        grad0 = _mm_andnot_ps(clipped0, grad0);
-                        grad1 = _mm_andnot_ps(clipped1, grad1);
-                        grad2 = _mm_andnot_ps(clipped2, grad2);
-                        grad3 = _mm_andnot_ps(clipped3, grad3);
-
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 0], grad0);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 4], grad1);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 8], grad2);
-                        _mm_storeu_ps(&gradients_[batch_offset + i + 12], grad3);
-
-                        const int clipped_mask =
-                            (_mm_movemask_ps(clipped0) << 0)
-                            | (_mm_movemask_ps(clipped1) << 4)
-                            | (_mm_movemask_ps(clipped2) << 8)
-                            | (_mm_movemask_ps(clipped3) << 12);
-
-                        thread_stat_state.num_clipped_ += popcount(clipped_mask);
-                    }
-                }
-            }
-
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    const IndexType index = batch_offset + i;
-                    const bool clipped = (output_[index] <= kZero) | (output_[index] >= kOne);
-                    gradients_[index] = gradients[index] * !clipped;
-                    thread_stat_state.num_clipped_ += clipped;
-                }
-            }
-
-#endif
-
-            thread_stat_state.num_total_ += count * kOutputDimensions;
-
-#if defined(USE_BLAS)
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    cblas_saxpy(
-                        kHalfDimensions, 1.0,
-                        &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
-                    );
-                }
-            }
-
-#else
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType c = 0; c < 2; ++c) {
-                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                    Blas::saxpy(
-                        kHalfDimensions, 1.0,
-                        &gradients_[output_offset], 1, thread_bias_state.biases_diff_, 1
-                    );
-                }
-            }
-
-#endif
-        }
-
-        void reduce_thread_stat_state()
-        {
-            for (IndexType i = 1; i < thread_stat_states_.size(); ++i)
-            {
-                thread_stat_states_[0] += thread_stat_states_[i];
-            }
-        }
-
-        void reduce_thread_bias_state()
-        {
-            for (IndexType i = 1; i < thread_bias_states_.size(); ++i)
-            {
-                thread_bias_states_[0] += thread_bias_states_[i];
-            }
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
-
-            const LearnFloatType local_learning_rate =
-                learning_rate * learning_rate_scale_;
-
-            // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
-            // Correct the learning rate and adjust the scale without using momentum
-            const LearnFloatType effective_learning_rate =
-                static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
-
-            reduce_thread_bias_state();
-
-            auto& main_thread_state = thread_bias_states_[0];
-
-#if defined(USE_BLAS)
-
-            cblas_saxpy(
-                kHalfDimensions, -local_learning_rate,
-                main_thread_state.biases_diff_, 1, biases_, 1
-            );
-
-#else
-
-            Blas::saxpy(
-                kHalfDimensions, -local_learning_rate,
-                main_thread_state.biases_diff_, 1, biases_, 1
-            );
-
-#endif
-
-            thread_pool.execute_with_workers(
-                [&, num_threads = thread_pool.size()](Thread& th) {
-                    const auto thread_index = th.thread_idx();
-
-                    for (IndexType b = 0; b < batch_size_; ++b) {
-                        const IndexType batch_offset = kOutputDimensions * b;
-
-                        for (IndexType c = 0; c < 2; ++c) {
-                            const IndexType output_offset = batch_offset + kHalfDimensions * c;
-                            for (const auto& feature : batch_[b].training_features[c]) {
-                                const IndexType feature_index = feature.get_index();
-                                const IndexType weights_offset =
-                                    kHalfDimensions * feature_index;
-#if defined (USE_SSE2)
-                                _mm_prefetch(reinterpret_cast<const char*>(&weights_[weights_offset]), _MM_HINT_T2);
-#endif
-
-                                // We assign each bucket a continuous range of bits at least
-                                // of cache line size to prevent false sharing.
-                                // For HalfKP this is enough to saturate about 80 threads.
-                                const IndexType thread_bucket =
-                                    (feature_index / BitsetType::best_concurrent_access_stride)
-                                    % num_threads;
-
-                                if (thread_bucket != thread_index)
-                                    continue;
-
-                                // This operation can be performed safely because
-                                // each thread accesses a different memory location
-                                // (even a different cache line)
-                                observed_features.set(feature_index);
-
-                                const auto scale = static_cast<LearnFloatType>(
-                                    effective_learning_rate / feature.get_count());
-
-#if defined (USE_BLAS)
-
-                                cblas_saxpy(
-                                    kHalfDimensions, -scale,
-                                    &gradients_[output_offset], 1,
-                                    &weights_[weights_offset], 1
-                                );
-
-#else
-
-                                Blas::saxpy(
-                                    kHalfDimensions, -scale,
-                                    &gradients_[output_offset],
-                                    &weights_[weights_offset]
-                                );
-
-#endif
-                            }
-                        }
-                    }
-                }
-            );
-
-            thread_pool.wait_for_workers_finished();
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer) :
-            batch_(nullptr),
-            batch_size_(0),
-            target_layer_(target_layer),
-            biases_(),
-            weights_(),
-            momentum_(0.2),
-            learning_rate_scale_(1.0) {
-
-            dequantize_parameters();
-        }
-
-        // Weight saturation and parameterization
-        void quantize_parameters() {
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                target_layer_->biases_[i] =
-                    round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-            }
-
-            std::vector<TrainingFeature> training_features;
-
-            Threads.for_each_index_with_workers(
-                0, RawFeatures::kDimensions,
-                [this, training_features](Thread&, int j) mutable {
-                    training_features.clear();
-                    Features::Factorizer<RawFeatures>::append_training_features(
-                        j, &training_features);
-
-                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                        double sum = 0.0;
-                        for (const auto& feature : training_features) {
-                            sum += weights_[kHalfDimensions * feature.get_index() + i];
-                        }
-
-                        target_layer_->weights_[kHalfDimensions * j + i] =
-                            round<typename LayerType::WeightType>(sum * kWeightScale);
-                    }
-                }
-            );
-            Threads.wait_for_workers_finished();
-        }
-
-        void reset_stats() {
-            for (auto& state : thread_stat_states_)
-                state.reset();
-        }
-
-        // read parameterized integer
-        void dequantize_parameters() {
-            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                biases_[i] = static_cast<LearnFloatType>(
-                    target_layer_->biases_[i] / kBiasScale);
-            }
-
-            std::fill(std::begin(weights_), std::end(weights_), +kZero);
-
-            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
-                weights_[i] = static_cast<LearnFloatType>(
-                    target_layer_->weights_[i] / kWeightScale);
-            }
-
-            reset_stats();
-
-            for (auto& state : thread_bias_states_)
-                state.reset();
-        }
-
-        // Set the weight corresponding to the feature that does not appear in the learning data to 0
-        void clear_unobserved_feature_weights() {
-            for (IndexType i = 0; i < kInputDimensions; ++i) {
-                if (!observed_features.test(i)) {
-                    std::fill(std::begin(weights_) + kHalfDimensions * i,
-                              std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
-                }
-            }
-
-            quantize_parameters();
-        }
-
-        // Check if there are any problems with learning
-        void check_health() {
-
-            constexpr LearnFloatType kPreActivationLimit =
-                std::numeric_limits<typename LayerType::WeightType>::max() /
-                kWeightScale;
-
-            reduce_thread_stat_state();
-
-            auto& main_thread_state = thread_stat_states_[0];
-
-            const auto largest_min_activation = *std::max_element(
-                std::begin(main_thread_state.min_activations_), std::end(main_thread_state.min_activations_));
-            const auto smallest_max_activation = *std::min_element(
-                std::begin(main_thread_state.max_activations_), std::end(main_thread_state.max_activations_));
-
-            double abs_bias_sum = 0.0;
-            double abs_weight_sum = 0.0;
-
-            for(auto b : biases_)
-                abs_bias_sum += std::abs(b);
-
-            std::vector<TrainingFeature> training_features;
-            for (IndexType j = 0; j < RawFeatures::kDimensions; ++j)
-            {
-                training_features.clear();
-                Features::Factorizer<RawFeatures>::append_training_features(
-                    j, &training_features);
-
-                for (const auto& feature : training_features) {
-                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-                        abs_weight_sum += std::abs(weights_[kHalfDimensions * feature.get_index() + i]);
-                    }
-                }
-            }
-
-            auto out = sync_region_cout.new_region();
-
-            out << "INFO (check_health):"
-                << " layer " << LayerType::kLayerIndex
-                << " - " << LayerType::get_name()
-                << std::endl;
-
-            out << "  - observed " << observed_features.count()
-                << " (out of " << kInputDimensions << ") features"
-                << std::endl;
-
-            out << "  - (min, max) of pre-activations = "
-                << main_thread_state.min_pre_activation_ << ", "
-                << main_thread_state.max_pre_activation_ << " (limit = "
-                << kPreActivationLimit << ")"
-                << std::endl;
-
-            out << "  - largest min activation = " << largest_min_activation
-                << " , smallest max activation = " << smallest_max_activation
-                << std::endl;
-
-            out << "  - avg_abs_bias   = " << abs_bias_sum / std::size(biases_) << std::endl;
-            out << "  - avg_abs_weight = " << abs_weight_sum / std::size(weights_) << std::endl;
-
-            out << "  - clipped " << static_cast<double>(main_thread_state.num_clipped_) / main_thread_state.num_total_ * 100.0 << "% of outputs"
-                << std::endl;
-
-            out.unlock();
-
-            reset_stats();
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions =
-            Features::Factorizer<RawFeatures>::get_dimensions();
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-        static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
-
-        // Coefficient used for parameterization
-        static constexpr LearnFloatType kActivationScale =
-            std::numeric_limits<std::int8_t>::max();
-        static constexpr LearnFloatType kBiasScale = kActivationScale;
-        static constexpr LearnFloatType kWeightScale = kActivationScale;
-
-        // LearnFloatType constant
-        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
-        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
-
-        // mini batch
-        const Example* batch_;
-        IndexType batch_size_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-
-        // parameter
-        alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
-        alignas(kCacheLineSize)
-            LearnFloatType weights_[kHalfDimensions * kInputDimensions];
-
-        // Buffer used for updating parameters
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-
-        // Features that appeared in the training data
-        using BitsetType = LargeBitset<kInputDimensions>;
-        BitsetType observed_features;
-
-        // hyper parameter
-        LearnFloatType momentum_;
-        LearnFloatType learning_rate_scale_;
-
-        struct alignas(kCacheLineSize) ThreadStatState
-        {
-            alignas(kCacheLineSize) LearnFloatType min_activations_[kHalfDimensions];
-            alignas(kCacheLineSize) LearnFloatType max_activations_[kHalfDimensions];
-            LearnFloatType min_pre_activation_;
-            LearnFloatType max_pre_activation_;
-            uint64_t num_clipped_;
-            uint64_t num_total_;
-
-            ThreadStatState() { reset(); }
-
-            ThreadStatState& operator+=(const ThreadStatState& other)
-            {
-                for (IndexType i = 0; i < kHalfDimensions; ++i)
-                {
-                    min_activations_[i] = std::min(min_activations_[i], other.min_activations_[i]);
-                }
-
-                for (IndexType i = 0; i < kHalfDimensions; ++i)
-                {
-                    max_activations_[i] = std::max(max_activations_[i], other.max_activations_[i]);
-                }
-
-                min_pre_activation_ = std::min(min_pre_activation_, other.min_pre_activation_);
-                max_pre_activation_ = std::max(max_pre_activation_, other.max_pre_activation_);
-
-                num_clipped_ += other.num_clipped_;
-                num_total_ += other.num_total_;
-
-                return *this;
-            }
-
-            void reset()
-            {
-                std::fill(std::begin(min_activations_), std::end(min_activations_), std::numeric_limits<float>::max());
-                std::fill(std::begin(max_activations_), std::end(max_activations_), std::numeric_limits<float>::lowest());
-                min_pre_activation_ = std::numeric_limits<float>::max();
-                max_pre_activation_ = std::numeric_limits<float>::lowest();
-                num_clipped_ = 0;
-                num_total_ = 0;
-            }
-        };
-
-        struct alignas(kCacheLineSize) ThreadBiasState
-        {
-            alignas(kCacheLineSize) LearnFloatType biases_diff_[kHalfDimensions];
-
-            ThreadBiasState() { reset(); }
-
-            ThreadBiasState& operator+=(const ThreadBiasState& other)
-            {
-                for (IndexType i = 0; i < kHalfDimensions; ++i)
-                {
-                    biases_diff_[i] += other.biases_diff_[i];
-                }
-
-                return *this;
-            }
-
-            void reset()
-            {
-                std::fill(std::begin(biases_diff_), std::end(biases_diff_), 0.0f);
-            }
-        };
-
-        std::vector<ThreadStatState, CacheLineAlignedAllocator<ThreadStatState>> thread_stat_states_;
-        std::vector<ThreadBiasState, CacheLineAlignedAllocator<ThreadBiasState>> thread_bias_states_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/trainer/trainer_input_slice.h b/src/nnue/trainer/trainer_input_slice.h
deleted file mode 100644
index ff1265dc..00000000
--- a/src/nnue/trainer/trainer_input_slice.h
+++ /dev/null
@@ -1,383 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
-#define _NNUE_TRAINER_INPUT_SLICE_H_
-
-#include "trainer.h"
-
-#include "extra/stockfish_blas.h"
-
-#include "learn/learn.h"
-
-#include "nnue/layers/input_slice.h"
-
-#include "thread.h"
-
-// Specialization of NNUE evaluation function learning class template for InputSlice
-namespace Eval::NNUE {
-
-    // Learning: Input layer
-    // This is tricky. It exists because when there's more than one trainer
-    // on top of a single feature transformer we want to only call propagate/backpropagate
-    // on the feature transformer once. This is straightforward in the old
-    // multithreading case, because propagate/backpropagate is called just once from the
-    // main thread. But with the current implementation of coarser multithreading
-    // we end up calling each method from each thread. Therefore we have to keep
-    // the num_calls and current_operation per thread basis, each thread must work
-    // on its designated batch slice, and the only synchronization points are
-    // step_start and step_end - for which we use state of the first thread.
-    // Each thread requires their own bookkeeping because it's possible that
-    // one thread is still in propagate of some batch slice while the other thread
-    // is doing backpropagate of some other slice. We also ensure the thread state
-    // isn't suspectible to false sharing by using a full cache line for the state.
-    class SharedInputTrainer {
-    public:
-        // factory function
-        static std::shared_ptr<SharedInputTrainer> create(
-            FeatureTransformer* ft) {
-
-            static std::shared_ptr<SharedInputTrainer> instance;
-
-            if (!instance) {
-                instance.reset(new SharedInputTrainer(ft));
-            }
-
-            ++instance->num_referrers_;
-
-            return instance;
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            auto& thread_state = thread_states_[0];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kSendMessage;
-                feature_transformer_trainer_->send_message(message);
-            }
-
-            assert(thread_state.current_operation == Operation::kSendMessage);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            auto& thread_state = thread_states_[0];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kInitialize;
-                feature_transformer_trainer_->initialize(rng);
-            }
-
-            assert(thread_state.current_operation == Operation::kInitialize);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)gradients_.size() < (long)kInputDimensions * size) {
-                gradients_.resize(kInputDimensions * size);
-            }
-
-            if (thread_states_.size() < thread_pool.size())
-            {
-                thread_states_.resize(thread_pool.size());
-            }
-
-            batch_size_ = size;
-
-            auto& thread_state = thread_states_[0];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kStepStart;
-                output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
-            }
-
-            assert(thread_state.current_operation == Operation::kStepStart);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-
-            return output_;
-        }
-
-        // forward propagation
-        void propagate(Thread& th, uint64_t offset, uint64_t count) {
-            const auto thread_id = th.thread_idx();
-
-            auto& thread_state = thread_states_[thread_id];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kPropagate;
-                feature_transformer_trainer_->propagate(th, offset, count);
-            }
-
-            assert(thread_state.current_operation == Operation::kPropagate);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           uint64_t offset,
-                           uint64_t count) {
-
-            const auto thread_id = th.thread_idx();
-
-            auto& thread_state = thread_states_[thread_id];
-
-            if (num_referrers_ == 1) {
-                feature_transformer_trainer_->backpropagate(th, gradients, offset, count);
-                return;
-            }
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kBackPropagate;
-                for (IndexType b = offset; b < offset + count; ++b) {
-                    const IndexType batch_offset = kInputDimensions * b;
-                    for (IndexType i = 0; i < kInputDimensions; ++i) {
-                        gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
-                }
-            }
-
-            assert(thread_state.current_operation == Operation::kBackPropagate);
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType batch_offset = kInputDimensions * b;
-                for (IndexType i = 0; i < kInputDimensions; ++i) {
-                    gradients_[batch_offset + i] += gradients[batch_offset + i];
-                }
-            }
-
-            if (++thread_state.num_calls == num_referrers_) {
-                feature_transformer_trainer_->backpropagate(
-                    th, gradients_.data(), offset, count);
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
-            auto& thread_state = thread_states_[0];
-
-            if (thread_state.num_calls == 0) {
-                thread_state.current_operation = Operation::kStepEnd;
-                feature_transformer_trainer_->step_end(thread_pool, learning_rate);
-            }
-
-            assert(thread_state.current_operation == Operation::kStepEnd);
-
-            if (++thread_state.num_calls == num_referrers_) {
-                thread_state.num_calls = 0;
-                thread_state.current_operation = Operation::kNone;
-            }
-        }
-
-    private:
-        // constructor
-        SharedInputTrainer(FeatureTransformer* ft) :
-            batch_size_(0),
-            num_referrers_(0),
-            thread_states_(1),
-            feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
-                ft)),
-            output_(nullptr) {
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions =
-            FeatureTransformer::kOutputDimensions;
-
-        // type of processing
-        enum class Operation {
-            kNone,
-            kSendMessage,
-            kInitialize,
-            kStepStart,
-            kPropagate,
-            kBackPropagate,
-            kStepEnd,
-        };
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        // number of layers sharing this layer as input
-        std::uint32_t num_referrers_;
-
-        struct alignas(kCacheLineSize) ThreadState
-        {
-            std::uint32_t num_calls{0};
-
-            // current processing type
-            Operation current_operation = Operation::kNone;
-        };
-
-        // Number of times the current process has been called
-        std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
-
-        // Trainer of input feature converter
-        const std::shared_ptr<Trainer<FeatureTransformer>>
-            feature_transformer_trainer_;
-
-        // pointer to output shared for forward propagation
-        const LearnFloatType* output_;
-
-        // buffer for back propagation
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-    };
-
-    // Learning: Input layer
-    template <IndexType OutputDimensions, IndexType Offset>
-    class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* /*target_layer*/, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(new Trainer(ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            shared_input_trainer_->send_message(message);
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            shared_input_trainer_->initialize(rng);
-        }
-
-        const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
-        {
-            const auto size = batch_end - batch_begin;
-
-            if ((long)output_.size() < (long)kOutputDimensions * size) {
-              output_.resize(kOutputDimensions * size);
-              gradients_.resize(kInputDimensions * size);
-            }
-
-            batch_size_ = size;
-
-            input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end);
-
-            return output_.data();
-        }
-
-        // forward propagation
-        void propagate(Thread& th, uint64_t offset, uint64_t count) {
-
-            shared_input_trainer_->propagate(th, offset, count);
-
-            for (IndexType b = offset; b < offset + count; ++b) {
-                const IndexType input_offset = kInputDimensions * b;
-                const IndexType output_offset = kOutputDimensions * b;
-
-#if defined(USE_BLAS)
-
-                cblas_scopy(
-                    kOutputDimensions, &input_[input_offset + Offset], 1,
-                    &output_[output_offset], 1
-                );
-#else
-
-                Blas::scopy(
-                    kOutputDimensions, &input_[input_offset + Offset], 1,
-                    &output_[output_offset], 1
-                );
-
-#endif
-            }
-        }
-
-        // backpropagation
-        void backpropagate(Thread& th,
-                           const LearnFloatType* gradients,
-                           uint64_t offset,
-                           uint64_t count) {
-
-            for (IndexType b = offset; b < offset + count; ++b)
-            {
-                const IndexType input_offset = kInputDimensions * b;
-                const IndexType output_offset = kOutputDimensions * b;
-
-                IndexType i = 0;
-                if constexpr (Offset > 0)
-                {
-                    for (; i < Offset; ++i) {
-                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
-                }
-
-                for (; i < Offset + kOutputDimensions; ++i) {
-                    gradients_[input_offset + i] = gradients[output_offset + i - Offset];
-                }
-
-                if constexpr (Offset + kOutputDimensions < kInputDimensions)
-                {
-                    for (; i < kInputDimensions; ++i)
-                    {
-                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
-                    }
-                }
-            }
-
-            shared_input_trainer_->backpropagate(th, gradients_.data(), offset, count);
-        }
-
-        void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
-            shared_input_trainer_->step_end(thread_pool, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(FeatureTransformer* ft) :
-            batch_size_(0),
-            shared_input_trainer_(SharedInputTrainer::create(ft)) {
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions =
-            FeatureTransformer::kOutputDimensions;
-        static constexpr IndexType kOutputDimensions = OutputDimensions;
-        static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        const LearnFloatType* input_;
-
-        // Trainer of shared input layer
-        const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-
-        // buffer for back propagation
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/nnue/trainer/trainer_sum.h b/src/nnue/trainer/trainer_sum.h
deleted file mode 100644
index 88ff302c..00000000
--- a/src/nnue/trainer/trainer_sum.h
+++ /dev/null
@@ -1,201 +0,0 @@
-﻿#ifndef _NNUE_TRAINER_SUM_H_
-#define _NNUE_TRAINER_SUM_H_
-
-#include "trainer.h"
-
-#include "extra/stockfish_blas.h"
-
-#include "learn/learn.h"
-
-#include "nnue/layers/sum.h"
-
-#include "thread.h"
-
-// Specialization of NNUE evaluation function learning class template for Sum
-namespace Eval::NNUE {
-
-    // Learning: A layer that sums the outputs of multiple layers
-    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-    class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
-          Trainer<Layers::Sum<RemainingPreviousLayers...>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
-        using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* target_layer, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(
-                new Trainer(target_layer, ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            // The results of other member functions do not depend on the processing order, so
-            // Tail is processed first for the purpose of simplifying the implementation, but
-            // SendMessage processes Head first to make it easier to understand subscript correspondence
-            previous_layer_trainer_->send_message(message);
-            Tail::send_message(message);
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            Tail::initialize(rng);
-            previous_layer_trainer_->initialize(rng);
-        }
-
-        // forward propagation
-        /*const*/ LearnFloatType* propagate(ThreadPool& thread_pool, const std::vector<Example>& batch) {
-            batch_size_ = static_cast<IndexType>(batch.size());
-            auto output = Tail::propagate(thread_pool, batch);
-            const auto head_output = previous_layer_trainer_->propagate(thread_pool, batch);
-
-#if defined(USE_BLAS)
-
-            cblas_saxpy(
-                kOutputDimensions * batch_size_, 1.0,
-                head_output, 1, output, 1
-            );
-
-#else
-
-            Blas::saxpy(
-                thread_pool,
-                kOutputDimensions * batch_size_, 1.0,
-                head_output, 1, output, 1
-            );
-
-#endif
-            return output;
-        }
-
-        // backpropagation
-        void backpropagate(ThreadPool& thread_pool,
-                           const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
-
-            Tail::backpropagate(thread_pool, gradients, learning_rate);
-            previous_layer_trainer_->backpropagate(thread_pool, gradients, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer, FeatureTransformer* ft):
-            Tail(target_layer, ft),
-            batch_size_(0),
-            previous_layer_trainer_(Trainer<FirstPreviousLayer>::create(
-                &target_layer->previous_layer_, ft)),
-            target_layer_(target_layer) {
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-        // make subclass friend
-        template <typename SumLayer>
-        friend class Trainer;
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        // Trainer of the previous layer
-        const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-    };
-
-
-    // Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
-    template <typename PreviousLayer>
-    class Trainer<Layers::Sum<PreviousLayer>> {
-    private:
-        // Type of layer to learn
-        using LayerType = Layers::Sum<PreviousLayer>;
-
-    public:
-        // factory function
-        static std::shared_ptr<Trainer> create(
-            LayerType* target_layer, FeatureTransformer* ft) {
-
-            return std::shared_ptr<Trainer>(
-                new Trainer(target_layer, ft));
-        }
-
-        // Set options such as hyperparameters
-        void send_message(Message* message) {
-            previous_layer_trainer_->send_message(message);
-        }
-
-        // Initialize the parameters with random numbers
-        template <typename RNG>
-        void initialize(RNG& rng) {
-            previous_layer_trainer_->initialize(rng);
-        }
-
-        // forward propagation
-        /*const*/ LearnFloatType* propagate(const std::vector<Example>& batch) {
-            if (output_.size() < kOutputDimensions * batch.size()) {
-                output_.resize(kOutputDimensions * batch.size());
-            }
-
-            batch_size_ = static_cast<IndexType>(batch.size());
-            const auto output = previous_layer_trainer_->propagate(batch);
-
-#if defined(USE_BLAS)
-            cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
-#else
-            for (IndexType b = 0; b < batch_size_; ++b) {
-                const IndexType batch_offset = kOutputDimensions * b;
-                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-                    output_[batch_offset + i] = output[batch_offset + i];
-                }
-            }
-
-#endif
-            return output_.data();
-        }
-
-        // backpropagation
-        void backpropagate(const LearnFloatType* gradients,
-                           LearnFloatType learning_rate) {
-
-            previous_layer_trainer_->backpropagate(gradients, learning_rate);
-        }
-
-    private:
-        // constructor
-        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-            batch_size_(0),
-            previous_layer_trainer_(Trainer<PreviousLayer>::create(
-                &target_layer->previous_layer_, ft)),
-            target_layer_(target_layer) {
-        }
-
-        // number of input/output dimensions
-        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-
-        // make subclass friend
-        template <typename SumLayer>
-        friend class Trainer;
-
-        // number of samples in mini-batch
-        IndexType batch_size_;
-
-        // Trainer of the previous layer
-        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-
-        // layer to learn
-        LayerType* const target_layer_;
-
-        // Forward propagation buffer
-        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
-    };
-
-}  // namespace Eval::NNUE
-
-#endif
diff --git a/src/uci.cpp b/src/uci.cpp
index 7da2881f..9a9a9e3c 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -22,11 +22,9 @@
 #include <sstream>
 #include <string>
 
-#include "extra/stockfish_blas.h"
 #include "nnue/evaluate_nnue.h"
 #include "evaluate.h"
 #include "movegen.h"
-#include "nnue/nnue_test_command.h"
 #include "position.h"
 #include "search.h"
 #include "syzygy/tbprobe.h"
@@ -37,7 +35,6 @@
 
 #include "learn/gensfen.h"
 #include "learn/gensfen_nonpv.h"
-#include "learn/learn.h"
 #include "learn/convert.h"
 #include "learn/transform.h"
 #include "learn/stats.h"
@@ -49,17 +46,6 @@ extern vector<string> setup_bench(const Position&, istream&);
 // FEN string of the initial position, normal chess
 const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
 
-void test_cmd(Position& pos, istringstream& is)
-{
-    // Initialize as it may be searched.
-    Eval::NNUE::init();
-
-    std::string param;
-    is >> param;
-
-    if (param == "nnue") Eval::NNUE::test_command(pos, is);
-}
-
 namespace {
 
   // position() is called when engine receives the "position" UCI command.
@@ -344,7 +330,6 @@ void UCI::loop(int argc, char* argv[]) {
 
       else if (token == "gensfen") Learner::gensfen(is);
       else if (token == "gensfen_nonpv") Learner::gensfen_nonpv(is);
-      else if (token == "learn") Learner::learn(is);
       else if (token == "convert") Learner::convert(is);
       else if (token == "convert_bin") Learner::convert_bin(is);
       else if (token == "convert_plain") Learner::convert_plain(is);
@@ -361,17 +346,7 @@ void UCI::loop(int argc, char* argv[]) {
           std::cout << th.thread_idx() << '\n';
         });
       }
-      else if (token == "blastest")
-      {
-        Blas::test(Threads);
-      }
-      else if (token == "blasbench")
-      {
-        Blas::bench(Threads);
-      }
 
-      // test command
-      else if (token == "test") test_cmd(pos, is);
       else
           sync_cout << "Unknown command: " << cmd << sync_endl;
 

From 8169de72e2b8d3310c7d96050e378d4278c2a6c8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Apr 2021 19:04:37 +0200
Subject: [PATCH 560/583] asd

---
 docs/learn.md | 132 --------------------------------------------------
 1 file changed, 132 deletions(-)
 delete mode 100644 docs/learn.md

diff --git a/docs/learn.md b/docs/learn.md
deleted file mode 100644
index ab2b1861..00000000
--- a/docs/learn.md
+++ /dev/null
@@ -1,132 +0,0 @@
-# Learn
-
-`learn` command allows training a network from training data.
-
-As all commands in stockfish `learn` can be invoked either from command line (as `stockfish.exe learn ...`, but this is not recommended because it's not possible to specify UCI options before `learn` executes) or in the interactive prompt.
-
-`learn` takes named parameters in the form of `learn param_1_name param_1_value param_2_name param_2_value ...`. Unrecognized parameters form a list of paths to training data files.
-
-It is recommended to set the `EnableTranspositionTable` UCI option to `false` to reduce the interference between qsearches which are used to provide shallow evaluation. Using TT may cause the shallow evaluation to diverge from the real evaluation of the net, hiding imperfections.
-
-It is recommended to set the `PruneAtShallowDepth` UCI option to `false` as it will provide more accurate shallow evaluation.
-
-It is **required** to set the `Use NNUE` UCI option to `pure` as otherwise the function being optimized will not always match the function being probed, in which case not much can be learned.
-
-Currently the following options are available:
-
-`set_recommended_uci_options` - this is a modifier not a parameter, no value follows it. If specified then some UCI options are set to recommended values.
-
-`targetdir` - path to the direction from which training data will be read. All files in this directory are read sequentially. If not specified then only the list of files from positional arguments will be used. If specified then files from the given directory will be used after the explicitly specified files.
-
-`epochs` - the number of weight update cycles (epochs) to train the network for. One such cycle is `epoch_size` positions. If not specified then the training will loop forever.
-
-`warmup_epochs` - the number of epochs to "pretrain" the net for with `warmup_lr` learning rate. Default: 0.
-
-`epoch_size` - The number of positions per epoch. Should be kept lowish as the current implementation loads all into memory before processing. Default is already high enough. The epoch size is not tied to validation nor net serialization, there are more specific options for that. Default: 1000000
-
-`basedir` - the base directory for the paths. Default: "" (current directory)
-
-`lr` - initial learning rate. Default: 1.
-
-`warmup_lr` - the learning rate to use during warmup epochs. Default: 0.1.
-
-`use_draw_games_in_training` - either 0 or 1. If 1 then draws will be used in training too. Default: 1.
-
-`use_draw_games_in_validation` - either 0 or 1. If 1 then draws will be used in validation too. Default: 1.
-
-`skip_duplicated_positions_in_training` - either 0 or 1. If 1 then a small hashtable will be used to try to eliminate duplicated position from training. Default: 0.
-
-`winning_probability_coefficient` - some magic value for winning probability. If you need to read this then don't touch it. Default: 1.0 / PawnValueEg / 4.0 * std::log(10.0)
-
-`use_wdl` - either 0 or 1. If 1 then the evaluations will be converted to win/draw/loss percentages prior to learning on them. (Slightly changes the gradient because eval has a different derivative than wdl). Default: 0.
-
-`lambda` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 1.0.
-
-`lambda2` - value in range [0..1]. 1 means that only evaluation is used for learning, 0 means that only game result is used. Values inbetween result in interpolation between the two contributions. See `lambda_limit` for when this is applied. Default: 1.0.
-
-`lambda_limit` - the maximum absolute score value for which `lambda` is used as opposed to `lambda2`. For positions with absolute evaluation higher than `lambda_limit` `lambda2` will be used. Default: 32000 (so always `lambda`).
-
-`max_grad` - the maximum allowed loss gradient for backpropagation. Effectively a form of gradient clipping. Useful for the first iterations with a randomly generated net as with higher lr backpropagation often overshoots and kills the net. The default value is fairly conservative, values as low as 0.25 could be used with lr of 1.0 without problems. Default: 1.0.
-
-`reduction_gameply` - the minimum ply after which positions won't be skipped. Positions at plies below this value are skipped with a probability that lessens linearly with the ply (reaching 0 at `reduction_gameply`). Default: 1.
-
-`eval_limit` - positions with absolute evaluation higher than this will be skipped. Default: 32000 (nothing is skipped).
-
-`save_only_once` - this is a modifier not a parameter, no value follows it. If specified then there will be only one network file generated.
-
-`no_shuffle` - this is a modifier not a parameter, no value follows it. If specified then data within a batch won't be shuffled.
-
-`batch_size` - the number of positions per one learning step. Default: 1000
-
-`lr_step` - learning rate will be multiplied by this factor every time a net is rejected (so in other words it controls LR drops). Default: 0.5 (no LR drops)
-
-`assume_quiet` - this is a flag option. When specified learn will not perform qsearch to reach a quiet position.
-
-`smart_fen_skipping` - this is a flag option. When specified some position that are not good candidates for teaching are skipped. This includes positions where the best move is a capture or promotion, and position where a king is in check. Default: 0.
-
-`smart_fen_skipping_for_validation` - same as `smart_fen_skipping` but applies to validation data set. Default: 0.
-
-`max_consecutive_rejections` - determines after how many subsequent rejected nets the training process will be terminated. Default: 4.
-
-`auto_lr_drop` - every time this many positions are processed the learning rate is multiplied by `newbob_decay`. In other words this value specifies for how many positions a single learning rate stage lasts. If 0 then doesn't have any effect. Default: 0.
-
-`nn_options` - if you're reading this you don't use it. It passes messages directly to the network evaluation. I don't know what it can do either.
-
-`eval_save_interval` - every `eval_save_interval` positions the network will be saved and either accepted or rejected (in which case an LR drop follows). Default: 100000000 (100M). (generally people use values in 10M-100M range)
-
-`loss_output_interval` - every `loss_output_interval` fitness statistics are displayed. Default: 1000000 (1M)
-
-`validation_set_file_name` - path to the file with training data to be used for validation (loss computation and move accuracy)
-
-`validation_count` - the number of positions to use for validation. Default: 2000.
-
-`sfen_read_size` - the number of sfens to always keep in the buffer. Default: 10000000 (10M)
-
-`thread_buffer_size` - the number of sfens to copy at once to each thread requesting more sfens for learning. Default: 10000
-
-`seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
-
-`verbose` - this is a modifier, not a parameter. When used there will be more detailed output during training.
-
-### Deprecated options
-
-`bat` (deprecated) - the size of a batch in multiples of 10000. This determines how many entries are read and shuffled at once during training. Default: 100 (meaning batch size of 1000000).
-
-`newbob_num_trials` (deprecated) - same as `max_consecutive_rejections`
-
-`newbob_decay` (deprecated) - same as `lr_step`
-
-`nn_batch_size` (deprecated) - same as `batch_size`
-
-`use_hash_in_training` (deprecated) - alias for `skip_duplicated_positions_in_training`
-
-`batchsize` (deprecated) - same as `epoch_size`
-
-`use_draw_in_training` (deprecated) - alias for `use_draw_games_in_training`
-
-`use_draw_in_validation` (deprecated) - alias for `use_draw_games_in_validation`
-
-## Legacy subcommands and parameters
-
-### Convert
-
-`convert_plain`
-`convert_bin`
-`interpolate_eval`
-`check_invalid_fen`
-`check_illegal_move`
-`convert_bin_from_pgn-extract`
-`pgn_eval_side_to_move`
-`convert_no_eval_fens_as_score_zero`
-`src_score_min_value`
-`src_score_max_value`
-`dest_score_min_value`
-`dest_score_max_value`
-
-### Shuffle
-
-`shuffle`
-`buffer_size`
-`shuffleq`
-`shufflem`
-`output_file_name`

From 696e849a306f321a4ea98e827ada5cdfd8665b38 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Apr 2021 19:18:41 +0200
Subject: [PATCH 561/583] learn -> tools

---
 src/Makefile                           | 16 ++++++++--------
 src/position.cpp                       | 12 ++++++------
 src/position.h                         | 10 +++++-----
 src/search.cpp                         |  4 ++--
 src/search.h                           |  2 +-
 src/{learn => tools}/convert.cpp       |  2 +-
 src/{learn => tools}/convert.h         |  2 +-
 src/{learn => tools}/gensfen.cpp       |  4 ++--
 src/{learn => tools}/gensfen.h         |  2 +-
 src/{learn => tools}/gensfen_nonpv.cpp |  4 ++--
 src/{learn => tools}/gensfen_nonpv.h   |  2 +-
 src/{learn => tools}/opening_book.cpp  |  2 +-
 src/{learn => tools}/opening_book.h    |  2 +-
 src/{learn => tools}/packed_sfen.h     |  4 ++--
 src/{learn => tools}/sfen_packer.cpp   |  2 +-
 src/{learn => tools}/sfen_packer.h     |  4 ++--
 src/{learn => tools}/sfen_reader.h     |  2 +-
 src/{learn => tools}/sfen_stream.h     |  2 +-
 src/{learn => tools}/sfen_writer.h     |  2 +-
 src/{learn => tools}/stats.cpp         |  4 ++--
 src/{learn => tools}/stats.h           |  2 +-
 src/{learn => tools}/transform.cpp     | 14 +++++++-------
 src/{learn => tools}/transform.h       |  2 +-
 src/uci.cpp                            | 26 +++++++++++++-------------
 24 files changed, 64 insertions(+), 64 deletions(-)
 rename src/{learn => tools}/convert.cpp (99%)
 rename src/{learn => tools}/convert.h (93%)
 rename src/{learn => tools}/gensfen.cpp (99%)
 rename src/{learn => tools}/gensfen.h (90%)
 rename src/{learn => tools}/gensfen_nonpv.cpp (99%)
 rename src/{learn => tools}/gensfen_nonpv.h (89%)
 rename src/{learn => tools}/opening_book.cpp (97%)
 rename src/{learn => tools}/opening_book.h (98%)
 rename src/{learn => tools}/packed_sfen.h (94%)
 rename src/{learn => tools}/sfen_packer.cpp (99%)
 rename src/{learn => tools}/sfen_packer.h (84%)
 rename src/{learn => tools}/sfen_reader.h (99%)
 rename src/{learn => tools}/sfen_stream.h (99%)
 rename src/{learn => tools}/sfen_writer.h (99%)
 rename src/{learn => tools}/stats.cpp (99%)
 rename src/{learn => tools}/stats.h (81%)
 rename src/{learn => tools}/transform.cpp (97%)
 rename src/{learn => tools}/transform.h (85%)

diff --git a/src/Makefile b/src/Makefile
index 19927ce5..8f30fff6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -57,17 +57,17 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/features/a.cpp \
 	nnue/features/castling_right.cpp \
 	nnue/features/enpassant.cpp \
-	learn/sfen_packer.cpp \
-	learn/gensfen.cpp \
-	learn/gensfen_nonpv.cpp \
-	learn/opening_book.cpp \
-	learn/convert.cpp \
-	learn/transform.cpp \
-	learn/stats.cpp
+	tools/sfen_packer.cpp \
+	tools/gensfen.cpp \
+	tools/gensfen_nonpv.cpp \
+	tools/opening_book.cpp \
+	tools/convert.cpp \
+	tools/transform.cpp \
+	tools/stats.cpp
 
 OBJS = $(notdir $(SRCS:.cpp=.o))
 
-VPATH = syzygy:nnue:nnue/features:eval:extra:learn
+VPATH = syzygy:nnue:nnue/features:eval:extra:tools
 
 ### ==========================================================================
 ### Section 2. High-level Configuration
diff --git a/src/position.cpp b/src/position.cpp
index 1b5ff222..2fb5f4f7 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -34,8 +34,8 @@
 #include "uci.h"
 #include "syzygy/tbprobe.h"
 
-#include "learn/packed_sfen.h"
-#include "learn/sfen_packer.h"
+#include "tools/packed_sfen.h"
+#include "tools/sfen_packer.h"
 
 using std::string;
 
@@ -1373,13 +1373,13 @@ bool Position::pos_is_ok() const {
 // Add a function that directly unpacks for speed. It's pretty tough.
 // Write it by combining packer::unpack() and Position::set().
 // If there is a problem with the passed phase and there is an error, non-zero is returned.
-int Position::set_from_packed_sfen(const Learner::PackedSfen& sfen , StateInfo* si, Thread* th)
+int Position::set_from_packed_sfen(const Tools::PackedSfen& sfen , StateInfo* si, Thread* th)
 {
-  return Learner::set_from_packed_sfen(*this, sfen, si, th);
+  return Tools::set_from_packed_sfen(*this, sfen, si, th);
 }
 
 // Get the packed sfen. Returns to the buffer specified in the argument.
-void Position::sfen_pack(Learner::PackedSfen& sfen)
+void Position::sfen_pack(Tools::PackedSfen& sfen)
 {
-  sfen = Learner::sfen_pack(*this);
+  sfen = Tools::sfen_pack(*this);
 }
diff --git a/src/position.h b/src/position.h
index fe5be374..9e979914 100644
--- a/src/position.h
+++ b/src/position.h
@@ -30,8 +30,8 @@
 
 #include "nnue/nnue_accumulator.h"
 
-#include "learn/packed_sfen.h"
-#include "learn/sfen_packer.h"
+#include "tools/packed_sfen.h"
+#include "tools/sfen_packer.h"
 
 
 /// StateInfo struct stores information needed to restore a Position object to
@@ -179,17 +179,17 @@ public:
 
   // --sfenization helper
 
-  friend int Learner::set_from_packed_sfen(Position& pos, const Learner::PackedSfen& sfen, StateInfo* si, Thread* th);
+  friend int Tools::set_from_packed_sfen(Position& pos, const Tools::PackedSfen& sfen, StateInfo* si, Thread* th);
 
   // Get the packed sfen. Returns to the buffer specified in the argument.
   // Do not include gamePly in pack.
-  void sfen_pack(Learner::PackedSfen& sfen);
+  void sfen_pack(Tools::PackedSfen& sfen);
 
   // It is slow to go through sfen, so I made a function to set packed sfen directly.
   // Equivalent to pos.set(sfen_unpack(data),si,th);.
   // If there is a problem with the passed phase and there is an error, non-zero is returned.
   // PackedSfen does not include gamePly so it cannot be restored. If you want to set it, specify it with an argument.
-  int set_from_packed_sfen(const Learner::PackedSfen& sfen, StateInfo* si, Thread* th);
+  int set_from_packed_sfen(const Tools::PackedSfen& sfen, StateInfo* si, Thread* th);
 
   void clear() { std::memset(this, 0, sizeof(Position)); }
 
diff --git a/src/search.cpp b/src/search.cpp
index 8fe35000..a5e5ee24 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -311,7 +311,7 @@ void Thread::search() {
 
   if (!this->rootMoves.empty())
     Tablebases::rank_root_moves(this->rootPos, this->rootMoves);
-      
+
   if (mainThread)
   {
       if (mainThread->bestPreviousScore == VALUE_INFINITE)
@@ -1995,7 +1995,7 @@ namespace Search
   // It might have been good.
 
   // Initialization for learning.
-  // Called from Learner::search(),Learner::qsearch().
+  // Called from Tools::search(),Tools::qsearch().
   static bool init_for_search(Position& pos, Stack* ss)
   {
 
diff --git a/src/search.h b/src/search.h
index 13123323..b9036f56 100644
--- a/src/search.h
+++ b/src/search.h
@@ -110,7 +110,7 @@ extern LimitsType Limits;
 void init();
 void clear();
 
-// A pair of reader and evaluation value. Returned by Learner::search(),Learner::qsearch().
+// A pair of reader and evaluation value. Returned by Tools::search(),Tools::qsearch().
 using ValueAndPV = std::pair<Value, std::vector<Move>>;
 
 ValueAndPV qsearch(Position& pos);
diff --git a/src/learn/convert.cpp b/src/tools/convert.cpp
similarity index 99%
rename from src/learn/convert.cpp
rename to src/tools/convert.cpp
index 47f56f02..8d8af4bb 100644
--- a/src/learn/convert.cpp
+++ b/src/tools/convert.cpp
@@ -28,7 +28,7 @@
 
 using namespace std;
 
-namespace Learner
+namespace Tools
 {
     bool fen_is_ok(Position& pos, std::string input_fen) {
         std::string pos_fen = pos.fen();
diff --git a/src/learn/convert.h b/src/tools/convert.h
similarity index 93%
rename from src/learn/convert.h
rename to src/tools/convert.h
index 227f0799..9d628540 100644
--- a/src/learn/convert.h
+++ b/src/tools/convert.h
@@ -5,7 +5,7 @@
 #include <string>
 #include <sstream>
 
-namespace Learner {
+namespace Tools {
     void convert(std::istringstream& is);
 
     void convert_bin_from_pgn_extract(std::istringstream& is);
diff --git a/src/learn/gensfen.cpp b/src/tools/gensfen.cpp
similarity index 99%
rename from src/learn/gensfen.cpp
rename to src/tools/gensfen.cpp
index e5ddd6aa..7021648a 100644
--- a/src/learn/gensfen.cpp
+++ b/src/tools/gensfen.cpp
@@ -34,7 +34,7 @@
 
 using namespace std;
 
-namespace Learner
+namespace Tools
 {
     // Class to generate sfen with multiple threads
     struct Gensfen
@@ -220,7 +220,7 @@ namespace Learner
         // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
         limits.nodes = 0;
 
-        // depth is also processed by the one passed as an argument of Learner::search().
+        // depth is also processed by the one passed as an argument of Tools::search().
         limits.depth = 0;
     }
 
diff --git a/src/learn/gensfen.h b/src/tools/gensfen.h
similarity index 90%
rename from src/learn/gensfen.h
rename to src/tools/gensfen.h
index c0a7c978..13eb0880 100644
--- a/src/learn/gensfen.h
+++ b/src/tools/gensfen.h
@@ -5,7 +5,7 @@
 
 #include <sstream>
 
-namespace Learner {
+namespace Tools {
 
     // Automatic generation of teacher position
     void gensfen(std::istringstream& is);
diff --git a/src/learn/gensfen_nonpv.cpp b/src/tools/gensfen_nonpv.cpp
similarity index 99%
rename from src/learn/gensfen_nonpv.cpp
rename to src/tools/gensfen_nonpv.cpp
index 098511fe..7edf9a33 100644
--- a/src/learn/gensfen_nonpv.cpp
+++ b/src/tools/gensfen_nonpv.cpp
@@ -34,7 +34,7 @@
 
 using namespace std;
 
-namespace Learner
+namespace Tools
 {
     // Class to generate sfen with multiple threads
     struct GensfenNonPv
@@ -163,7 +163,7 @@ namespace Learner
         // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
         limits.nodes = 0;
 
-        // depth is also processed by the one passed as an argument of Learner::search().
+        // depth is also processed by the one passed as an argument of Tools::search().
         limits.depth = 0;
     }
 
diff --git a/src/learn/gensfen_nonpv.h b/src/tools/gensfen_nonpv.h
similarity index 89%
rename from src/learn/gensfen_nonpv.h
rename to src/tools/gensfen_nonpv.h
index 38ccaa60..31229d5e 100644
--- a/src/learn/gensfen_nonpv.h
+++ b/src/tools/gensfen_nonpv.h
@@ -3,7 +3,7 @@
 
 #include <sstream>
 
-namespace Learner {
+namespace Tools {
 
     // Automatic generation of teacher position
     void gensfen_nonpv(std::istringstream& is);
diff --git a/src/learn/opening_book.cpp b/src/tools/opening_book.cpp
similarity index 97%
rename from src/learn/opening_book.cpp
rename to src/tools/opening_book.cpp
index fb569bda..3d3842ef 100644
--- a/src/learn/opening_book.cpp
+++ b/src/tools/opening_book.cpp
@@ -2,7 +2,7 @@
 
 #include <fstream>
 
-namespace Learner {
+namespace Tools {
 
     EpdOpeningBook::EpdOpeningBook(const std::string& file, PRNG& prng) :
         OpeningBook(file)
diff --git a/src/learn/opening_book.h b/src/tools/opening_book.h
similarity index 98%
rename from src/learn/opening_book.h
rename to src/tools/opening_book.h
index d07fc58b..562be0f9 100644
--- a/src/learn/opening_book.h
+++ b/src/tools/opening_book.h
@@ -13,7 +13,7 @@
 #include <memory>
 #include <mutex>
 
-namespace Learner {
+namespace Tools {
 
     struct OpeningBook {
 
diff --git a/src/learn/packed_sfen.h b/src/tools/packed_sfen.h
similarity index 94%
rename from src/learn/packed_sfen.h
rename to src/tools/packed_sfen.h
index 3aa4fcac..8080200f 100644
--- a/src/learn/packed_sfen.h
+++ b/src/tools/packed_sfen.h
@@ -4,7 +4,7 @@
 #include <vector>
 #include <cstdint>
 
-namespace Learner {
+namespace Tools {
 
     // packed sfen
     struct PackedSfen { std::uint8_t data[32]; };
@@ -17,7 +17,7 @@ namespace Learner {
         // phase
         PackedSfen sfen;
 
-        // Evaluation value returned from Learner::search()
+        // Evaluation value returned from Tools::search()
         std::int16_t score;
 
         // PV first move
diff --git a/src/learn/sfen_packer.cpp b/src/tools/sfen_packer.cpp
similarity index 99%
rename from src/learn/sfen_packer.cpp
rename to src/tools/sfen_packer.cpp
index 777b5943..a51fd193 100644
--- a/src/learn/sfen_packer.cpp
+++ b/src/tools/sfen_packer.cpp
@@ -11,7 +11,7 @@
 
 using namespace std;
 
-namespace Learner {
+namespace Tools {
 
     // Class that handles bitstream
     // useful when doing aspect encoding
diff --git a/src/learn/sfen_packer.h b/src/tools/sfen_packer.h
similarity index 84%
rename from src/learn/sfen_packer.h
rename to src/tools/sfen_packer.h
index 5f232fed..c99d7985 100644
--- a/src/learn/sfen_packer.h
+++ b/src/tools/sfen_packer.h
@@ -3,7 +3,7 @@
 
 #include "types.h"
 
-#include "learn/packed_sfen.h"
+#include "packed_sfen.h"
 
 #include <cstdint>
 
@@ -11,7 +11,7 @@ class Position;
 struct StateInfo;
 class Thread;
 
-namespace Learner {
+namespace Tools {
 
     int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th);
     PackedSfen sfen_pack(Position& pos);
diff --git a/src/learn/sfen_reader.h b/src/tools/sfen_reader.h
similarity index 99%
rename from src/learn/sfen_reader.h
rename to src/tools/sfen_reader.h
index e36efcc6..064efe53 100644
--- a/src/learn/sfen_reader.h
+++ b/src/tools/sfen_reader.h
@@ -17,7 +17,7 @@
 #include <thread>
 #include <functional>
 
-namespace Learner{
+namespace Tools{
 
     enum struct SfenReaderMode
     {
diff --git a/src/learn/sfen_stream.h b/src/tools/sfen_stream.h
similarity index 99%
rename from src/learn/sfen_stream.h
rename to src/tools/sfen_stream.h
index da411346..bb731457 100644
--- a/src/learn/sfen_stream.h
+++ b/src/tools/sfen_stream.h
@@ -10,7 +10,7 @@
 #include <string>
 #include <memory>
 
-namespace Learner {
+namespace Tools {
 
     enum struct SfenOutputType
     {
diff --git a/src/learn/sfen_writer.h b/src/tools/sfen_writer.h
similarity index 99%
rename from src/learn/sfen_writer.h
rename to src/tools/sfen_writer.h
index b1c3ed5f..37c36491 100644
--- a/src/learn/sfen_writer.h
+++ b/src/tools/sfen_writer.h
@@ -19,7 +19,7 @@
 
 using namespace std;
 
-namespace Learner {
+namespace Tools {
 
     // Helper class for exporting Sfen
     struct SfenWriter
diff --git a/src/learn/stats.cpp b/src/tools/stats.cpp
similarity index 99%
rename from src/learn/stats.cpp
rename to src/tools/stats.cpp
index c0e2c0a1..c40411b9 100644
--- a/src/learn/stats.cpp
+++ b/src/tools/stats.cpp
@@ -25,7 +25,7 @@
 #include <mutex>
 #include <optional>
 
-namespace Learner::Stats
+namespace Tools::Stats
 {
     struct StatisticGathererBase
     {
@@ -566,7 +566,7 @@ namespace Learner::Stats
         Position& pos = th->rootPos;
         StateInfo si;
 
-        auto in = Learner::open_sfen_input_file(filename);
+        auto in = Tools::open_sfen_input_file(filename);
 
         auto on_move = [&](const Position& position, const Move& move) {
             statistic_gatherers.on_move(position, move);
diff --git a/src/learn/stats.h b/src/tools/stats.h
similarity index 81%
rename from src/learn/stats.h
rename to src/tools/stats.h
index c9a71e5a..c4a13d19 100644
--- a/src/learn/stats.h
+++ b/src/tools/stats.h
@@ -3,7 +3,7 @@
 
 #include <sstream>
 
-namespace Learner::Stats {
+namespace Tools::Stats {
 
     void gather_statistics(std::istringstream& is);
 
diff --git a/src/learn/transform.cpp b/src/tools/transform.cpp
similarity index 97%
rename from src/learn/transform.cpp
rename to src/tools/transform.cpp
index 8991b9f1..b3d1f94b 100644
--- a/src/learn/transform.cpp
+++ b/src/tools/transform.cpp
@@ -21,7 +21,7 @@
 #include <mutex>
 #include <optional>
 
-namespace Learner
+namespace Tools
 {
     using CommandFunc = void(*)(std::istringstream&);
 
@@ -120,8 +120,8 @@ namespace Learner
         Position& pos = th->rootPos;
         StateInfo si;
 
-        auto in = Learner::open_sfen_input_file(params.input_filename);
-        auto out = Learner::create_new_sfen_output(params.output_filename);
+        auto in = Tools::open_sfen_input_file(params.input_filename);
+        auto out = Tools::create_new_sfen_output(params.output_filename);
 
         if (in == nullptr)
         {
@@ -261,7 +261,7 @@ namespace Learner
 
         buffer.reserve(batch_size);
 
-        auto out = Learner::create_new_sfen_output(params.output_filename);
+        auto out = Tools::create_new_sfen_output(params.output_filename);
 
         std::mutex mutex;
         uint64_t num_processed = 0;
@@ -279,7 +279,7 @@ namespace Learner
         // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
         limits.nodes = 0;
 
-        // depth is also processed by the one passed as an argument of Learner::search().
+        // depth is also processed by the one passed as an argument of Tools::search().
         limits.depth = 0;
 
         Threads.execute_with_workers([&](auto& th){
@@ -343,7 +343,7 @@ namespace Learner
     void do_rescore_data(RescoreParams& params)
     {
         // TODO: Use SfenReader once it works correctly in sequential mode. See issue #271
-        auto in = Learner::open_sfen_input_file(params.input_filename);
+        auto in = Tools::open_sfen_input_file(params.input_filename);
         auto readsome = [&in, mutex = std::mutex{}](int n) mutable -> PSVector {
 
             PSVector psv;
@@ -388,7 +388,7 @@ namespace Learner
         // If you use this, it will be compared with the accumulated nodes of each thread. Therefore, do not use it.
         limits.nodes = 0;
 
-        // depth is also processed by the one passed as an argument of Learner::search().
+        // depth is also processed by the one passed as an argument of Tools::search().
         limits.depth = 0;
 
         std::atomic<std::uint64_t> num_processed = 0;
diff --git a/src/learn/transform.h b/src/tools/transform.h
similarity index 85%
rename from src/learn/transform.h
rename to src/tools/transform.h
index 8a6921a0..f202b55c 100644
--- a/src/learn/transform.h
+++ b/src/tools/transform.h
@@ -3,7 +3,7 @@
 
 #include <sstream>
 
-namespace Learner {
+namespace Tools {
 
     void transform(std::istringstream& is);
 
diff --git a/src/uci.cpp b/src/uci.cpp
index 9a9a9e3c..ecf4d86d 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -33,11 +33,11 @@
 #include "tt.h"
 #include "uci.h"
 
-#include "learn/gensfen.h"
-#include "learn/gensfen_nonpv.h"
-#include "learn/convert.h"
-#include "learn/transform.h"
-#include "learn/stats.h"
+#include "tools/gensfen.h"
+#include "tools/gensfen_nonpv.h"
+#include "tools/convert.h"
+#include "tools/transform.h"
+#include "tools/stats.h"
 
 using namespace std;
 
@@ -328,14 +328,14 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "eval")     trace_eval(pos);
       else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
 
-      else if (token == "gensfen") Learner::gensfen(is);
-      else if (token == "gensfen_nonpv") Learner::gensfen_nonpv(is);
-      else if (token == "convert") Learner::convert(is);
-      else if (token == "convert_bin") Learner::convert_bin(is);
-      else if (token == "convert_plain") Learner::convert_plain(is);
-      else if (token == "convert_bin_from_pgn_extract") Learner::convert_bin_from_pgn_extract(is);
-      else if (token == "transform") Learner::transform(is);
-      else if (token == "gather_statistics") Learner::Stats::gather_statistics(is);
+      else if (token == "gensfen") Tools::gensfen(is);
+      else if (token == "gensfen_nonpv") Tools::gensfen_nonpv(is);
+      else if (token == "convert") Tools::convert(is);
+      else if (token == "convert_bin") Tools::convert_bin(is);
+      else if (token == "convert_plain") Tools::convert_plain(is);
+      else if (token == "convert_bin_from_pgn_extract") Tools::convert_bin_from_pgn_extract(is);
+      else if (token == "transform") Tools::transform(is);
+      else if (token == "gather_statistics") Tools::Stats::gather_statistics(is);
 
       // Command to call qsearch(),search() directly for testing
       else if (token == "qsearch") qsearch_cmd(pos);

From f1d4c1c89681ce2c9a63ecb60c3509cbd011f0d4 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Apr 2021 19:24:23 +0200
Subject: [PATCH 562/583] remove useless stuff

---
 src/Makefile                                  |   8 -
 src/nnue/architectures/halfka_256x2-32-32.h   |  54 -----
 .../architectures/halfkp-cr-ep_256x2-32-32.h  |  57 -----
 .../architectures/halfkp-cr_256x2-32-32.h     |  37 ----
 src/nnue/architectures/halfkp_384x2-32-32.h   |  35 ----
 src/nnue/features/a.cpp                       |  54 -----
 src/nnue/features/a.h                         |  54 -----
 src/nnue/features/castling_right.cpp          |  65 ------
 src/nnue/features/castling_right.h            |  44 ----
 src/nnue/features/enpassant.cpp               |  49 -----
 src/nnue/features/enpassant.h                 |  40 ----
 src/nnue/features/half_ka.cpp                 |  93 ---------
 src/nnue/features/half_ka.h                   |  75 -------
 src/nnue/features/half_relative_ka.cpp        |  90 --------
 src/nnue/features/half_relative_ka.h          |  68 ------
 src/nnue/features/half_relative_kp.cpp        |  91 --------
 src/nnue/features/half_relative_kp.h          |  66 ------
 src/nnue/features/k.cpp                       |  45 ----
 src/nnue/features/k.h                         |  49 -----
 src/nnue/features/p.cpp                       |  55 -----
 src/nnue/features/p.h                         |  49 -----
 src/nnue/layers/sum.h                         | 196 ------------------
 22 files changed, 1374 deletions(-)
 delete mode 100644 src/nnue/architectures/halfka_256x2-32-32.h
 delete mode 100644 src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
 delete mode 100644 src/nnue/architectures/halfkp-cr_256x2-32-32.h
 delete mode 100644 src/nnue/architectures/halfkp_384x2-32-32.h
 delete mode 100644 src/nnue/features/a.cpp
 delete mode 100644 src/nnue/features/a.h
 delete mode 100644 src/nnue/features/castling_right.cpp
 delete mode 100644 src/nnue/features/castling_right.h
 delete mode 100644 src/nnue/features/enpassant.cpp
 delete mode 100644 src/nnue/features/enpassant.h
 delete mode 100644 src/nnue/features/half_ka.cpp
 delete mode 100644 src/nnue/features/half_ka.h
 delete mode 100644 src/nnue/features/half_relative_ka.cpp
 delete mode 100644 src/nnue/features/half_relative_ka.h
 delete mode 100644 src/nnue/features/half_relative_kp.cpp
 delete mode 100644 src/nnue/features/half_relative_kp.h
 delete mode 100644 src/nnue/features/k.cpp
 delete mode 100644 src/nnue/features/k.h
 delete mode 100644 src/nnue/features/p.cpp
 delete mode 100644 src/nnue/features/p.h
 delete mode 100644 src/nnue/layers/sum.h

diff --git a/src/Makefile b/src/Makefile
index 8f30fff6..33a2434d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -49,14 +49,6 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
 	nnue/evaluate_nnue.cpp \
 	nnue/features/half_kp.cpp \
-	nnue/features/half_ka.cpp \
-	nnue/features/half_relative_kp.cpp \
-	nnue/features/half_relative_ka.cpp \
-	nnue/features/k.cpp \
-	nnue/features/p.cpp \
-	nnue/features/a.cpp \
-	nnue/features/castling_right.cpp \
-	nnue/features/enpassant.cpp \
 	tools/sfen_packer.cpp \
 	tools/gensfen.cpp \
 	tools/gensfen_nonpv.cpp \
diff --git a/src/nnue/architectures/halfka_256x2-32-32.h b/src/nnue/architectures/halfka_256x2-32-32.h
deleted file mode 100644
index c108ef5d..00000000
--- a/src/nnue/architectures/halfka_256x2-32-32.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
-
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef NNUE_HALFKA_256X2_32_32_H_INCLUDED
-#define NNUE_HALFKA_256X2_32_32_H_INCLUDED
-
-#include "nnue/features/feature_set.h"
-#include "nnue/features/half_ka.h"
-
-#include "nnue/layers/input_slice.h"
-#include "nnue/layers/affine_transform.h"
-#include "nnue/layers/clipped_relu.h"
-
-namespace Eval::NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<
-        Features::HalfKA<Features::Side::kFriend>>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-        // Define network structure
-        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-}  // namespace Eval::NNUE
-
-#endif // #ifndef NNUE_HALFA_256X2_32_32_H_INCLUDED
diff --git a/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h b/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
deleted file mode 100644
index 6327b78a..00000000
--- a/src/nnue/architectures/halfkp-cr-ep_256x2-32-32.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
-
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
-#define NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
-
-#include "nnue/features/feature_set.h"
-#include "nnue/features/half_kp.h"
-#include "nnue/features/castling_right.h"
-#include "nnue/features/enpassant.h"
-
-#include "nnue/layers/input_slice.h"
-#include "nnue/layers/affine_transform.h"
-#include "nnue/layers/clipped_relu.h"
-
-namespace Eval::NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<
-        Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
-        Features::EnPassant>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-        // Define network structure
-        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-}  // namespace Eval::NNUE
-
-#endif // #ifndef NNUE_HALFKP_CR_EP_256X2_32_32_H_INCLUDED
diff --git a/src/nnue/architectures/halfkp-cr_256x2-32-32.h b/src/nnue/architectures/halfkp-cr_256x2-32-32.h
deleted file mode 100644
index dd587d1d..00000000
--- a/src/nnue/architectures/halfkp-cr_256x2-32-32.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
-#define NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
-
-#include "nnue/features/feature_set.h"
-#include "nnue/features/half_kp.h"
-#include "nnue/features/castling_right.h"
-
-#include "nnue/layers/input_slice.h"
-#include "nnue/layers/affine_transform.h"
-#include "nnue/layers/clipped_relu.h"
-
-namespace Eval::NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<
-        Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
-
-    namespace Layers {
-
-        // Define network structure
-        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-}  // namespace Eval::NNUE
-
-#endif // #ifndef NNUE_HALFKP_CR_256X2_32_32_H_INCLUDED
diff --git a/src/nnue/architectures/halfkp_384x2-32-32.h b/src/nnue/architectures/halfkp_384x2-32-32.h
deleted file mode 100644
index 96913295..00000000
--- a/src/nnue/architectures/halfkp_384x2-32-32.h
+++ /dev/null
@@ -1,35 +0,0 @@
-﻿// Definition of input features and network structure used in NNUE evaluation function
-
-#ifndef HALFKP_384X2_32_32_H
-#define HALFKP_384X2_32_32_H
-
-#include "nnue/features/feature_set.h"
-#include "nnue/features/half_kp.h"
-
-#include "nnue/layers/input_slice.h"
-#include "nnue/layers/affine_transform.h"
-#include "nnue/layers/clipped_relu.h"
-
-namespace Eval::NNUE {
-
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<
-        Features::HalfKP<Features::Side::kFriend>>;
-
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 384;
-
-    namespace Layers {
-
-        // define network structure
-        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-    }  // namespace Layers
-
-    using Network = Layers::OutputLayer;
-
-}  // namespace Eval::NNUE
-#endif // HALFKP_384X2_32_32_H
diff --git a/src/nnue/features/a.cpp b/src/nnue/features/a.cpp
deleted file mode 100644
index 1bfb583f..00000000
--- a/src/nnue/features/a.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-﻿#include "a.h"
-#include "index_list.h"
-
-// Definition of input feature A of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Orient a square according to perspective (rotate the board 180° for black)
-    // Important note for "halfka": this arch was designed with "flip" in mind 
-    // although it still is untested which approach is better.
-    // this has to stay until we find a better arch that works with "flip".
-    // allows us to use current master net for gensfen (primarily needed for higher quality data)
-    inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * 63));
-    }
-
-    // Find the index of the feature quantity from the king position and PieceSquare
-    inline IndexType A::make_index(
-        Color perspective, Square s, Piece pc) {
-        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-    }
-
-    // Get a list of indices with a value of 1 among the features
-    void A::append_active_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* active) {
-
-        Bitboard bb = pos.pieces();
-        while (bb) {
-            Square s = pop_lsb(&bb);
-            active->push_back(make_index(perspective, s, pos.piece_on(s)));
-        }
-    }
-
-    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    void A::append_changed_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* removed,
-        IndexList* added) {
-
-        const auto& dp = pos.state()->dirtyPiece;
-        for (int i = 0; i < dp.dirty_num; ++i) {
-            Piece pc = dp.piece[i];
-
-            if (dp.from[i] != SQ_NONE)
-              removed->push_back(make_index(perspective, dp.from[i], pc));
-
-            if (dp.to[i] != SQ_NONE)
-              added->push_back(make_index(perspective, dp.to[i], pc));
-        }
-    }
-
-}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/a.h b/src/nnue/features/a.h
deleted file mode 100644
index 50a0d8be..00000000
--- a/src/nnue/features/a.h
+++ /dev/null
@@ -1,54 +0,0 @@
-﻿#ifndef _NNUE_FEATURES_A_H_
-#define _NNUE_FEATURES_A_H_
-
-#include "features_common.h"
-
-#include "evaluate.h"
-
-// Definition of input feature A of NNUE evaluation function
-// A is a union of P features and K features, so technically the
-// same effect can be achieved by including both P and K features
-// but it would result in slower index appending because
-// P would conditionally exclude K features and vice versa,
-// where A doesn't have any conditionals.
-namespace Eval::NNUE::Features {
-
-    // Feature P: PieceSquare of pieces other than balls
-    class A {
-    public:
-        // feature quantity name
-        static constexpr const char* kName = "A";
-
-        // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t kHashValue = 0x7A4C414Cu;
-
-        // number of feature dimensions
-        static constexpr IndexType kDimensions = PS_END2;
-
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions = 32;
-
-        // Timing of full calculation instead of difference calculation
-        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
-
-        // Get a list of indices with a value of 1 among the features
-        static void append_active_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* active);
-
-        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-        static void append_changed_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* removed,
-            IndexList* added);
-
-    private:
-        // Index of a feature for a given piece on some square
-        static IndexType make_index(Color perspective, Square s, Piece pc);
-    };
-
-}  // namespace Eval::NNUE::Features
-
-#endif // #ifndef _NNUE_FEATURES_UNION_P_K_H_
diff --git a/src/nnue/features/castling_right.cpp b/src/nnue/features/castling_right.cpp
deleted file mode 100644
index cbac0851..00000000
--- a/src/nnue/features/castling_right.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "castling_right.h"
-#include "index_list.h"
-
-//Definition of input feature quantity CastlingRight of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Get a list of indices with a value of 1 among the features
-    void CastlingRight::append_active_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* active) {
-
-        // do nothing if array size is small to avoid compiler warning
-        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
-
-        int castling_rights = pos.state()->castlingRights;
-        int relative_castling_rights;
-        if (perspective == WHITE) {
-            relative_castling_rights = castling_rights;
-        }
-        else {
-            // Invert the perspective.
-            relative_castling_rights = ((castling_rights & 3) << 2)
-                & ((castling_rights >> 2) & 3);
-        }
-
-        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
-            if (relative_castling_rights & (1 << i)) {
-                active->push_back(i);
-            }
-        }
-    }
-
-    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    void CastlingRight::append_changed_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* removed,
-        IndexList* /* added */) {
-
-        int previous_castling_rights = pos.state()->previous->castlingRights;
-        int current_castling_rights = pos.state()->castlingRights;
-        int relative_previous_castling_rights;
-        int relative_current_castling_rights;
-        if (perspective == WHITE) {
-            relative_previous_castling_rights = previous_castling_rights;
-            relative_current_castling_rights = current_castling_rights;
-        }
-        else {
-            // Invert the perspective.
-            relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
-                & ((previous_castling_rights >> 2) & 3);
-            relative_current_castling_rights = ((current_castling_rights & 3) << 2)
-                & ((current_castling_rights >> 2) & 3);
-        }
-
-        for (Eval::NNUE::IndexType i = 0; i < kDimensions; ++i) {
-            if ((relative_previous_castling_rights & (1 << i)) &&
-                (relative_current_castling_rights & (1 << i)) == 0) {
-                removed->push_back(i);
-            }
-        }
-    }
-
-}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/castling_right.h b/src/nnue/features/castling_right.h
deleted file mode 100644
index cada24b6..00000000
--- a/src/nnue/features/castling_right.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef _NNUE_FEATURES_CASTLING_RIGHT_H_
-#define _NNUE_FEATURES_CASTLING_RIGHT_H_
-
-#include "features_common.h"
-
-#include "evaluate.h"
-
-//Definition of input feature quantity CastlingRight of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    class CastlingRight {
-    public:
-        // feature quantity name
-        static constexpr const char* kName = "CastlingRight";
-
-        // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t kHashValue = 0x913968AAu;
-
-        // number of feature dimensions
-        static constexpr IndexType kDimensions = 4;
-
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions = 4;
-
-        // Timing of full calculation instead of difference calculation
-        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
-
-        // Get a list of indices with a value of 1 among the features
-        static void append_active_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* active);
-
-        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-        static void append_changed_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* removed,
-            IndexList* added);
-    };
-
-}  // namespace Eval::NNUE::Features
-
-#endif
diff --git a/src/nnue/features/enpassant.cpp b/src/nnue/features/enpassant.cpp
deleted file mode 100644
index 06ba2d49..00000000
--- a/src/nnue/features/enpassant.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#include "enpassant.h"
-#include "index_list.h"
-
-//Definition of input feature quantity EnPassant of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Get a list of indices with a value of 1 among the features
-    void EnPassant::append_active_indices(
-        const Position& pos,
-        Color /* perspective */,
-        IndexList* active) {
-
-        // do nothing if array size is small to avoid compiler warning
-        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions)
-            return;
-
-        auto epSquare = pos.state()->epSquare;
-        if (epSquare == SQ_NONE)
-            return;
-
-        auto file = file_of(epSquare);
-        active->push_back(file);
-    }
-
-    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    void EnPassant::append_changed_indices(
-        const Position& pos,
-        Color /* perspective */,
-        IndexList* removed,
-        IndexList* added) {
-
-        auto previous_epSquare = pos.state()->previous->epSquare;
-        auto epSquare = pos.state()->epSquare;
-
-        if (previous_epSquare != SQ_NONE) {
-            if (epSquare != SQ_NONE && file_of(epSquare) == file_of(previous_epSquare))
-                return;
-
-            auto file = file_of(previous_epSquare);
-            removed->push_back(file);
-        }
-
-        if (epSquare != SQ_NONE) {
-            auto file = file_of(epSquare);
-            added->push_back(file);
-        }
-    }
-
-}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/enpassant.h b/src/nnue/features/enpassant.h
deleted file mode 100644
index 6ccb6046..00000000
--- a/src/nnue/features/enpassant.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef _NNUE_FEATURES_ENPASSANT_H_
-#define _NNUE_FEATURES_ENPASSANT_H_
-
-#include "features_common.h"
-
-#include "evaluate.h"
-
-//Definition of input feature quantity EnPassant of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    class EnPassant {
-    public:
-        // feature quantity name
-        static constexpr const char* kName = "EnPassant";
-        // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t kHashValue = 0x02924F91u;
-        // number of feature dimensions
-        static constexpr IndexType kDimensions = 8;
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions = 1;
-        // Timing of full calculation instead of difference calculation
-        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
-
-        // Get a list of indices with a value of 1 among the features
-        static void append_active_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* active);
-
-        // Get a list of indices whose values have changed from the previous one in the feature quantity
-        static void append_changed_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* removed,
-            IndexList* added);
-    };
-
-}  // namespace Eval::NNUE::Features
-
-#endif
diff --git a/src/nnue/features/half_ka.cpp b/src/nnue/features/half_ka.cpp
deleted file mode 100644
index 08124b96..00000000
--- a/src/nnue/features/half_ka.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
-
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-//Definition of input features HalfKA of NNUE evaluation function
-
-#include "half_ka.h"
-#include "index_list.h"
-
-namespace Eval::NNUE::Features {
-
-    // Orient a square according to perspective (rotate the board 180° for black)
-    // Important note for "halfka": this arch was designed with "flip" in mind 
-    // although it still is untested which approach is better.
-    // this has to stay until we find a better arch that works with "flip".
-    // allows us to use current master net for gensfen (primarily needed for higher quality data)
-    inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * 63));
-    }
-
-    // Find the index of the feature quantity from the king position and PieceSquare
-    template <Side AssociatedKing>
-    inline IndexType HalfKA<AssociatedKing>::make_index(
-        Color perspective,
-        Square s,
-        Piece pc,
-        Square ksq) {
-
-        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective] + PS_END2 * ksq);
-    }
-
-    // Get a list of indices for active features
-    template <Side AssociatedKing>
-    void HalfKA<AssociatedKing>::append_active_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* active) {
-
-        Square ksq = orient(
-            perspective,
-            pos.square<KING>(
-                AssociatedKing == Side::kFriend ? perspective : ~perspective));
-
-        Bitboard bb = pos.pieces();
-        while (bb) {
-            Square s = pop_lsb(&bb);
-            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
-        }
-    }
-
-    // Get a list of indices for recently changed features
-    template <Side AssociatedKing>
-    void HalfKA<AssociatedKing>::append_changed_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* removed,
-        IndexList* added) {
-
-        Square ksq = orient(
-            perspective,
-            pos.square<KING>(
-                AssociatedKing == Side::kFriend ? perspective : ~perspective));
-
-        const auto& dp = pos.state()->dirtyPiece;
-        for (int i = 0; i < dp.dirty_num; ++i) {
-            Piece pc = dp.piece[i];
-
-            if (dp.from[i] != SQ_NONE)
-                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
-
-            if (dp.to[i] != SQ_NONE)
-                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
-        }
-    }
-
-    template class HalfKA<Side::kFriend>;
-    template class HalfKA<Side::kEnemy>;
-
-}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_ka.h b/src/nnue/features/half_ka.h
deleted file mode 100644
index 2839357e..00000000
--- a/src/nnue/features/half_ka.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
-
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef NNUE_FEATURES_HALF_KA_H_INCLUDED
-#define NNUE_FEATURES_HALF_KA_H_INCLUDED
-
-#include "features_common.h"
-
-#include "evaluate.h"
-
-//Definition of input features HalfKPK of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Feature HalfKPK: Combination of the position of own king
-    // and the position of pieces other than kings
-    template <Side AssociatedKing>
-    class HalfKA {
-
-    public:
-        // Feature name
-        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
-            "HalfKA(Friend)" : "HalfKA(Enemy)";
-
-        // Hash value embedded in the evaluation file
-        static constexpr std::uint32_t kHashValue =
-            0x5F134CB9u ^ (AssociatedKing == Side::kFriend);
-
-        // Number of feature dimensions
-        static constexpr IndexType kDimensions =
-            static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END2);
-
-        // Maximum number of simultaneously active features
-        static constexpr IndexType kMaxActiveDimensions = 32;
-
-        // Trigger for full calculation instead of difference calculation
-        static constexpr TriggerEvent kRefreshTrigger =
-            (AssociatedKing == Side::kFriend) ?
-            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
-
-        // Get a list of indices for active features
-        static void append_active_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* active);
-
-        // Get a list of indices for recently changed features
-        static void append_changed_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* removed,
-            IndexList* added);
-
-    private:
-        // Index of a feature for a given king position and another piece on some square
-        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
-    };
-
-}  // namespace Eval::NNUE::Features
-
-#endif // #ifndef NNUE_FEATURES_HALF_KA_H_INCLUDED
diff --git a/src/nnue/features/half_relative_ka.cpp b/src/nnue/features/half_relative_ka.cpp
deleted file mode 100644
index d2ad31e6..00000000
--- a/src/nnue/features/half_relative_ka.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-﻿#include "half_relative_ka.h"
-#include "index_list.h"
-
-//Definition of input features HalfRelativeKA of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Orient a square according to perspective (rotate the board 180° for black)
-    // Important note for "halfka": this arch was designed with "flip" in mind 
-    // although it still is untested which approach is better.
-    // this has to stay until we find a better arch that works with "flip".
-    // allows us to use current master net for gensfen (primarily needed for higher quality data)
-    inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * 63));
-    }
-
-    // Find the index of the feature quantity from the ball position and PieceSquare
-    template <Side AssociatedKing>
-    inline IndexType HalfRelativeKA<AssociatedKing>::make_index(
-        Color perspective,
-        Square s,
-        Piece pc,
-        Square sq_k) {
-
-        const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-        return make_index(sq_k, p);
-    }
-
-    // Find the index of the feature quantity from the ball position and PieceSquare
-    template <Side AssociatedKing>
-    inline IndexType HalfRelativeKA<AssociatedKing>::make_index(
-        Square sq_k,
-        IndexType p) {
-
-        constexpr IndexType W = kBoardWidth;
-        constexpr IndexType H = kBoardHeight;
-        const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
-        const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
-        const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
-        const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
-        return H * W * piece_index + H * relative_file + relative_rank;
-    }
-
-    // Get a list of indices with a value of 1 among the features
-    template <Side AssociatedKing>
-    void HalfRelativeKA<AssociatedKing>::append_active_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* active) {
-
-        Square ksq = orient(
-            perspective,
-            pos.square<KING>(
-                AssociatedKing == Side::kFriend ? perspective : ~perspective));
-
-        Bitboard bb = pos.pieces();
-        while (bb) {
-            Square s = pop_lsb(&bb);
-            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
-        }
-    }
-
-    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    template <Side AssociatedKing>
-    void HalfRelativeKA<AssociatedKing>::append_changed_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* removed,
-        IndexList* added) {
-
-        Square ksq = orient(
-            perspective,
-            pos.square<KING>(
-                AssociatedKing == Side::kFriend ? perspective : ~perspective));
-
-        const auto& dp = pos.state()->dirtyPiece;
-        for (int i = 0; i < dp.dirty_num; ++i) {
-            Piece pc = dp.piece[i];
-
-            if (dp.from[i] != SQ_NONE)
-                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
-
-            if (dp.to[i] != SQ_NONE)
-                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
-        }
-    }
-
-    template class HalfRelativeKA<Side::kFriend>;
-    template class HalfRelativeKA<Side::kEnemy>;
-
-}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_relative_ka.h b/src/nnue/features/half_relative_ka.h
deleted file mode 100644
index f42661e9..00000000
--- a/src/nnue/features/half_relative_ka.h
+++ /dev/null
@@ -1,68 +0,0 @@
-﻿#ifndef _NNUE_FEATURES_HALF_RELATIVE_KA_H_
-#define _NNUE_FEATURES_HALF_RELATIVE_KA_H_
-
-#include "features_common.h"
-
-#include "evaluate.h"
-
-// Definition of input features HalfRelativeKA of NNUE evaluation function
-// K - King
-// A - Any piece
-// KA - product of K and A
-namespace Eval::NNUE::Features {
-
-    // Feature HalfRelativeKA: Relative position of each piece other than the ball based on own ball or enemy ball
-    template <Side AssociatedKing>
-    class HalfRelativeKA {
-    public:
-        // feature quantity name
-        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
-            "HalfRelativeKA(Friend)" : "HalfRelativeKA(Enemy)";
-
-        // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t kHashValue =
-            0xA123051Fu ^ (AssociatedKing == Side::kFriend);
-
-        static constexpr IndexType kNumPieceKinds = 6 * 2;
-
-        // width of the virtual board with the ball in the center
-        static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
-
-        // height of a virtual board with balls in the center
-        static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
-
-        // number of feature dimensions
-        static constexpr IndexType kDimensions =
-            kNumPieceKinds * kBoardHeight * kBoardWidth;
-
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions = 32;
-
-        // Timing of full calculation instead of difference calculation
-        static constexpr TriggerEvent kRefreshTrigger =
-            (AssociatedKing == Side::kFriend) ?
-            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
-
-        // Get a list of indices with a value of 1 among the features
-        static void append_active_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* active);
-
-        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-        static void append_changed_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* removed,
-            IndexList* added);
-
-        // Find the index of the feature quantity from the ball position and PieceSquare
-        static IndexType make_index(Square s, IndexType p);
-
-        // Find the index of the feature quantity from the ball position and PieceSquare
-        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
-    };
-
-}  // namespace Eval::NNUE::Features
-
-#endif // #ifndef _NNUE_FEATURES_HALF_RELATIVE_KA_H_
diff --git a/src/nnue/features/half_relative_kp.cpp b/src/nnue/features/half_relative_kp.cpp
deleted file mode 100644
index 2ebccd59..00000000
--- a/src/nnue/features/half_relative_kp.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-﻿#include "half_relative_kp.h"
-#include "index_list.h"
-
-//Definition of input features HalfRelativeKP of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Orient a square according to perspective (rotate the board 180° for black)
-    // this has to stay until we find a better arch that works with "flip".
-    // allows us to use current master net for gensfen (primarily needed for higher quality data)
-    inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * 63));
-    }
-
-    // Find the index of the feature quantity from the ball position and PieceSquare
-    template <Side AssociatedKing>
-    inline IndexType HalfRelativeKP<AssociatedKing>::make_index(
-        Color perspective,
-        Square s,
-        Piece pc,
-        Square sq_k) {
-
-        const IndexType p = IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-        return make_index(sq_k, p);
-    }
-
-    // Find the index of the feature quantity from the ball position and PieceSquare
-    template <Side AssociatedKing>
-    inline IndexType HalfRelativeKP<AssociatedKing>::make_index(
-        Square sq_k,
-        IndexType p) {
-
-        constexpr IndexType W = kBoardWidth;
-        constexpr IndexType H = kBoardHeight;
-        const IndexType piece_index = (p - PS_W_PAWN) / SQUARE_NB;
-        const Square sq_p = static_cast<Square>((p - PS_W_PAWN) % SQUARE_NB);
-        const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
-        const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
-        return H * W * piece_index + H * relative_file + relative_rank;
-    }
-
-    // Get a list of indices with a value of 1 among the features
-    template <Side AssociatedKing>
-    void HalfRelativeKP<AssociatedKing>::append_active_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* active) {
-
-        Square ksq = orient(
-            perspective,
-            pos.square<KING>(
-                AssociatedKing == Side::kFriend ? perspective : ~perspective));
-
-        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-        while (bb) {
-            Square s = pop_lsb(&bb);
-            active->push_back(make_index(perspective, s, pos.piece_on(s), ksq));
-        }
-    }
-
-    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    template <Side AssociatedKing>
-    void HalfRelativeKP<AssociatedKing>::append_changed_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* removed,
-        IndexList* added) {
-
-        Square ksq = orient(
-            perspective,
-            pos.square<KING>(
-                AssociatedKing == Side::kFriend ? perspective : ~perspective));
-
-        const auto& dp = pos.state()->dirtyPiece;
-        for (int i = 0; i < dp.dirty_num; ++i) {
-            Piece pc = dp.piece[i];
-
-            if (type_of(pc) == KING)
-                continue;
-
-            if (dp.from[i] != SQ_NONE)
-                removed->push_back(make_index(perspective, dp.from[i], pc, ksq));
-
-            if (dp.to[i] != SQ_NONE)
-                added->push_back(make_index(perspective, dp.to[i], pc, ksq));
-        }
-    }
-
-    template class HalfRelativeKP<Side::kFriend>;
-    template class HalfRelativeKP<Side::kEnemy>;
-
-}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_relative_kp.h b/src/nnue/features/half_relative_kp.h
deleted file mode 100644
index 590a01a3..00000000
--- a/src/nnue/features/half_relative_kp.h
+++ /dev/null
@@ -1,66 +0,0 @@
-﻿#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
-#define _NNUE_FEATURES_HALF_RELATIVE_KP_H_
-
-#include "features_common.h"
-
-#include "evaluate.h"
-
-//Definition of input features HalfRelativeKP of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Feature HalfRelativeKP: Relative position of each piece other than the ball based on own ball or enemy ball
-    template <Side AssociatedKing>
-    class HalfRelativeKP {
-    public:
-        // feature quantity name
-        static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
-            "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";
-
-        // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t kHashValue =
-            0xF9180919u ^ (AssociatedKing == Side::kFriend);
-
-        // Piece type excluding balls
-        static constexpr IndexType kNumPieceKinds = 5 * 2;
-
-        // width of the virtual board with the ball in the center
-        static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
-
-        // height of a virtual board with balls in the center
-        static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
-
-        // number of feature dimensions
-        static constexpr IndexType kDimensions =
-            kNumPieceKinds * kBoardHeight * kBoardWidth;
-
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-
-        // Timing of full calculation instead of difference calculation
-        static constexpr TriggerEvent kRefreshTrigger =
-            (AssociatedKing == Side::kFriend) ?
-            TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
-
-        // Get a list of indices with a value of 1 among the features
-        static void append_active_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* active);
-
-        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-        static void append_changed_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* removed,
-            IndexList* added);
-
-        // Find the index of the feature quantity from the ball position and PieceSquare
-        static IndexType make_index(Square s, IndexType p);
-
-        // Find the index of the feature quantity from the ball position and PieceSquare
-        static IndexType make_index(Color perspective, Square s, Piece pc, Square sq_k);
-    };
-
-}  // namespace Eval::NNUE::Features
-
-#endif
diff --git a/src/nnue/features/k.cpp b/src/nnue/features/k.cpp
deleted file mode 100644
index 7b62a75a..00000000
--- a/src/nnue/features/k.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-﻿#include "k.h"
-#include "index_list.h"
-
-//Definition of input feature quantity K of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Orient a square according to perspective (rotate the board 180° for black)
-    // this has to stay until we find a better arch that works with "flip".
-    // allows us to use current master net for gensfen (primarily needed for higher quality data)
-    inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * 63));
-    }
-
-    // Index of a feature for a given king position.
-    IndexType K::make_index(Color perspective, Square s, Color king_color) {
-        return IndexType(orient(perspective, s) + bool(perspective ^ king_color) * 64);
-    }
-
-    // Get a list of indices with a value of 1 among the features
-    void K::append_active_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* active) {
-
-        for (auto color : Colors) {
-          active->push_back(make_index(perspective, pos.square<KING>(color), color));
-        }
-    }
-
-    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    void K::append_changed_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* removed,
-        IndexList* added) {
-
-        const auto& dp = pos.state()->dirtyPiece;
-        if (type_of(dp.piece[0]) == KING)
-        {
-            removed->push_back(make_index(perspective, dp.from[0], color_of(dp.piece[0])));
-            added->push_back(make_index(perspective, dp.to[0], color_of(dp.piece[0])));
-        }
-    }
-
-}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/k.h b/src/nnue/features/k.h
deleted file mode 100644
index 928d77de..00000000
--- a/src/nnue/features/k.h
+++ /dev/null
@@ -1,49 +0,0 @@
-﻿#ifndef _NNUE_FEATURES_K_H_
-#define _NNUE_FEATURES_K_H_
-
-#include "features_common.h"
-
-#include "evaluate.h"
-
-//Definition of input feature quantity K of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Feature K: Ball position
-    class K {
-    public:
-        // feature quantity name
-        static constexpr const char* kName = "K";
-
-        // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
-
-        // number of feature dimensions
-        static constexpr IndexType kDimensions = SQUARE_NB * 2;
-
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions = 2;
-
-        // Timing of full calculation instead of difference calculation
-        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
-
-        // Get a list of indices with a value of 1 among the features
-        static void append_active_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* active);
-
-        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-        static void append_changed_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* removed,
-            IndexList* added);
-
-    private:
-        // Index of a feature for a given king position.
-        static IndexType make_index(Color perspective, Square s, Color king_color);
-    };
-
-}  // namespace Eval::NNUE::Features
-
-#endif
diff --git a/src/nnue/features/p.cpp b/src/nnue/features/p.cpp
deleted file mode 100644
index a17e304f..00000000
--- a/src/nnue/features/p.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-﻿#include "p.h"
-#include "index_list.h"
-
-//Definition of input feature P of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Orient a square according to perspective (rotate the board 180° for black)
-    // this has to stay until we find a better arch that works with "flip".
-    // allows us to use current master net for gensfen (primarily needed for higher quality data)
-    inline Square orient(Color perspective, Square s) {
-        return Square(int(s) ^ (bool(perspective) * 63));
-    }
-
-    // Find the index of the feature quantity from the king position and PieceSquare
-    inline IndexType P::make_index(
-        Color perspective, Square s, Piece pc) {
-        return IndexType(orient(perspective, s) + kpp_board_index[pc][perspective]);
-    }
-
-    // Get a list of indices with a value of 1 among the features
-    void P::append_active_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* active) {
-
-        Bitboard bb = pos.pieces() & ~pos.pieces(KING);
-        while (bb) {
-            Square s = pop_lsb(&bb);
-            active->push_back(make_index(perspective, s, pos.piece_on(s)));
-        }
-    }
-
-    // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-    void P::append_changed_indices(
-        const Position& pos,
-        Color perspective,
-        IndexList* removed,
-        IndexList* added) {
-
-        const auto& dp = pos.state()->dirtyPiece;
-        for (int i = 0; i < dp.dirty_num; ++i) {
-            Piece pc = dp.piece[i];
-
-            if (type_of(pc) == KING)
-              continue;
-
-            if (dp.from[i] != SQ_NONE)
-              removed->push_back(make_index(perspective, dp.from[i], pc));
-
-            if (dp.to[i] != SQ_NONE)
-              added->push_back(make_index(perspective, dp.to[i], pc));
-        }
-    }
-
-}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/p.h b/src/nnue/features/p.h
deleted file mode 100644
index d461086b..00000000
--- a/src/nnue/features/p.h
+++ /dev/null
@@ -1,49 +0,0 @@
-﻿#ifndef _NNUE_FEATURES_P_H_
-#define _NNUE_FEATURES_P_H_
-
-#include "features_common.h"
-
-#include "evaluate.h"
-
-//Definition of input feature P of NNUE evaluation function
-namespace Eval::NNUE::Features {
-
-    // Feature P: PieceSquare of pieces other than balls
-    class P {
-    public:
-        // feature quantity name
-        static constexpr const char* kName = "P";
-
-        // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
-
-        // number of feature dimensions
-        static constexpr IndexType kDimensions = PS_END;
-
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions = 30; // Kings don't count
-
-        // Timing of full calculation instead of difference calculation
-        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
-
-        // Get a list of indices with a value of 1 among the features
-        static void append_active_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* active);
-
-        // Get a list of indices whose values ​​have changed from the previous one in the feature quantity
-        static void append_changed_indices(
-            const Position& pos,
-            Color perspective,
-            IndexList* removed,
-            IndexList* added);
-
-    private:
-        // Index of a feature for a given piece on some square
-        static IndexType make_index(Color perspective, Square s, Piece pc);
-    };
-
-}  // namespace Eval::NNUE::Features
-
-#endif
diff --git a/src/nnue/layers/sum.h b/src/nnue/layers/sum.h
deleted file mode 100644
index 01ae251c..00000000
--- a/src/nnue/layers/sum.h
+++ /dev/null
@@ -1,196 +0,0 @@
-﻿#ifndef _NNUE_LAYERS_SUM_H_
-#define _NNUE_LAYERS_SUM_H_
-
-#include "nnue/nnue_common.h"
-
-// Definition of layer Sum of NNUE evaluation function
-namespace Eval::NNUE::Layers {
-
-    // Layer that sums the output of multiple layers
-    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
-    class Sum : public Sum<RemainingPreviousLayers...> {
-    private:
-        using Head = FirstPreviousLayer;
-        using Tail = Sum<RemainingPreviousLayers...>;
-
-     public:
-        // Input/output type
-        using InputType = typename Head::OutputType;
-
-        using OutputType = InputType;
-
-        static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
-
-        static constexpr IndexType kOutputDimensions = kInputDimensions;
-
-        static_assert(kInputDimensions == Tail::kInputDimensions ,"");
-
-        // Size of forward propagation buffer used in this layer
-        static constexpr std::size_t kSelfBufferSize =
-            CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
-
-        // Size of the forward propagation buffer used from the input layer to this layer
-        static constexpr std::size_t kBufferSize =
-            std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
-
-        static constexpr int kLayerIndex = Tail::kLayerIndex + 1;
-
-        // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t GetHashValue() {
-            std::uint32_t hash_value = 0xBCE400B4u;
-            hash_value ^= Head::GetHashValue() >> 1;
-            hash_value ^= Head::GetHashValue() << 31;
-            hash_value ^= Tail::GetHashValue() >> 2;
-            hash_value ^= Tail::GetHashValue() << 30;
-            return hash_value;
-        }
-
-        static std::string get_name() {
-             return "Sum[" +
-                std::to_string(kOutputDimensions) + "]";
-        }
-
-        // A string that represents the structure from the input layer to this layer
-        static std::string get_structure_string() {
-            return get_name() + "(" + get_summands_string() + ")";
-        }
-
-        static std::string get_layers_info() {
-            std::string info = Tail::get_layers_info();
-            info += "\n  - ";
-            info += std::to_string(kLayerIndex);
-            info += " - ";
-            info += get_name();
-            return info;
-        }
-
-        // read parameters
-        bool ReadParameters(std::istream& stream) {
-            if (!Tail::ReadParameters(stream))
-                return false;
-
-            return previous_layer_.ReadParameters(stream);
-        }
-
-        // write parameters
-        bool WriteParameters(std::ostream& stream) const {
-            if (!Tail::WriteParameters(stream))
-                return false;
-
-            return previous_layer_.WriteParameters(stream);
-        }
-
-        // forward propagation
-        const OutputType* propagate(
-            const TransformedFeatureType* transformed_features, char* buffer) const {
-
-            Tail::propagate(transformed_features, buffer);
-
-            const auto head_output = previous_layer_.Propagate(
-                transformed_features, buffer + kSelfBufferSize);
-
-            const auto output = reinterpret_cast<OutputType*>(buffer);
-
-            for (IndexType i = 0; i <kOutputDimensions; ++i) {
-                output[i] += head_output[i];
-            }
-
-            return output;
-        }
-
-    protected:
-        // A string that represents the list of layers to be summed
-        static std::string get_summands_string() {
-            return Head::get_structure_string() + "," + Tail::get_summands_string();
-        }
-
-        // Make the learning class a friend
-        friend class Trainer<Sum>;
-
-        // the layer immediately before this layer
-        FirstPreviousLayer previous_layer_;
-    };
-
-    // Layer that sums the output of multiple layers (when there is one template argument)
-    template <typename PreviousLayer>
-    class Sum<PreviousLayer> {
-    public:
-        // Input/output type
-        using InputType = typename PreviousLayer::OutputType;
-
-        using OutputType = InputType;
-
-        // number of input/output dimensions
-        static constexpr IndexType kInputDimensions =
-            PreviousLayer::kOutputDimensions;
-
-        static constexpr IndexType kOutputDimensions = kInputDimensions;
-
-        // Size of the forward propagation buffer used from the input layer to this layer
-        static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
-
-        static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
-
-        // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t GetHashValue() {
-            std::uint32_t hash_value = 0xBCE400B4u;
-            hash_value ^= PreviousLayer::GetHashValue() >> 1;
-            hash_value ^= PreviousLayer::GetHashValue() << 31;
-            return hash_value;
-        }
-
-        static std::string get_name() {
-             return "Sum[" +
-                std::to_string(kOutputDimensions) + "]";
-        }
-
-        // A string that represents the structure from the input layer to this layer
-        static std::string get_structure_string() {
-            return get_name() + "(" + get_summands_string() + ")";
-        }
-
-        static std::string get_layers_info() {
-            std::string info = PreviousLayer::get_layers_info();
-            info += '\n';
-            info += std::to_string(kLayerIndex);
-            info += ": ";
-            info += get_name();
-            return info;
-        }
-
-        // read parameters
-        bool ReadParameters(std::istream& stream) {
-            return previous_layer_.ReadParameters(stream);
-        }
-
-        // write parameters
-        bool WriteParameters(std::ostream& stream) const {
-            return previous_layer_.WriteParameters(stream);
-        }
-
-        // forward propagation
-        const OutputType* Propagate(
-            const TransformedFeatureType* transformed_features, char* buffer) const {
-
-            return previous_layer_.Propagate(transformed_features, buffer);
-        }
-
-    protected:
-        // A string that represents the list of layers to be summed
-        static std::string get_summands_string() {
-            return PreviousLayer::get_structure_string();
-        }
-
-        // Make the learning class a friend
-        friend class Trainer<Sum>;
-
-        // the layer immediately before this layer
-        PreviousLayer previous_layer_;
-    };
-
-}  // namespace Eval::NNUE::Layers
-
-#endif

From 19f712cdbbc973fa568510177d043a48cf15438e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sun, 18 Apr 2021 20:33:17 +0200
Subject: [PATCH 563/583] Post-merge fixes.

---
 src/evaluate.cpp                    | 121 ++++++++-
 src/evaluate.h                      |  18 ++
 src/misc.cpp                        |   4 +-
 src/nnue/evaluate_nnue.cpp          | 152 +-----------
 src/nnue/evaluate_nnue.h            |  14 +-
 src/nnue/features/feature_set.h     | 300 +++-------------------
 src/nnue/features/features_common.h |   9 +-
 src/nnue/features/half_kp.cpp       |   1 +
 src/nnue/layers/affine_transform.h  |  46 +---
 src/nnue/nnue_accumulator.h         |  13 +-
 src/nnue/nnue_feature_transformer.h | 373 ++++++++++------------------
 src/position.cpp                    |   6 +-
 src/position.h                      |   4 +-
 src/search.cpp                      |   6 +-
 src/tools/convert.cpp               |   2 +-
 src/tools/convert.h                 |   2 +-
 src/tools/gensfen.cpp               |   4 +-
 src/tools/gensfen.h                 |   2 +-
 src/tools/gensfen_nonpv.cpp         |   4 +-
 src/tools/gensfen_nonpv.h           |   2 +-
 src/tools/opening_book.cpp          |   2 +-
 src/tools/opening_book.h            |   2 +-
 src/tools/packed_sfen.h             |   2 +-
 src/tools/sfen_packer.cpp           |   6 +-
 src/tools/sfen_packer.h             |  10 +-
 src/tools/sfen_reader.h             |   2 +-
 src/tools/sfen_stream.h             |   2 +-
 src/tools/sfen_writer.h             |  16 +-
 src/tools/stats.cpp                 |   4 +-
 src/tools/stats.h                   |   2 +-
 src/tools/transform.cpp             |   2 +-
 src/tools/transform.h               |   2 +-
 src/uci.cpp                         |   5 +-
 33 files changed, 372 insertions(+), 768 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index e4558730..3d9a48b7 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -38,8 +38,127 @@
 #include "uci.h"
 #include "incbin/incbin.h"
 
+// Macro to embed the default efficiently updatable neural network (NNUE) file
+// data in the engine binary (using incbin.h, by Dale Weiler).
+// This macro invocation will declare the following three variables
+//     const unsigned char        gEmbeddedNNUEData[];  // a pointer to the embedded data
+//     const unsigned char *const gEmbeddedNNUEEnd;     // a marker to the end
+//     const unsigned int         gEmbeddedNNUESize;    // the size of the embedded file
+// Note that this does not work in Microsoft Visual Studio.
+#if !defined(_MSC_VER) && !defined(NNUE_EMBEDDING_OFF)
+  INCBIN(EmbeddedNNUE, EvalFileDefaultName);
+#else
+  const unsigned char        gEmbeddedNNUEData[1] = {0x0};
+  const unsigned char *const gEmbeddedNNUEEnd = &gEmbeddedNNUEData[1];
+  const unsigned int         gEmbeddedNNUESize = 1;
+#endif
+
 using namespace std;
 
+namespace Stockfish {
+
+namespace Eval {
+
+  namespace NNUE {
+    string eval_file_loaded = "None";
+    UseNNUEMode useNNUE;
+
+    /// NNUE::init() tries to load a NNUE network at startup time, or when the engine
+    /// receives a UCI command "setoption name EvalFile value nn-[a-z0-9]{12}.nnue"
+    /// The name of the NNUE network is always retrieved from the EvalFile option.
+    /// We search the given network in three locations: internally (the default
+    /// network may be embedded in the binary), in the active working directory and
+    /// in the engine directory. Distro packagers may define the DEFAULT_NNUE_DIRECTORY
+    /// variable to have the engine search in a special directory in their distro.
+
+    static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
+    {
+      if (mode == "false")
+        return UseNNUEMode::False;
+      else if (mode == "true")
+         return UseNNUEMode::True;
+      else if (mode == "pure")
+        return UseNNUEMode::Pure;
+
+      return UseNNUEMode::False;
+    }
+
+    void init() {
+
+      useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
+      if (useNNUE == UseNNUEMode::False)
+          return;
+
+      string eval_file = string(Options["EvalFile"]);
+
+      #if defined(DEFAULT_NNUE_DIRECTORY)
+      #define stringify2(x) #x
+      #define stringify(x) stringify2(x)
+      vector<string> dirs = { "<internal>" , "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
+      #else
+      vector<string> dirs = { "<internal>" , "" , CommandLine::binaryDirectory };
+      #endif
+
+      for (string directory : dirs)
+          if (eval_file_loaded != eval_file)
+          {
+              if (directory != "<internal>")
+              {
+                  ifstream stream(directory + eval_file, ios::binary);
+                  if (load_eval(eval_file, stream))
+                      eval_file_loaded = eval_file;
+              }
+
+              if (directory == "<internal>" && eval_file == EvalFileDefaultName)
+              {
+                  // C++ way to prepare a buffer for a memory stream
+                  class MemoryBuffer : public basic_streambuf<char> {
+                      public: MemoryBuffer(char* p, size_t n) { setg(p, p, p + n); setp(p, p + n); }
+                  };
+
+                  MemoryBuffer buffer(const_cast<char*>(reinterpret_cast<const char*>(gEmbeddedNNUEData)),
+                                      size_t(gEmbeddedNNUESize));
+
+                  istream stream(&buffer);
+                  if (load_eval(eval_file, stream))
+                      eval_file_loaded = eval_file;
+              }
+          }
+    }
+
+    /// NNUE::verify() verifies that the last net used was loaded successfully
+    void verify() {
+
+      string eval_file = string(Options["EvalFile"]);
+
+      if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
+      {
+          UCI::OptionsMap defaults;
+          UCI::init(defaults);
+
+          string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+          string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
+          string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+          string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + string(defaults["EvalFile"]);
+          string msg5 = "The engine will be terminated now.";
+
+          sync_cout << "info string ERROR: " << msg1 << sync_endl;
+          sync_cout << "info string ERROR: " << msg2 << sync_endl;
+          sync_cout << "info string ERROR: " << msg3 << sync_endl;
+          sync_cout << "info string ERROR: " << msg4 << sync_endl;
+          sync_cout << "info string ERROR: " << msg5 << sync_endl;
+
+          exit(EXIT_FAILURE);
+      }
+
+      if (useNNUE != UseNNUEMode::False)
+          sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
+      else
+          sync_cout << "info string classical evaluation enabled" << sync_endl;
+    }
+  }
+}
+
 namespace Trace {
 
   enum Tracing { NO_TRACE, TRACE };
@@ -1021,7 +1140,7 @@ Value Eval::evaluate(const Position& pos) {
       bool lowPieceEndgame =   pos.non_pawn_material() == BishopValueMg
                             || (pos.non_pawn_material() < 2 * RookValueMg && pos.count<PAWN>() < 2);
 
-      v = classical || lowPieceEndgame ? Evaluation<NO_TRACE>(pos).value() 
+      v = classical || lowPieceEndgame ? Evaluation<NO_TRACE>(pos).value()
                                        : adjusted_NNUE();
 
       // If the classical eval is small and imbalance large, use NNUE nevertheless.
diff --git a/src/evaluate.h b/src/evaluate.h
index f3766f45..af3453b4 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -28,6 +28,7 @@ namespace Stockfish {
 class Position;
 
 namespace Eval {
+
   std::string trace(const Position& pos);
   Value evaluate(const Position& pos);
 
@@ -36,6 +37,23 @@ namespace Eval {
   // name of the macro, as it is used in the Makefile.
   #define EvalFileDefaultName   "nn-62ef826d1a6d.nnue"
 
+  namespace NNUE {
+    enum struct UseNNUEMode
+    {
+      False,
+      True,
+      Pure
+    };
+
+    extern UseNNUEMode useNNUE;
+    extern std::string eval_file_loaded;
+
+    Value evaluate(const Position& pos);
+    bool load_eval(std::string name, std::istream& stream);
+    void init();
+    void verify();
+  }
+
 } // namespace Eval
 
 } // namespace Stockfish
diff --git a/src/misc.cpp b/src/misc.cpp
index e981136b..e47e2649 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -61,10 +61,10 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 
 using namespace std;
 
-SynchronizedRegionLogger sync_region_cout(std::cout);
-
 namespace Stockfish {
 
+SynchronizedRegionLogger sync_region_cout(std::cout);
+
 namespace {
 
 /// Version number. If Version is left empty, then compile date in the format
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index bbc6ebc2..7b2a1ae8 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -1,33 +1,21 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
-
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
-
   Stockfish is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Code for calculating NNUE evaluation function
 
-#include "evaluate_nnue.h"
-
-#include "position.h"
-#include "misc.h"
-#include "uci.h"
-#include "types.h"
-
 #include <iostream>
-#include <string>
-#include <fstream>
 #include <set>
 
 #include "../evaluate.h"
@@ -49,36 +37,18 @@ namespace Stockfish::Eval::NNUE {
   // Evaluation function file name
   std::string fileName;
 
-  // Saved evaluation function file name
-  std::string savedfileName = "nn.bin";
-
-  // Get a string that represents the structure of the evaluation function
-  std::string get_architecture_string() {
-    return "Features=" + FeatureTransformer::get_structure_string() +
-        ",Network=" + Network::get_structure_string();
-  }
-
-  std::string get_layers_info() {
-    return
-        FeatureTransformer::get_layers_info()
-        + '\n' + Network::get_layers_info();
-  }
-
-  UseNNUEMode useNNUE;
-  std::string eval_file_loaded = "None";
-
   namespace Detail {
 
   // Initialize the evaluation function parameters
   template <typename T>
-  void initialize(AlignedPtr<T>& pointer) {
+  void Initialize(AlignedPtr<T>& pointer) {
 
     pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
     std::memset(pointer.get(), 0, sizeof(T));
   }
 
   template <typename T>
-  void initialize(LargePagePtr<T>& pointer) {
+  void Initialize(LargePagePtr<T>& pointer) {
 
     static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
     pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
@@ -95,35 +65,17 @@ namespace Stockfish::Eval::NNUE {
     return reference.ReadParameters(stream);
   }
 
-  // write evaluation function parameters
-  template <typename T>
-  bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
-    constexpr std::uint32_t header = T::GetHashValue();
-
-    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
-
-    return pointer->WriteParameters(stream);
-  }
-
-  template <typename T>
-  bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
-    constexpr std::uint32_t header = T::GetHashValue();
-
-    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
-
-    return pointer->WriteParameters(stream);
-  }
   }  // namespace Detail
 
   // Initialize the evaluation function parameters
-  void initialize() {
+  void Initialize() {
 
-    Detail::initialize(feature_transformer);
-    Detail::initialize(network);
+    Detail::Initialize(feature_transformer);
+    Detail::Initialize(network);
   }
 
   // Read network header
-  bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
+  bool ReadHeader(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
   {
     std::uint32_t version, size;
 
@@ -136,48 +88,18 @@ namespace Stockfish::Eval::NNUE {
     return !stream.fail();
   }
 
-  // write the header
-  bool write_header(std::ostream& stream,
-    std::uint32_t hash_value, const std::string& architecture) {
-
-    stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
-    stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
-
-    const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
-
-    stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
-    stream.write(architecture.data(), size);
-
-    return !stream.fail();
-  }
-
   // Read network parameters
   bool ReadParameters(std::istream& stream) {
 
     std::uint32_t hash_value;
     std::string architecture;
-    if (!read_header(stream, &hash_value, &architecture)) return false;
+    if (!ReadHeader(stream, &hash_value, &architecture)) return false;
     if (hash_value != kHashValue) return false;
     if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
     if (!Detail::ReadParameters(stream, *network)) return false;
     return stream && stream.peek() == std::ios::traits_type::eof();
   }
 
-  // write evaluation function parameters
-  bool WriteParameters(std::ostream& stream) {
-
-    if (!write_header(stream, kHashValue, get_architecture_string()))
-        return false;
-
-    if (!Detail::WriteParameters(stream, feature_transformer))
-        return false;
-
-    if (!Detail::WriteParameters(stream, network))
-        return false;
-
-    return !stream.fail();
-}
-
   // Evaluation function. Perform differential calculation.
   Value evaluate(const Position& pos) {
 
@@ -211,65 +133,9 @@ namespace Stockfish::Eval::NNUE {
   // Load eval, from a file stream or a memory stream
   bool load_eval(std::string name, std::istream& stream) {
 
-    initialize();
+    Initialize();
     fileName = name;
     return ReadParameters(stream);
-}
-
-static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
-{
-  if (mode == "false")
-    return UseNNUEMode::False;
-  else if (mode == "true")
-     return UseNNUEMode::True;
-  else if (mode == "pure")
-    return UseNNUEMode::Pure;
-
-  return UseNNUEMode::False;
-}
-
-void init() {
-
-  useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
-
-  if (Options["SkipLoadingEval"])
-  {
-    eval_file_loaded.clear();
-    return;
   }
 
-  if (useNNUE == UseNNUEMode::False)
-  {
-    // Keep the eval file loaded. Useful for mixed bench.
-    return;
-  }
-
-  std::string eval_file = std::string(Options["EvalFile"]);
-
-#if defined(DEFAULT_NNUE_DIRECTORY)
-#define stringify2(x) #x
-#define stringify(x) stringify2(x)
-  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
-#else
-  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
-#endif
-
-  for (std::string directory : dirs)
-  {
-    if (eval_file_loaded != eval_file)
-    {
-      std::ifstream stream(directory + eval_file, std::ios::binary);
-      if (load_eval(eval_file, stream))
-      {
-        sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
-        eval_file_loaded = eval_file;
-      }
-      else
-      {
-        sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
-        eval_file_loaded.clear();
-      }
-    }
-  }
-
-} // namespace Stockfish::Eval::NNUE
+} // namespace Stockfish::Eval::NNUE
\ No newline at end of file
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index b6070fae..010a89f7 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -1,17 +1,14 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
-
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
-
   Stockfish is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
@@ -23,19 +20,10 @@
 
 #include "nnue_feature_transformer.h"
 
-#include "misc.h"
-
 #include <memory>
 
 namespace Stockfish::Eval::NNUE {
 
-  enum struct UseNNUEMode
-  {
-    False,
-    True,
-    Pure
-  };
-
   // Hash value of evaluation function structure
   constexpr std::uint32_t kHashValue =
       FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
@@ -65,4 +53,4 @@ namespace Stockfish::Eval::NNUE {
 
 }  // namespace Stockfish::Eval::NNUE
 
-#endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
+#endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
\ No newline at end of file
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index c293661b..fb25bce5 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -1,17 +1,14 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
-
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
-
   Stockfish is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
@@ -26,277 +23,44 @@
 
 namespace Stockfish::Eval::NNUE::Features {
 
-    // Class template that represents a list of values
-    template <typename T, T... Values>
-    struct CompileTimeList;
+  // Class template that represents a list of values
+  template <typename T, T... Values>
+  struct CompileTimeList;
 
-    template <typename T, T First, T... Remaining>
-    struct CompileTimeList<T, First, Remaining...> {
-        static constexpr bool contains(T value) {
-            return value == First || CompileTimeList<T, Remaining...>::contains(value);
-        }
+  template <typename T, T First, T... Remaining>
+  struct CompileTimeList<T, First, Remaining...> {
+    static constexpr bool Contains(T value) {
+      return value == First || CompileTimeList<T, Remaining...>::Contains(value);
+    }
+    static constexpr std::array<T, sizeof...(Remaining) + 1>
+        kValues = {{First, Remaining...}};
+  };
 
-        static constexpr std::array<T, sizeof...(Remaining) + 1>
-            kValues = {{First, Remaining...}};
-    };
+  // Base class of feature set
+  template <typename Derived>
+  class FeatureSetBase {
 
-    template <typename T, T First, T... Remaining>
-    constexpr std::array<T, sizeof...(Remaining) + 1>
-        CompileTimeList<T, First, Remaining...>::kValues;
+  };
 
-    template <typename T>
-    struct CompileTimeList<T> {
-        static constexpr bool contains(T /*value*/) {
-            return false;
-        }
-        static constexpr std::array<T, 0> kValues = { {} };
-    };
+  // Class template that represents the feature set
+  template <typename FeatureType>
+  class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
 
-    // Class template that adds to the beginning of the list
-    template <typename T, typename ListType, T Value>
-    struct AppendToList;
+   public:
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
+    // Number of feature dimensions
+    static constexpr IndexType kDimensions = FeatureType::kDimensions;
+    // Maximum number of simultaneously active features
+    static constexpr IndexType kMaxActiveDimensions =
+        FeatureType::kMaxActiveDimensions;
+    // Trigger for full calculation instead of difference calculation
+    using SortedTriggerSet =
+        CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
+    static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
 
-    template <typename T, T... Values, T AnotherValue>
-    struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
-        using Result = CompileTimeList<T, AnotherValue, Values...>;
-    };
-
-    // Class template for adding to a sorted, unique list
-    template <typename T, typename ListType, T Value>
-    struct InsertToSet;
-
-    template <typename T, T First, T... Remaining, T AnotherValue>
-    struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
-        using Result =
-            std::conditional_t<
-                CompileTimeList<T, First, Remaining...>::contains(AnotherValue),
-                CompileTimeList<T, First, Remaining...>,
-                std::conditional_t<
-                    (AnotherValue < First),
-                    CompileTimeList<T, AnotherValue, First, Remaining...>,
-                    typename AppendToList<T, typename InsertToSet<
-                        T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
-                        First
-                    >::Result
-                >
-            >;
-    };
-
-    template <typename T, T Value>
-    struct InsertToSet<T, CompileTimeList<T>, Value> {
-        using Result = CompileTimeList<T, Value>;
-    };
-
-    // Base class of feature set
-    template <typename Derived>
-    class FeatureSetBase {
-
-       public:
-        // Get a list of indices for active features
-        template <typename IndexListType>
-        static void append_active_indices(
-            const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
-
-            for (Color perspective : { WHITE, BLACK }) {
-                Derived::collect_active_indices(
-                    pos, trigger, perspective, &active[perspective]);
-            }
-        }
-
-        // Get a list of indices for recently changed features
-        template <typename PositionType, typename IndexListType>
-        static void append_changed_indices(
-            const PositionType& pos,
-            TriggerEvent trigger,
-            IndexListType removed[2],
-            IndexListType added[2],
-            bool reset[2]) {
-
-            const auto& dp = pos.state()->dirtyPiece;
-
-            for (Color perspective : { WHITE, BLACK }) {
-                switch (trigger) {
-                    case TriggerEvent::kNone:
-                        break;
-                    case TriggerEvent::kFriendKingMoved:
-                        if (dp.dirty_num == 0) continue;
-                        reset[perspective] = dp.piece[0] == make_piece(perspective, KING);
-                        break;
-                    case TriggerEvent::kEnemyKingMoved:
-                        if (dp.dirty_num == 0) continue;
-                        reset[perspective] = dp.piece[0] == make_piece(~perspective, KING);
-                        break;
-                    case TriggerEvent::kAnyKingMoved:
-                        if (dp.dirty_num == 0) continue;
-                        reset[perspective] = type_of(dp.piece[0]) == KING;
-                        break;
-                    case TriggerEvent::kAnyPieceMoved:
-                        reset[perspective] = true;
-                        break;
-                    default:
-                        assert(false);
-                        break;
-                }
-
-                if (reset[perspective]) {
-                    Derived::collect_active_indices(
-                        pos, trigger, perspective, &added[perspective]);
-                } else {
-                    Derived::collect_changed_indices(
-                        pos, trigger, perspective,
-                        &removed[perspective], &added[perspective]);
-                }
-            }
-        }
-    };
-
-    // Class template that represents the feature set
-    // do internal processing in reverse order of template arguments in order to linearize the amount of calculation at runtime
-    template <typename FirstFeatureType, typename... RemainingFeatureTypes>
-    class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
-      public FeatureSetBase<
-          FeatureSet<FirstFeatureType, RemainingFeatureTypes...>
-      > {
-
-    private:
-        using Head = FirstFeatureType;
-        using Tail = FeatureSet<RemainingFeatureTypes...>;
-
-    public:
-        // Hash value embedded in the evaluation function file
-        static constexpr std::uint32_t kHashValue =
-            Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
-
-        // number of feature dimensions
-        static constexpr IndexType kDimensions =
-            Head::kDimensions + Tail::kDimensions;
-
-        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
-        static constexpr IndexType kMaxActiveDimensions =
-            Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
-
-        // List of timings to perform all calculations instead of difference calculation
-        using SortedTriggerSet = typename InsertToSet<TriggerEvent,
-            typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
-
-        static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
-
-        // Get the feature quantity name
-        static std::string get_name() {
-            return std::string(Head::kName) + "+" + Tail::get_name();
-        }
-
-    private:
-        // Get a list of indices with a value of 1 among the features
-        template <typename IndexListType>
-        static void collect_active_indices(
-            const Position& pos,
-            const TriggerEvent trigger,
-            const Color perspective,
-            IndexListType* const active) {
-
-            Tail::collect_active_indices(pos, trigger, perspective, active);
-            if (Head::kRefreshTrigger == trigger) {
-                const auto start = active->size();
-                Head::append_active_indices(pos, perspective, active);
-
-                for (auto i = start; i < active->size(); ++i) {
-                    (*active)[i] += Tail::kDimensions;
-                }
-            }
-        }
-
-        // Get a list of indices whose values have changed from the previous one in the feature quantity
-        template <typename IndexListType>
-        static void collect_changed_indices(
-            const Position& pos,
-            const TriggerEvent trigger,
-            const Color perspective,
-            IndexListType* const removed,
-            IndexListType* const added) {
-
-            Tail::collect_changed_indices(pos, trigger, perspective, removed, added);
-            if (Head::kRefreshTrigger == trigger) {
-                const auto start_removed = removed->size();
-                const auto start_added = added->size();
-                Head::append_changed_indices(pos, perspective, removed, added);
-
-                for (auto i = start_removed; i < removed->size(); ++i) {
-                    (*removed)[i] += Tail::kDimensions;
-                }
-
-                for (auto i = start_added; i < added->size(); ++i) {
-                    (*added)[i] += Tail::kDimensions;
-                }
-            }
-        }
-
-        // Make the base class and the class template that recursively uses itself a friend
-        friend class FeatureSetBase<FeatureSet>;
-
-        template <typename... FeatureTypes>
-        friend class FeatureSet;
-    };
-
-    // Class template that represents the feature set
-    template <typename FeatureType>
-    class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
-
-    public:
-        // Hash value embedded in the evaluation file
-        static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
-
-        // Number of feature dimensions
-        static constexpr IndexType kDimensions = FeatureType::kDimensions;
-
-        // Maximum number of simultaneously active features
-        static constexpr IndexType kMaxActiveDimensions =
-            FeatureType::kMaxActiveDimensions;
-
-        // Trigger for full calculation instead of difference calculation
-        using SortedTriggerSet =
-            CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
-
-        static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
-
-        // Get the feature quantity name
-        static std::string get_name() {
-            return FeatureType::kName;
-        }
-
-    private:
-        // Get a list of indices for active features
-        static void collect_active_indices(
-            const Position& pos,
-            const TriggerEvent trigger,
-            const Color perspective,
-            IndexList* const active) {
-
-            if (FeatureType::kRefreshTrigger == trigger) {
-              FeatureType::append_active_indices(pos, perspective, active);
-            }
-        }
-
-        // Get a list of indices for recently changed features
-        static void collect_changed_indices(
-            const Position& pos,
-            const TriggerEvent trigger,
-            const Color perspective,
-            IndexList* const removed,
-            IndexList* const added) {
-
-            if (FeatureType::kRefreshTrigger == trigger) {
-              FeatureType::append_changed_indices(pos, perspective, removed, added);
-            }
-        }
-
-        // Make the base class and the class template that recursively uses itself a friend
-        friend class FeatureSetBase<FeatureSet>;
-
-        template <typename... FeatureTypes>
-        friend class FeatureSet;
-    };
+  };
 
 }  // namespace Stockfish::Eval::NNUE::Features
 
-#endif // #ifndef NNUE_FEATURE_SET_H_INCLUDED
+#endif // #ifndef NNUE_FEATURE_SET_H_INCLUDED
\ No newline at end of file
diff --git a/src/nnue/features/features_common.h b/src/nnue/features/features_common.h
index 0e2c0a84..118ec953 100644
--- a/src/nnue/features/features_common.h
+++ b/src/nnue/features/features_common.h
@@ -33,16 +33,11 @@ namespace Stockfish::Eval::NNUE::Features {
 
   // Trigger to perform full calculations instead of difference only
   enum class TriggerEvent {
-    kNone, // Calculate the difference whenever possible
-    kFriendKingMoved, // calculate full evaluation when own king moves
-    kEnemyKingMoved, // calculate full evaluation when opponent king moves
-    kAnyKingMoved, // calculate full evaluation when any king moves
-    kAnyPieceMoved, // always calculate full evaluation
+    kFriendKingMoved // calculate full evaluation when own king moves
   };
 
   enum class Side {
-    kFriend, // side to move
-    kEnemy, // opponent
+    kFriend // side to move
   };
 
 }  // namespace Stockfish::Eval::NNUE::Features
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index 612b42bd..05eb1a9a 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -80,6 +80,7 @@ namespace Stockfish::Eval::NNUE::Features {
       if (dp.to[i] != SQ_NONE)
         added->push_back(make_index(perspective, dp.to[i], pc, ksq));
     }
+  }
 
   template class HalfKP<Side::kFriend>;
 
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index b6fb5928..3e54ab7f 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -1,17 +1,14 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
-
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
-
   Stockfish is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
@@ -55,8 +52,6 @@ namespace Stockfish::Eval::NNUE::Layers {
     static constexpr std::size_t kBufferSize =
         PreviousLayer::kBufferSize + kSelfBufferSize;
 
-    static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
-
     // Hash value embedded in the evaluation file
     static constexpr std::uint32_t GetHashValue() {
       std::uint32_t hash_value = 0xCC03DAE4u;
@@ -66,27 +61,6 @@ namespace Stockfish::Eval::NNUE::Layers {
       return hash_value;
     }
 
-    static std::string get_name() {
-        return "AffineTransform[" +
-            std::to_string(kOutputDimensions) + "<-" +
-            std::to_string(kInputDimensions) + "]";
-    }
-
-    // A string that represents the structure from the input layer to this layer
-    static std::string get_structure_string() {
-        return get_name() + "(" +
-            PreviousLayer::get_structure_string() + ")";
-    }
-
-    static std::string get_layers_info() {
-        std::string info = PreviousLayer::get_layers_info();
-        info += "\n  - ";
-        info += std::to_string(kLayerIndex);
-        info += " - ";
-        info += get_name();
-        return info;
-    }
-
    // Read network parameters
     bool ReadParameters(std::istream& stream) {
       if (!previous_layer_.ReadParameters(stream)) return false;
@@ -148,21 +122,6 @@ namespace Stockfish::Eval::NNUE::Layers {
       return !stream.fail();
     }
 
-    // write parameters
-    bool WriteParameters(std::ostream& stream) const {
-        if (!previous_layer_.WriteParameters(stream))
-            return false;
-
-        stream.write(reinterpret_cast<const char*>(biases_),
-            kOutputDimensions * sizeof(BiasType));
-
-        stream.write(reinterpret_cast<const char*>(weights_),
-            kOutputDimensions * kPaddedInputDimensions *
-            sizeof(WeightType));
-
-        return !stream.fail();
-    }
-
     // Forward propagation
     const OutputType* Propagate(
         const TransformedFeatureType* transformed_features, char* buffer) const {
@@ -474,9 +433,6 @@ namespace Stockfish::Eval::NNUE::Layers {
     using BiasType = OutputType;
     using WeightType = std::int8_t;
 
-    // Make the learning class a friend
-    friend class Trainer<AffineTransform>;
-
     PreviousLayer previous_layer_;
 
     alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
@@ -502,4 +458,4 @@ namespace Stockfish::Eval::NNUE::Layers {
 
 }  // namespace Stockfish::Eval::NNUE::Layers
 
-#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
\ No newline at end of file
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index e84e26f1..5da4ecb5 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -1,17 +1,14 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
-
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
-
   Stockfish is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
@@ -25,12 +22,16 @@
 
 namespace Stockfish::Eval::NNUE {
 
+  // The accumulator of a StateInfo without parent is set to the INIT state
+  enum AccumulatorState { EMPTY, COMPUTED, INIT };
+
   // Class that holds the result of affine transformation of input features
   struct alignas(kCacheLineSize) Accumulator {
-      std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
-      bool computed_accumulation;
+    std::int16_t
+        accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+    AccumulatorState state[2];
   };
 
 }  // namespace Stockfish::Eval::NNUE
 
-#endif // NNUE_ACCUMULATOR_H_INCLUDED
+#endif // NNUE_ACCUMULATOR_H_INCLUDED
\ No newline at end of file
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 3082c1df..b5c8f40e 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -1,17 +1,14 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
-
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
-
   Stockfish is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
@@ -25,8 +22,7 @@
 #include "nnue_architecture.h"
 #include "features/index_list.h"
 
-#include <cstring>
-#include <string>
+#include <cstring> // std::memset()
 
 namespace Stockfish::Eval::NNUE {
 
@@ -41,7 +37,6 @@ namespace Stockfish::Eval::NNUE {
   #define vec_store(a,b) _mm512_store_si512(a,b)
   #define vec_add_16(a,b) _mm512_add_epi16(a,b)
   #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
-  #define vec_zero _mm512_setzero_si512()
   static constexpr IndexType kNumRegs = 8; // only 8 are needed
 
   #elif USE_AVX2
@@ -50,7 +45,6 @@ namespace Stockfish::Eval::NNUE {
   #define vec_store(a,b) _mm256_store_si256(a,b)
   #define vec_add_16(a,b) _mm256_add_epi16(a,b)
   #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
-  #define vec_zero _mm256_setzero_si256()
   static constexpr IndexType kNumRegs = 16;
 
   #elif USE_SSE2
@@ -59,7 +53,6 @@ namespace Stockfish::Eval::NNUE {
   #define vec_store(a,b) *(a)=(b)
   #define vec_add_16(a,b) _mm_add_epi16(a,b)
   #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
-  #define vec_zero _mm_setzero_si128()
   static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
 
   #elif USE_MMX
@@ -68,7 +61,6 @@ namespace Stockfish::Eval::NNUE {
   #define vec_store(a,b) *(a)=(b)
   #define vec_add_16(a,b) _mm_add_pi16(a,b)
   #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
-  #define vec_zero _mm_setzero_si64()
   static constexpr IndexType kNumRegs = 8;
 
   #elif USE_NEON
@@ -77,7 +69,6 @@ namespace Stockfish::Eval::NNUE {
   #define vec_store(a,b) *(a)=(b)
   #define vec_add_16(a,b) vaddq_s16(a,b)
   #define vec_sub_16(a,b) vsubq_s16(a,b)
-  #define vec_zero {0}
   static constexpr IndexType kNumRegs = 16;
 
   #else
@@ -109,33 +100,12 @@ namespace Stockfish::Eval::NNUE {
     static constexpr std::size_t kBufferSize =
         kOutputDimensions * sizeof(OutputType);
 
-    static constexpr int kLayerIndex = 0;
-
     // Hash value embedded in the evaluation file
     static constexpr std::uint32_t GetHashValue() {
 
       return RawFeatures::kHashValue ^ kOutputDimensions;
     }
 
-    static std::string get_name() {
-      return RawFeatures::get_name() + "[" +
-          std::to_string(kInputDimensions) + "->" +
-          std::to_string(kHalfDimensions) + "x2]";
-    }
-
-    // a string representing the structure
-    static std::string get_structure_string() {
-      return get_name();
-    }
-
-    static std::string get_layers_info() {
-      std::string info = "  - ";
-      info += std::to_string(kLayerIndex);
-      info += " - ";
-      info += get_name();
-      return info;
-    }
-
     // Read network parameters
     bool ReadParameters(std::istream& stream) {
 
@@ -146,38 +116,11 @@ namespace Stockfish::Eval::NNUE {
       return !stream.fail();
     }
 
-    // write parameters
-    bool WriteParameters(std::ostream& stream) const {
-      stream.write(reinterpret_cast<const char*>(biases_),
-          kHalfDimensions * sizeof(BiasType));
-
-      stream.write(reinterpret_cast<const char*>(weights_),
-          kHalfDimensions * kInputDimensions * sizeof(WeightType));
-
-      return !stream.fail();
-    }
-
-    // Proceed with the difference calculation if possible
-    bool update_accumulator_if_possible(const Position& pos) const {
-
-      const auto now = pos.state();
-      if (now->accumulator.computed_accumulation)
-        return true;
-
-      const auto prev = now->previous;
-      if (prev && prev->accumulator.computed_accumulation) {
-        update_accumulator(pos);
-        return true;
-      }
-
-      return false;
-    }
-
     // Convert input features
     void Transform(const Position& pos, OutputType* output) const {
 
-      if (!update_accumulator_if_possible(pos))
-        refresh_accumulator(pos);
+      UpdateAccumulator(pos, WHITE);
+      UpdateAccumulator(pos, BLACK);
 
       const auto& accumulation = pos.state()->accumulator.accumulation;
 
@@ -221,13 +164,6 @@ namespace Stockfish::Eval::NNUE {
               &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
           __m512i sum1 = _mm512_load_si512(
               &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-              sum0 = _mm512_add_epi16(sum0, reinterpret_cast<const __m512i*>(
-                  accumulation[perspectives[p]][i])[j * 2 + 0]);
-              sum1 = _mm512_add_epi16(sum1, reinterpret_cast<const __m512i*>(
-                  accumulation[perspectives[p]][i])[j * 2 + 1]);
-          }
-
           _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl,
               _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero)));
         }
@@ -239,13 +175,6 @@ namespace Stockfish::Eval::NNUE {
               &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
           __m256i sum1 = _mm256_load_si256(
               &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-              sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
-                  accumulation[perspectives[p]][i])[j * 2 + 0]);
-              sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
-                  accumulation[perspectives[p]][i])[j * 2 + 1]);
-          }
-
           _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
@@ -257,13 +186,6 @@ namespace Stockfish::Eval::NNUE {
               accumulation[perspectives[p]][0])[j * 2 + 0]);
           __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
               accumulation[perspectives[p]][0])[j * 2 + 1]);
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-            sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 0]);
-            sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
-                accumulation[perspectives[p]][i])[j * 2 + 1]);
-          }
-
       const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
 
           _mm_store_si128(&out[j],
@@ -284,13 +206,6 @@ namespace Stockfish::Eval::NNUE {
               accumulation[perspectives[p]][0])[j * 2 + 0]);
           __m64 sum1 = *(&reinterpret_cast<const __m64*>(
               accumulation[perspectives[p]][0])[j * 2 + 1]);
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-              sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
-                  accumulation[perspectives[p]][i])[j * 2 + 0]);
-              sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
-                  accumulation[perspectives[p]][i])[j * 2 + 1]);
-          }
-
           const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
           out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
         }
@@ -300,22 +215,12 @@ namespace Stockfish::Eval::NNUE {
         for (IndexType j = 0; j < kNumChunks; ++j) {
           int16x8_t sum = reinterpret_cast<const int16x8_t*>(
               accumulation[perspectives[p]][0])[j];
-
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-              sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
-                  accumulation[perspectives[p]][i])[j]);
-          }
-
           out[j] = vmax_s8(vqmovn_s16(sum), kZero);
         }
 
   #else
         for (IndexType j = 0; j < kHalfDimensions; ++j) {
           BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
-          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
-              sum += accumulation[static_cast<int>(perspectives[p])][i][j];
-          }
-
           output[offset + j] = static_cast<OutputType>(
               std::max<int>(0, std::min<int>(127, sum)));
         }
@@ -328,183 +233,177 @@ namespace Stockfish::Eval::NNUE {
     }
 
    private:
-    // Calculate cumulative value without using difference calculation
-    void refresh_accumulator(const Position& pos) const {
+    void UpdateAccumulator(const Position& pos, const Color c) const {
 
   #ifdef VECTOR
       // Gcc-10.2 unnecessarily spills AVX2 registers if this array
       // is defined in the VECTOR code below, once in each branch
       vec_t acc[kNumRegs];
   #endif
-      auto& accumulator = pos.state()->accumulator;
-      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-        Features::IndexList active_indices[2];
-        RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
-                                           active_indices);
-          for (Color perspective : { WHITE, BLACK }) {
-#ifdef VECTOR
-            for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-              auto accTile = reinterpret_cast<vec_t*>(
-                  &accumulator.accumulation[perspective][i][j * kTileHeight]);
 
-              if (i == 0) {
-                auto biasesTile = reinterpret_cast<const vec_t*>(
-                    &biases_[j * kTileHeight]);
-                for (IndexType k = 0; k < kNumRegs; ++k)
-                  acc[k] = biasesTile[k];
-              } else {
-                for (IndexType k = 0; k < kNumRegs; ++k)
-                  acc[k] = vec_zero;
-              }
+      // Look for a usable accumulator of an earlier position. We keep track
+      // of the estimated gain in terms of features to be added/subtracted.
+      StateInfo *st = pos.state(), *next = nullptr;
+      int gain = pos.count<ALL_PIECES>() - 2;
+      while (st->accumulator.state[c] == EMPTY)
+      {
+        auto& dp = st->dirtyPiece;
+        // The first condition tests whether an incremental update is
+        // possible at all: if this side's king has moved, it is not possible.
+        static_assert(std::is_same_v<RawFeatures::SortedTriggerSet,
+              Features::CompileTimeList<Features::TriggerEvent, Features::TriggerEvent::kFriendKingMoved>>,
+              "Current code assumes that only kFriendlyKingMoved refresh trigger is being used.");
+        if (   dp.piece[0] == make_piece(c, KING)
+            || (gain -= dp.dirty_num + 1) < 0)
+          break;
+        next = st;
+        st = st->previous;
+      }
 
-              for (const auto index : active_indices[perspective]) {
-                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+      if (st->accumulator.state[c] == COMPUTED)
+      {
+        if (next == nullptr)
+          return;
 
-                for (IndexType k = 0; k < kNumRegs; ++k)
-                  acc[k] = vec_add_16(acc[k], column[k]);
-              }
+        // Update incrementally in two steps. First, we update the "next"
+        // accumulator. Then, we update the current accumulator (pos.state()).
 
-              for (IndexType k = 0; k < kNumRegs; k++)
-                vec_store(&accTile[k], acc[k]);
-            }
-#else
-            if (i == 0) {
-              std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                          kHalfDimensions * sizeof(BiasType));
-            } else {
-              std::memset(accumulator.accumulation[perspective][i], 0,
-                          kHalfDimensions * sizeof(BiasType));
-            }
+        // Gather all features to be updated. This code assumes HalfKP features
+        // only and doesn't support refresh triggers.
+        static_assert(std::is_same_v<Features::FeatureSet<Features::HalfKP<Features::Side::kFriend>>,
+                                     RawFeatures>);
+        Features::IndexList removed[2], added[2];
+        Features::HalfKP<Features::Side::kFriend>::AppendChangedIndices(pos,
+            next->dirtyPiece, c, &removed[0], &added[0]);
+        for (StateInfo *st2 = pos.state(); st2 != next; st2 = st2->previous)
+          Features::HalfKP<Features::Side::kFriend>::AppendChangedIndices(pos,
+              st2->dirtyPiece, c, &removed[1], &added[1]);
 
-            for (const auto index : active_indices[perspective]) {
-              const IndexType offset = kHalfDimensions * index;
-
-              for (IndexType j = 0; j < kHalfDimensions; ++j)
-                accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-            }
-#endif
-          }
-
-        }
-
-#if defined(USE_MMX)
-        _mm_empty();
-#endif
-
-        accumulator.computed_accumulation = true;
-    }
-
-    // Calculate cumulative value using difference calculation
-    void update_accumulator(const Position& pos) const {
+        // Mark the accumulators as computed.
+        next->accumulator.state[c] = COMPUTED;
+        pos.state()->accumulator.state[c] = COMPUTED;
 
+        // Now update the accumulators listed in info[], where the last element is a sentinel.
+        StateInfo *info[3] =
+          { next, next == pos.state() ? nullptr : pos.state(), nullptr };
   #ifdef VECTOR
-      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
-      // is defined in the VECTOR code below, once in each branch
-      vec_t acc[kNumRegs];
-  #endif
-    const auto& prev_accumulator = pos.state()->previous->accumulator;
-    auto& accumulator = pos.state()->accumulator;
-    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
-      Features::IndexList removed_indices[2], added_indices[2];
-      bool reset[2] = { false, false };
-      RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
-                                          removed_indices, added_indices, reset);
-
-#ifdef VECTOR
-      for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-        for (Color perspective : { WHITE, BLACK }) {
+        for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j)
+        {
+          // Load accumulator
           auto accTile = reinterpret_cast<vec_t*>(
-              &accumulator.accumulation[perspective][i][j * kTileHeight]);
-
-          if (reset[perspective]) {
-            if (i == 0) {
-              auto biasesTile = reinterpret_cast<const vec_t*>(
-                  &biases_[j * kTileHeight]);
-              for (IndexType k = 0; k < kNumRegs; ++k)
-                acc[k] = biasesTile[k];
-            } else {
-              for (IndexType k = 0; k < kNumRegs; ++k)
-                acc[k] = vec_zero;
-            }
-          } else {
-            auto prevAccTile = reinterpret_cast<const vec_t*>(
-                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
-
-            for (IndexType k = 0; k < kNumRegs; ++k)
-              acc[k] = vec_load(&prevAccTile[k]);
+            &st->accumulator.accumulation[c][0][j * kTileHeight]);
+          for (IndexType k = 0; k < kNumRegs; ++k)
+            acc[k] = vec_load(&accTile[k]);
 
+          for (IndexType i = 0; info[i]; ++i)
+          {
             // Difference calculation for the deactivated features
-            for (const auto index : removed_indices[perspective]) {
+            for (const auto index : removed[i])
+            {
               const IndexType offset = kHalfDimensions * index + j * kTileHeight;
               auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
               for (IndexType k = 0; k < kNumRegs; ++k)
                 acc[k] = vec_sub_16(acc[k], column[k]);
             }
-          }
 
-          { // Difference calculation for the activated features
-            for (const auto index : added_indices[perspective]) {
+            // Difference calculation for the activated features
+            for (const auto index : added[i])
+            {
               const IndexType offset = kHalfDimensions * index + j * kTileHeight;
               auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-
               for (IndexType k = 0; k < kNumRegs; ++k)
                 acc[k] = vec_add_16(acc[k], column[k]);
             }
+
+            // Store accumulator
+            accTile = reinterpret_cast<vec_t*>(
+              &info[i]->accumulator.accumulation[c][0][j * kTileHeight]);
+            for (IndexType k = 0; k < kNumRegs; ++k)
+              vec_store(&accTile[k], acc[k]);
+          }
+        }
+
+  #else
+        for (IndexType i = 0; info[i]; ++i)
+        {
+          std::memcpy(info[i]->accumulator.accumulation[c][0],
+              st->accumulator.accumulation[c][0],
+              kHalfDimensions * sizeof(BiasType));
+          st = info[i];
+
+          // Difference calculation for the deactivated features
+          for (const auto index : removed[i])
+          {
+            const IndexType offset = kHalfDimensions * index;
+
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              st->accumulator.accumulation[c][0][j] -= weights_[offset + j];
           }
 
+          // Difference calculation for the activated features
+          for (const auto index : added[i])
+          {
+            const IndexType offset = kHalfDimensions * index;
+
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
+              st->accumulator.accumulation[c][0][j] += weights_[offset + j];
+          }
+        }
+  #endif
+      }
+      else
+      {
+        // Refresh the accumulator
+        auto& accumulator = pos.state()->accumulator;
+        accumulator.state[c] = COMPUTED;
+        Features::IndexList active;
+        Features::HalfKP<Features::Side::kFriend>::AppendActiveIndices(pos, c, &active);
+
+  #ifdef VECTOR
+        for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j)
+        {
+          auto biasesTile = reinterpret_cast<const vec_t*>(
+              &biases_[j * kTileHeight]);
           for (IndexType k = 0; k < kNumRegs; ++k)
+            acc[k] = biasesTile[k];
+
+          for (const auto index : active)
+          {
+            const IndexType offset = kHalfDimensions * index + j * kTileHeight;
+            auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
+
+            for (unsigned k = 0; k < kNumRegs; ++k)
+              acc[k] = vec_add_16(acc[k], column[k]);
+          }
+
+          auto accTile = reinterpret_cast<vec_t*>(
+              &accumulator.accumulation[c][0][j * kTileHeight]);
+          for (unsigned k = 0; k < kNumRegs; k++)
             vec_store(&accTile[k], acc[k]);
         }
+
+  #else
+        std::memcpy(accumulator.accumulation[c][0], biases_,
+            kHalfDimensions * sizeof(BiasType));
+
+        for (const auto index : active)
+        {
+          const IndexType offset = kHalfDimensions * index;
+
+          for (IndexType j = 0; j < kHalfDimensions; ++j)
+            accumulator.accumulation[c][0][j] += weights_[offset + j];
+        }
+  #endif
       }
-#if defined(USE_MMX)
+
+  #if defined(USE_MMX)
       _mm_empty();
-#endif
-
-#else
-      for (Color perspective : { WHITE, BLACK }) {
-
-        if (reset[perspective]) {
-          if (i == 0) {
-            std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                        kHalfDimensions * sizeof(BiasType));
-          } else {
-            std::memset(accumulator.accumulation[perspective][i], 0,
-                        kHalfDimensions * sizeof(BiasType));
-          }
-        } else {
-          std::memcpy(accumulator.accumulation[perspective][i],
-                      prev_accumulator.accumulation[perspective][i],
-                      kHalfDimensions * sizeof(BiasType));
-          // Difference calculation for the deactivated features
-          for (const auto index : removed_indices[perspective]) {
-            const IndexType offset = kHalfDimensions * index;
-
-            for (IndexType j = 0; j < kHalfDimensions; ++j)
-              accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
-          }
-        }
-        { // Difference calculation for the activated features
-          for (const auto index : added_indices[perspective]) {
-            const IndexType offset = kHalfDimensions * index;
-
-            for (IndexType j = 0; j < kHalfDimensions; ++j)
-              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-          }
-        }
-      }
-#endif
-      }
-      accumulator.computed_accumulation = true;
+  #endif
     }
 
     using BiasType = std::int16_t;
     using WeightType = std::int16_t;
 
-    // Make the learning class a friend
-    friend class Trainer<FeatureTransformer>;
-
     alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
     alignas(kCacheLineSize)
         WeightType weights_[kHalfDimensions * kInputDimensions];
@@ -512,4 +411,4 @@ namespace Stockfish::Eval::NNUE {
 
 }  // namespace Stockfish::Eval::NNUE
 
-#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
\ No newline at end of file
diff --git a/src/position.cpp b/src/position.cpp
index 57bc7c7b..c515f253 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -707,7 +707,8 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   ++st->pliesFromNull;
 
   // Used by NNUE
-  st->accumulator.computed_accumulation = false;
+  st->accumulator.state[WHITE] = Eval::NNUE::EMPTY;
+  st->accumulator.state[BLACK] = Eval::NNUE::EMPTY;
   auto& dp = st->dirtyPiece;
   dp.dirty_num = 1;
 
@@ -1006,7 +1007,8 @@ void Position::do_null_move(StateInfo& newSt) {
   st = &newSt;
 
   // Used by NNUE
-  st->accumulator.computed_accumulation = false;
+  st->accumulator.state[WHITE] = Eval::NNUE::EMPTY;
+  st->accumulator.state[BLACK] = Eval::NNUE::EMPTY;
   auto& dp = st->dirtyPiece;
   dp.dirty_num = 0;
 
diff --git a/src/position.h b/src/position.h
index 27a0a596..e816c541 100644
--- a/src/position.h
+++ b/src/position.h
@@ -197,7 +197,7 @@ public:
   //static std::string sfen_from_rawdata(Piece board[81], Hand hands[2], Color turn, int gamePly);
 
   // Returns the position of the ball on the c side.
-  Square king_square(Color c) const { return pieceList[make_piece(c, KING)][0]; }
+  Square king_square(Color c) const { return lsb(pieces(c, KING)); }
 
 private:
   // Initialization helpers (used while setting up a position)
@@ -445,6 +445,8 @@ inline StateInfo* Position::state() const {
   return st;
 }
 
+static const char* const StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
+
 } // namespace Stockfish
 
 #endif // #ifndef POSITION_H_INCLUDED
diff --git a/src/search.cpp b/src/search.cpp
index 6f2cfc7b..a179c1d1 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -222,7 +222,7 @@ void MainThread::search() {
   Time.init(Limits, us, rootPos.game_ply());
   TT.new_search();
 
-  Eval::NNUE::verify_eval_file_loaded();
+  Eval::NNUE::verify();
 
   if (rootMoves.empty())
   {
@@ -2143,7 +2143,7 @@ namespace Search
       return { mated_in(/*ss->ply*/ 0 + 1), {} };
     }
 
-    auto bestValue = ::qsearch<PV>(pos, ss, -VALUE_INFINITE, VALUE_INFINITE, 0);
+    auto bestValue = Stockfish::qsearch<PV>(pos, ss, -VALUE_INFINITE, VALUE_INFINITE, 0);
 
     // Returns the PV obtained.
     std::vector<Move> pvs;
@@ -2249,7 +2249,7 @@ namespace Search
         while (true)
         {
           Depth adjustedDepth = std::max(1, rootDepth);
-          bestValue = ::search<PV>(pos, ss, alpha, beta, adjustedDepth, false);
+          bestValue = Stockfish::search<PV>(pos, ss, alpha, beta, adjustedDepth, false);
 
           stable_sort(rootMoves.begin() + pvIdx, rootMoves.end());
           //my_stable_sort(pos.this_thread()->thread_id(),&rootMoves[0] + pvIdx, rootMoves.size() - pvIdx);
diff --git a/src/tools/convert.cpp b/src/tools/convert.cpp
index 8d8af4bb..0be7070c 100644
--- a/src/tools/convert.cpp
+++ b/src/tools/convert.cpp
@@ -28,7 +28,7 @@
 
 using namespace std;
 
-namespace Tools
+namespace Stockfish::Tools
 {
     bool fen_is_ok(Position& pos, std::string input_fen) {
         std::string pos_fen = pos.fen();
diff --git a/src/tools/convert.h b/src/tools/convert.h
index 9d628540..0bd9ee23 100644
--- a/src/tools/convert.h
+++ b/src/tools/convert.h
@@ -5,7 +5,7 @@
 #include <string>
 #include <sstream>
 
-namespace Tools {
+namespace Stockfish::Tools {
     void convert(std::istringstream& is);
 
     void convert_bin_from_pgn_extract(std::istringstream& is);
diff --git a/src/tools/gensfen.cpp b/src/tools/gensfen.cpp
index 7021648a..1e721a3e 100644
--- a/src/tools/gensfen.cpp
+++ b/src/tools/gensfen.cpp
@@ -34,7 +34,7 @@
 
 using namespace std;
 
-namespace Tools
+namespace Stockfish::Tools
 {
     // Class to generate sfen with multiple threads
     struct Gensfen
@@ -962,7 +962,7 @@ namespace Tools
             << "  - draw by insuff. mat.   = " << params.detect_draw_by_insufficient_mating_material << endl;
 
         // Show if the training data generator uses NNUE.
-        Eval::NNUE::verify_eval_file_loaded();
+        Eval::NNUE::verify();
 
         Threads.main()->ponder = false;
 
diff --git a/src/tools/gensfen.h b/src/tools/gensfen.h
index 13eb0880..a8505474 100644
--- a/src/tools/gensfen.h
+++ b/src/tools/gensfen.h
@@ -5,7 +5,7 @@
 
 #include <sstream>
 
-namespace Tools {
+namespace Stockfish::Tools {
 
     // Automatic generation of teacher position
     void gensfen(std::istringstream& is);
diff --git a/src/tools/gensfen_nonpv.cpp b/src/tools/gensfen_nonpv.cpp
index 7edf9a33..9775f80e 100644
--- a/src/tools/gensfen_nonpv.cpp
+++ b/src/tools/gensfen_nonpv.cpp
@@ -34,7 +34,7 @@
 
 using namespace std;
 
-namespace Tools
+namespace Stockfish::Tools
 {
     // Class to generate sfen with multiple threads
     struct GensfenNonPv
@@ -476,7 +476,7 @@ namespace Tools
             << "  - count                  = " << count << endl;
 
         // Show if the training data generator uses NNUE.
-        Eval::NNUE::verify_eval_file_loaded();
+        Eval::NNUE::verify();
 
         Threads.main()->ponder = false;
 
diff --git a/src/tools/gensfen_nonpv.h b/src/tools/gensfen_nonpv.h
index 31229d5e..842dd70b 100644
--- a/src/tools/gensfen_nonpv.h
+++ b/src/tools/gensfen_nonpv.h
@@ -3,7 +3,7 @@
 
 #include <sstream>
 
-namespace Tools {
+namespace Stockfish::Tools {
 
     // Automatic generation of teacher position
     void gensfen_nonpv(std::istringstream& is);
diff --git a/src/tools/opening_book.cpp b/src/tools/opening_book.cpp
index 3d3842ef..63ff7ed1 100644
--- a/src/tools/opening_book.cpp
+++ b/src/tools/opening_book.cpp
@@ -2,7 +2,7 @@
 
 #include <fstream>
 
-namespace Tools {
+namespace Stockfish::Tools {
 
     EpdOpeningBook::EpdOpeningBook(const std::string& file, PRNG& prng) :
         OpeningBook(file)
diff --git a/src/tools/opening_book.h b/src/tools/opening_book.h
index 562be0f9..8323ed10 100644
--- a/src/tools/opening_book.h
+++ b/src/tools/opening_book.h
@@ -13,7 +13,7 @@
 #include <memory>
 #include <mutex>
 
-namespace Tools {
+namespace Stockfish::Tools {
 
     struct OpeningBook {
 
diff --git a/src/tools/packed_sfen.h b/src/tools/packed_sfen.h
index 8080200f..969405ea 100644
--- a/src/tools/packed_sfen.h
+++ b/src/tools/packed_sfen.h
@@ -4,7 +4,7 @@
 #include <vector>
 #include <cstdint>
 
-namespace Tools {
+namespace Stockfish::Tools {
 
     // packed sfen
     struct PackedSfen { std::uint8_t data[32]; };
diff --git a/src/tools/sfen_packer.cpp b/src/tools/sfen_packer.cpp
index a51fd193..a8e1fec2 100644
--- a/src/tools/sfen_packer.cpp
+++ b/src/tools/sfen_packer.cpp
@@ -11,7 +11,7 @@
 
 using namespace std;
 
-namespace Tools {
+namespace Stockfish::Tools {
 
     // Class that handles bitstream
     // useful when doing aspect encoding
@@ -260,15 +260,11 @@ namespace Tools {
 
         pos.clear();
         std::memset(si, 0, sizeof(StateInfo));
-        std::fill_n(&pos.pieceList[0][0], sizeof(pos.pieceList) / sizeof(Square), SQ_NONE);
         pos.st = si;
 
         // Active color
         pos.sideToMove = (Color)stream.read_one_bit();
 
-        pos.pieceList[W_KING][0] = SQUARE_NB;
-        pos.pieceList[B_KING][0] = SQUARE_NB;
-
         // First the position of the ball
         for (auto c : Colors)
             pos.board[stream.read_n_bit(6)] = make_piece(c, KING);
diff --git a/src/tools/sfen_packer.h b/src/tools/sfen_packer.h
index c99d7985..42093043 100644
--- a/src/tools/sfen_packer.h
+++ b/src/tools/sfen_packer.h
@@ -7,11 +7,13 @@
 
 #include <cstdint>
 
-class Position;
-struct StateInfo;
-class Thread;
+namespace Stockfish {
+    class Position;
+    struct StateInfo;
+    class Thread;
+}
 
-namespace Tools {
+namespace Stockfish::Tools {
 
     int set_from_packed_sfen(Position& pos, const PackedSfen& sfen, StateInfo* si, Thread* th);
     PackedSfen sfen_pack(Position& pos);
diff --git a/src/tools/sfen_reader.h b/src/tools/sfen_reader.h
index 064efe53..28a6104d 100644
--- a/src/tools/sfen_reader.h
+++ b/src/tools/sfen_reader.h
@@ -17,7 +17,7 @@
 #include <thread>
 #include <functional>
 
-namespace Tools{
+namespace Stockfish::Tools{
 
     enum struct SfenReaderMode
     {
diff --git a/src/tools/sfen_stream.h b/src/tools/sfen_stream.h
index bb731457..6122e091 100644
--- a/src/tools/sfen_stream.h
+++ b/src/tools/sfen_stream.h
@@ -10,7 +10,7 @@
 #include <string>
 #include <memory>
 
-namespace Tools {
+namespace Stockfish::Tools {
 
     enum struct SfenOutputType
     {
diff --git a/src/tools/sfen_writer.h b/src/tools/sfen_writer.h
index 37c36491..10dc98d6 100644
--- a/src/tools/sfen_writer.h
+++ b/src/tools/sfen_writer.h
@@ -17,9 +17,7 @@
 #include <thread>
 #include <atomic>
 
-using namespace std;
-
-namespace Tools {
+namespace Stockfish::Tools {
 
     // Helper class for exporting Sfen
     struct SfenWriter
@@ -28,13 +26,13 @@ namespace Tools {
         static constexpr size_t SFEN_WRITE_SIZE = 5000;
 
         // File name to write and number of threads to create
-        SfenWriter(string filename_, int thread_num, uint64_t save_count, SfenOutputType sfen_output_type)
+        SfenWriter(std::string filename_, int thread_num, uint64_t save_count, SfenOutputType sfen_output_type)
         {
             sfen_buffers_pool.reserve((size_t)thread_num * 10);
             sfen_buffers.resize(thread_num);
 
             auto out = sync_region_cout.new_region();
-            out << "INFO (sfen_writer): Creating new data file at " << filename_ << endl;
+            out << "INFO (sfen_writer): Creating new data file at " << filename_ << std::endl;
 
             sfen_format = sfen_output_type;
             output_file_stream = create_new_sfen_output(filename_, sfen_format);
@@ -121,7 +119,7 @@ namespace Tools {
         {
             while (!finished || sfen_buffers_pool.size())
             {
-                vector<std::unique_ptr<PSVector>> buffers;
+                std::vector<std::unique_ptr<PSVector>> buffers;
                 {
                     std::unique_lock<std::mutex> lk(mutex);
 
@@ -157,11 +155,11 @@ namespace Tools {
                             // Rename the file and open it again.
                             // Add ios::app in consideration of overwriting.
                             // (Depending on the operation, it may not be necessary.)
-                            string new_filename = filename + "_" + std::to_string(n);
+                            std::string new_filename = filename + "_" + std::to_string(n);
                             output_file_stream = create_new_sfen_output(new_filename, sfen_format);
 
                             auto out = sync_region_cout.new_region();
-                            out << "INFO (sfen_writer): Creating new data file at " << new_filename << endl;
+                            out << "INFO (sfen_writer): Creating new data file at " << new_filename << std::endl;
                         }
                     }
                 }
@@ -182,7 +180,7 @@ namespace Tools {
         std::thread file_worker_thread;
 
         // Flag that all threads have finished
-        atomic<bool> finished;
+        std::atomic<bool> finished;
 
         SfenOutputType sfen_format;
 
diff --git a/src/tools/stats.cpp b/src/tools/stats.cpp
index c40411b9..c154fc10 100644
--- a/src/tools/stats.cpp
+++ b/src/tools/stats.cpp
@@ -25,7 +25,7 @@
 #include <mutex>
 #include <optional>
 
-namespace Tools::Stats
+namespace Stockfish::Tools::Stats
 {
     struct StatisticGathererBase
     {
@@ -398,7 +398,7 @@ namespace Tools::Stats
                 m_castling += 1;
             else if (type_of(move) == PROMOTION)
                 m_promotion += 1;
-            else if (type_of(move) == ENPASSANT)
+            else if (type_of(move) == EN_PASSANT)
                 m_enpassant += 1;
             else if (type_of(move) == NORMAL)
                 m_normal += 1;
diff --git a/src/tools/stats.h b/src/tools/stats.h
index c4a13d19..3276bab6 100644
--- a/src/tools/stats.h
+++ b/src/tools/stats.h
@@ -3,7 +3,7 @@
 
 #include <sstream>
 
-namespace Tools::Stats {
+namespace Stockfish::Tools::Stats {
 
     void gather_statistics(std::istringstream& is);
 
diff --git a/src/tools/transform.cpp b/src/tools/transform.cpp
index b3d1f94b..0b7f5a27 100644
--- a/src/tools/transform.cpp
+++ b/src/tools/transform.cpp
@@ -21,7 +21,7 @@
 #include <mutex>
 #include <optional>
 
-namespace Tools
+namespace Stockfish::Tools
 {
     using CommandFunc = void(*)(std::istringstream&);
 
diff --git a/src/tools/transform.h b/src/tools/transform.h
index f202b55c..dcaf9a1d 100644
--- a/src/tools/transform.h
+++ b/src/tools/transform.h
@@ -3,7 +3,7 @@
 
 #include <sstream>
 
-namespace Tools {
+namespace Stockfish::Tools {
 
     void transform(std::istringstream& is);
 
diff --git a/src/uci.cpp b/src/uci.cpp
index 725056bd..fdce17aa 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -45,9 +45,6 @@ namespace Stockfish {
 
 extern vector<string> setup_bench(const Position&, istream&);
 
-// FEN string of the initial position, normal chess
-const char* StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
-
 namespace {
 
   // position() is called when engine receives the "position" UCI command.
@@ -93,7 +90,7 @@ namespace {
     Position p;
     p.set(pos.fen(), Options["UCI_Chess960"], &states->back(), Threads.main());
 
-    Eval::NNUE::verify_eval_file_loaded();
+    Eval::NNUE::verify();
 
     sync_cout << "\n" << Eval::trace(p) << sync_endl;
   }

From ba32bd5d70b3549ec43ae484fa6ce95f0ddf10b2 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 19 Apr 2021 18:57:21 +0200
Subject: [PATCH 564/583] Bring the changes closer to official-stockfish/master

---
 .travis.yml                                   |  31 +-
 README.md                                     | 312 +++++++++++++++---
 src/Makefile                                  |  37 +--
 src/misc.cpp                                  |  81 -----
 src/misc.h                                    |  83 +----
 src/nnue/architectures/halfkp_256x2-32-32.h   |  56 ++--
 src/nnue/evaluate_nnue.cpp                    |   5 +-
 src/nnue/evaluate_nnue.h                      |   5 +-
 src/nnue/features/feature_set.h               |   5 +-
 src/nnue/features/half_kp.cpp                 |  20 +-
 src/nnue/features/half_kp.h                   |  23 +-
 src/nnue/layers/affine_transform.h            |   5 +-
 src/nnue/layers/clipped_relu.h                |  30 --
 src/nnue/layers/input_slice.h                 |  27 --
 src/nnue/nnue_accumulator.h                   |   5 +-
 src/nnue/nnue_common.h                        |  11 -
 src/nnue/nnue_feature_transformer.h           |   5 +-
 src/position.cpp                              |   4 +-
 src/tools/convert.cpp                         |   6 +-
 ...ensfen.cpp => training_data_generator.cpp} |  34 +-
 .../{gensfen.h => training_data_generator.h}  |   2 +-
 ....cpp => training_data_generator_nonpv.cpp} |  26 +-
 ...onpv.h => training_data_generator_nonpv.h} |   2 +-
 src/uci.cpp                                   |  23 +-
 src/uci.h                                     |   2 -
 stockfish.md                                  | 242 --------------
 tests/instrumented.sh                         |  54 ++-
 tests/instrumented_learn.sh                   | 155 ---------
 28 files changed, 451 insertions(+), 840 deletions(-)
 rename src/tools/{gensfen.cpp => training_data_generator.cpp} (97%)
 rename src/tools/{gensfen.h => training_data_generator.h} (75%)
 rename src/tools/{gensfen_nonpv.cpp => training_data_generator_nonpv.cpp} (95%)
 rename src/tools/{gensfen_nonpv.h => training_data_generator_nonpv.h} (71%)
 delete mode 100644 stockfish.md
 delete mode 100755 tests/instrumented_learn.sh

diff --git a/.travis.yml b/.travis.yml
index 3a04de58..377796f7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,34 +7,11 @@ matrix:
       compiler: gcc
       addons:
         apt:
-          packages: ['g++-multilib', 'valgrind', 'expect', 'curl', 'libopenblas-dev']
+          packages: ['g++-multilib', 'valgrind', 'expect', 'curl']
       env:
         - COMPILER=g++
         - COMP=gcc
 
-#    - os: linux
-#      compiler: clang
-#      addons:
-#        apt:
-#          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl', 'openblas']
-#      env:
-#        - COMPILER=clang++-10
-#        - COMP=clang
-#
-#    - os: osx
-#      osx_image: xcode12
-#      compiler: gcc
-#      env:
-#        - COMPILER=g++
-#        - COMP=gcc
-#
-#    - os: osx
-#      osx_image: xcode12
-#      compiler: clang
-#      env:
-#        - COMPILER=clang++
-#        - COMP=clang
-
 branches:
   only:
    - master
@@ -96,9 +73,3 @@ script:
   #
   - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined
   - make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread
-
-  # NNUE testing
-  - export CXXFLAGS="-O1 -fno-inline"
-  - make clean && make -j2 ARCH=x86-64-modern debug=no optimize=no build > /dev/null && ../tests/instrumented_learn.sh --valgrind
-  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented_learn.sh --sanitizer-undefined
-  - make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented_learn.sh --sanitizer-thread
diff --git a/README.md b/README.md
index 10d43595..6c974a78 100644
--- a/README.md
+++ b/README.md
@@ -24,54 +24,117 @@ This distribution of Stockfish consists of the following files:
   * Readme.md, the file you are currently reading.
 
   * Copying.txt, a text file containing the GNU General Public License version 3.
-  
+
   * AUTHORS, a text file with the list of authors for the project
 
-Stockfish NNUE is a port of a shogi neural network named NNUE (efficiently updateable neural network backwards) to Stockfish 11. To learn more about the Stockfish chess engine, look [here](stockfish.md) for an overview and [here](https://github.com/official-stockfish/Stockfish) for the official repository.
+  * src, a subdirectory containing the full source code, including a Makefile
+    that can be used to compile Stockfish on Unix-like systems.
 
-=======
-## Building
+  * a file with the .nnue extension, storing the neural network for the NNUE
+    evaluation. Binary distributions will have this file embedded.
 
-To compile:
-```
-make -jN ARCH=... build
-```
+## UCI options
 
-To compile with Profile Guided Optimizations. Requires that the computer that is used for compilation supports the selected `ARCH`.
-```
-make -jN ARCH=... profile-build
-```
+Currently, Stockfish has the following UCI options:
 
-`N` is the number of threads to use for compilation.
+  * #### Threads
+    The number of CPU threads used for searching a position. For best performance, set
+    this equal to the number of CPU cores available.
 
-`ARCH` is one of:
-`x86-64-vnni512`, `x86-64-vnni256`, `x86-64-avx512`, `x86-64-bmi2`, `x86-64-avx2`,
-`x86-64-sse41-popcnt`, `x86-64-modern`, `x86-64-ssse3`, `x86-64-sse3-popcnt`,
-`x86-64`, `x86-32-sse41-popcnt`, `x86-32-sse2`, `x86-32`, `ppc-64`, `ppc-32,
-armv7`, `armv7-neon`, `armv8`, `apple-silicon`, `general-64`, `general-32`.
+  * #### Hash
+    The size of the hash table in MB. It is recommended to set Hash after setting Threads.
 
-`ARCH` needs to be chosen based based on the instruction set of the CPU that will run stockfish. `x86-64-modern` will produce a binary that works on most common processors, but other options may increase performance for specific hardware.
+  * #### Clear Hash
+    Clear the hash table.
 
-Additional options:
+  * #### Ponder
+    Let Stockfish ponder its next move while the opponent is thinking.
 
-### Building Instructions for Mac
+  * #### MultiPV
+    Output the N best lines (principal variations, PVs) when searching.
+    Leave at 1 for best performance.
 
-1. Ensure that you have OpenBlas Installed
-```
-brew install openblas
-```
-2. Go to src then build using the makefile
-```
-cd src
-make build ARCH=x86-64 COMP=gcc blas=yes
-```
-or
-```
-cd src
-make profile-build ARCH=x86-64 COMP=gcc blas=yes
-```
+  * #### Use NNUE
+    Toggle between the NNUE and classical evaluation functions. If set to "true",
+    the network parameters must be available to load from file (see also EvalFile),
+    if they are not embedded in the binary.
 
-## Training Guide
+  * #### EvalFile
+    The name of the file of the NNUE evaluation parameters. Depending on the GUI the
+    filename might have to include the full path to the folder/directory that contains the file.
+    Other locations, such as the directory that contains the binary and the working directory,
+    are also searched.
+
+  * #### UCI_AnalyseMode
+    An option handled by your GUI.
+
+  * #### UCI_Chess960
+    An option handled by your GUI. If true, Stockfish will play Chess960.
+
+  * #### UCI_ShowWDL
+    If enabled, show approximate WDL statistics as part of the engine output.
+    These WDL numbers model expected game outcomes for a given evaluation and
+    game ply for engine self-play at fishtest LTC conditions (60+0.6s per game).
+
+  * #### UCI_LimitStrength
+    Enable weaker play aiming for an Elo rating as set by UCI_Elo. This option overrides Skill Level.
+
+  * #### UCI_Elo
+    If enabled by UCI_LimitStrength, aim for an engine strength of the given Elo.
+    This Elo rating has been calibrated at a time control of 60s+0.6s and anchored to CCRL 40/4.
+
+  * #### Skill Level
+    Lower the Skill Level in order to make Stockfish play weaker (see also UCI_LimitStrength).
+    Internally, MultiPV is enabled, and with a certain probability depending on the Skill Level a
+    weaker move will be played.
+
+  * #### SyzygyPath
+    Path to the folders/directories storing the Syzygy tablebase files. Multiple
+    directories are to be separated by ";" on Windows and by ":" on Unix-based
+    operating systems. Do not use spaces around the ";" or ":".
+
+    Example: `C:\tablebases\wdl345;C:\tablebases\wdl6;D:\tablebases\dtz345;D:\tablebases\dtz6`
+
+    It is recommended to store .rtbw files on an SSD. There is no loss in storing
+    the .rtbz files on a regular HD. It is recommended to verify all md5 checksums
+    of the downloaded tablebase files (`md5sum -c checksum.md5`) as corruption will
+    lead to engine crashes.
+
+  * #### SyzygyProbeDepth
+    Minimum remaining search depth for which a position is probed. Set this option
+    to a higher value to probe less aggressively if you experience too much slowdown
+    (in terms of nps) due to tablebase probing.
+
+  * #### Syzygy50MoveRule
+    Disable to let fifty-move rule draws detected by Syzygy tablebase probes count
+    as wins or losses. This is useful for ICCF correspondence games.
+
+  * #### SyzygyProbeLimit
+    Limit Syzygy tablebase probing to positions with at most this many pieces left
+    (including kings and pawns).
+
+  * #### Contempt
+    A positive value for contempt favors middle game positions and avoids draws,
+    effective for the classical evaluation only.
+
+  * #### Analysis Contempt
+    By default, contempt is set to prefer the side to move. Set this option to "White"
+    or "Black" to analyse with contempt for that side, or "Off" to disable contempt.
+
+  * #### Move Overhead
+    Assume a time delay of x ms due to network and GUI overheads. This is useful to
+    avoid losses on time in those cases.
+
+  * #### Slow Mover
+    Lower values will make Stockfish take less time in games, higher values will
+    make it think longer.
+
+  * #### nodestime
+    Tells the engine to use nodes searched instead of wall time to account for
+    elapsed time. Useful for engine testing.
+
+  * #### Debug Log File
+    Write all communication to and from the engine into a text file.
 
 ### Generating Training Data
 
@@ -81,24 +144,26 @@ To generate training data from the classic eval, use the gensfen command with th
 uci
 setoption name PruneAtShallowDepth value false
 setoption name Use NNUE value false
-setoption name Threads value x
-setoption name Hash value y
+setoption name Threads value X
+setoption name Hash value Y
 setoption name SyzygyPath value path
 isready
-gensfen depth a loop b use_draw_in_training_data_generation 1 eval_limit 32000
+gensfen depth A count B keep_draws 1 eval_limit 32000
 ```
 
-- `depth` is the searched depth per move, or how far the engine looks forward. This value is an integer.
-- `loop` is the amount of positions generated. This value is also an integer.
+- `A` is the searched depth per move, or how far the engine looks forward. This value is an integer.
+- `B` is the amount of positions generated. This value is also an integer.
 
 Specify how many threads and how much memory you would like to use with the `x` and `y` values. The option SyzygyPath is not necessary, but if you would like to use it, you must first have Syzygy endgame tablebases on your computer, which you can find [here](http://oics.olympuschess.com/tracker/index.php). You will need to have a torrent client to download these tablebases, as that is probably the fastest way to obtain them. The `path` is the path to the folder containing those tablebases. It does not have to be surrounded in quotes.
 
-This will create a file named "generated_kifu.binpack" in the same folder as the binary containing the generated training data. Once generation is done, you can rename the file to something like "1billiondepth12.binpack" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
+This will create a file named "training_data.binpack" in the same folder as the binary containing the generated training data. Once generation is done, you can rename the file to something like "1billiondepth12.binpack" to remember the depth and quantity of the positions and move it to a folder named "trainingdata" in the same directory as the binaries.
 
 You will also need validation data that is used for loss calculation and accuracy computation. Validation data is generated in the same way as training data, but generally at most 1 million positions should be used as there's no need for more and it would just slow the learning process down. It may also be better to slightly increase the depth for validation data. After generation you can rename the validation data file to "val.binpack" and drop it in a folder named "validationdata" in the same directory to make it easier.
 
 ## Training data formats.
 
+Currently there are 3 training data formats. Two of them are supported directly.
+
 - `.bin` - the original training data format. Uses 40 bytes per entry. Is supported directly by the `gensfen` and `learn` commands.
 - `.plain` - a human readable training data format. This one is not supported directly by the `gensfen` and `learn` commands. It should not be used for data exchange because it's less compact than other formats. It is mostly useful for inspection of the data.
 - `.binpack` - a compact binary training data format that exploits positions chains to further reduce size. It uses on average between 2 to 3 bytes per entry when generating data with `gensfen`. It is supported directly by `gensfen` and `learn` commands. It is currently the default for the `gensfen` command. A more in depth description can be found [here](docs/binpack.md)
@@ -107,4 +172,165 @@ You will also need validation data that is used for loss calculation and accurac
 
 There is a builting converted that support all 3 formats described above. Any of them can be converted to any other. For more information and usage guide see [here](docs/convert.md).
 
-A more updated list can be found in the #sf-nnue-resources channel in the Discord.
+## A note on classical evaluation versus NNUE evaluation
+
+Both approaches assign a value to a position that is used in alpha-beta (PVS) search
+to find the best move. The classical evaluation computes this value as a function
+of various chess concepts, handcrafted by experts, tested and tuned using fishtest.
+The NNUE evaluation computes this value with a neural network based on basic
+inputs (e.g. piece positions only). The network is optimized and trained
+on the evaluations of millions of positions at moderate search depth.
+
+The NNUE evaluation was first introduced in shogi, and ported to Stockfish afterward.
+It can be evaluated efficiently on CPUs, and exploits the fact that only parts
+of the neural network need to be updated after a typical chess move.
+[The nodchip repository](https://github.com/nodchip/Stockfish) provides additional
+tools to train and develop the NNUE networks. On CPUs supporting modern vector instructions
+(avx2 and similar), the NNUE evaluation results in much stronger playing strength, even
+if the nodes per second computed by the engine is somewhat lower (roughly 80% of nps
+is typical).
+
+Notes:
+
+1) the NNUE evaluation depends on the Stockfish binary and the network parameter
+file (see the EvalFile UCI option). Not every parameter file is compatible with a given
+Stockfish binary, but the default value of the EvalFile UCI option is the name of a network
+that is guaranteed to be compatible with that binary.
+
+2) to use the NNUE evaluation, the additional data file with neural network parameters
+needs to be available. Normally, this file is already embedded in the binary or it
+can be downloaded. The filename for the default (recommended) net can be found as the default
+value of the `EvalFile` UCI option, with the format `nn-[SHA256 first 12 digits].nnue`
+(for instance, `nn-c157e0a5755b.nnue`). This file can be downloaded from
+```
+https://tests.stockfishchess.org/api/nn/[filename]
+```
+replacing `[filename]` as needed.
+
+## What to expect from the Syzygy tablebases?
+
+If the engine is searching a position that is not in the tablebases (e.g.
+a position with 8 pieces), it will access the tablebases during the search.
+If the engine reports a very large score (typically 153.xx), this means
+it has found a winning line into a tablebase position.
+
+If the engine is given a position to search that is in the tablebases, it
+will use the tablebases at the beginning of the search to preselect all
+good moves, i.e. all moves that preserve the win or preserve the draw while
+taking into account the 50-move rule.
+It will then perform a search only on those moves. **The engine will not move
+immediately**, unless there is only a single good move. **The engine likely
+will not report a mate score, even if the position is known to be won.**
+
+It is therefore clear that this behaviour is not identical to what one might
+be used to with Nalimov tablebases. There are technical reasons for this
+difference, the main technical reason being that Nalimov tablebases use the
+DTM metric (distance-to-mate), while the Syzygy tablebases use a variation of the
+DTZ metric (distance-to-zero, zero meaning any move that resets the 50-move
+counter). This special metric is one of the reasons that the Syzygy tablebases are
+more compact than Nalimov tablebases, while still storing all information
+needed for optimal play and in addition being able to take into account
+the 50-move rule.
+
+## Large Pages
+
+Stockfish supports large pages on Linux and Windows. Large pages make
+the hash access more efficient, improving the engine speed, especially
+on large hash sizes. Typical increases are 5..10% in terms of nodes per
+second, but speed increases up to 30% have been measured. The support is
+automatic. Stockfish attempts to use large pages when available and
+will fall back to regular memory allocation when this is not the case.
+
+### Support on Linux
+
+Large page support on Linux is obtained by the Linux kernel
+transparent huge pages functionality. Typically, transparent huge pages
+are already enabled, and no configuration is needed.
+
+### Support on Windows
+
+The use of large pages requires "Lock Pages in Memory" privilege. See
+[Enable the Lock Pages in Memory Option (Windows)](https://docs.microsoft.com/en-us/sql/database-engine/configure-windows/enable-the-lock-pages-in-memory-option-windows)
+on how to enable this privilege, then run [RAMMap](https://docs.microsoft.com/en-us/sysinternals/downloads/rammap)
+to double-check that large pages are used. We suggest that you reboot
+your computer after you have enabled large pages, because long Windows
+sessions suffer from memory fragmentation, which may prevent Stockfish
+from getting large pages: a fresh session is better in this regard.
+
+## Compiling Stockfish yourself from the sources
+
+Stockfish has support for 32 or 64-bit CPUs, certain hardware
+instructions, big-endian machines such as Power PC, and other platforms.
+
+On Unix-like systems, it should be easy to compile Stockfish
+directly from the source code with the included Makefile in the folder
+`src`. In general it is recommended to run `make help` to see a list of make
+targets with corresponding descriptions.
+
+```
+    cd src
+    make help
+    make net
+    make build ARCH=x86-64-modern
+```
+
+When not using the Makefile to compile (for instance, with Microsoft MSVC) you
+need to manually set/unset some switches in the compiler command line; see
+file *types.h* for a quick reference.
+
+When reporting an issue or a bug, please tell us which Stockfish version
+and which compiler you used to create your executable. This information
+can be found by typing the following command in a console:
+
+```
+    ./stockfish compiler
+```
+
+## Understanding the code base and participating in the project
+
+Stockfish's improvement over the last decade has been a great community
+effort. There are a few ways to help contribute to its growth.
+
+### Donating hardware
+
+Improving Stockfish requires a massive amount of testing. You can donate
+your hardware resources by installing the [Fishtest Worker](https://github.com/glinscott/fishtest/wiki/Running-the-worker:-overview)
+and view the current tests on [Fishtest](https://tests.stockfishchess.org/tests).
+
+### Improving the code
+
+If you want to help improve the code, there are several valuable resources:
+
+* [In this wiki,](https://www.chessprogramming.org) many techniques used in
+Stockfish are explained with a lot of background information.
+
+* [The section on Stockfish](https://www.chessprogramming.org/Stockfish)
+describes many features and techniques used by Stockfish. However, it is
+generic rather than being focused on Stockfish's precise implementation.
+Nevertheless, a helpful resource.
+
+* The latest source can always be found on [GitHub](https://github.com/official-stockfish/Stockfish).
+Discussions about Stockfish take place these days mainly in the [FishCooking](https://groups.google.com/forum/#!forum/fishcooking)
+group and on the [Stockfish Discord channel](https://discord.gg/nv8gDtt).
+The engine testing is done on [Fishtest](https://tests.stockfishchess.org/tests).
+If you want to help improve Stockfish, please read this [guideline](https://github.com/glinscott/fishtest/wiki/Creating-my-first-test)
+first, where the basics of Stockfish development are explained.
+
+
+## Terms of use
+
+Stockfish is free, and distributed under the **GNU General Public License version 3**
+(GPL v3). Essentially, this means you are free to do almost exactly
+what you want with the program, including distributing it among your
+friends, making it available for download from your website, selling
+it (either by itself or as part of some bigger software package), or
+using it as the starting point for a software project of your own.
+
+The only real limitation is that whenever you distribute Stockfish in
+some way, you MUST always include the full source code, or a pointer
+to where the source code can be found, to generate the exact binary
+you are distributing. If you make any changes to the source code,
+these changes must also be made available under the GPL.
+
+For full details, read the copy of the GPL v3 found in the file named
+*Copying.txt*.
diff --git a/src/Makefile b/src/Makefile
index 86d00d95..cc0f7391 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -52,8 +52,8 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	nnue/evaluate_nnue.cpp \
 	nnue/features/half_kp.cpp \
 	tools/sfen_packer.cpp \
-	tools/gensfen.cpp \
-	tools/gensfen_nonpv.cpp \
+	tools/training_data_generator.cpp \
+	tools/training_data_generator_nonpv.cpp \
 	tools/opening_book.cpp \
 	tools/convert.cpp \
 	tools/transform.cpp \
@@ -113,7 +113,6 @@ else
    SUPPORTED_ARCH=false
 endif
 
-blas = no
 optimize = yes
 debug = no
 sanitize = no
@@ -309,9 +308,9 @@ endif
 ### ==========================================================================
 
 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -g -Wall -Wcast-qual -fno-exceptions -std=c++17 -fopenmp -I. $(EXTRACXXFLAGS)
-LDFLAGS += -fopenmp $(EXTRALDFLAGS)
-DEPENDFLAGS += -std=c++17 -I.
+CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 -I. $(EXTRACXXFLAGS)
+DEPENDFLAGS += -std=c++17
+LDFLAGS += $(EXTRALDFLAGS)
 
 ifeq ($(COMP),)
 	COMP=gcc
@@ -465,28 +464,6 @@ ifneq ($(comp),mingw)
 endif
 endif
 
-### 3.2.1. BLAS libraries
-ifeq ($(blas), yes)
-	LDFLAGS += -lopenblas
-
-	ifeq ($(KERNEL),Linux)
-		LDFLAGS +=
-	else ifeq ($(KERNEL), Darwin)
-		CXXFLAGS += -I/usr/local/opt/openblas/include
-		LDFLAGS += -L/usr/local/opt/openblas/lib -lcblas
-	else
-		CXXFLAGS += -I/mingw64/include/OpenBLAS
-
-		ifeq ($(debug),yes)
-			LDFLAGS += -Wl,-static
-		else
-			LDFLAGS += -Wl,-s -static
-		endif
-	endif
-
-	CXXFLAGS += -DUSE_BLAS
-endif
-
 ### 3.2.2 Debugging
 ifeq ($(debug),no)
 	CXXFLAGS += -DNDEBUG
@@ -809,12 +786,12 @@ net:
 
 # clean binaries and objects
 objclean:
-	@rm -f $(EXE) *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o ./learn/*.o ./extra/*.o ./eval/*.o
+	@rm -f $(EXE) *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o ./tools/*.o ./extra/*.o ./eval/*.o
 
 # clean auxiliary profiling files
 profileclean:
 	@rm -rf profdir
-	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./learn/*.gcda ./extra/*.gcda ./eval/*.gcda
+	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./tools/*.gcda ./extra/*.gcda ./eval/*.gcda
 	@rm -f stockfish.profdata *.profraw
 	@rm -f $(PGO_TRAINING_DATA_FILE)
 
diff --git a/src/misc.cpp b/src/misc.cpp
index e47e2649..b58695ec 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -664,85 +664,4 @@ void sleep(int ms)
     std::this_thread::sleep_for(std::chrono::milliseconds(ms));
 }
 
-void* aligned_malloc(size_t size, size_t align)
-{
-    void* p = _mm_malloc(size, align);
-    if (p == nullptr)
-    {
-        std::cout << "info string can't allocate memory. sise = " << size << std::endl;
-        exit(1);
-    }
-    return p;
-}
-
-std::uint64_t get_file_size(std::fstream& fs)
-{
-    auto pos = fs.tellg();
-
-    fs.seekg(0, fstream::end);
-    const uint64_t eofPos = (uint64_t)fs.tellg();
-    fs.clear(); // Otherwise, the next seek may fail.
-    fs.seekg(0, fstream::beg);
-    const uint64_t begPos = (uint64_t)fs.tellg();
-    fs.seekg(pos);
-
-    return eofPos - begPos;
-}
-
-int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func)
-{
-    fstream fs(filename, ios::in | ios::binary);
-    if (fs.fail())
-        return 1;
-
-    const uint64_t file_size = get_file_size(fs);
-    //std::cout << "filename = " << filename << " , file_size = " << file_size << endl;
-
-    // I know the file size, so call callback_func to get a buffer for this,
-    // Get the pointer.
-    void* ptr = callback_func(file_size);
-
-    // If the buffer could not be secured, or if the file size is different from the expected file size,
-    // It is supposed to return nullptr. At this time, reading is interrupted and an error is returned.
-    if (ptr == nullptr)
-        return 2;
-
-    // read in pieces
-
-    const uint64_t block_size = 1024 * 1024 * 1024; // number of elements to read in one read (1GB)
-    for (uint64_t pos = 0; pos < file_size; pos += block_size)
-    {
-        // size to read this time
-        uint64_t read_size = (pos + block_size < file_size) ? block_size : (file_size - pos);
-        fs.read((char*)ptr + pos, read_size);
-
-        // Read error occurred in the middle of the file.
-        if (fs.fail())
-            return 2;
-
-        //cout << ".";
-    }
-    fs.close();
-
-    return 0;
-}
-
-int write_memory_to_file(std::string filename, void* ptr, uint64_t size)
-{
-    fstream fs(filename, ios::out | ios::binary);
-    if (fs.fail())
-        return 1;
-
-    const uint64_t block_size = 1024 * 1024 * 1024; // number of elements to write in one write (1GB)
-    for (uint64_t pos = 0; pos < size; pos += block_size)
-    {
-        // Memory size to write this time
-        uint64_t write_size = (pos + block_size < size) ? block_size : (size - pos);
-        fs.write((char*)ptr + pos, write_size);
-        //cout << ".";
-    }
-    fs.close();
-    return 0;
-}
-
 } // namespace Stockfish
diff --git a/src/misc.h b/src/misc.h
index 6eb8b1ae..b7d3c78a 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -489,91 +489,10 @@ private:
 namespace WinProcGroup {
   void bindThisThread(size_t idx);
 }
-// sleep for the specified number of milliseconds.
-extern void sleep(int ms);
 
 // Returns a string that represents the current time. (Used for log output when learning evaluation function)
 std::string now_string();
-
-// When compiled with gcc/clang such as msys2, Windows Subsystem for Linux,
-// In C++ std::ifstream, ::read() is a wrapper for that because it is not possible to read and write files larger than 2GB in one shot.
-//
-// callback_func of the argument of read_file_to_memory() uses the file size as an argument when the file can be opened
-// It will be called back, so if you allocate a buffer and pass a function that returns the first pointer, it will be read there.
-// These functions return non-zero on error, such as when the file cannot be found.
-//
-// Also, if the buffer cannot be allocated in the callback function or if the file size is different from the expected file size,
-// Return nullptr. At this time, read_file_to_memory() interrupts reading and returns with an error.
-
-std::uint64_t get_file_size(std::fstream& fs);
-int read_file_to_memory(std::string filename, std::function<void* (uint64_t)> callback_func);
-int write_memory_to_file(std::string filename, void* ptr, uint64_t size);
-
-// --------------------
-// async version of PRNG
-// --------------------
-
-// async version of PRNG
-struct AsyncPRNG
-{
-  AsyncPRNG() : prng() { }
-  AsyncPRNG(uint64_t seed) : prng(seed) { assert(seed); }
-  AsyncPRNG(const std::string& seed) : prng(seed) { }
-  // [ASYNC] Extract one random number.
-  template<typename T> T rand() {
-    std::unique_lock<std::mutex> lk(mutex);
-    return prng.rand<T>();
-  }
-
-  // [ASYNC] Returns a random number from 0 to n-1. (Not uniform distribution, but this is enough in reality)
-  uint64_t rand(uint64_t n) {
-    std::unique_lock<std::mutex> lk(mutex);
-    return prng.rand(n);
-  }
-
-  // Return the random seed used internally.
-  uint64_t get_seed() const { return prng.get_seed(); }
-
-protected:
-  std::mutex mutex;
-  PRNG prng;
-};
-
-// Display a random seed. (For debugging)
-inline std::ostream& operator<<(std::ostream& os, AsyncPRNG& prng)
-{
-  os << "AsyncPRNG::seed = " << std::hex << prng.get_seed() << std::dec;
-  return os;
-}
-
-// --------------------
-//       Math
-// --------------------
-
-// Mathematical function used for progress calculation and learning
-namespace Math {
-    inline double sigmoid(double x)
-    {
-        return 1.0 / (1.0 + std::exp(-x));
-    }
-
-    inline double dsigmoid(double x)
-    {
-        // Sigmoid function
-        // f(x) = 1/(1+exp(-x))
-        // the first derivative is
-        // f'(x) = df/dx = f(x)・{ 1-f(x)}
-        // becomes
-
-        return sigmoid(x) * (1.0 - sigmoid(x));
-    }
-
-	// Clip v so that it fits between [lo,hi].
-	// * In Stockfish, this function is written in bitboard.h.
-	template<class T> constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
-		return v < lo ? lo : v > hi ? hi : v;
-	}
-}
+void sleep(int ms);
 
 namespace Algo {
     // Fisher-Yates
diff --git a/src/nnue/architectures/halfkp_256x2-32-32.h b/src/nnue/architectures/halfkp_256x2-32-32.h
index 21308368..a6768204 100644
--- a/src/nnue/architectures/halfkp_256x2-32-32.h
+++ b/src/nnue/architectures/halfkp_256x2-32-32.h
@@ -2,18 +2,18 @@
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
 
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
 
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 // Definition of input features and network structure used in NNUE evaluation function
@@ -21,33 +21,33 @@
 #ifndef NNUE_HALFKP_256X2_32_32_H_INCLUDED
 #define NNUE_HALFKP_256X2_32_32_H_INCLUDED
 
-#include "nnue/features/feature_set.h"
-#include "nnue/features/half_kp.h"
+#include "../features/feature_set.h"
+#include "../features/half_kp.h"
 
-#include "nnue/layers/input_slice.h"
-#include "nnue/layers/affine_transform.h"
-#include "nnue/layers/clipped_relu.h"
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
 
 namespace Stockfish::Eval::NNUE {
 
-    // Input features used in evaluation function
-    using RawFeatures = Features::FeatureSet<
-        Features::HalfKP<Features::Side::kFriend>>;
+// Input features used in evaluation function
+using RawFeatures = Features::FeatureSet<
+    Features::HalfKP<Features::Side::kFriend>>;
 
-    // Number of input feature dimensions after conversion
-    constexpr IndexType kTransformedFeatureDimensions = 256;
+// Number of input feature dimensions after conversion
+constexpr IndexType kTransformedFeatureDimensions = 256;
 
-    namespace Layers {
+namespace Layers {
 
-        // Define network structure
-        using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
-        using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
-        using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-        using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+// Define network structure
+using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
-    }  // namespace Layers
+}  // namespace Layers
 
-    using Network = Layers::OutputLayer;
+using Network = Layers::OutputLayer;
 
 }  // namespace Stockfish::Eval::NNUE
 
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 7b2a1ae8..5416f13e 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -1,14 +1,17 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
+
   Stockfish is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
+
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
@@ -138,4 +141,4 @@ namespace Stockfish::Eval::NNUE {
     return ReadParameters(stream);
   }
 
-} // namespace Stockfish::Eval::NNUE
\ No newline at end of file
+} // namespace Stockfish::Eval::NNUE
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
index 010a89f7..24aa6cc0 100644
--- a/src/nnue/evaluate_nnue.h
+++ b/src/nnue/evaluate_nnue.h
@@ -1,14 +1,17 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
+
   Stockfish is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
+
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
@@ -53,4 +56,4 @@ namespace Stockfish::Eval::NNUE {
 
 }  // namespace Stockfish::Eval::NNUE
 
-#endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
\ No newline at end of file
+#endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
index fb25bce5..a3fea9c0 100644
--- a/src/nnue/features/feature_set.h
+++ b/src/nnue/features/feature_set.h
@@ -1,14 +1,17 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
+
   Stockfish is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
+
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
@@ -63,4 +66,4 @@ namespace Stockfish::Eval::NNUE::Features {
 
 }  // namespace Stockfish::Eval::NNUE::Features
 
-#endif // #ifndef NNUE_FEATURE_SET_H_INCLUDED
\ No newline at end of file
+#endif // #ifndef NNUE_FEATURE_SET_H_INCLUDED
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
index 05eb1a9a..8e6907ae 100644
--- a/src/nnue/features/half_kp.cpp
+++ b/src/nnue/features/half_kp.cpp
@@ -2,18 +2,18 @@
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
 
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
 
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 //Definition of input features HalfKP of NNUE evaluation function
diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
index 3c94778d..2461acb7 100644
--- a/src/nnue/features/half_kp.h
+++ b/src/nnue/features/half_kp.h
@@ -2,23 +2,26 @@
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
 
-    Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
 
-    Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+//Definition of input features HalfKP of NNUE evaluation function
+
 #ifndef NNUE_FEATURES_HALF_KP_H_INCLUDED
 #define NNUE_FEATURES_HALF_KP_H_INCLUDED
 
+#include "../../evaluate.h"
 #include "features_common.h"
 
 namespace Stockfish::Eval::NNUE::Features {
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 3e54ab7f..1faa180d 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -1,14 +1,17 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
+
   Stockfish is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
+
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
@@ -458,4 +461,4 @@ namespace Stockfish::Eval::NNUE::Layers {
 
 }  // namespace Stockfish::Eval::NNUE::Layers
 
-#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
\ No newline at end of file
+#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 7efb0a0b..a10e3e48 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -47,8 +47,6 @@ namespace Stockfish::Eval::NNUE::Layers {
     static constexpr std::size_t kBufferSize =
         PreviousLayer::kBufferSize + kSelfBufferSize;
 
-    static constexpr int kLayerIndex = PreviousLayer::kLayerIndex + 1;
-
     // Hash value embedded in the evaluation file
     static constexpr std::uint32_t GetHashValue() {
       std::uint32_t hash_value = 0x538D24C7u;
@@ -56,36 +54,11 @@ namespace Stockfish::Eval::NNUE::Layers {
       return hash_value;
     }
 
-    static std::string get_name() {
-        return "ClippedReLU[" +
-            std::to_string(kOutputDimensions) + "]";
-    }
-
-    // A string that represents the structure from the input layer to this layer
-    static std::string get_structure_string() {
-        return get_name() + "(" +
-            PreviousLayer::get_structure_string() + ")";
-    }
-
-    static std::string get_layers_info() {
-        std::string info = PreviousLayer::get_layers_info();
-        info += "\n  - ";
-        info += std::to_string(kLayerIndex);
-        info += " - ";
-        info += get_name();
-        return info;
-    }
-
     // Read network parameters
     bool ReadParameters(std::istream& stream) {
       return previous_layer_.ReadParameters(stream);
     }
 
-    // write parameters
-    bool WriteParameters(std::ostream& stream) const {
-        return previous_layer_.WriteParameters(stream);
-    }
-
     // Forward propagation
     const OutputType* Propagate(
         const TransformedFeatureType* transformed_features, char* buffer) const {
@@ -185,9 +158,6 @@ namespace Stockfish::Eval::NNUE::Layers {
     }
 
    private:
-    // Make the learning class a friend
-    friend class Trainer<ClippedReLU>;
-
     PreviousLayer previous_layer_;
   };
 
diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h
index f7dab19e..9a7ce92e 100644
--- a/src/nnue/layers/input_slice.h
+++ b/src/nnue/layers/input_slice.h
@@ -41,8 +41,6 @@ class InputSlice {
   // Size of forward propagation buffer used from the input layer to this layer
   static constexpr std::size_t kBufferSize = 0;
 
-  static constexpr int kLayerIndex = 1;
-
   // Hash value embedded in the evaluation file
   static constexpr std::uint32_t GetHashValue() {
     std::uint32_t hash_value = 0xEC42E90Du;
@@ -50,35 +48,10 @@ class InputSlice {
     return hash_value;
   }
 
-    static std::string get_name() {
-        return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
-            std::to_string(Offset) + ":" +
-            std::to_string(Offset + kOutputDimensions) + ")]";
-    }
-
-    // A string that represents the structure from the input layer to this layer
-    static std::string get_structure_string() {
-        return get_name();
-    }
-
-    static std::string get_layers_info() {
-        std::string info = "  - ";
-        info += std::to_string(kLayerIndex);
-        info += " - ";
-        info += get_name();
-        return info;
-    }
-
   // Read network parameters
   bool ReadParameters(std::istream& /*stream*/) {
     return true;
   }
-
-  // write parameters
-  bool WriteParameters(std::ostream& /*stream*/) const {
-      return true;
-  }
-
   // Forward propagation
   const OutputType* Propagate(
       const TransformedFeatureType* transformed_features,
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index 5da4ecb5..55fafa13 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -1,14 +1,17 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
+
   Stockfish is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
+
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
@@ -34,4 +37,4 @@ namespace Stockfish::Eval::NNUE {
 
 }  // namespace Stockfish::Eval::NNUE
 
-#endif // NNUE_ACCUMULATOR_H_INCLUDED
\ No newline at end of file
+#endif // NNUE_ACCUMULATOR_H_INCLUDED
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 1479d6b3..bfd0738e 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -25,13 +25,6 @@
 
 #include <cstring>
 #include <iostream>
-#if defined(__GNUC__ ) && (__GNUC__ < 8)
-#include <experimental/filesystem>
-namespace sys = std::experimental::filesystem;
-#else
-#include <filesystem>
-namespace sys = std::filesystem;
-#endif
 
 #if defined(USE_AVX2)
 #include <immintrin.h>
@@ -112,10 +105,6 @@ namespace Stockfish::Eval::NNUE {
   using TransformedFeatureType = std::uint8_t;
   using IndexType = std::uint32_t;
 
-  // Forward declaration of learning class template
-  template <typename Layer>
-  class Trainer;
-
   // Round n up to be a multiple of base
   template <typename IntType>
   constexpr IntType CeilToMultiple(IntType n, IntType base) {
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index b5c8f40e..1e0b0e6d 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -1,14 +1,17 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
+
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
+
   Stockfish is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
+
   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
@@ -411,4 +414,4 @@ namespace Stockfish::Eval::NNUE {
 
 }  // namespace Stockfish::Eval::NNUE
 
-#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
\ No newline at end of file
+#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
diff --git a/src/position.cpp b/src/position.cpp
index c515f253..8ef516b6 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -1007,10 +1007,10 @@ void Position::do_null_move(StateInfo& newSt) {
   st = &newSt;
 
   // Used by NNUE
+  st->dirtyPiece.dirty_num = 0;
+  st->dirtyPiece.piece[0] = NO_PIECE; // Avoid checks in UpdateAccumulator()
   st->accumulator.state[WHITE] = Eval::NNUE::EMPTY;
   st->accumulator.state[BLACK] = Eval::NNUE::EMPTY;
-  auto& dp = st->dirtyPiece;
-  dp.dirty_num = 0;
 
   if (st->epSquare != SQ_NONE)
   {
diff --git a/src/tools/convert.cpp b/src/tools/convert.cpp
index 0be7070c..03f3e4a7 100644
--- a/src/tools/convert.cpp
+++ b/src/tools/convert.cpp
@@ -25,8 +25,10 @@
 #include <chrono>
 #include <random>
 #include <regex>
+#include <filesystem>
 
 using namespace std;
+namespace sys = std::filesystem;
 
 namespace Stockfish::Tools
 {
@@ -123,7 +125,7 @@ namespace Stockfish::Tools
                     score = (score - src_score_min_value) / (src_score_max_value - src_score_min_value);
                     // Scale to [dest_score_min_value, dest_score_max_value].
                     score = score * (dest_score_max_value - dest_score_min_value) + dest_score_min_value;
-                    p.score = Math::clamp((int32_t)std::round(score), -(int32_t)VALUE_MATE, (int32_t)VALUE_MATE);
+                    p.score = std::clamp((int32_t)std::round(score), -(int32_t)VALUE_MATE, (int32_t)VALUE_MATE);
                 }
                 else if (token == "ply") {
                     int temp;
@@ -414,7 +416,7 @@ namespace Stockfish::Tools
                                     Value value = parse_score_from_pgn_extract(str_eval, success);
                                     if (success) {
                                         eval_found = true;
-                                        psv.score = Math::clamp(value, -VALUE_MATE, VALUE_MATE);
+                                        psv.score = std::clamp(value, -VALUE_MATE, VALUE_MATE);
                                     }
 
 #if defined(DEBUG_CONVERT_BIN_FROM_PGN_EXTRACT)
diff --git a/src/tools/gensfen.cpp b/src/tools/training_data_generator.cpp
similarity index 97%
rename from src/tools/gensfen.cpp
rename to src/tools/training_data_generator.cpp
index 1e721a3e..10f84fe5 100644
--- a/src/tools/gensfen.cpp
+++ b/src/tools/training_data_generator.cpp
@@ -1,4 +1,4 @@
-﻿#include "gensfen.h"
+﻿#include "training_data_generator.h"
 
 #include "sfen_writer.h"
 #include "packed_sfen.h"
@@ -37,7 +37,7 @@ using namespace std;
 namespace Stockfish::Tools
 {
     // Class to generate sfen with multiple threads
-    struct Gensfen
+    struct TrainingDataGenerator
     {
         struct Params
         {
@@ -123,7 +123,7 @@ namespace Stockfish::Tools
         static constexpr uint64_t REPORT_STATS_EVERY = 200000;
         static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
 
-        Gensfen(
+        TrainingDataGenerator(
             const Params& prm
         ) :
             params(prm),
@@ -205,7 +205,7 @@ namespace Stockfish::Tools
         void maybe_report(uint64_t done);
     };
 
-    void Gensfen::set_gensfen_search_limits()
+    void TrainingDataGenerator::set_gensfen_search_limits()
     {
         // About Search::Limits
         // Be careful because this member variable is global and affects other threads.
@@ -224,7 +224,7 @@ namespace Stockfish::Tools
         limits.depth = 0;
     }
 
-    void Gensfen::generate(uint64_t limit)
+    void TrainingDataGenerator::generate(uint64_t limit)
     {
         last_stats_report_time = 0;
 
@@ -246,7 +246,7 @@ namespace Stockfish::Tools
         std::cout << std::endl;
     }
 
-    void Gensfen::generate_worker(
+    void TrainingDataGenerator::generate_worker(
         Thread& th,
         std::atomic<uint64_t>& counter,
         uint64_t limit)
@@ -449,7 +449,7 @@ namespace Stockfish::Tools
         }
     }
 
-    bool Gensfen::was_seen_before(const Position& pos)
+    bool TrainingDataGenerator::was_seen_before(const Position& pos)
     {
         // Look into the position hashtable to see if the same
         // position was seen before.
@@ -470,7 +470,7 @@ namespace Stockfish::Tools
         }
     }
 
-    optional<int8_t> Gensfen::get_current_game_result(
+    optional<int8_t> TrainingDataGenerator::get_current_game_result(
         Position& pos,
         const vector<int>& move_hist_scores) const
     {
@@ -591,7 +591,7 @@ namespace Stockfish::Tools
         return nullopt;
     }
 
-    vector<uint8_t> Gensfen::generate_random_move_flags(PRNG& prng)
+    vector<uint8_t> TrainingDataGenerator::generate_random_move_flags(PRNG& prng)
     {
         vector<uint8_t> random_move_flag;
 
@@ -628,7 +628,7 @@ namespace Stockfish::Tools
         return random_move_flag;
     }
 
-    optional<Move> Gensfen::choose_random_move(
+    optional<Move> TrainingDataGenerator::choose_random_move(
         PRNG& prng,
         Position& pos,
         std::vector<uint8_t>& random_move_flag,
@@ -725,7 +725,7 @@ namespace Stockfish::Tools
     // 1 when winning. -1 when losing. Pass 0 for a draw.
     // Return value: true if the specified number of
     // sfens has already been reached and the process ends.
-    bool Gensfen::commit_psv(
+    bool TrainingDataGenerator::commit_psv(
         Thread& th,
         PSVector& sfens,
         int8_t result,
@@ -770,7 +770,7 @@ namespace Stockfish::Tools
         return false;
     }
 
-    void Gensfen::report(uint64_t done, uint64_t new_done)
+    void TrainingDataGenerator::report(uint64_t done, uint64_t new_done)
     {
         const auto now_time = now();
         const TimePoint elapsed = now_time - last_stats_report_time + 1;
@@ -786,7 +786,7 @@ namespace Stockfish::Tools
         out = sync_region_cout.new_region();
     }
 
-    void Gensfen::maybe_report(uint64_t done)
+    void TrainingDataGenerator::maybe_report(uint64_t done)
     {
         if (done % REPORT_DOT_EVERY == 0)
         {
@@ -811,12 +811,12 @@ namespace Stockfish::Tools
     }
 
     // Command to generate a game record
-    void gensfen(istringstream& is)
+    void generate_training_data(istringstream& is)
     {
         // Number of generated game records default = 8 billion phases (Ponanza specification)
         uint64_t loop_max = 8000000000UL;
 
-        Gensfen::Params params;
+        TrainingDataGenerator::Params params;
 
         // Add a random number to the end of the file name.
         bool random_file_name = false;
@@ -966,9 +966,9 @@ namespace Stockfish::Tools
 
         Threads.main()->ponder = false;
 
-        Gensfen gensfen(params);
+        TrainingDataGenerator gensfen(params);
         gensfen.generate(loop_max);
 
-        std::cout << "INFO: Gensfen finished." << endl;
+        std::cout << "INFO: TrainingDataGenerator finished." << endl;
     }
 }
diff --git a/src/tools/gensfen.h b/src/tools/training_data_generator.h
similarity index 75%
rename from src/tools/gensfen.h
rename to src/tools/training_data_generator.h
index a8505474..9a105155 100644
--- a/src/tools/gensfen.h
+++ b/src/tools/training_data_generator.h
@@ -8,7 +8,7 @@
 namespace Stockfish::Tools {
 
     // Automatic generation of teacher position
-    void gensfen(std::istringstream& is);
+    void generate_training_data(std::istringstream& is);
 }
 
 #endif
\ No newline at end of file
diff --git a/src/tools/gensfen_nonpv.cpp b/src/tools/training_data_generator_nonpv.cpp
similarity index 95%
rename from src/tools/gensfen_nonpv.cpp
rename to src/tools/training_data_generator_nonpv.cpp
index 9775f80e..247c18a4 100644
--- a/src/tools/gensfen_nonpv.cpp
+++ b/src/tools/training_data_generator_nonpv.cpp
@@ -1,4 +1,4 @@
-﻿#include "gensfen_nonpv.h"
+﻿#include "training_data_generator_nonpv.h"
 
 #include "sfen_writer.h"
 #include "packed_sfen.h"
@@ -37,7 +37,7 @@ using namespace std;
 namespace Stockfish::Tools
 {
     // Class to generate sfen with multiple threads
-    struct GensfenNonPv
+    struct TrainingDataGeneratorNonPv
     {
         struct Params
         {
@@ -89,7 +89,7 @@ namespace Stockfish::Tools
         static constexpr uint64_t REPORT_STATS_EVERY = 200000;
         static_assert(REPORT_STATS_EVERY % REPORT_DOT_EVERY == 0);
 
-        GensfenNonPv(
+        TrainingDataGeneratorNonPv(
             const Params& prm
         ) :
             params(prm),
@@ -148,7 +148,7 @@ namespace Stockfish::Tools
         void maybe_report(uint64_t done);
     };
 
-    void GensfenNonPv::set_gensfen_search_limits()
+    void TrainingDataGeneratorNonPv::set_gensfen_search_limits()
     {
         // About Search::Limits
         // Be careful because this member variable is global and affects other threads.
@@ -167,7 +167,7 @@ namespace Stockfish::Tools
         limits.depth = 0;
     }
 
-    void GensfenNonPv::generate(uint64_t limit)
+    void TrainingDataGeneratorNonPv::generate(uint64_t limit)
     {
         last_stats_report_time = 0;
 
@@ -189,7 +189,7 @@ namespace Stockfish::Tools
         std::cout << std::endl;
     }
 
-    PSVector GensfenNonPv::do_exploration(
+    PSVector TrainingDataGeneratorNonPv::do_exploration(
         Thread& th,
         int count)
     {
@@ -253,7 +253,7 @@ namespace Stockfish::Tools
         return psv;
     }
 
-    void GensfenNonPv::generate_worker(
+    void TrainingDataGeneratorNonPv::generate_worker(
         Thread& th,
         std::atomic<uint64_t>& counter,
         uint64_t limit)
@@ -323,7 +323,7 @@ namespace Stockfish::Tools
     // 1 when winning. -1 when losing. Pass 0 for a draw.
     // Return value: true if the specified number of
     // sfens has already been reached and the process ends.
-    bool GensfenNonPv::commit_psv(
+    bool TrainingDataGeneratorNonPv::commit_psv(
         Thread& th,
         PSVector& sfens,
         std::atomic<uint64_t>& counter,
@@ -347,7 +347,7 @@ namespace Stockfish::Tools
         return false;
     }
 
-    void GensfenNonPv::report(uint64_t done, uint64_t new_done)
+    void TrainingDataGeneratorNonPv::report(uint64_t done, uint64_t new_done)
     {
         const auto now_time = now();
         const TimePoint elapsed = now_time - last_stats_report_time + 1;
@@ -363,7 +363,7 @@ namespace Stockfish::Tools
         out = sync_region_cout.new_region();
     }
 
-    void GensfenNonPv::maybe_report(uint64_t done)
+    void TrainingDataGeneratorNonPv::maybe_report(uint64_t done)
     {
         if (done % REPORT_DOT_EVERY == 0)
         {
@@ -388,10 +388,10 @@ namespace Stockfish::Tools
     }
 
     // Command to generate a game record
-    void gensfen_nonpv(istringstream& is)
+    void generate_training_data_nonpv(istringstream& is)
     {
         // Number of generated game records default = 8 billion phases (Ponanza specification)
-        GensfenNonPv::Params params;
+        TrainingDataGeneratorNonPv::Params params;
 
         uint64_t count = 1'000'000;
 
@@ -480,7 +480,7 @@ namespace Stockfish::Tools
 
         Threads.main()->ponder = false;
 
-        GensfenNonPv gensfen(params);
+        TrainingDataGeneratorNonPv gensfen(params);
         gensfen.generate(count);
 
         std::cout << "INFO: gensfen_nonpv finished." << endl;
diff --git a/src/tools/gensfen_nonpv.h b/src/tools/training_data_generator_nonpv.h
similarity index 71%
rename from src/tools/gensfen_nonpv.h
rename to src/tools/training_data_generator_nonpv.h
index 842dd70b..8bd093cb 100644
--- a/src/tools/gensfen_nonpv.h
+++ b/src/tools/training_data_generator_nonpv.h
@@ -6,7 +6,7 @@
 namespace Stockfish::Tools {
 
     // Automatic generation of teacher position
-    void gensfen_nonpv(std::istringstream& is);
+    void generate_training_data_nonpv(std::istringstream& is);
 }
 
 #endif
\ No newline at end of file
diff --git a/src/uci.cpp b/src/uci.cpp
index fdce17aa..6f5b28a9 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -33,8 +33,8 @@
 #include "tt.h"
 #include "uci.h"
 
-#include "tools/gensfen.h"
-#include "tools/gensfen_nonpv.h"
+#include "tools/training_data_generator.h"
+#include "tools/training_data_generator_nonpv.h"
 #include "tools/convert.h"
 #include "tools/transform.h"
 #include "tools/stats.h"
@@ -209,14 +209,7 @@ void UCI::setoption(const std::string& name, const std::string& value)
 
 // The win rate model returns the probability (per mille) of winning given an eval
 // and a game-ply. The model fits rather accurately the LTC fishtest statistics.
-int UCI::win_rate_model(Value v, int ply) {
-   // Return win rate in per mille (rounded to nearest)
-   return int(0.5 + win_rate_model_double(v, ply));
-}
-
-// The win rate model returns the probability (per mille) of winning given an eval
-// and a game-ply. The model fits rather accurately the LTC fishtest statistics.
-double UCI::win_rate_model_double(double v, int ply) {
+int win_rate_model(Value v, int ply) {
 
    // The model captures only up to 240 plies, so limit input (and rescale)
    double m = std::min(240, ply) / 64.0;
@@ -230,10 +223,10 @@ double UCI::win_rate_model_double(double v, int ply) {
    double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
 
    // Transform eval to centipawns with limited range
-     double x = std::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
+   double x = std::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
 
-   // Return win rate in per mille
-   return 1000.0 / (1 + std::exp((a - x) / b));
+   // Return win rate in per mille (rounded to nearest)
+   return int(0.5 + 1000 / (1 + std::exp((a - x) / b)));
 }
 
 // --------------------
@@ -327,8 +320,8 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "eval")     trace_eval(pos);
       else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
 
-      else if (token == "gensfen") Tools::gensfen(is);
-      else if (token == "gensfen_nonpv") Tools::gensfen_nonpv(is);
+      else if (token == "generate_training_data") Tools::generate_training_data(is);
+      else if (token == "generate_training_data") Tools::generate_training_data_nonpv(is);
       else if (token == "convert") Tools::convert(is);
       else if (token == "convert_bin") Tools::convert_bin(is);
       else if (token == "convert_plain") Tools::convert_plain(is);
diff --git a/src/uci.h b/src/uci.h
index b27445aa..9e4c6e2f 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -74,8 +74,6 @@ std::string square(Square s);
 std::string move(Move m, bool chess960);
 std::string pv(const Position& pos, Depth depth, Value alpha, Value beta);
 std::string wdl(Value v, int ply);
-int win_rate_model(Value v, int ply);
-double win_rate_model_double(double v, int ply);
 Move to_move(const Position& pos, std::string& str);
 void setoption(const std::string& name, const std::string& value);
 
diff --git a/stockfish.md b/stockfish.md
deleted file mode 100644
index 823518d1..00000000
--- a/stockfish.md
+++ /dev/null
@@ -1,242 +0,0 @@
-## Overview
-
-[![Build Status](https://travis-ci.org/official-stockfish/Stockfish.svg?branch=master)](https://travis-ci.org/official-stockfish/Stockfish)
-[![Build Status](https://ci.appveyor.com/api/projects/status/github/official-stockfish/Stockfish?branch=master&svg=true)](https://ci.appveyor.com/project/mcostalba/stockfish/branch/master)
-
-[Stockfish](https://stockfishchess.org) is a free, powerful UCI chess engine
-derived from Glaurung 2.1. It is not a complete chess program and requires a
-UCI-compatible GUI (e.g. XBoard with PolyGlot, Scid, Cute Chess, eboard, Arena,
-Sigma Chess, Shredder, Chess Partner or Fritz) in order to be used comfortably.
-Read the documentation for your GUI of choice for information about how to use
-Stockfish with it.
-
-
-## Files
-
-This distribution of Stockfish consists of the following files:
-
-  * Readme.md, the file you are currently reading.
-
-  * Copying.txt, a text file containing the GNU General Public License version 3.
-
-  * src, a subdirectory containing the full source code, including a Makefile
-    that can be used to compile Stockfish on Unix-like systems.
-
-
-## UCI parameters
-
-Currently, Stockfish has the following UCI options:
-
-  * #### Debug Log File
-    Write all communication to and from the engine into a text file.
-
-  * #### Contempt
-    A positive value for contempt favors middle game positions and avoids draws.
-
-  * #### Analysis Contempt
-    By default, contempt is set to prefer the side to move. Set this option to "White"
-    or "Black" to analyse with contempt for that side, or "Off" to disable contempt.
-
-  * #### Threads
-    The number of CPU threads used for searching a position. For best performance, set
-    this equal to the number of CPU cores available.
-
-  * #### Hash
-    The size of the hash table in MB. It is recommended to set Hash after setting Threads.
-
-  * #### Clear Hash
-    Clear the hash table.
-
-  * #### Ponder
-    Let Stockfish ponder its next move while the opponent is thinking.
-
-  * #### MultiPV
-    Output the N best lines (principal variations, PVs) when searching.
-    Leave at 1 for best performance.
-
-  * #### Skill Level
-    Lower the Skill Level in order to make Stockfish play weaker (see also UCI_LimitStrength).
-    Internally, MultiPV is enabled, and with a certain probability depending on the Skill Level a
-    weaker move will be played.
-
-  * #### UCI_LimitStrength
-    Enable weaker play aiming for an Elo rating as set by UCI_Elo. This option overrides Skill Level.
-
-  * #### UCI_Elo
-    If enabled by UCI_LimitStrength, aim for an engine strength of the given Elo.
-    This Elo rating has been calibrated at a time control of 60s+0.6s and anchored to CCRL 40/4.
-
-  * #### UCI_ShowWDL
-    If enabled, show approximate WDL statistics as part of the engine output.
-    These WDL numbers model expected game outcomes for a given evaluation and
-    game ply for engine self-play at fishtest LTC conditions (60+0.6s per game).
-
-  * #### Move Overhead
-    Assume a time delay of x ms due to network and GUI overheads. This is useful to
-    avoid losses on time in those cases.
-
-  * #### Slow Mover
-    Lower values will make Stockfish take less time in games, higher values will
-    make it think longer.
-
-  * #### nodestime
-    Tells the engine to use nodes searched instead of wall time to account for
-    elapsed time. Useful for engine testing.
-
-  * #### UCI_Chess960
-    An option handled by your GUI. If true, Stockfish will play Chess960.
-
-  * #### UCI_AnalyseMode
-    An option handled by your GUI.
-
-  * #### SyzygyPath
-    Path to the folders/directories storing the Syzygy tablebase files. Multiple
-    directories are to be separated by ";" on Windows and by ":" on Unix-based
-    operating systems. Do not use spaces around the ";" or ":".
-
-    Example: `C:\tablebases\wdl345;C:\tablebases\wdl6;D:\tablebases\dtz345;D:\tablebases\dtz6`
-
-    It is recommended to store .rtbw files on an SSD. There is no loss in storing
-    the .rtbz files on a regular HD. It is recommended to verify all md5 checksums
-    of the downloaded tablebase files (`md5sum -c checksum.md5`) as corruption will
-    lead to engine crashes.
-
-  * #### SyzygyProbeDepth
-    Minimum remaining search depth for which a position is probed. Set this option
-    to a higher value to probe less agressively if you experience too much slowdown
-    (in terms of nps) due to TB probing.
-
-  * #### Syzygy50MoveRule
-    Disable to let fifty-move rule draws detected by Syzygy tablebase probes count
-    as wins or losses. This is useful for ICCF correspondence games.
-
-  * #### SyzygyProbeLimit
-    Limit Syzygy tablebase probing to positions with at most this many pieces left
-    (including kings and pawns).
-
-
-## What to expect from Syzygybases?
-
-If the engine is searching a position that is not in the tablebases (e.g.
-a position with 8 pieces), it will access the tablebases during the search.
-If the engine reports a very large score (typically 153.xx), this means
-that it has found a winning line into a tablebase position.
-
-If the engine is given a position to search that is in the tablebases, it
-will use the tablebases at the beginning of the search to preselect all
-good moves, i.e. all moves that preserve the win or preserve the draw while
-taking into account the 50-move rule.
-It will then perform a search only on those moves. **The engine will not move
-immediately**, unless there is only a single good move. **The engine likely
-will not report a mate score even if the position is known to be won.**
-
-It is therefore clear that this behaviour is not identical to what one might
-be used to with Nalimov tablebases. There are technical reasons for this
-difference, the main technical reason being that Nalimov tablebases use the
-DTM metric (distance-to-mate), while Syzygybases use a variation of the
-DTZ metric (distance-to-zero, zero meaning any move that resets the 50-move
-counter). This special metric is one of the reasons that Syzygybases are
-more compact than Nalimov tablebases, while still storing all information
-needed for optimal play and in addition being able to take into account
-the 50-move rule.
-
-## Large Pages
-
-Stockfish supports large pages on Linux and Windows. Large pages make
-the hash access more efficient, improving the engine speed, especially
-on large hash sizes. Typical increases are 5..10% in terms of nps, but
-speed increases up to 30% have been measured. The support is
-automatic. Stockfish attempts to use large pages when available and
-will fall back to regular memory allocation when this is not the case.
-
-### Support on Linux
-
-Large page support on Linux is obtained by the Linux kernel
-transparent huge pages functionality. Typically, transparent huge pages
-are already enabled and no configuration is needed.
-
-### Support on Windows
-
-The use of large pages requires "Lock Pages in Memory" privilege. See
-[Enable the Lock Pages in Memory Option (Windows)](https://docs.microsoft.com/en-us/sql/database-engine/configure-windows/enable-the-lock-pages-in-memory-option-windows)
-on how to enable this privilege. Logout/login may be needed
-afterwards. Due to memory fragmentation, it may not always be
-possible to allocate large pages even when enabled. A reboot
-might alleviate this problem. To determine whether large pages
-are in use, see the engine log.
-
-## Compiling Stockfish yourself from the sources
-
-Stockfish has support for 32 or 64-bit CPUs, certain hardware
-instructions, big-endian machines such as Power PC, and other platforms.
-
-On Unix-like systems, it should be easy to compile Stockfish
-directly from the source code with the included Makefile in the folder
-`src`. In general it is recommended to run `make help` to see a list of make
-targets with corresponding descriptions.
-
-```
-    cd src
-    make help
-    make build ARCH=x86-64-modern
-```
-
-When not using the Makefile to compile (for instance with Microsoft MSVC) you
-need to manually set/unset some switches in the compiler command line; see
-file *types.h* for a quick reference.
-
-When reporting an issue or a bug, please tell us which version and
-compiler you used to create your executable. These informations can
-be found by typing the following commands in a console:
-
-```
-    ./stockfish
-    compiler
-```
-
-## Understanding the code base and participating in the project
-
-Stockfish's improvement over the last couple of years has been a great
-community effort. There are a few ways to help contribute to its growth.
-
-### Donating hardware
-
-Improving Stockfish requires a massive amount of testing. You can donate
-your hardware resources by installing the [Fishtest Worker](https://github.com/glinscott/fishtest/wiki/Running-the-worker:-overview)
-and view the current tests on [Fishtest](https://tests.stockfishchess.org/tests).
-
-### Improving the code
-
-If you want to help improve the code, there are several valuable resources:
-
-* [In this wiki,](https://www.chessprogramming.org) many techniques used in
-Stockfish are explained with a lot of background information.
-
-* [The section on Stockfish](https://www.chessprogramming.org/Stockfish)
-describes many features and techniques used by Stockfish. However, it is
-generic rather than being focused on Stockfish's precise implementation.
-Nevertheless, a helpful resource.
-
-* The latest source can always be found on [GitHub](https://github.com/official-stockfish/Stockfish).
-Discussions about Stockfish take place in the [FishCooking](https://groups.google.com/forum/#!forum/fishcooking)
-group and engine testing is done on [Fishtest](https://tests.stockfishchess.org/tests).
-If you want to help improve Stockfish, please read this [guideline](https://github.com/glinscott/fishtest/wiki/Creating-my-first-test)
-first, where the basics of Stockfish development are explained.
-
-
-## Terms of use
-
-Stockfish is free, and distributed under the **GNU General Public License version 3**
-(GPL v3). Essentially, this means that you are free to do almost exactly
-what you want with the program, including distributing it among your
-friends, making it available for download from your web site, selling
-it (either by itself or as part of some bigger software package), or
-using it as the starting point for a software project of your own.
-
-The only real limitation is that whenever you distribute Stockfish in
-some way, you must always include the full source code, or a pointer
-to where the source code can be found. If you make any changes to the
-source code, these changes must also be made available under the GPL.
-
-For full details, read the copy of the GPL v3 found in the file named
-*Copying.txt*.
diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index 2f746a86..518d1087 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -24,7 +24,7 @@ case $1 in
     echo "valgrind-thread testing started"
     prefix=''
     exeprefix='valgrind --fair-sched=try --error-exitcode=42'
-    postfix=''
+    postfix='1>/dev/null'
     threads="2"
     bench_depth=5
     go_depth=10
@@ -142,7 +142,57 @@ cat << EOF > syzygy.exp
  exit \$value
 EOF
 
-for exp in game.exp
+# generate_training_data testing 01
+cat << EOF > data_generation01.exp
+ set timeout 240
+ spawn $exeprefix ./stockfish
+
+ send "uci\n"
+ expect "uciok"
+
+ send "setoption name Threads value $threads\n"
+ send "setoption name Use NNUE value false\n"
+ send "isready\n"
+ send "generate_training_data depth 3 count 100 keep_draws 1 eval_limit 32000 output_file_name training_data/training_data.bin output_format bin\n"
+ expect "INFO: Gensfen finished."
+ send "convert_plain targetfile training_data/training_data.bin output_file_name training_data.txt\n"
+ expect "all done"
+ send "generate_training_data depth 3 count 100 keep_draws 1 eval_limit 32000 output_file_name training_data/training_data.binpack output_format binpack\n"
+ expect "INFO: Gensfen finished."
+
+ send "quit\n"
+ expect eof
+
+ # return error code of the spawned program, useful for valgrind
+ lassign [wait] pid spawnid os_error_flag value
+ exit \$value
+EOF
+
+# generate_training_data testing 02
+cat << EOF > data_generation02.exp
+ set timeout 240
+ spawn $exeprefix ./stockfish
+
+ send "uci\n"
+ expect "uciok"
+
+ send "setoption name Threads value $threads\n"
+ send "setoption name Use NNUE value true\n"
+ send "isready\n"
+ send "generate_training_data depth 4 count 50 keep_draws 1 eval_limit 32000 output_file_name validation_data/validation_data.bin output_format bin\n"
+ expect "INFO: Gensfen finished."
+ send "generate_training_data depth 4 count 50 keep_draws 1 eval_limit 32000 output_file_name validation_data/validation_data.binpack output_format binpack\n"
+ expect "INFO: Gensfen finished."
+
+ send "quit\n"
+ expect eof
+
+ # return error code of the spawned program, useful for valgrind
+ lassign [wait] pid spawnid os_error_flag value
+ exit \$value
+EOF
+
+for exp in game.exp data_generation01.exe data_generation02.exp
 do
 
   echo "$prefix expect $exp $postfix"
diff --git a/tests/instrumented_learn.sh b/tests/instrumented_learn.sh
deleted file mode 100755
index af8c8b17..00000000
--- a/tests/instrumented_learn.sh
+++ /dev/null
@@ -1,155 +0,0 @@
-#!/bin/bash
-# check for errors under valgrind or sanitizers.
-
-error()
-{
-  echo "instrumented testing failed on line $1"
-  exit 1
-}
-trap 'error ${LINENO}' ERR
-
-# define suitable post and prefixes for testing options
-case $1 in
-  --valgrind)
-    echo "valgrind testing started"
-    prefix=''
-    exeprefix='valgrind --error-exitcode=42'
-    postfix='1>/dev/null'
-    threads="1"
-  ;;
-  --valgrind-thread)
-    echo "valgrind-thread testing started"
-    prefix=''
-    exeprefix='valgrind --fair-sched=try --error-exitcode=42'
-    postfix='1>/dev/null'
-    threads="2"
-  ;;
-  --sanitizer-undefined)
-    echo "sanitizer-undefined testing started"
-    prefix='!'
-    exeprefix=''
-    postfix='2>&1 | grep -A50 "runtime error:"'
-    threads="1"
-  ;;
-  --sanitizer-thread)
-    echo "sanitizer-thread testing started"
-    prefix='!'
-    exeprefix=''
-    postfix='2>&1 | grep -A50 "WARNING: ThreadSanitizer:"'
-    threads="2"
-
-cat << EOF > tsan.supp
-race:TTEntry::move
-race:TTEntry::depth
-race:TTEntry::bound
-race:TTEntry::save
-race:TTEntry::value
-race:TTEntry::eval
-race:TTEntry::is_pv
-
-race:TranspositionTable::probe
-race:TranspositionTable::hashfull
-
-EOF
-
-    export TSAN_OPTIONS="suppressions=./tsan.supp"
-
-  ;;
-  *)
-    echo "unknown testing started"
-    prefix=''
-    exeprefix=''
-    postfix=''
-    threads="1"
-  ;;
-esac
-
-mkdir -p training_data
-mkdir -p validation_data
-
-# gensfen testing 01
-cat << EOF > gensfen01.exp
- set timeout 240
- spawn $exeprefix ./stockfish
-
- send "uci\n"
- expect "uciok"
-
- send "setoption name Threads value $threads\n"
- send "setoption name Use NNUE value false\n"
- send "isready\n"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.bin sfen_format bin\n"
- expect "INFO: Gensfen finished."
- send "convert_plain targetfile training_data/training_data.bin output_file_name training_data.txt\n"
- expect "all done"
- send "gensfen depth 3 loop 100 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name training_data/training_data.binpack sfen_format binpack\n"
- expect "INFO: Gensfen finished."
-
- send "quit\n"
- expect eof
-
- # return error code of the spawned program, useful for valgrind
- lassign [wait] pid spawnid os_error_flag value
- exit \$value
-EOF
-
-# gensfen testing 02
-cat << EOF > gensfen02.exp
- set timeout 240
- spawn $exeprefix ./stockfish
-
- send "uci\n"
- expect "uciok"
-
- send "setoption name Threads value $threads\n"
- send "setoption name Use NNUE value true\n"
- send "isready\n"
- send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.bin sfen_format bin\n"
- expect "INFO: Gensfen finished."
- send "gensfen depth 4 loop 50 use_draw_in_training_data_generation 1 eval_limit 32000 output_file_name validation_data/validation_data.binpack sfen_format binpack\n"
- expect "INFO: Gensfen finished."
-
- send "quit\n"
- expect eof
-
- # return error code of the spawned program, useful for valgrind
- lassign [wait] pid spawnid os_error_flag value
- exit \$value
-EOF
-
-# simple learning
-cat << EOF > learn01.exp
- set timeout 240
- spawn $exeprefix ./stockfish
-
- send "uci\n"
- send "setoption name SkipLoadingEval value true\n"
- send "setoption name Use NNUE value pure\n"
- send "setoption name Threads value $threads\n"
- send "isready\n"
- send "learn targetdir training_data epochs 1 sfen_read_size 100 thread_buffer_size 10 batchsize 100 use_draw_in_training 1 use_draw_in_validation 1 lr 1 eval_limit 32000 nn_batch_size 30 newbob_decay 0.5 eval_save_interval 30 loss_output_interval 10 validation_set_file_name validation_data/validation_data.bin\n"
-
- expect "INFO (save_eval): Finished saving evaluation file in evalsave/final"
-
- send "quit\n"
- expect eof
-
- # return error code of the spawned program, useful for valgrind
- lassign [wait] pid spawnid os_error_flag value
- exit \$value
-
-EOF
-
-for exp in gensfen01.exp gensfen02.exp learn01.exp
-do
-
-  echo "$prefix expect $exp $postfix"
-  eval "$prefix expect $exp $postfix"
-
-  rm $exp
-
-done
-
-rm -f tsan.supp
-
-echo "instrumented learn testing OK"

From c2511ffc7bd6b95e4030aaf046c853a0f476be4a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 19 Apr 2021 19:05:37 +0200
Subject: [PATCH 565/583] Renaming and small changes.

---
 README.md                                     | 10 ++---
 .../{gensfen.md => generate_training_data.md} | 36 +++++++--------
 ...npv.md => generate_training_data_nonpv.md} | 10 ++---
 src/tools/training_data_generator.cpp         | 45 ++++++++++---------
 src/tools/training_data_generator_nonpv.cpp   | 10 ++---
 5 files changed, 56 insertions(+), 55 deletions(-)
 rename docs/{gensfen.md => generate_training_data.md} (59%)
 rename docs/{gensfen_nonpv.md => generate_training_data_nonpv.md} (75%)

diff --git a/README.md b/README.md
index 6c974a78..690eaf3b 100644
--- a/README.md
+++ b/README.md
@@ -138,7 +138,7 @@ Currently, Stockfish has the following UCI options:
 
 ### Generating Training Data
 
-To generate training data from the classic eval, use the gensfen command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands.
+To generate training data from the classic eval, use the generate_training_data command with the setting "Use NNUE" set to "false". The given example is generation in its simplest form. There are more commands.
 
 ```
 uci
@@ -148,7 +148,7 @@ setoption name Threads value X
 setoption name Hash value Y
 setoption name SyzygyPath value path
 isready
-gensfen depth A count B keep_draws 1 eval_limit 32000
+generate_training_data depth A count B keep_draws 1 eval_limit 32000
 ```
 
 - `A` is the searched depth per move, or how far the engine looks forward. This value is an integer.
@@ -164,9 +164,9 @@ You will also need validation data that is used for loss calculation and accurac
 
 Currently there are 3 training data formats. Two of them are supported directly.
 
-- `.bin` - the original training data format. Uses 40 bytes per entry. Is supported directly by the `gensfen` and `learn` commands.
-- `.plain` - a human readable training data format. This one is not supported directly by the `gensfen` and `learn` commands. It should not be used for data exchange because it's less compact than other formats. It is mostly useful for inspection of the data.
-- `.binpack` - a compact binary training data format that exploits positions chains to further reduce size. It uses on average between 2 to 3 bytes per entry when generating data with `gensfen`. It is supported directly by `gensfen` and `learn` commands. It is currently the default for the `gensfen` command. A more in depth description can be found [here](docs/binpack.md)
+- `.bin` - the original training data format. Uses 40 bytes per entry. Is supported directly by the `generate_training_data` command.
+- `.plain` - a human readable training data format. This one is not supported directly by the `generate_training_data` command. It should not be used for data exchange because it's less compact than other formats. It is mostly useful for inspection of the data.
+- `.binpack` - a compact binary training data format that exploits positions chains to further reduce size. It uses on average between 2 to 3 bytes per entry when generating data with `generate_training_data`. It is supported directly by `generate_training_data` command. It is currently the default for the `generate_training_data` command. A more in depth description can be found [here](docs/binpack.md)
 
 ### Conversion between formats.
 
diff --git a/docs/gensfen.md b/docs/generate_training_data.md
similarity index 59%
rename from docs/gensfen.md
rename to docs/generate_training_data.md
index 48f7f5e7..a02d6e0c 100644
--- a/docs/gensfen.md
+++ b/docs/generate_training_data.md
@@ -1,34 +1,36 @@
-# Gensfen
+# generate_training_data
 
-`gensfen` command allows generation of training data from self-play in a manner that suits training better than traditional games. It introduces random moves to diversify openings, and fixed depth evaluation.
+`generate_training_data` command allows generation of training data from self-play in a manner that suits training better than traditional games. It introduces random moves to diversify openings, and fixed depth evaluation.
 
-As all commands in stockfish `gensfen` can be invoked either from command line (as `stockfish.exe gensfen ...`, but this is not recommended because it's not possible to specify UCI options before `gensfen` executes) or in the interactive prompt.
+As all commands in stockfish `generate_training_data` can be invoked either from command line (as `stockfish.exe generate_training_data ...`, but this is not recommended because it's not possible to specify UCI options before `generate_training_data` executes) or in the interactive prompt.
 
 It is recommended to set the `PruneAtShallowDepth` UCI option to `false` as it will increase the quality of fixed depth searches.
 
 It is recommended to keep the `EnableTranspositionTable` UCI option at the default `true` value as it will make the generation process faster without noticably harming the uniformity of the data.
 
-`gensfen` takes named parameters in the form of `gensfen param_1_name param_1_value param_2_name param_2_value ...`.
+`generate_training_data` takes named parameters in the form of `generate_training_data param_1_name param_1_value param_2_name param_2_value ...`.
 
 Currently the following options are available:
 
 `set_recommended_uci_options` - this is a modifier not a parameter, no value follows it. If specified then some UCI options are set to recommended values.
 
-`depth` - minimum depth of evaluation of each position. Default: 3.
+`depth` - sets minimum and maximum depth of evaluation of each position. Default: 3.
 
-`depth2` - maximum depth of evaluation of each position. If not specified then the same as `depth`.
+`mindepth` - minimum depth of evaluation of each position. If not specified then the same as `depth`.
+
+`maxdepth` - minimum depth of evaluation of each position. If not specified then the same as `depth`.
 
 `nodes` - the number of nodes to use for evaluation of each position. This number is multiplied by the number of PVs of the current search. This does NOT override the `depth` and `depth2` options. If specified then whichever of depth or nodes limit is reached first applies.
 
-`loop` - the number of training data entries to generate. 1 entry == 1 position. Default: 8000000000 (8B).
+`count` - the number of training data entries to generate. 1 entry == 1 position. Default: 8000000000 (8B).
 
 `output_file_name` - the name of the file to output to. If the extension is not present or doesn't match the selected training data format the right extension will be appened. Default: generated_kifu
 
 `eval_limit` - evaluations with higher absolute value than this will not be written and will terminate a self-play game. Should not exceed 10000 which is VALUE_KNOWN_WIN, but is only hardcapped at mate in 2 (\~30000). Default: 3000
 
-`random_move_minply` - the minimal ply at which a random move may be executed instead of a move chosen by search. Default: 1.
+`random_move_min_ply` - the minimal ply at which a random move may be executed instead of a move chosen by search. Default: 1.
 
-`random_move_maxply` - the maximal ply at which a random move may be executed instead of a move chosen by search. Default: 24.
+`random_move_max_ply` - the maximal ply at which a random move may be executed instead of a move chosen by search. Default: 24.
 
 `random_move_count` - maximum number of random moves in a single self-play game. Default: 5.
 
@@ -40,9 +42,9 @@ Currently the following options are available:
 
 `random_multi_pv_depth` - the depth to use for multiPV search for random move. Default: `depth2`.
 
-`write_minply` - minimum ply for which the training data entry will be emitted. Default: 16.
+`write_min_ply` - minimum ply for which the training data entry will be emitted. Default: 16.
 
-`write_maxply` - maximum ply for which the training data entry will be emitted. Default: 400.
+`write_max_ply` - maximum ply for which the training data entry will be emitted. Default: 400.
 
 `book` - a path to an opening book to use for the starting positions. Currently only .epd format is supported. If not specified then the starting position is always the standard chess starting position.
 
@@ -50,17 +52,13 @@ Currently the following options are available:
 
 `random_file_name` - if specified then the output filename will be chosen randomly. Overrides `output_file_name`.
 
-`write_out_draw_game_in_training_data_generation` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 1.
+`keep_draws` - either 0 or 1. If 1 then training data from drawn games will be emitted too. Default: 1.
 
-`use_draw_in_training_data_generation` - deprecated, alias for `write_out_draw_game_in_training_data_generation`
+`adjudicate_draws_by_score` - either 0 or 1. If 1 then drawn games will be adjudicated when the score remains 0 for at least 8 plies after ply 80. Default: 1.
 
-`detect_draw_by_consecutive_low_score` - either 0 or 1. If 1 then drawn games will be adjudicated when the score remains 0 for at least 8 plies after ply 80. Default: 1.
+`adjudicate_draws_by_insufficient_mating_material` - either 0 or 1. If 1 then position with insufficient material will be adjudicated as draws. Default: 1.
 
-`use_game_draw_adjudication` - deprecated, alias for `detect_draw_by_consecutive_low_score`
-
-`detect_draw_by_insufficient_mating_material` - either 0 or 1. If 1 then position with insufficient material will be adjudicated as draws. Default: 1.
-
-`sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `binpack`.
+`data_format` - format of the training data to use. Either `bin` or `binpack`. Default: `binpack`.
 
 `ensure_quiet` - this is a flag option. When specified the positions will be from the qsearch leaf.
 
diff --git a/docs/gensfen_nonpv.md b/docs/generate_training_data_nonpv.md
similarity index 75%
rename from docs/gensfen_nonpv.md
rename to docs/generate_training_data_nonpv.md
index 0814bd60..5e4e0ee5 100644
--- a/docs/gensfen_nonpv.md
+++ b/docs/generate_training_data_nonpv.md
@@ -1,14 +1,14 @@
-# Gensfen NonPV
+# generate_training_data_nonpv
 
-`gensfen_nonpv` command allows generation of training data from self-play in a manner that suits training better than traditional games. It plays fixed nodes self play games for exploration and records [some of] the evaluated positions. Then rescores them with fixed depth search.
+`generate_training_data_nonpv` command allows generation of training data from self-play in a manner that suits training better than traditional games. It plays fixed nodes self play games for exploration and records [some of] the evaluated positions. Then rescores them with fixed depth search.
 
-As all commands in stockfish `gensfen_nonpv` can be invoked either from command line (as `stockfish.exe gensfen_nonpv ...`, but this is not recommended because it's not possible to specify UCI options before `gensfen_nonpv` executes) or in the interactive prompt.
+As all commands in stockfish `generate_training_data_nonpv` can be invoked either from command line (as `stockfish.exe generate_training_data_nonpv ...`, but this is not recommended because it's not possible to specify UCI options before `generate_training_data_nonpv` executes) or in the interactive prompt.
 
 It is recommended to set the `PruneAtShallowDepth` UCI option to `false` as it will increase the quality of fixed depth searches.
 
 It is recommended to keep the `EnableTranspositionTable` UCI option at the default `true` value as it will make the generation process faster without noticably harming the uniformity of the data.
 
-`gensfen_nonpv` takes named parameters in the form of `gensfen_nonpv param_1_name param_1_value param_2_name param_2_value ...`.
+`generate_training_data_nonpv` takes named parameters in the form of `generate_training_data_nonpv param_1_name param_1_value param_2_name param_2_value ...`.
 
 Currently the following options are available:
 
@@ -36,6 +36,6 @@ Currently the following options are available:
 
 `book` - a path to an opening book to use for the starting positions. Currently only .epd format is supported. If not specified then the starting position is always the standard chess starting position.
 
-`sfen_format` - format of the training data to use. Either `bin` or `binpack`. Default: `binpack`.
+`data_format` - format of the training data to use. Either `bin` or `binpack`. Default: `binpack`.
 
 `seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
diff --git a/src/tools/training_data_generator.cpp b/src/tools/training_data_generator.cpp
index 10f84fe5..746de279 100644
--- a/src/tools/training_data_generator.cpp
+++ b/src/tools/training_data_generator.cpp
@@ -83,7 +83,7 @@ namespace Stockfish::Tools
 
             uint64_t save_every = std::numeric_limits<uint64_t>::max();
 
-            std::string output_file_name = "generated_kifu";
+            std::string output_file_name = "training_data";
 
             SfenOutputType sfen_format = SfenOutputType::Binpack;
 
@@ -831,20 +831,25 @@ namespace Stockfish::Tools
                 break;
 
             if (token == "depth")
+            {
+                is >> params.search_depth_min;
+                params.search_depth_max = params.search_depth_min;
+            }
+            else if (token == "min_depth")
+                is >> params.search_depth_min;
+            else if (token == "max_depth")
                 is >> params.search_depth_min;
-            else if (token == "depth2")
-                is >> params.search_depth_max;
             else if (token == "nodes")
                 is >> params.nodes;
-            else if (token == "loop")
+            else if (token == "count")
                 is >> loop_max;
             else if (token == "output_file_name")
                 is >> params.output_file_name;
             else if (token == "eval_limit")
                 is >> params.eval_limit;
-            else if (token == "random_move_minply")
+            else if (token == "random_move_min_ply")
                 is >> params.random_move_minply;
-            else if (token == "random_move_maxply")
+            else if (token == "random_move_max_ply")
                 is >> params.random_move_maxply;
             else if (token == "random_move_count")
                 is >> params.random_move_count;
@@ -856,9 +861,9 @@ namespace Stockfish::Tools
                 is >> params.random_multi_pv_diff;
             else if (token == "random_multi_pv_depth")
                 is >> params.random_multi_pv_depth;
-            else if (token == "write_minply")
+            else if (token == "write_min_ply")
                 is >> params.write_minply;
-            else if (token == "write_maxply")
+            else if (token == "write_max_ply")
                 is >> params.write_maxply;
             else if (token == "save_every")
                 is >> params.save_every;
@@ -866,15 +871,13 @@ namespace Stockfish::Tools
                 is >> params.book;
             else if (token == "random_file_name")
                 is >> random_file_name;
-            // Accept also the old option name.
-            else if (token == "use_draw_in_training_data_generation" || token == "write_out_draw_game_in_training_data_generation")
+            else if (token == "keep_draws")
                 is >> params.write_out_draw_game_in_training_data_generation;
-            // Accept also the old option name.
-            else if (token == "use_game_draw_adjudication" || token == "detect_draw_by_consecutive_low_score")
+            else if (token == "adjudicate_draws_by_score")
                 is >> params.detect_draw_by_consecutive_low_score;
-            else if (token == "detect_draw_by_insufficient_mating_material")
+            else if (token == "adjudicate_draws_by_insufficient_material")
                 is >> params.detect_draw_by_insufficient_mating_material;
-            else if (token == "sfen_format")
+            else if (token == "data_format")
                 is >> sfen_format;
             else if (token == "seed")
                 is >> params.seed;
@@ -934,25 +937,25 @@ namespace Stockfish::Tools
 
         params.enforce_constraints();
 
-        std::cout << "INFO: Executing gensfen command\n";
+        std::cout << "INFO: Executing generate_training_data command\n";
 
         std::cout << "INFO: Parameters:\n";
         std::cout
             << "  - search_depth_min       = " << params.search_depth_min << endl
             << "  - search_depth_max       = " << params.search_depth_max << endl
             << "  - nodes                  = " << params.nodes << endl
-            << "  - num sfens to generate  = " << loop_max << endl
+            << "  - count                  = " << loop_max << endl
             << "  - eval_limit             = " << params.eval_limit << endl
             << "  - num threads (UCI)      = " << params.num_threads << endl
-            << "  - random_move_minply     = " << params.random_move_minply << endl
-            << "  - random_move_maxply     = " << params.random_move_maxply << endl
+            << "  - random_move_min_ply    = " << params.random_move_minply << endl
+            << "  - random_move_max_ply    = " << params.random_move_maxply << endl
             << "  - random_move_count      = " << params.random_move_count << endl
             << "  - random_move_like_apery = " << params.random_move_like_apery << endl
             << "  - random_multi_pv        = " << params.random_multi_pv << endl
             << "  - random_multi_pv_diff   = " << params.random_multi_pv_diff << endl
             << "  - random_multi_pv_depth  = " << params.random_multi_pv_depth << endl
-            << "  - write_minply           = " << params.write_minply << endl
-            << "  - write_maxply           = " << params.write_maxply << endl
+            << "  - write_min_ply          = " << params.write_minply << endl
+            << "  - write_max_ply          = " << params.write_maxply << endl
             << "  - book                   = " << params.book << endl
             << "  - output_file_name       = " << params.output_file_name << endl
             << "  - save_every             = " << params.save_every << endl
@@ -969,6 +972,6 @@ namespace Stockfish::Tools
         TrainingDataGenerator gensfen(params);
         gensfen.generate(loop_max);
 
-        std::cout << "INFO: TrainingDataGenerator finished." << endl;
+        std::cout << "INFO: generate_training_data finished." << endl;
     }
 }
diff --git a/src/tools/training_data_generator_nonpv.cpp b/src/tools/training_data_generator_nonpv.cpp
index 247c18a4..3db95033 100644
--- a/src/tools/training_data_generator_nonpv.cpp
+++ b/src/tools/training_data_generator_nonpv.cpp
@@ -61,7 +61,7 @@ namespace Stockfish::Tools
 
             int exploration_min_pieces = 8;
 
-            std::string output_file_name = "generated_gensfen_nonpv";
+            std::string output_file_name = "training_data_nonpv";
 
             SfenOutputType sfen_format = SfenOutputType::Binpack;
 
@@ -426,7 +426,7 @@ namespace Stockfish::Tools
                 is >> params.exploration_save_rate;
             else if (token == "book")
                 is >> params.book;
-            else if (token == "sfen_format")
+            else if (token == "data_format")
                 is >> sfen_format;
             else if (token == "seed")
                 is >> params.seed;
@@ -458,7 +458,7 @@ namespace Stockfish::Tools
 
         params.enforce_constraints();
 
-        std::cout << "INFO: Executing gensfen_nonpv command\n";
+        std::cout << "INFO: Executing generate_training_data_nonpv command\n";
 
         std::cout << "INFO: Parameters:\n";
         std::cout
@@ -471,7 +471,7 @@ namespace Stockfish::Tools
             << "  - exploration_min_pieces = " << params.exploration_min_pieces << endl
             << "  - exploration_save_rate  = " << params.exploration_save_rate << endl
             << "  - book                   = " << params.book << endl
-            << "  - sfen_format            = " << sfen_format << endl
+            << "  - data_format            = " << sfen_format << endl
             << "  - seed                   = " << params.seed << endl
             << "  - count                  = " << count << endl;
 
@@ -483,6 +483,6 @@ namespace Stockfish::Tools
         TrainingDataGeneratorNonPv gensfen(params);
         gensfen.generate(count);
 
-        std::cout << "INFO: gensfen_nonpv finished." << endl;
+        std::cout << "INFO: generate_training_data_nonpv finished." << endl;
     }
 }

From 2421a88a540cf28662c98bd36178525c6f5339db Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 13 May 2021 11:00:48 +0200
Subject: [PATCH 566/583] Post merge fixes

---
 src/evaluate.cpp                    | 38 ++++++++++++++---------------
 src/nnue/nnue_common.h              |  2 +-
 src/nnue/nnue_feature_transformer.h |  1 +
 3 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index b7e0bb82..6aef5b88 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -127,28 +127,28 @@ namespace Eval {
           }
     }
 
-  void NNUE::export_net(const std::optional<std::string>& filename) {
-    std::string actualFilename;
-    if (filename.has_value()) {
-      actualFilename = filename.value();
-    } else {
-      if (eval_file_loaded != EvalFileDefaultName) {
-        sync_cout << "Failed to export a net. A non-embedded net can only be saved if the filename is specified." << sync_endl;
-        return;
+    void export_net(const std::optional<std::string>& filename) {
+      std::string actualFilename;
+      if (filename.has_value()) {
+        actualFilename = filename.value();
+      } else {
+        if (eval_file_loaded != EvalFileDefaultName) {
+          sync_cout << "Failed to export a net. A non-embedded net can only be saved if the filename is specified." << sync_endl;
+          return;
+        }
+        actualFilename = EvalFileDefaultName;
+      }
+
+      ofstream stream(actualFilename, std::ios_base::binary);
+      if (save_eval(stream)) {
+          sync_cout << "Network saved successfully to " << actualFilename << "." << sync_endl;
+      } else {
+          sync_cout << "Failed to export a net." << sync_endl;
       }
-      actualFilename = EvalFileDefaultName;
     }
 
-    ofstream stream(actualFilename, std::ios_base::binary);
-    if (save_eval(stream)) {
-        sync_cout << "Network saved successfully to " << actualFilename << "." << sync_endl;
-    } else {
-        sync_cout << "Failed to export a net." << sync_endl;
-    }
-  }
-
-  /// NNUE::verify() verifies that the last net used was loaded successfully
-  void NNUE::verify() {
+    /// NNUE::verify() verifies that the last net used was loaded successfully
+    void verify() {
 
       string eval_file = string(Options["EvalFile"]);
 
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 9fc7b0e9..eb902a4f 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -21,7 +21,7 @@
 #ifndef NNUE_COMMON_H_INCLUDED
 #define NNUE_COMMON_H_INCLUDED
 
-#include "types.h"
+#include "../types.h"
 
 #include <cstring>
 #include <iostream>
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index a4a8e98f..373367d6 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -25,6 +25,7 @@
 #include "nnue_architecture.h"
 
 #include "../misc.h"
+#include "../position.h"
 
 #include <cstring> // std::memset()
 

From 201d324187f3567d8c686e38e83eb816686b6a36 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 14 May 2021 17:45:39 +0200
Subject: [PATCH 567/583] Add . as an additional include directory both for
 .depend and for the build.

---
 src/Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 45ae7e5f..929c8dfc 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -320,8 +320,9 @@ endif
 ### ==========================================================================
 
 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 -I. $(EXTRACXXFLAGS)
-DEPENDFLAGS += -std=c++17
+ADDITIONAL_INCLUDE_DIRECTORIES = -I.
+CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 $(ADDITIONAL_INCLUDE_DIRECTORIES) $(EXTRACXXFLAGS)
+DEPENDFLAGS += -std=c++17 $(ADDITIONAL_INCLUDE_DIRECTORIES)
 LDFLAGS += $(EXTRALDFLAGS)
 
 ifeq ($(COMP),)

From ddcfaa06fa6757c8639597f67b40a38e38917ca7 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 17 May 2021 11:35:36 +0200
Subject: [PATCH 568/583] Don't ignore unknown options, don't execute the
 command instead.

---
 src/tools/training_data_generator.cpp       |  5 ++++-
 src/tools/training_data_generator_nonpv.cpp |  5 ++++-
 src/tools/transform.cpp                     | 10 ++++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/tools/training_data_generator.cpp b/src/tools/training_data_generator.cpp
index 746de279..a8804fef 100644
--- a/src/tools/training_data_generator.cpp
+++ b/src/tools/training_data_generator.cpp
@@ -896,7 +896,10 @@ namespace Stockfish::Tools
                 params.ensure_quiet = true;
             }
             else
-                cout << "ERROR: Ignoring unknown option " << token << endl;
+            {
+                cout << "ERROR: Unknown option " << token << ". Exiting...\n";
+                return;
+            }
         }
 
         if (!sfen_format.empty())
diff --git a/src/tools/training_data_generator_nonpv.cpp b/src/tools/training_data_generator_nonpv.cpp
index 3db95033..e8df9c50 100644
--- a/src/tools/training_data_generator_nonpv.cpp
+++ b/src/tools/training_data_generator_nonpv.cpp
@@ -443,7 +443,10 @@ namespace Stockfish::Tools
                 UCI::setoption("EnableTranspositionTable", "true");
             }
             else
-                cout << "ERROR: Ignoring unknown option " << token << endl;
+            {
+                cout << "ERROR: Unknown option " << token << ". Exiting...\n";
+                return;
+            }
         }
 
         if (!sfen_format.empty())
diff --git a/src/tools/transform.cpp b/src/tools/transform.cpp
index 0b7f5a27..ab7a3db8 100644
--- a/src/tools/transform.cpp
+++ b/src/tools/transform.cpp
@@ -210,6 +210,11 @@ namespace Stockfish::Tools
                 is >> params.input_filename;
             else if (token == "output_file")
                 is >> params.output_filename;
+            else
+            {
+                std::cout << "ERROR: Unknown option " << token << ". Exiting...\n";
+                return;
+            }
         }
 
         std::cout << "Performing transform nudged_static with parameters:\n";
@@ -474,6 +479,11 @@ namespace Stockfish::Tools
                 is >> params.keep_moves;
             else if (token == "research_count")
                 is >> params.research_count;
+            else
+            {
+                std::cout << "ERROR: Unknown option " << token << ". Exiting...\n";
+                return;
+            }
         }
 
         params.enforce_constraints();

From 8634a5d02127ffbdb50c530ab3203219f45afb5a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 18 May 2021 15:31:56 +0200
Subject: [PATCH 569/583] Improve gather_statistics output structure.

---
 src/tools/stats.cpp | 355 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 297 insertions(+), 58 deletions(-)

diff --git a/src/tools/stats.cpp b/src/tools/stats.cpp
index c154fc10..1172f18b 100644
--- a/src/tools/stats.cpp
+++ b/src/tools/stats.cpp
@@ -24,16 +24,245 @@
 #include <limits>
 #include <mutex>
 #include <optional>
+#include <type_traits>
 
 namespace Stockfish::Tools::Stats
 {
+    struct Indentation
+    {
+        char character = ' ';
+        int width_per_indent = 4;
+        int num_indents = 0;
+
+        [[nodiscard]] Indentation next() const
+        {
+            return Indentation{ character, width_per_indent, num_indents + 1 };
+        }
+
+        [[nodiscard]] std::string to_string() const
+        {
+            return std::string(num_indents * width_per_indent, character);
+        }
+    };
+
+    template <typename IntT>
+    [[nodiscard]] int get_num_base_10_digits(IntT v)
+    {
+        int digits = 1;
+        while (v != 0)
+        {
+            digits += 1;
+            v /= 10;
+        }
+        return digits;
+    }
+
+    [[nodiscard]] std::string indent_text(const std::string& text, Indentation indent)
+    {
+        std::string delimiter = "\n";
+        std::string indent_str = indent.to_string();
+
+        std::string indented;
+
+        std::string::size_type pos = 0;
+        std::string::size_type prev = 0;
+        while ((pos = text.find(delimiter, prev)) != std::string::npos)
+        {
+            std::string line = text.substr(prev, pos - prev);
+            indented += indent_str + line + delimiter;
+            prev = pos + delimiter.size();
+        }
+
+        {
+            std::string line = text.substr(prev);
+            indented += indent_str + line;
+        }
+
+        return indented;
+    }
+
+    struct IndentedTextBlock
+    {
+        Indentation indentation;
+        std::string text;
+
+        IndentedTextBlock(Indentation indent, std::string str) :
+            indentation(indent),
+            text(std::move(str))
+        {
+        }
+
+        [[nodiscard]] static std::string join(const std::vector<IndentedTextBlock>& blocks, const std::string& delimiter)
+        {
+            std::string result;
+
+            bool is_first = true;
+            for (auto&& [indentation, text] : blocks)
+            {
+                if (!is_first)
+                {
+                    result += delimiter;
+                }
+
+                result += indent_text(text, indentation);
+
+                is_first = false;
+            }
+
+            return result;
+        }
+    };
+
+    struct StatisticOutputEntryNode
+    {
+        [[nodiscard]] const std::vector<std::unique_ptr<StatisticOutputEntryNode>>& get_children() const
+        {
+            return m_children;
+        }
+
+        template <typename NodeT, typename... Ts>
+        StatisticOutputEntryNode& emplace_child(Ts&&... args)
+        {
+            return *(m_children.emplace_back(std::make_unique<NodeT>(std::forward<Ts>(args)...)));
+        }
+
+        template <typename NodeT>
+        StatisticOutputEntryNode& add_child(std::unique_ptr<NodeT>&& node)
+        {
+            return *(m_children.emplace_back(std::move(node)));
+        }
+
+        [[nodiscard]] virtual std::vector<IndentedTextBlock> to_indented_text_blocks(Indentation indent) const = 0;
+
+    protected:
+        std::vector<std::unique_ptr<StatisticOutputEntryNode>> m_children;
+
+        void add_indented_children_blocks(std::vector<IndentedTextBlock>& blocks, Indentation indent) const
+        {
+            for (auto&& child : m_children)
+            {
+                auto part = child->to_indented_text_blocks(indent.next());
+                blocks.insert(blocks.end(), part.begin(), part.end());
+            }
+        }
+    };
+
+    struct StatisticOutputEntryHeader : StatisticOutputEntryNode
+    {
+        StatisticOutputEntryHeader(const std::string& text) :
+            m_text(text)
+        {
+        }
+
+        [[nodiscard]] virtual std::vector<IndentedTextBlock> to_indented_text_blocks(Indentation indent) const override
+        {
+            std::vector<IndentedTextBlock> blocks;
+
+            blocks.emplace_back(indent, m_text);
+
+            this->add_indented_children_blocks(blocks, indent);
+
+            return blocks;
+        }
+
+    private:
+        std::string m_text;
+    };
+
+    template <typename T>
+    struct StatisticOutputEntryValue : StatisticOutputEntryNode
+    {
+        StatisticOutputEntryValue(const std::string& name, const T& value, bool value_in_new_line = false) :
+            m_value(name, value),
+            m_value_in_new_line(value_in_new_line)
+        {
+        }
+
+        [[nodiscard]] virtual std::vector<IndentedTextBlock> to_indented_text_blocks(Indentation indent) const override
+        {
+            std::vector<IndentedTextBlock> blocks;
+
+            std::string value_str;
+            if constexpr (std::is_same_v<T, std::string>)
+            {
+                value_str = m_value.second;
+            }
+            else
+            {
+                value_str = std::to_string(m_value.second);
+            }
+
+            if (m_value_in_new_line)
+            {
+                blocks.emplace_back(indent, m_value.first + ": ");
+                blocks.emplace_back(indent.next(), value_str);
+            }
+            else
+            {
+                blocks.emplace_back(indent, m_value.first + ": " + value_str);
+            }
+
+            this->add_indented_children_blocks(blocks, indent);
+
+            return blocks;
+        }
+
+    private:
+        std::pair<std::string, T> m_value;
+        bool m_value_in_new_line;
+    };
+
+    struct StatisticOutput
+    {
+        template <typename NodeT, typename... Ts>
+        StatisticOutputEntryNode& emplace_node(Ts&&... args)
+        {
+            return *(m_nodes.emplace_back(std::make_unique<NodeT>(std::forward<Ts>(args)...)));
+        }
+
+        template <typename NodeT>
+        StatisticOutputEntryNode& add_child(std::unique_ptr<NodeT>&& node)
+        {
+            return *(m_nodes.emplace_back(std::move(node)));
+        }
+
+        [[nodiscard]] const std::vector<std::unique_ptr<StatisticOutputEntryNode>>& get_nodes() const
+        {
+            return m_nodes;
+        }
+
+        void add(StatisticOutput&& other)
+        {
+            for (auto&& node : other.m_nodes)
+            {
+                m_nodes.emplace_back(std::move(node));
+            }
+        }
+
+        [[nodiscard]] std::string to_string() const
+        {
+            std::vector<IndentedTextBlock> blocks;
+
+            for (auto&& node : m_nodes)
+            {
+                auto part = node->to_indented_text_blocks(Indentation{});
+                blocks.insert(blocks.end(), part.begin(), part.end());
+            }
+
+            return IndentedTextBlock::join(blocks, "\n");
+        }
+
+    private:
+        std::vector<std::unique_ptr<StatisticOutputEntryNode>> m_nodes;
+    };
+
     struct StatisticGathererBase
     {
         virtual void on_position(const Position&) {}
         virtual void on_move(const Position&, const Move&) {}
         virtual void reset() = 0;
         [[nodiscard]] virtual const std::string& get_name() const = 0;
-        [[nodiscard]] virtual std::vector<std::pair<std::string, std::string>> get_formatted_stats() const = 0;
+        [[nodiscard]] virtual StatisticOutput get_output() const = 0;
     };
 
     struct StatisticGathererFactoryBase
@@ -104,21 +333,20 @@ namespace Stockfish::Tools::Stats
             }
         }
 
-        [[nodiscard]] virtual const std::string& get_name() const override
+        [[nodiscard]] const std::string& get_name() const override
         {
             static std::string name = "SET";
             return name;
         }
 
-        [[nodiscard]] virtual std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
+        [[nodiscard]] StatisticOutput get_output() const override
         {
-            std::vector<std::pair<std::string, std::string>> parts;
+            StatisticOutput out;
             for (auto&& s : m_gatherers)
             {
-                auto part = s->get_formatted_stats();
-                parts.insert(parts.end(), part.begin(), part.end());
+                out.add(s->get_output());
             }
-            return parts;
+            return out;
         }
 
     private:
@@ -190,16 +418,27 @@ namespace Stockfish::Tools::Stats
             return m_squares[sq];
         }
 
-        [[nodiscard]] std::string get_formatted_stats() const
+        [[nodiscard]] std::unique_ptr<StatisticOutputEntryNode> get_output_node(const std::string& name) const
         {
+            int max_digits = 1;
+            for (int i = 0; i < SQUARE_NB; ++i)
+            {
+                const int d = get_num_base_10_digits(m_squares[i]);
+                if (d > max_digits)
+                {
+                    max_digits = d;
+                }
+            }
+
             std::stringstream ss;
             for (int i = 0; i < SQUARE_NB; ++i)
             {
-                ss << std::setw(8) << m_squares[i ^ (int)SQ_A8] << ' ';
+                ss << std::setw(max_digits) << m_squares[i ^ (int)SQ_A8] << ' ';
                 if ((i + 1) % 8 == 0)
                     ss << '\n';
             }
-            return ss.str();
+
+            return std::make_unique<StatisticOutputEntryValue<std::string>>(name, ss.str(), true);
         }
 
     private:
@@ -234,11 +473,11 @@ namespace Stockfish::Tools::Stats
             return name;
         }
 
-        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
+        [[nodiscard]] StatisticOutput get_output() const override
         {
-            return {
-                { "Number of positions", std::to_string(m_num_positions) }
-            };
+            StatisticOutput out;
+            out.emplace_node<StatisticOutputEntryValue<std::uint64_t>>("Number of positions", m_num_positions);
+            return out;
         }
 
     private:
@@ -273,12 +512,13 @@ namespace Stockfish::Tools::Stats
             return name;
         }
 
-        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
+        [[nodiscard]] StatisticOutput get_output() const override
         {
-            return {
-                { "White king squares", '\n' + m_white.get_formatted_stats() },
-                { "Black king squares", '\n' + m_black.get_formatted_stats() }
-            };
+            StatisticOutput out;
+            auto& header = out.emplace_node<StatisticOutputEntryHeader>("King square distribution:");
+            header.add_child(m_white.get_output_node("White king squares"));
+            header.add_child(m_black.get_output_node("Black king squares"));
+            return out;
         }
 
     private:
@@ -316,12 +556,13 @@ namespace Stockfish::Tools::Stats
             return name;
         }
 
-        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
+        [[nodiscard]] StatisticOutput get_output() const override
         {
-            return {
-                { "White move from squares", '\n' + m_white.get_formatted_stats() },
-                { "Black move from squares", '\n' + m_black.get_formatted_stats() }
-            };
+            StatisticOutput out;
+            auto& header = out.emplace_node<StatisticOutputEntryHeader>("Move from square distribution:");
+            header.add_child(m_white.get_output_node("White move from squares"));
+            header.add_child(m_black.get_output_node("Black move from squares"));
+            return out;
         }
 
     private:
@@ -359,12 +600,13 @@ namespace Stockfish::Tools::Stats
             return name;
         }
 
-        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
+        [[nodiscard]] StatisticOutput get_output() const override
         {
-            return {
-                { "White move to squares", '\n' + m_white.get_formatted_stats() },
-                { "Black move to squares", '\n' + m_black.get_formatted_stats() }
-            };
+            StatisticOutput out;
+            auto& header = out.emplace_node<StatisticOutputEntryHeader>("Move to square distribution:");
+            header.add_child(m_white.get_output_node("White move to squares"));
+            header.add_child(m_black.get_output_node("Black move to squares"));
+            return out;
         }
 
     private:
@@ -419,16 +661,17 @@ namespace Stockfish::Tools::Stats
             return name;
         }
 
-        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
+        [[nodiscard]] StatisticOutput get_output() const override
         {
-            return {
-                { "Total moves", std::to_string(m_total) },
-                { "Normal moves", std::to_string(m_normal) },
-                { "Capture moves", std::to_string(m_capture) },
-                { "Promotion moves", std::to_string(m_promotion) },
-                { "Castling moves", std::to_string(m_castling) },
-                { "En-passant moves", std::to_string(m_enpassant) }
-            };
+            StatisticOutput out;
+            auto& header = out.emplace_node<StatisticOutputEntryHeader>("Number of moves by type:");
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Total", m_total);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Normal", m_normal);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Capture", m_capture);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Promotion", m_promotion);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Castling", m_castling);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("En-passant", m_enpassant);
+            return out;
         }
 
     private:
@@ -465,9 +708,10 @@ namespace Stockfish::Tools::Stats
             return name;
         }
 
-        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
+        [[nodiscard]] StatisticOutput get_output() const override
         {
-            std::vector<std::pair<std::string, std::string>> result;
+            StatisticOutput out;
+            auto& header = out.emplace_node<StatisticOutputEntryHeader>("Number of positions by piece count:");
             bool do_write = false;
             for (int i = SQUARE_NB - 1; i >= 0; --i)
             {
@@ -477,13 +721,10 @@ namespace Stockfish::Tools::Stats
                 // Start writing when the first non-zero number pops up.
                 if (do_write)
                 {
-                    result.emplace_back(
-                        std::string("Number of positions with ") + std::to_string(i) + " pieces",
-                        std::to_string(m_piece_count_hist[i])
-                    );
+                    header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>(std::to_string(i), m_piece_count_hist[i]);
                 }
             }
-            return result;
+            return out;
         }
 
     private:
@@ -515,16 +756,17 @@ namespace Stockfish::Tools::Stats
             return name;
         }
 
-        [[nodiscard]] std::vector<std::pair<std::string, std::string>> get_formatted_stats() const override
+        [[nodiscard]] StatisticOutput get_output() const override
         {
-            return {
-                { "Pawn moves", std::to_string(m_moved_piece_type_hist[PAWN]) },
-                { "Knight moves", std::to_string(m_moved_piece_type_hist[KNIGHT]) },
-                { "Bishop moves", std::to_string(m_moved_piece_type_hist[BISHOP]) },
-                { "Rook moves", std::to_string(m_moved_piece_type_hist[ROOK]) },
-                { "Queen moves", std::to_string(m_moved_piece_type_hist[QUEEN]) },
-                { "King moves", std::to_string(m_moved_piece_type_hist[KING]) }
-            };
+            StatisticOutput out;
+            auto& header = out.emplace_node<StatisticOutputEntryHeader>("Number of moves by piece type:");
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Pawn", m_moved_piece_type_hist[PAWN]);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Knight", m_moved_piece_type_hist[KNIGHT]);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Bishop", m_moved_piece_type_hist[BISHOP]);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Rook", m_moved_piece_type_hist[ROOK]);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Queen", m_moved_piece_type_hist[QUEEN]);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("King", m_moved_piece_type_hist[KING]);
+            return out;
         }
 
     private:
@@ -606,10 +848,7 @@ namespace Stockfish::Tools::Stats
         std::cout << "Finished gathering statistics.\n\n";
         std::cout << "Results:\n\n";
 
-        for (auto&& [name, value] : statistic_gatherers.get_formatted_stats())
-        {
-            std::cout << name << ": " << value << '\n';
-        }
+        std::cout << statistic_gatherers.get_output().to_string();
     }
 
     void gather_statistics(std::istringstream& is)

From a4b598060c0618e8093e71a9ff37f2ed86ca3521 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 19 May 2021 12:55:14 +0200
Subject: [PATCH 570/583] Add stats: ply_discontinuities, material_imbalance,
 results

---
 src/tools/stats.cpp | 226 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 197 insertions(+), 29 deletions(-)

diff --git a/src/tools/stats.cpp b/src/tools/stats.cpp
index 1172f18b..d8274f65 100644
--- a/src/tools/stats.cpp
+++ b/src/tools/stats.cpp
@@ -57,6 +57,18 @@ namespace Stockfish::Tools::Stats
         return digits;
     }
 
+    [[nodiscard]] std::string left_pad_to_length(const std::string& str, char ch, int length)
+    {
+        if (str.size() < length)
+        {
+            return std::string(length - static_cast<int>(str.size()), ch) + str;
+        }
+        else
+        {
+            return str;
+        }
+    }
+
     [[nodiscard]] std::string indent_text(const std::string& text, Indentation indent)
     {
         std::string delimiter = "\n";
@@ -258,8 +270,7 @@ namespace Stockfish::Tools::Stats
 
     struct StatisticGathererBase
     {
-        virtual void on_position(const Position&) {}
-        virtual void on_move(const Position&, const Move&) {}
+        virtual void on_entry(const Position&, const Move&, const PackedSfenValue&) {}
         virtual void reset() = 0;
         [[nodiscard]] virtual const std::string& get_name() const = 0;
         [[nodiscard]] virtual StatisticOutput get_output() const = 0;
@@ -309,19 +320,11 @@ namespace Stockfish::Tools::Stats
             }
         }
 
-        void on_position(const Position& position) override
+        void on_entry(const Position& pos, const Move& move, const PackedSfenValue& psv) override
         {
             for (auto& g : m_gatherers)
             {
-                g->on_position(position);
-            }
-        }
-
-        void on_move(const Position& pos, const Move& move) override
-        {
-            for (auto& g : m_gatherers)
-            {
-                g->on_move(pos, move);
+                g->on_entry(pos, move, psv);
             }
         }
 
@@ -458,7 +461,7 @@ namespace Stockfish::Tools::Stats
         {
         }
 
-        void on_position(const Position&) override
+        void on_entry(const Position&, const Move&, const PackedSfenValue&) override
         {
             m_num_positions += 1;
         }
@@ -495,7 +498,7 @@ namespace Stockfish::Tools::Stats
 
         }
 
-        void on_position(const Position& pos) override
+        void on_entry(const Position& pos, const Move&, const PackedSfenValue&) override
         {
             m_white[pos.square<KING>(WHITE)] += 1;
             m_black[pos.square<KING>(BLACK)] += 1;
@@ -537,7 +540,7 @@ namespace Stockfish::Tools::Stats
 
         }
 
-        void on_move(const Position& pos, const Move& move) override
+        void on_entry(const Position& pos, const Move& move, const PackedSfenValue&) override
         {
             if (pos.side_to_move() == WHITE)
                 m_white[from_sq(move)] += 1;
@@ -581,7 +584,7 @@ namespace Stockfish::Tools::Stats
 
         }
 
-        void on_move(const Position& pos, const Move& move) override
+        void on_entry(const Position& pos, const Move& move, const PackedSfenValue&) override
         {
             if (pos.side_to_move() == WHITE)
                 m_white[to_sq(move)] += 1;
@@ -629,7 +632,7 @@ namespace Stockfish::Tools::Stats
 
         }
 
-        void on_move(const Position& pos, const Move& move) override
+        void on_entry(const Position& pos, const Move& move, const PackedSfenValue&) override
         {
             m_total += 1;
 
@@ -692,7 +695,7 @@ namespace Stockfish::Tools::Stats
             reset();
         }
 
-        void on_position(const Position& pos) override
+        void on_entry(const Position& pos, const Move&, const PackedSfenValue&) override
         {
             m_piece_count_hist[popcount(pos.pieces())] += 1;
         }
@@ -740,7 +743,7 @@ namespace Stockfish::Tools::Stats
             reset();
         }
 
-        void on_move(const Position& pos, const Move& move) override
+        void on_entry(const Position& pos, const Move& move, const PackedSfenValue&) override
         {
             m_moved_piece_type_hist[type_of(pos.piece_on(from_sq(move)))] += 1;
         }
@@ -773,6 +776,170 @@ namespace Stockfish::Tools::Stats
         std::uint64_t m_moved_piece_type_hist[PIECE_TYPE_NB];
     };
 
+    struct PlyDiscontinuitiesCounter : StatisticGathererBase
+    {
+        static inline std::string name = "PlyDiscontinuitiesCounter";
+
+        PlyDiscontinuitiesCounter()
+        {
+            reset();
+        }
+
+        void on_entry(const Position& pos, const Move&, const PackedSfenValue&) override
+        {
+            const int current_ply = pos.game_ply();
+            if (m_prev_ply != -1)
+            {
+                const bool is_discontinuity = (current_ply != (m_prev_ply + 1));
+                if (is_discontinuity)
+                {
+                    m_num_discontinuities += 1;
+                }
+            }
+            m_prev_ply = current_ply;
+        }
+
+        void reset() override
+        {
+            m_num_discontinuities = 0;
+            m_prev_ply = -1;
+        }
+
+        [[nodiscard]] const std::string& get_name() const override
+        {
+            return name;
+        }
+
+        [[nodiscard]] StatisticOutput get_output() const override
+        {
+            StatisticOutput out;
+            out.emplace_node<StatisticOutputEntryValue<std::uint64_t>>("Number of ply discontinuities (usually games)", m_num_discontinuities);
+            return out;
+        }
+
+    private:
+        std::uint64_t m_num_discontinuities;
+        int m_prev_ply;
+    };
+
+    struct MaterialImbalanceDistribution : StatisticGathererBase
+    {
+        static inline std::string name = "MaterialImbalanceDistribution";
+        static constexpr int max_imbalance = 64;
+
+        MaterialImbalanceDistribution()
+        {
+            reset();
+        }
+
+        void on_entry(const Position& pos, const Move&, const PackedSfenValue&) override
+        {
+            const int imbalance = get_simple_material(pos, WHITE) - get_simple_material(pos, BLACK);
+            const int imbalance_idx = std::clamp(imbalance, -max_imbalance, max_imbalance) + max_imbalance;
+            m_num_imbalances[imbalance_idx] += 1;
+        }
+
+        void reset() override
+        {
+            for (auto& imb : m_num_imbalances)
+                imb = 0;
+        }
+
+        [[nodiscard]] const std::string& get_name() const override
+        {
+            return name;
+        }
+
+        [[nodiscard]] StatisticOutput get_output() const override
+        {
+            StatisticOutput out;
+            auto& header = out.emplace_node<StatisticOutputEntryHeader>("Number of \"simple eval\" imbalances for white's perspective:");
+            const int key_length = get_num_base_10_digits(max_imbalance) + 1;
+            for (int i = -max_imbalance; i <= max_imbalance; ++i)
+            {
+                header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>(
+                    left_pad_to_length(std::to_string(i), ' ', key_length),
+                    m_num_imbalances[i + max_imbalance]
+                );
+            }
+            return out;
+        }
+
+    private:
+        std::uint64_t m_num_imbalances[max_imbalance + 1 + max_imbalance];
+
+        [[nodiscard]] int get_simple_material(const Position& pos, Color c)
+        {
+            return
+                  9 * pos.count<QUEEN>(c)
+                + 5 * pos.count<ROOK>(c)
+                + 3 * pos.count<BISHOP>(c)
+                + 3 * pos.count<KNIGHT>(c)
+                +     pos.count<PAWN>(c);
+        }
+    };
+
+    struct ResultDistribution : StatisticGathererBase
+    {
+        static inline std::string name = "ResultDistribution";
+
+        ResultDistribution()
+        {
+            reset();
+        }
+
+        void on_entry(const Position& pos, const Move&, const PackedSfenValue& psv) override
+        {
+            const Color stm = pos.side_to_move();
+            if (psv.game_result == 0)
+            {
+                m_draws += 1;
+            }
+            else if (psv.game_result == 1)
+            {
+                m_stm_wins += 1;
+                m_wins[stm] += 1;
+            }
+            else
+            {
+                m_stm_loses += 1;
+                m_wins[~stm] += 1;
+            }
+        }
+
+        void reset() override
+        {
+            m_wins[WHITE] = 0;
+            m_wins[BLACK] = 0;
+            m_draws = 0;
+            m_stm_wins = 0;
+            m_stm_loses = 0;
+        }
+
+        [[nodiscard]] const std::string& get_name() const override
+        {
+            return name;
+        }
+
+        [[nodiscard]] StatisticOutput get_output() const override
+        {
+            StatisticOutput out;
+            auto& header = out.emplace_node<StatisticOutputEntryHeader>("Distribution of results:");
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("White wins", m_wins[WHITE]);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Black wins", m_wins[BLACK]);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Draws", m_draws);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Side to move wins", m_stm_wins);
+            header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>("Side to move loses", m_stm_loses);
+            return out;
+        }
+
+    private:
+        std::uint64_t m_wins[COLOR_NB];
+        std::uint64_t m_draws;
+        std::uint64_t m_stm_wins;
+        std::uint64_t m_stm_loses;
+    };
+
     /*
         This function provides factories for all possible statistic gatherers.
         Each new statistic gatherer needs to be added there.
@@ -791,6 +958,12 @@ namespace Stockfish::Tools::Stats
             reg.add<MoveTypeCounter>("move", "move_type");
             reg.add<MovedPieceTypeCounter>("move", "moved_piece_type");
 
+            reg.add<PlyDiscontinuitiesCounter>("ply_discontinuities");
+
+            reg.add<MaterialImbalanceDistribution>("material_imbalance");
+
+            reg.add<ResultDistribution>("results");
+
             reg.add<PieceCountCounter>("piece_count");
 
             return reg;
@@ -810,12 +983,8 @@ namespace Stockfish::Tools::Stats
 
         auto in = Tools::open_sfen_input_file(filename);
 
-        auto on_move = [&](const Position& position, const Move& move) {
-            statistic_gatherers.on_move(position, move);
-        };
-
-        auto on_position = [&](const Position& position) {
-            statistic_gatherers.on_position(position);
+        auto on_entry = [&](const Position& position, const Move& move, const PackedSfenValue& psv) {
+            statistic_gatherers.on_entry(position, move, psv);
         };
 
         if (in == nullptr)
@@ -831,12 +1000,11 @@ namespace Stockfish::Tools::Stats
             if (!v.has_value())
                 break;
 
-            auto& ps = v.value();
+            auto& psv = v.value();
 
-            pos.set_from_packed_sfen(ps.sfen, &si, th);
+            pos.set_from_packed_sfen(psv.sfen, &si, th);
 
-            on_position(pos);
-            on_move(pos, (Move)ps.move);
+            on_entry(pos, (Move)psv.move, psv);
 
             num_processed += 1;
             if (num_processed % 1'000'000 == 0)

From d664ae123fc14169a186ceaecd5a939683533a3a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 19 May 2021 12:56:44 +0200
Subject: [PATCH 571/583] Update docs

---
 docs/stats.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/stats.md b/docs/stats.md
index 54b4e4e0..67d3c7ab 100644
--- a/docs/stats.md
+++ b/docs/stats.md
@@ -29,3 +29,9 @@ Any name that doesn't designate an argument name or is not an argument will be i
 `move`, `moved_piece_type` - the number of times a piece of each type was moved
 
 `piece_count` - the histogram of the number of pieces on the board
+
+`ply_discontinuities` - the number of times the ply jumped by a value different than 1 between two consecutive positions. Usually the number of games.
+
+`material_imbalance` - the histogram of imbalances, with values computed using "simple eval", that is pawn=1, bishop=knight=3, rook=5, queen=9
+
+`results` - the distribution of game results

From f89f8bd8ee672d012b62d94c1f7bfdb163d9618e Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 19 May 2021 13:48:02 +0200
Subject: [PATCH 572/583] Add endgame configuration stats

---
 src/tools/stats.cpp | 148 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 146 insertions(+), 2 deletions(-)

diff --git a/src/tools/stats.cpp b/src/tools/stats.cpp
index d8274f65..1e3ac062 100644
--- a/src/tools/stats.cpp
+++ b/src/tools/stats.cpp
@@ -59,9 +59,23 @@ namespace Stockfish::Tools::Stats
 
     [[nodiscard]] std::string left_pad_to_length(const std::string& str, char ch, int length)
     {
-        if (str.size() < length)
+        const int str_size = static_cast<int>(str.size());
+        if (str_size < length)
         {
-            return std::string(length - static_cast<int>(str.size()), ch) + str;
+            return std::string(length - str_size, ch) + str;
+        }
+        else
+        {
+            return str;
+        }
+    }
+
+    [[nodiscard]] std::string right_pad_to_length(const std::string& str, char ch, int length)
+    {
+        const int str_size = static_cast<int>(str.size());
+        if (str_size < length)
+        {
+            return str + std::string(length - str_size, ch);
         }
         else
         {
@@ -940,6 +954,134 @@ namespace Stockfish::Tools::Stats
         std::uint64_t m_stm_loses;
     };
 
+    template <int MaxManCount>
+    struct EndgameConfigurations : StatisticGathererBase
+    {
+        static_assert(MaxManCount < 10);
+        static_assert(MaxManCount > 2);
+
+        static inline std::string name = std::string("EndgameConfigurations") + std::to_string(MaxManCount);
+
+        using MaterialKey = std::uint64_t;
+
+        EndgameConfigurations()
+        {
+            reset();
+        }
+
+        void on_entry(const Position& pos, const Move&, const PackedSfenValue&) override
+        {
+            const int piece_count = pos.count<ALL_PIECES>();
+            if (piece_count > MaxManCount)
+            {
+                return;
+            }
+
+            const auto index = get_material_key_for_position(pos);
+            m_counts[index] += 1;
+        }
+
+        void reset() override
+        {
+            m_counts.clear();
+        }
+
+        [[nodiscard]] const std::string& get_name() const override
+        {
+            return name;
+        }
+
+        [[nodiscard]] StatisticOutput get_output() const override
+        {
+            StatisticOutput out;
+            auto& header = out.emplace_node<StatisticOutputEntryHeader>("Distribution of endgame configurations:");
+            std::vector<std::pair<MaterialKey, std::uint64_t>> flattened(m_counts.begin(), m_counts.end());
+            std::sort(flattened.begin(), flattened.end(), [](const auto& lhs, const auto& rhs) { return lhs.second > rhs.second; });
+            for (auto&& [index, count] : flattened)
+            {
+                header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>(
+                    get_padded_name_by_material_key(index),
+                    count
+                );
+            }
+            return out;
+        }
+
+    private:
+        // can support up to 17 pieces.
+        // it's basically the material string encoded as a number in base 8
+        // encoding is from the least significant digit to most significant
+        // v=1, P=2, N=3, B=4, R=5, Q=6, K=7. 0 indicates end
+        std::map<MaterialKey, std::uint64_t> m_counts;
+
+        [[nodiscard]] MaterialKey get_material_key_for_position(const Position& pos) const
+        {
+            MaterialKey index = 0;
+            std::uint64_t shift = 0;
+
+            index += 7 << shift; shift += 3;
+
+            for (int i = 0; i < pos.count<PAWN>(WHITE); ++i) { index += 2 << shift; shift += 3; }
+            for (int i = 0; i < pos.count<BISHOP>(WHITE); ++i) { index += 3 << shift; shift += 3; }
+            for (int i = 0; i < pos.count<KNIGHT>(WHITE); ++i) { index += 4 << shift; shift += 3; }
+            for (int i = 0; i < pos.count<ROOK>(WHITE); ++i) { index += 5 << shift; shift += 3; }
+            for (int i = 0; i < pos.count<QUEEN>(WHITE); ++i) { index += 6 << shift; shift += 3; }
+
+            index += 1 << shift; shift += 3;
+            index += 7 << shift; shift += 3;
+
+            for (int i = 0; i < pos.count<PAWN>(BLACK); ++i) { index += 2 << shift; shift += 3; }
+            for (int i = 0; i < pos.count<BISHOP>(BLACK); ++i) { index += 3 << shift; shift += 3; }
+            for (int i = 0; i < pos.count<KNIGHT>(BLACK); ++i) { index += 4 << shift; shift += 3; }
+            for (int i = 0; i < pos.count<ROOK>(BLACK); ++i) { index += 5 << shift; shift += 3; }
+            for (int i = 0; i < pos.count<QUEEN>(BLACK); ++i) { index += 6 << shift; shift += 3; }
+
+            return index;
+        }
+
+        [[nodiscard]] std::string get_padded_name_by_material_key(MaterialKey index) const
+        {
+            std::string sides[COLOR_NB];
+            Color side = WHITE;
+
+            while (index != 0)
+            {
+                switch (index % 8)
+                {
+                    case 1:
+                        side = BLACK;
+                        break;
+                    case 2:
+                        sides[side] += 'P';
+                        break;
+                    case 3:
+                        sides[side] += 'N';
+                        break;
+                    case 4:
+                        sides[side] += 'B';
+                        break;
+                    case 5:
+                        sides[side] += 'R';
+                        break;
+                    case 6:
+                        sides[side] += 'Q';
+                        break;
+                    case 7:
+                        sides[side] += 'K';
+                        break;
+                    default:
+                        break;
+                }
+                index >>= 3;
+            }
+
+            return
+                  right_pad_to_length(sides[WHITE], ' ', MaxManCount-1)
+                + 'v'
+                + right_pad_to_length(sides[BLACK], ' ', MaxManCount-1);
+        }
+    };
+
     /*
         This function provides factories for all possible statistic gatherers.
         Each new statistic gatherer needs to be added there.
@@ -966,6 +1108,8 @@ namespace Stockfish::Tools::Stats
 
             reg.add<PieceCountCounter>("piece_count");
 
+            reg.add<EndgameConfigurations<6>>("endgames_6man");
+
             return reg;
         }();
 

From 0a464a7c216341cd3bb29d105335c27d5484b06f Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 19 May 2021 13:51:40 +0200
Subject: [PATCH 573/583] Improve material imbalance output

---
 src/tools/stats.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/tools/stats.cpp b/src/tools/stats.cpp
index 1e3ac062..7e70f5a3 100644
--- a/src/tools/stats.cpp
+++ b/src/tools/stats.cpp
@@ -869,7 +869,18 @@ namespace Stockfish::Tools::Stats
             StatisticOutput out;
             auto& header = out.emplace_node<StatisticOutputEntryHeader>("Number of \"simple eval\" imbalances for white's perspective:");
             const int key_length = get_num_base_10_digits(max_imbalance) + 1;
+            int min_non_zero = max_imbalance;
+            int max_non_zero = -max_imbalance;
             for (int i = -max_imbalance; i <= max_imbalance; ++i)
+            {
+                if (m_num_imbalances[i + max_imbalance] != 0)
+                {
+                    min_non_zero = std::min(min_non_zero, i);
+                    max_non_zero = std::max(max_non_zero, i);
+                }
+            }
+
+            for (int i = min_non_zero; i <= max_non_zero; ++i)
             {
                 header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>(
                     left_pad_to_length(std::to_string(i), ' ', key_length),

From dc00b6c188ad0c9f06c130874a14b958ee4d9136 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Wed, 19 May 2021 13:52:18 +0200
Subject: [PATCH 574/583] Update docs

---
 docs/stats.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/stats.md b/docs/stats.md
index 67d3c7ab..286788f8 100644
--- a/docs/stats.md
+++ b/docs/stats.md
@@ -35,3 +35,5 @@ Any name that doesn't designate an argument name or is not an argument will be i
 `material_imbalance` - the histogram of imbalances, with values computed using "simple eval", that is pawn=1, bishop=knight=3, rook=5, queen=9
 
 `results` - the distribution of game results
+
+`endgames_6man` - distribution of endgame configurations for <=6 pieces (including kings)

From 0f241355daf4e87917ff246e8af9bea241941ab9 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 20 May 2021 13:22:11 +0200
Subject: [PATCH 575/583] Add output_file option to gather_statistics.

It is optional. When specified it will also forward the final results output to the provided file.
---
 docs/stats.md       |  2 ++
 src/tools/stats.cpp | 20 +++++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/docs/stats.md b/docs/stats.md
index 286788f8..8fe50ee5 100644
--- a/docs/stats.md
+++ b/docs/stats.md
@@ -10,6 +10,8 @@ Any name that doesn't designate an argument name or is not an argument will be i
 
 `input_file` - the path to the .bin or .binpack input file to read
 
+`output_file` - optional path to the output file to write the results too. Results are always written on the console, so if this is specified the results will be written in both places.
+
 `max_count` - the maximum number of positions to process. Default: no limit.
 
 ## Groups
diff --git a/src/tools/stats.cpp b/src/tools/stats.cpp
index 7e70f5a3..538b2a9e 100644
--- a/src/tools/stats.cpp
+++ b/src/tools/stats.cpp
@@ -1130,7 +1130,8 @@ namespace Stockfish::Tools::Stats
     void do_gather_statistics(
         const std::string& filename,
         StatisticGathererSet& statistic_gatherers,
-        std::uint64_t max_count)
+        std::uint64_t max_count,
+        const std::optional<std::string>& output_filename)
     {
         Thread* th = Threads.main();
         Position& pos = th->rootPos;
@@ -1171,7 +1172,13 @@ namespace Stockfish::Tools::Stats
         std::cout << "Finished gathering statistics.\n\n";
         std::cout << "Results:\n\n";
 
-        std::cout << statistic_gatherers.get_output().to_string();
+        const auto output_str = statistic_gatherers.get_output().to_string();
+        std::cout << output_str;
+        if (output_filename.has_value())
+        {
+            std::ofstream out_file(*output_filename);
+            out_file << output_str;
+        }
     }
 
     void gather_statistics(std::istringstream& is)
@@ -1183,6 +1190,7 @@ namespace Stockfish::Tools::Stats
         StatisticGathererSet statistic_gatherers;
 
         std::string input_file;
+        std::optional<std::string> output_file;
         std::uint64_t max_count = std::numeric_limits<std::uint64_t>::max();
 
         while(true)
@@ -1195,13 +1203,19 @@ namespace Stockfish::Tools::Stats
 
             if (token == "input_file")
                 is >> input_file;
+            else if (token == "output_file")
+            {
+                std::string s;
+                is >> s;
+                output_file = s;
+            }
             else if (token == "max_count")
                 is >> max_count;
             else
                 registry.add_statistic_gatherers_by_group(statistic_gatherers, token);
         }
 
-        do_gather_statistics(input_file, statistic_gatherers, max_count);
+        do_gather_statistics(input_file, statistic_gatherers, max_count, output_file);
     }
 
 }

From c124d55fa63a9544bd88245ece0e4652e878dbdb Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 20 May 2021 12:59:55 +0200
Subject: [PATCH 576/583] Add more output to endgame stats.

---
 src/tools/stats.cpp | 78 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 67 insertions(+), 11 deletions(-)

diff --git a/src/tools/stats.cpp b/src/tools/stats.cpp
index 7e70f5a3..420d450b 100644
--- a/src/tools/stats.cpp
+++ b/src/tools/stats.cpp
@@ -980,7 +980,7 @@ namespace Stockfish::Tools::Stats
             reset();
         }
 
-        void on_entry(const Position& pos, const Move&, const PackedSfenValue&) override
+        void on_entry(const Position& pos, const Move&, const PackedSfenValue& psv) override
         {
             const int piece_count = pos.count<ALL_PIECES>();
             if (piece_count > MaxManCount)
@@ -989,12 +989,29 @@ namespace Stockfish::Tools::Stats
             }
 
             const auto index = get_material_key_for_position(pos);
-            m_counts[index] += 1;
+            auto& entry = m_entries[index];
+            entry.count += 1;
+            if (psv.game_result == 0)
+            {
+                entry.draws += 1;
+            }
+            else
+            {
+                const Color winner_side = psv.game_result == 1 ? pos.side_to_move() : ~pos.side_to_move();
+                if (winner_side == WHITE)
+                {
+                    entry.white_wins += 1;
+                }
+                else
+                {
+                    entry.black_wins += 1;
+                }
+            }
         }
 
         void reset() override
         {
-            m_counts.clear();
+            m_entries.clear();
         }
 
         [[nodiscard]] const std::string& get_name() const override
@@ -1005,25 +1022,49 @@ namespace Stockfish::Tools::Stats
         [[nodiscard]] StatisticOutput get_output() const override
         {
             StatisticOutput out;
-            auto& header = out.emplace_node<StatisticOutputEntryHeader>("Distribution of endgame configurations:");
-            std::vector<std::pair<MaterialKey, std::uint64_t>> flattened(m_counts.begin(), m_counts.end());
-            std::sort(flattened.begin(), flattened.end(), [](const auto& lhs, const auto& rhs) { return lhs.second > rhs.second; });
-            for (auto&& [index, count] : flattened)
+            auto& header = out.emplace_node<StatisticOutputEntryHeader>("Distribution of endgame configurations (count W D L Perf%):");
+            std::vector<std::pair<MaterialKey, Entry>> flattened(m_entries.begin(), m_entries.end());
+            std::sort(flattened.begin(), flattened.end(), [](const auto& lhs, const auto& rhs) { return lhs.second.count > rhs.second.count; });
+            for (auto&& [index, entry] : flattened)
             {
-                header.emplace_child<StatisticOutputEntryValue<std::uint64_t>>(
+                header.emplace_child<StatisticOutputEntryValue<std::string>>(
                     get_padded_name_by_material_key(index),
-                    count
+                    entry.to_string()
                 );
             }
             return out;
         }
 
     private:
+        struct Entry
+        {
+            std::uint64_t count = 0;
+            std::uint64_t white_wins = 0;
+            std::uint64_t black_wins = 0;
+            std::uint64_t draws = 0;
+
+            [[nodiscard]] std::string to_string() const
+            {
+                constexpr int wide_column_width = 9;
+                constexpr int narrow_column_width = 4;
+
+                const float perf =
+                      (white_wins + draws / 2.0f)
+                    / (white_wins + black_wins + draws);
+
+                return
+                      left_pad_to_length(std::to_string(count), ' ', wide_column_width) + ' '
+                    + left_pad_to_length(std::to_string(white_wins), ' ', wide_column_width) + ' '
+                    + left_pad_to_length(std::to_string(draws), ' ', wide_column_width) + ' '
+                    + left_pad_to_length(std::to_string(black_wins), ' ', wide_column_width) + ' '
+                    + left_pad_to_length(std::to_string(static_cast<int>(perf * 100.0f + 0.5f)), ' ', narrow_column_width) + '%';
+            }
+        };
         // can support up to 17 pieces.
         // it's basically the material string encoded as a number in base 8
         // encoding is from the least significant digit to most significant
         // v=1, P=2, N=3, B=4, R=5, Q=6, K=7. 0 indicates end
-        std::map<MaterialKey, std::uint64_t> m_counts;
+        std::map<MaterialKey, Entry> m_entries;
 
         [[nodiscard]] MaterialKey get_material_key_for_position(const Position& pos) const
         {
@@ -1053,6 +1094,7 @@ namespace Stockfish::Tools::Stats
         [[nodiscard]] std::string get_padded_name_by_material_key(MaterialKey index) const
         {
             std::string sides[COLOR_NB];
+            int material[COLOR_NB] = { 0, 0 };
             Color side = WHITE;
 
             while (index != 0)
@@ -1064,18 +1106,23 @@ namespace Stockfish::Tools::Stats
                         break;
                     case 2:
                         sides[side] += 'P';
+                        material[side] += 1;
                         break;
                     case 3:
                         sides[side] += 'N';
+                        material[side] += 3;
                         break;
                     case 4:
                         sides[side] += 'B';
+                        material[side] += 3;
                         break;
                     case 5:
                         sides[side] += 'R';
+                        material[side] += 5;
                         break;
                     case 6:
                         sides[side] += 'Q';
+                        material[side] += 9;
                         break;
                     case 7:
                         sides[side] += 'K';
@@ -1086,10 +1133,19 @@ namespace Stockfish::Tools::Stats
                 index >>= 3;
             }
 
+            const int imbalance = material[WHITE] - material[BLACK];
+            const std::string imbalance_str =
+                  std::string(imbalance > 0 ? "+" : "") // force + sign for positive values
+                + std::string(imbalance == 0 ? " " : "") // pad 0
+                + std::to_string(imbalance);
+
             return
                   right_pad_to_length(sides[WHITE], ' ', MaxManCount-1)
                 + 'v'
-                + right_pad_to_length(sides[BLACK], ' ', MaxManCount-1);
+                + right_pad_to_length(sides[BLACK], ' ', MaxManCount-1)
+                + " ("
+                + right_pad_to_length(imbalance_str, ' ', 3)
+                + ')';
         }
     };
 

From abb7fa00abe931561efd6397ccda900a2420a599 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Fri, 21 May 2021 11:09:50 +0200
Subject: [PATCH 577/583] Remove ensure_quiet parameter from
 generate_training_data.

---
 docs/generate_training_data.md        |  2 -
 src/tools/training_data_generator.cpp | 97 +++------------------------
 2 files changed, 8 insertions(+), 91 deletions(-)

diff --git a/docs/generate_training_data.md b/docs/generate_training_data.md
index a02d6e0c..734f7e81 100644
--- a/docs/generate_training_data.md
+++ b/docs/generate_training_data.md
@@ -60,6 +60,4 @@ Currently the following options are available:
 
 `data_format` - format of the training data to use. Either `bin` or `binpack`. Default: `binpack`.
 
-`ensure_quiet` - this is a flag option. When specified the positions will be from the qsearch leaf.
-
 `seed` - seed for the PRNG. Can be either a number or a string. If it's a string then its hash will be used. If not specified then the current time will be used.
diff --git a/src/tools/training_data_generator.cpp b/src/tools/training_data_generator.cpp
index a8804fef..24498917 100644
--- a/src/tools/training_data_generator.cpp
+++ b/src/tools/training_data_generator.cpp
@@ -93,8 +93,6 @@ namespace Stockfish::Tools
             bool detect_draw_by_consecutive_low_score = true;
             bool detect_draw_by_insufficient_mating_material = true;
 
-            bool ensure_quiet = false;
-
             uint64_t num_threads;
 
             std::string book;
@@ -349,86 +347,17 @@ namespace Stockfish::Tools
                 // Discard stuff before write_minply is reached
                 // because it can harm training due to overfitting.
                 // Initial positions would be too common.
-                if (ply >= params.write_minply)
+                if (ply >= params.write_minply && !was_seen_before(pos))
                 {
-                    packed_sfens.emplace_back(PackedSfenValue());
+                    auto& psv = packed_sfens.emplace_back();
 
-                    auto& psv = packed_sfens.back();
+                    // Here we only write the position data.
+                    // Result is added after the whole game is done.
+                    pos.sfen_pack(psv.sfen);
 
-                    if (params.ensure_quiet)
-                    {
-                        auto [qsearch_value, qsearch_pv] = Search::qsearch(pos);
-                        if (qsearch_pv.empty())
-                        {
-                            // Here we only write the position data.
-                            // Result is added after the whole game is done.
-                            pos.sfen_pack(psv.sfen);
-
-                            // Already a quiet position
-                            psv.score = search_value;
-                            psv.move = search_pv[0];
-                            psv.gamePly = ply;
-                        }
-                        else
-                        {
-                            // Navigate to a quiet
-                            int old_ply = ply;
-                            for (auto m : qsearch_pv)
-                            {
-                                pos.do_move(m, states[ply++]);
-                            }
-
-                            if (was_seen_before(pos))
-                            {
-                                // Just skip the move.
-                                packed_sfens.pop_back();
-                            }
-                            else
-                            {
-                                // Reevaluate
-                                auto [quiet_search_value, quiet_search_pv] = Search::search(pos, depth, 1, params.nodes);
-                                if (quiet_search_pv.empty())
-                                {
-                                    // Just skip the move.
-                                    packed_sfens.pop_back();
-                                }
-                                else
-                                {
-                                    // Here we only write the position data.
-                                    // Result is added after the whole game is done.
-                                    pos.sfen_pack(psv.sfen);
-
-                                    psv.score = quiet_search_value;
-                                    psv.move = quiet_search_pv[0];
-                                    psv.gamePly = ply;
-                                }
-                            }
-
-                            // Get back to the game
-                            for (auto it = qsearch_pv.rbegin(); it != qsearch_pv.rend(); ++it)
-                            {
-                                pos.undo_move(*it);
-                            }
-                            ply = old_ply;
-                        }
-                    }
-                    else
-                    {
-                        if (was_seen_before(pos))
-                        {
-                            packed_sfens.pop_back();
-                        }
-                        else
-                        {
-                            // Here we only write the position data.
-                            // Result is added after the whole game is done.
-                            pos.sfen_pack(psv.sfen);
-
-                            psv.score = search_value;
-                            psv.move = search_pv[0];
-                            psv.gamePly = ply;
-                        }
-                    }
+                    psv.score = search_value;
+                    psv.move = search_pv[0];
+                    psv.gamePly = ply;
                 }
 
                 // Update the next move according to best search result or random move.
@@ -891,10 +820,6 @@ namespace Stockfish::Tools
                 UCI::setoption("PruneAtShallowDepth", "false");
                 UCI::setoption("EnableTranspositionTable", "true");
             }
-            else if (token == "ensure_quiet")
-            {
-                params.ensure_quiet = true;
-            }
             else
             {
                 cout << "ERROR: Unknown option " << token << ". Exiting...\n";
@@ -912,12 +837,6 @@ namespace Stockfish::Tools
                 cout << "WARNING: Unknown sfen format `" << sfen_format << "`. Using bin\n";
         }
 
-        if (params.ensure_quiet)
-        {
-            // Otherwise we can't ensure quiet positions...
-            UCI::setoption("EnableTranspositionTable", "false");
-        }
-
         if (random_file_name)
         {
             // Give a random number to output_file_name at this point.

From a4605860c69513b7f0d84dfbda7eda8ecc8b121b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 24 May 2021 11:45:21 +0200
Subject: [PATCH 578/583] Post-merge fixes.

---
 src/evaluate.cpp                            | 20 ++++++++++----------
 src/search.cpp                              |  2 +-
 src/thread.h                                | 16 +++++++++++++++-
 src/tools/training_data_generator.cpp       |  4 ++--
 src/tools/training_data_generator_nonpv.cpp |  2 +-
 src/tools/transform.cpp                     |  2 +-
 src/uci.cpp                                 |  2 +-
 7 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index b58ff624..ccb7436b 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -63,18 +63,18 @@ namespace Eval {
   namespace NNUE {
     string eval_file_loaded = "None";
     UseNNUEMode useNNUE;
-  }
 
-  static UseNNUEMode NNUE::nnue_mode_from_option(const UCI::Option& mode)
-  {
-    if (mode == "false")
+    static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
+    {
+      if (mode == "false")
+        return UseNNUEMode::False;
+      else if (mode == "true")
+         return UseNNUEMode::True;
+      else if (mode == "pure")
+        return UseNNUEMode::Pure;
+
       return UseNNUEMode::False;
-    else if (mode == "true")
-       return UseNNUEMode::True;
-    else if (mode == "pure")
-      return UseNNUEMode::Pure;
-
-    return UseNNUEMode::False;
+    }
   }
 
   /// NNUE::init() tries to load a NNUE network at startup time, or when the engine
diff --git a/src/search.cpp b/src/search.cpp
index 73c7f856..f0289b45 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -374,7 +374,7 @@ void Thread::search() {
           // Start with a small aspiration window and, in the case of a fail
           // high/low, re-search with a bigger window until we don't fail
           // high/low anymore.
-          int failedHighCnt = 0;
+          failedHighCnt = 0;
           while (true)
           {
               Depth adjustedDepth = std::max(1, rootDepth - failedHighCnt - searchAgainCounter);
diff --git a/src/thread.h b/src/thread.h
index fec68e05..0989f4ba 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -76,6 +76,15 @@ public:
   void wait_for_search_finished();
   size_t id() const { return idx; }
 
+  void wait_for_worker_finished();
+
+  template <typename FuncT>
+  void set_eval_callback(FuncT&& f) { on_eval_callback = std::forward<FuncT>(f); }
+
+  void clear_eval_callback() { on_eval_callback = nullptr; }
+
+  void on_eval() { if (on_eval_callback) on_eval_callback(rootPos); }
+
   Pawns::Table pawnsTable;
   Material::Table materialTable;
   size_t pvIdx, pvLast;
@@ -94,6 +103,11 @@ public:
   CapturePieceToHistory captureHistory;
   ContinuationHistory continuationHistory[2][2];
   Score contempt;
+  int failedHighCnt;
+  bool rootInTB;
+  int Cardinality;
+  bool UseRule50;
+  Depth ProbeDepth;
 };
 
 
@@ -166,7 +180,7 @@ struct ThreadPool : public std::vector<Thread*> {
 
     execute_with_workers(
       [chunk_size, end, func](Thread& th) mutable {
-        const IndexT thread_id = th.thread_idx();
+        const IndexT thread_id = th.id();
         const IndexT offset = chunk_size * thread_id;
         if (offset >= end)
           return;
diff --git a/src/tools/training_data_generator.cpp b/src/tools/training_data_generator.cpp
index 24498917..6495d566 100644
--- a/src/tools/training_data_generator.cpp
+++ b/src/tools/training_data_generator.cpp
@@ -257,7 +257,7 @@ namespace Stockfish::Tools
 
         StateInfo si;
 
-        auto& prng = prngs[th.thread_idx()];
+        auto& prng = prngs[th.id()];
 
         // end flag
         bool quit = false;
@@ -693,7 +693,7 @@ namespace Stockfish::Tools
             maybe_report(iter + 1);
 
             // Write out one sfen.
-            sfen_writer.write(th.thread_idx(), sfen);
+            sfen_writer.write(th.id(), sfen);
         }
 
         return false;
diff --git a/src/tools/training_data_generator_nonpv.cpp b/src/tools/training_data_generator_nonpv.cpp
index e8df9c50..278259c6 100644
--- a/src/tools/training_data_generator_nonpv.cpp
+++ b/src/tools/training_data_generator_nonpv.cpp
@@ -341,7 +341,7 @@ namespace Stockfish::Tools
             maybe_report(iter + 1);
 
             // Write out one sfen.
-            sfen_writer.write(th.thread_idx(), sfen);
+            sfen_writer.write(th.id(), sfen);
         }
 
         return false;
diff --git a/src/tools/transform.cpp b/src/tools/transform.cpp
index ab7a3db8..f657b410 100644
--- a/src/tools/transform.cpp
+++ b/src/tools/transform.cpp
@@ -426,7 +426,7 @@ namespace Stockfish::Tools
                         ps.move = search_pv[0];
                     ps.padding = 0;
 
-                    out.write(th.thread_idx(), ps);
+                    out.write(th.id(), ps);
 
                     auto p = num_processed.fetch_add(1) + 1;
                     if (p % 10000 == 0)
diff --git a/src/uci.cpp b/src/uci.cpp
index b1b39bc4..2fa7a186 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -342,7 +342,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "tasktest")
       {
         Threads.execute_with_workers([](auto& th) {
-          std::cout << th.thread_idx() << '\n';
+          std::cout << th.id() << '\n';
         });
       }
       else if (!token.empty() && token[0] != '#')

From ca365f17baec20ec26865efec169e28e28c966d8 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 24 May 2021 19:17:19 +0200
Subject: [PATCH 579/583] Fix discrepancy for ep square between set and move in
 the binpack lib.

basically, the binpack lib doesn't reset the epsquare after f7f5 in this 5kb1/5p2/2B3p1/1N1KP2p/3p1P2/2bP2P1/5r2/8 b - - 0 1 position, but it does reset it when passed the fen 5kb1/8/2B3p1/1N1KPp1p/3p1P2/2bP2P1/5r2/8 w - f6 0 50. Potentially creating a discrepancy based on whether the position was set directly or arrived at by a move
---
 src/extra/nnue_data_binpack_format.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index dce53b83..b2597d76 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -6078,18 +6078,14 @@ namespace chess
         // for double pushes move index differs by 16 or -16;
         if((movedPiece == PieceType::Pawn) & ((ordinal(move.to) ^ ordinal(move.from)) == 16))
         {
-            const Square potentialEpSquare = fromOrdinal<Square>((ordinal(move.to) + ordinal(move.from)) >> 1);
-            // Even though the move has not yet been made we can safely call
-            // this function and get the right result because the position of the
-            // pawn to be captured is not really relevant.
-            if (isEpPossible(potentialEpSquare, !m_sideToMove))
-            {
-                m_epSquare = potentialEpSquare;
-            }
+            m_epSquare = fromOrdinal<Square>((ordinal(move.to) + ordinal(move.from)) >> 1);
         }
 
         const Piece captured = BaseType::doMove(move);
         m_sideToMove = !m_sideToMove;
+
+        nullifyEpSquareIfNotPossible();
+
         return { move, captured, oldEpSquare, oldCastlingRights };
     }
 

From eac1d430b40015734a7ad92cf8520af4b92db76b Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Mon, 24 May 2021 19:43:07 +0200
Subject: [PATCH 580/583] Add dedicated command for training data validation.

---
 docs/validate_training_data.md       |  12 +++
 src/Makefile                         |   1 +
 src/extra/nnue_data_binpack_format.h | 150 +++++++++++++++++++++++++++
 src/tools/validate_training_data.cpp | 122 ++++++++++++++++++++++
 src/tools/validate_training_data.h   |  12 +++
 src/uci.cpp                          |   2 +
 6 files changed, 299 insertions(+)
 create mode 100644 docs/validate_training_data.md
 create mode 100644 src/tools/validate_training_data.cpp
 create mode 100644 src/tools/validate_training_data.h

diff --git a/docs/validate_training_data.md b/docs/validate_training_data.md
new file mode 100644
index 00000000..e2bfc30c
--- /dev/null
+++ b/docs/validate_training_data.md
@@ -0,0 +1,12 @@
+# validate_training_data
+
+`validate_training_data` allows validation of training data of types `.plain`, `.bin`, and `.binpack`.
+
+As all commands in stockfish `validate_training_data` can be invoked either from command line (as `stockfish.exe validate_training_data ...`) or in the interactive prompt.
+
+The syntax of this command is as follows:
+```
+validate_training_data in_path
+```
+
+`in_path` is the path to the file to validate. The type of the data is deduced based on its extension (one of `.plain`, `.bin`, `.binpack`).
\ No newline at end of file
diff --git a/src/Makefile b/src/Makefile
index 4661e494..d3cea8de 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -51,6 +51,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
 	nnue/evaluate_nnue.cpp \
 	nnue/features/half_ka_v2.cpp \
+	tools/validate_training_data.cpp \
 	tools/sfen_packer.cpp \
 	tools/training_data_generator.cpp \
 	tools/training_data_generator_nonpv.cpp \
diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index dce53b83..a6366d81 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -7831,4 +7831,154 @@ namespace binpack
 
         std::cout << "Finished. Converted " << numProcessedPositions << " positions.\n";
     }
+
+    inline void validatePlain(std::string inputPath)
+    {
+        constexpr std::size_t reportSize = 1000000;
+
+        std::cout << "Validating " << inputPath << '\n';
+
+        TrainingDataEntry e;
+
+        std::string key;
+        std::string value;
+        std::string move;
+
+        std::ifstream inputFile(inputPath);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+        std::size_t numProcessedPositionsBatch = 0;
+
+        for(;;)
+        {
+            inputFile >> key;
+            if (!inputFile)
+            {
+                break;
+            }
+
+            if (key == "e"sv)
+            {
+                e.move = chess::uci::uciToMove(e.pos, move);
+                if (!e.isValid())
+                {
+                    std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                    return;
+                }
+
+                ++numProcessedPositions;
+                ++numProcessedPositionsBatch;
+
+                if (numProcessedPositionsBatch >= reportSize)
+                {
+                    numProcessedPositionsBatch -= reportSize;
+                    const auto cur = inputFile.tellg();
+                    std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+                }
+
+                continue;
+            }
+
+            inputFile >> std::ws;
+            std::getline(inputFile, value, '\n');
+
+            if (key == "fen"sv) e.pos = chess::Position::fromFen(value.c_str());
+            if (key == "move"sv) move = value;
+            if (key == "score"sv) e.score = std::stoi(value);
+            if (key == "ply"sv) e.ply = std::stoi(value);
+            if (key == "result"sv) e.result = std::stoi(value);
+        }
+
+        if (numProcessedPositionsBatch)
+        {
+            const auto cur = inputFile.tellg();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+
+        std::cout << "Finished. Validated " << numProcessedPositions << " positions.\n";
+    }
+
+    inline void validateBin(std::string inputPath)
+    {
+        constexpr std::size_t reportSize = 1000000;
+
+        std::cout << "Validating " << inputPath << '\n';
+
+        std::ifstream inputFile(inputPath, std::ios_base::binary);
+        const auto base = inputFile.tellg();
+        std::size_t numProcessedPositions = 0;
+        std::size_t numProcessedPositionsBatch = 0;
+
+        nodchip::PackedSfenValue psv;
+        for(;;)
+        {
+            inputFile.read(reinterpret_cast<char*>(&psv), sizeof(psv));
+            if (inputFile.gcount() != 40)
+            {
+                break;
+            }
+
+            auto e = packedSfenValueToTrainingDataEntry(psv);
+            if (!e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                return;
+            }
+
+            ++numProcessedPositions;
+            ++numProcessedPositionsBatch;
+
+            if (numProcessedPositionsBatch >= reportSize)
+            {
+                numProcessedPositionsBatch -= reportSize;
+                const auto cur = inputFile.tellg();
+                std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+            }
+        }
+
+        if (numProcessedPositionsBatch)
+        {
+            const auto cur = inputFile.tellg();
+            std::cout << "Processed " << (cur - base) << " bytes and " << numProcessedPositions << " positions.\n";
+        }
+
+        std::cout << "Finished. Validated " << numProcessedPositions << " positions.\n";
+    }
+
+    inline void validateBinpack(std::string inputPath)
+    {
+        constexpr std::size_t reportSize = 1000000;
+
+        std::cout << "Validating " << inputPath << '\n';
+
+        CompressedTrainingDataEntryReader reader(inputPath);
+        std::size_t numProcessedPositions = 0;
+        std::size_t numProcessedPositionsBatch = 0;
+
+        while(reader.hasNext())
+        {
+            auto e = reader.next();
+            if (!e.isValid())
+            {
+                std::cerr << "Illegal move " << chess::uci::moveToUci(e.pos, e.move) << " for position " << e.pos.fen() << '\n';
+                return;
+            }
+
+            ++numProcessedPositions;
+            ++numProcessedPositionsBatch;
+
+            if (numProcessedPositionsBatch >= reportSize)
+            {
+                numProcessedPositionsBatch -= reportSize;
+                std::cout << "Processed " << numProcessedPositions << " positions.\n";
+            }
+        }
+
+        if (numProcessedPositionsBatch)
+        {
+            std::cout << "Processed " << numProcessedPositions << " positions.\n";
+        }
+
+        std::cout << "Finished. Validated " << numProcessedPositions << " positions.\n";
+    }
 }
diff --git a/src/tools/validate_training_data.cpp b/src/tools/validate_training_data.cpp
new file mode 100644
index 00000000..18cae456
--- /dev/null
+++ b/src/tools/validate_training_data.cpp
@@ -0,0 +1,122 @@
+#include "validate_training_data.h"
+
+#include "uci.h"
+#include "misc.h"
+#include "thread.h"
+#include "position.h"
+#include "tt.h"
+
+#include "extra/nnue_data_binpack_format.h"
+
+#include "nnue/evaluate_nnue.h"
+
+#include "syzygy/tbprobe.h"
+
+#include <sstream>
+#include <fstream>
+#include <unordered_set>
+#include <iomanip>
+#include <list>
+#include <cmath>    // std::exp(),std::pow(),std::log()
+#include <cstring>  // memcpy()
+#include <memory>
+#include <limits>
+#include <optional>
+#include <chrono>
+#include <random>
+#include <regex>
+#include <filesystem>
+
+using namespace std;
+namespace sys = std::filesystem;
+
+namespace Stockfish::Tools
+{
+    static inline const std::string plain_extension = ".plain";
+    static inline const std::string bin_extension = ".bin";
+    static inline const std::string binpack_extension = ".binpack";
+
+    static bool file_exists(const std::string& name)
+    {
+        std::ifstream f(name);
+        return f.good();
+    }
+
+    static bool ends_with(const std::string& lhs, const std::string& end)
+    {
+        if (end.size() > lhs.size()) return false;
+
+        return std::equal(end.rbegin(), end.rend(), lhs.rbegin());
+    }
+
+    static bool is_validation_of_type(
+        const std::string& input_path,
+        const std::string& expected_input_extension)
+    {
+        return ends_with(input_path, expected_input_extension);
+    }
+
+    using ValidateFunctionType = void(std::string inputPath);
+
+    static ValidateFunctionType* get_validate_function(const std::string& input_path)
+    {
+        if (is_validation_of_type(input_path, plain_extension))
+            return binpack::validatePlain;
+
+        if (is_validation_of_type(input_path, bin_extension))
+            return binpack::validateBin;
+
+        if (is_validation_of_type(input_path, binpack_extension))
+            return binpack::validateBinpack;
+
+        return nullptr;
+    }
+
+    static void validate_training_data(const std::string& input_path)
+    {
+        if(!file_exists(input_path))
+        {
+            std::cerr << "Input file does not exist.\n";
+            return;
+        }
+
+        auto func = get_validate_function(input_path);
+        if (func != nullptr)
+        {
+            func(input_path);
+        }
+        else
+        {
+            std::cerr << "Validation of files of this type is not supported.\n";
+        }
+    }
+
+    static void validate_training_data(const std::vector<std::string>& args)
+    {
+        if (args.size() != 1)
+        {
+            std::cerr << "Invalid arguments.\n";
+            std::cerr << "Usage: validate in_path\n";
+            return;
+        }
+
+        validate_training_data(args[0]);
+    }
+
+    void validate_training_data(istringstream& is)
+    {
+        std::vector<std::string> args;
+
+        while (true)
+        {
+            std::string token = "";
+            is >> token;
+            if (token == "")
+                break;
+
+            args.push_back(token);
+        }
+
+        validate_training_data(args);
+    }
+}
diff --git a/src/tools/validate_training_data.h b/src/tools/validate_training_data.h
new file mode 100644
index 00000000..0c62ab50
--- /dev/null
+++ b/src/tools/validate_training_data.h
@@ -0,0 +1,12 @@
+#ifndef _VALIDATE_TRAINING_DATA_H_
+#define _VALIDATE_TRAINING_DATA_H_
+
+#include <vector>
+#include <string>
+#include <sstream>
+
+namespace Stockfish::Tools {
+    void validate_training_data(std::istringstream& is);
+}
+
+#endif
diff --git a/src/uci.cpp b/src/uci.cpp
index 2fa7a186..5e0bb11b 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -33,6 +33,7 @@
 #include "tt.h"
 #include "uci.h"
 
+#include "tools/validate_training_data.h"
 #include "tools/training_data_generator.h"
 #include "tools/training_data_generator_nonpv.h"
 #include "tools/convert.h"
@@ -330,6 +331,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "generate_training_data") Tools::generate_training_data(is);
       else if (token == "generate_training_data") Tools::generate_training_data_nonpv(is);
       else if (token == "convert") Tools::convert(is);
+      else if (token == "validate_training_data") Tools::validate_training_data(is);
       else if (token == "convert_bin") Tools::convert_bin(is);
       else if (token == "convert_plain") Tools::convert_plain(is);
       else if (token == "convert_bin_from_pgn_extract") Tools::convert_bin_from_pgn_extract(is);

From 55ce07b773b1b57faf65d9591b199b350868a933 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 3 Apr 2021 16:15:51 +0200
Subject: [PATCH 581/583] Add additional checks for en-passant possiblity when
 fixing the erroneus ep flag from a fen.

---
 src/extra/nnue_data_binpack_format.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/extra/nnue_data_binpack_format.h b/src/extra/nnue_data_binpack_format.h
index 453169dd..0b1ac0aa 100644
--- a/src/extra/nnue_data_binpack_format.h
+++ b/src/extra/nnue_data_binpack_format.h
@@ -6116,6 +6116,26 @@ namespace chess
 
     [[nodiscard]] inline bool Position::isEpPossibleColdPath(Square epSquare, Bitboard pawnsAttackingEpSquare, Color sideToMove) const
     {
+        if (pieceAt(epSquare) != Piece::none())
+        {
+            return false;
+        }
+
+        const auto forward =
+            sideToMove == chess::Color::White
+            ? FlatSquareOffset(0, 1)
+            : FlatSquareOffset(0, -1);
+
+        if (pieceAt(epSquare + forward) != Piece::none())
+        {
+            return false;
+        }
+
+        if (pieceAt(epSquare + -forward) != Piece(PieceType::Pawn, !sideToMove))
+        {
+            return false;
+        }
+
         // only set m_epSquare when it matters, ie. when
         // the opposite side can actually capture
         for (Square sq : pawnsAttackingEpSquare)

From bf187c46c82d08ca8cef48a3e6e52ca64ee607e5 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 26 May 2021 15:12:08 +0200
Subject: [PATCH 582/583] Add a tool to interleave binpacks

this tool with take N binpacks as input to produce 1 binpack as output.
The input binpacks are read in random order, with a probability related to their size,
but each input file is read sequentially. The output is thus an appropriately shuffled binpack.
The tool is much faster than cat'ing the files together followed by a shuffle.
It assumes that the input binpacks themselves have no particular internal ordering.
---
 script/interleave_binpacks.py | 86 +++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 script/interleave_binpacks.py

diff --git a/script/interleave_binpacks.py b/script/interleave_binpacks.py
new file mode 100644
index 00000000..02888b33
--- /dev/null
+++ b/script/interleave_binpacks.py
@@ -0,0 +1,86 @@
+import struct
+import sys
+import os
+import random
+from pathlib import Path
+
+
+def copy_next_chunk(in_file, out_file):
+    chunk_header = in_file.read(8)
+    assert chunk_header[0:4] == b"BINP"
+    size = struct.unpack("<I", chunk_header[4:])[0]
+
+    out_file.write(chunk_header)
+    data = in_file.read(size)
+    out_file.write(data)
+
+    return size + 8
+
+
+def main():
+    if len(sys.argv) < 4:
+        print("Usage: python interleave_binpacks.py infile1 ... infileN outfile")
+        print("       The output binpack, will contain all data from the input files.")
+        print("       Data is read sequentially from the input, randomly alternating between files.")
+        return
+
+    # open last arg as output file name
+    out_filename = sys.argv[-1]
+    print("outfile: ", out_filename)
+
+    if Path(out_filename).exists():
+        print(
+            "Output path {} already exists. Please specify a path to a file that does not exist.".format(
+                out_filename
+            )
+        )
+        return
+
+    out_file = open(out_filename, "wb")
+
+    # open other args as input file names, and get their sizes
+    in_filenames = []
+    for i in range(1, len(sys.argv) - 1):
+        in_filenames.append(sys.argv[i])
+    print("infiles: ", in_filenames)
+
+    in_files = []
+    in_files_remaining = []
+    for in_filename in in_filenames:
+        in_file = open(in_filename, "rb")
+        in_files.append(in_file)
+        file_size = os.path.getsize(in_filename)
+        in_files_remaining.append(file_size)
+
+    # randomly pick a file, with a probability related to their sizes.
+    # copy from the front and keep track of remaining sizes
+    total_remaining = sum(in_files_remaining)
+    print("Merging {} bytes ".format(total_remaining))
+
+    total_size = 0
+    report_every = 100
+    prev_mib = -report_every
+
+    while total_remaining > 0:
+        where = random.randrange(total_remaining)
+        i = 0
+        while where >= in_files_remaining[i]:
+            where -= in_files_remaining[i]
+            i += 1
+        size = copy_next_chunk(in_files[i], out_file)
+        in_files_remaining[i] -= size
+        total_remaining -= size
+        total_size += size
+        mib = total_size // 1024 // 1024
+        if mib // 100 != prev_mib // 100:
+            print("Copied {} MiB".format(mib))
+            prev_mib = mib
+
+    out_file.close()
+    for in_file in in_files:
+        in_file.close()
+
+    print("Merged  {} bytes".format(total_size))
+
+
+main()

From cee4ed39bdf1e79d1754881e9adf5af557024479 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Sat, 12 Jun 2021 18:10:55 +0200
Subject: [PATCH 583/583] fix accumulator state initialization in
 set_from_packed_sfen

---
 src/tools/sfen_packer.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tools/sfen_packer.cpp b/src/tools/sfen_packer.cpp
index a8e1fec2..8182503c 100644
--- a/src/tools/sfen_packer.cpp
+++ b/src/tools/sfen_packer.cpp
@@ -260,6 +260,8 @@ namespace Stockfish::Tools {
 
         pos.clear();
         std::memset(si, 0, sizeof(StateInfo));
+        si->accumulator.state[WHITE] = Eval::NNUE::INIT;
+        si->accumulator.state[BLACK] = Eval::NNUE::INIT;
         pos.st = si;
 
         // Active color