Moved the nnue folder.

2026-05-20 15:37:47 +00:00 · 2020-08-07 22:34:53 +09:00
parent c0e1235fef
commit 1c23465383
44 changed files with 0 additions and 0 deletions
@@ -0,0 +1,42 @@
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef HALFKP_CR_EP_256X2_32_32_H
+#define HALFKP_CR_EP_256X2_32_32_H
+
+#include "../features/feature_set.h"
+#include "../features/half_kp.h"
+#include "../features/castling_right.h"
+#include "../features/enpassant.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<
+      Features::HalfKP<Features::Side::kFriend>, Features::CastlingRight,
+      Features::EnPassant>;
+
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+      // define network structure
+      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+  }  // namespace NNUE
+
+}  // namespace Eval
+#endif // HALFKP_CR_EP_256X2_32_32_H
@@ -0,0 +1,39 @@
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef HALFKP_256X2_32_32_H
+#define HALFKP_256X2_32_32_H
+
+#include "../features/feature_set.h"
+#include "../features/half_kp.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// Input features used in evaluation function
+using RawFeatures = Features::FeatureSet<
+    Features::HalfKP<Features::Side::kFriend>>;
+
+// Number of input feature dimensions after conversion
+constexpr IndexType kTransformedFeatureDimensions = 256;
+
+namespace Layers {
+
+// define network structure
+using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+}  // namespace Layers
+
+using Network = Layers::OutputLayer;
+
+}  // namespace NNUE
+
+}  // namespace Eval
+#endif // HALFKP_256X2_32_32_H
@@ -0,0 +1,39 @@
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef HALFKP_384X2_32_32_H
+#define HALFKP_384X2_32_32_H
+
+#include "../features/feature_set.h"
+#include "../features/half_kp.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// Input features used in evaluation function
+using RawFeatures = Features::FeatureSet<
+    Features::HalfKP<Features::Side::kFriend>>;
+
+// Number of input feature dimensions after conversion
+constexpr IndexType kTransformedFeatureDimensions = 384;
+
+namespace Layers {
+
+// define network structure
+using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+}  // namespace Layers
+
+using Network = Layers::OutputLayer;
+
+}  // namespace NNUE
+
+}  // namespace Eval
+#endif // HALFKP_384X2_32_32_H
@@ -0,0 +1,42 @@
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef K_P_CR_EP_256X2_32_32_H
+#define K_P_CR_EP_256X2_32_32_H
+
+#include "../features/feature_set.h"
+#include "../features/k.h"
+#include "../features/p.h"
+#include "../features/castling_right.h"
+#include "../features/enpassant.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
+      Features::CastlingRight, Features::EnPassant>;
+
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+      // define network structure
+      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+  }  // namespace NNUE
+
+}  // namespace Eval
+#endif // K_P_CR_EP_256X2_32_32_H
@@ -0,0 +1,41 @@
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef K_P_CR_256X2_32_32_H
+#define K_P_CR_256X2_32_32_H
+
+#include "../features/feature_set.h"
+#include "../features/k.h"
+#include "../features/p.h"
+#include "../features/castling_right.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    // Input features used in evaluation function
+    using RawFeatures = Features::FeatureSet<Features::K, Features::P,
+      Features::CastlingRight>;
+
+    // Number of input feature dimensions after conversion
+    constexpr IndexType kTransformedFeatureDimensions = 256;
+
+    namespace Layers {
+
+      // define network structure
+      using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+      using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+      using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+      using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+    }  // namespace Layers
+
+    using Network = Layers::OutputLayer;
+
+  }  // namespace NNUE
+
+}  // namespace Eval
+#endif // K_P_CR_256X2_32_32_H
@@ -0,0 +1,38 @@
+// Definition of input features and network structure used in NNUE evaluation function
+#ifndef K_P_256X2_32_32_H
+#define K_P_256X2_32_32_H
+
+#include "../features/feature_set.h"
+#include "../features/k.h"
+#include "../features/p.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// Input features used in evaluation function
+using RawFeatures = Features::FeatureSet<Features::K, Features::P>;
+
+// Number of input feature dimensions after conversion
+constexpr IndexType kTransformedFeatureDimensions = 256;
+
+namespace Layers {
+
+// define network structure
+using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+}  // namespace Layers
+
+using Network = Layers::OutputLayer;
+
+}  // namespace NNUE
+
+}  // namespace Eval
+#endif // K_P_256X2_32_32_H
@@ -0,0 +1,326 @@
+// Code for calculating NNUE evaluation function
+
+#if defined(EVAL_NNUE)
+
+#include <fstream>
+#include <iostream>
+
+#include "../../evaluate.h"
+#include "../../position.h"
+#include "../../misc.h"
+#include "../../uci.h"
+
+#include "evaluate_nnue.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// Input feature converter
+AlignedPtr<FeatureTransformer> feature_transformer;
+
+// Evaluation function
+AlignedPtr<Network> network;
+
+// Evaluation function file name
+std::string fileName = "nn.bin";
+
+// Saved evaluation function file name
+std::string savedfileName = "nn.bin";
+
+// Get a string that represents the structure of the evaluation function
+std::string GetArchitectureString() {
+  return "Features=" + FeatureTransformer::GetStructureString() +
+      ",Network=" + Network::GetStructureString();
+}
+
+namespace {
+
+namespace Detail {
+
+// Initialize the evaluation function parameters
+template <typename T>
+void Initialize(AlignedPtr<T>& pointer) {
+  pointer.reset(reinterpret_cast<T*>(aligned_malloc(sizeof(T), alignof(T))));
+  std::memset(pointer.get(), 0, sizeof(T));
+}
+
+// read evaluation function parameters
+template <typename T>
+bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
+  std::uint32_t header;
+  stream.read(reinterpret_cast<char*>(&header), sizeof(header));
+  if (!stream || header != T::GetHashValue()) return false;
+  return pointer->ReadParameters(stream);
+}
+
+// write evaluation function parameters
+template <typename T>
+bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
+  constexpr std::uint32_t header = T::GetHashValue();
+  stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+  return pointer->WriteParameters(stream);
+}
+
+}  // namespace Detail
+
+// Initialize the evaluation function parameters
+void Initialize() {
+  Detail::Initialize(feature_transformer);
+  Detail::Initialize(network);
+}
+
+}  // namespace
+
+// read the header
+bool ReadHeader(std::istream& stream,
+  std::uint32_t* hash_value, std::string* architecture) {
+  std::uint32_t version, size;
+  stream.read(reinterpret_cast<char*>(&version), sizeof(version));
+  stream.read(reinterpret_cast<char*>(hash_value), sizeof(*hash_value));
+  stream.read(reinterpret_cast<char*>(&size), sizeof(size));
+  if (!stream || version != kVersion) return false;
+  architecture->resize(size);
+  stream.read(&(*architecture)[0], size);
+  return !stream.fail();
+}
+
+// write the header
+bool WriteHeader(std::ostream& stream,
+  std::uint32_t hash_value, const std::string& architecture) {
+  stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
+  stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
+  const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
+  stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
+  stream.write(architecture.data(), size);
+  return !stream.fail();
+}
+
+// read evaluation function parameters
+bool ReadParameters(std::istream& stream) {
+  std::uint32_t hash_value;
+  std::string architecture;
+  if (!ReadHeader(stream, &hash_value, &architecture)) return false;
+  if (hash_value != kHashValue) return false;
+  if (!Detail::ReadParameters(stream, feature_transformer)) return false;
+  if (!Detail::ReadParameters(stream, network)) return false;
+  return stream && stream.peek() == std::ios::traits_type::eof();
+}
+
+// write evaluation function parameters
+bool WriteParameters(std::ostream& stream) {
+  if (!WriteHeader(stream, kHashValue, GetArchitectureString())) return false;
+  if (!Detail::WriteParameters(stream, feature_transformer)) return false;
+  if (!Detail::WriteParameters(stream, network)) return false;
+  return !stream.fail();
+}
+
+// proceed if you can calculate the difference
+static void UpdateAccumulatorIfPossible(const Position& pos) {
+  feature_transformer->UpdateAccumulatorIfPossible(pos);
+}
+
+// Calculate the evaluation value
+static Value ComputeScore(const Position& pos, bool refresh = false) {
+  auto& accumulator = pos.state()->accumulator;
+  if (!refresh && accumulator.computed_score) {
+    return accumulator.score;
+  }
+
+  alignas(kCacheLineSize) TransformedFeatureType
+      transformed_features[FeatureTransformer::kBufferSize];
+  feature_transformer->Transform(pos, transformed_features, refresh);
+  alignas(kCacheLineSize) char buffer[Network::kBufferSize];
+  const auto output = network->Propagate(transformed_features, buffer);
+
+  // When a value larger than VALUE_MAX_EVAL is returned, aspiration search fails high
+  // It should be guaranteed that it is less than VALUE_MAX_EVAL because the search will not end.
+
+  // Even if this phenomenon occurs, if the seconds are fixed when playing, the search will be aborted there, so
+  // The best move in the previous iteration is pointed to as bestmove, so apparently
+  // no problem. The situation in which this VALUE_MAX_EVAL is returned is almost at a dead end,
+  // Since such a jamming phase often appears at the end, there is a big difference in the situation
+  // Doesn't really affect the outcome.
+
+  // However, when searching with a fixed depth such as when creating a teacher, it will not return from the search
+  // Waste the computation time for that thread. Also, it will be timed out with fixed depth game.
+
+  auto score = static_cast<Value>(output[0] / FV_SCALE);
+
+  // 1) I feel that if I clip too poorly, it will have an effect on my learning...
+  // 2) Since accumulator.score is not used at the time of difference calculation, it can be rewritten without any problem.
+  score = Math::clamp(score , -VALUE_MAX_EVAL , VALUE_MAX_EVAL);
+
+  accumulator.score = score;
+  accumulator.computed_score = true;
+  return accumulator.score;
+}
+
+} // namespace NNUE
+
+#if defined(USE_EVAL_HASH)
+// Class used to store evaluation values in HashTable
+struct alignas(16) ScoreKeyValue {
+#if defined(USE_SSE2)
+  ScoreKeyValue() = default;
+  ScoreKeyValue(const ScoreKeyValue& other) {
+    static_assert(sizeof(ScoreKeyValue) == sizeof(__m128i),
+                  "sizeof(ScoreKeyValue) should be equal to sizeof(__m128i)");
+    _mm_store_si128(&as_m128i, other.as_m128i);
+  }
+  ScoreKeyValue& operator=(const ScoreKeyValue& other) {
+    _mm_store_si128(&as_m128i, other.as_m128i);
+    return *this;
+  }
+#endif
+
+  // It is necessary to be able to operate atomically with evaluate hash, so the manipulator for that
+  void encode() {
+#if defined(USE_SSE2)
+    // ScoreKeyValue is copied to atomic, so if the key matches, the data matches.
+#else
+    key ^= score;
+#endif
+  }
+  // decode() is the reverse conversion of encode(), but since it is xor, the reverse conversion is the same.
+  void decode() { encode(); }
+
+  union {
+    struct {
+      std::uint64_t key;
+      std::uint64_t score;
+    };
+#if defined(USE_SSE2)
+    __m128i as_m128i;
+#endif
+  };
+};
+
+// Simple HashTable implementation.
+// Size is a power of 2.
+template <typename T, size_t Size>
+struct HashTable {
+  HashTable() { clear(); }
+  T* operator [] (const Key k) { return entries_ + (static_cast<size_t>(k) & (Size - 1)); }
+  void clear() { memset(entries_, 0, sizeof(T)*Size); }
+
+  // Check that Size is a power of 2
+  static_assert((Size & (Size - 1)) == 0, "");
+
+ private:
+  T entries_[Size];
+};
+
+//HashTable to save the evaluated ones (following ehash)
+
+#if !defined(USE_LARGE_EVAL_HASH)
+// 134MB (setting other than witch's AVX2)
+struct EvaluateHashTable : HashTable<ScoreKeyValue, 0x800000> {};
+#else
+// If you have prefetch, it's better to have a big one...
+// → It doesn't change much and the memory is wasteful, so is it okay to set ↑ by default?
+// 1GB (setting for witch's AVX2)
+struct EvaluateHashTable : HashTable<ScoreKeyValue, 0x4000000> {};
+#endif
+
+EvaluateHashTable g_evalTable;
+
+// Prepare a function to prefetch.
+void prefetch_evalhash(const Key key) {
+  constexpr auto mask = ~((uint64_t)0x1f);
+  prefetch((void*)((uint64_t)g_evalTable[key] & mask));
+}
+#endif
+
+// read the evaluation function file
+// Save and restore Options with bench command etc., so EvalDir is changed at this time,
+// This function may be called twice to flag that the evaluation function needs to be reloaded.
+void load_eval() {
+
+  // Must be done!
+  NNUE::Initialize();
+
+  if (Options["SkipLoadingEval"])
+  {
+      std::cout << "info string SkipLoadingEval set to true, Net not loaded!" << std::endl;
+      return;
+  }
+
+  const std::string file_name = Options["EvalFile"];
+  NNUE::fileName = file_name;
+
+  std::ifstream stream(file_name, std::ios::binary);
+  const bool result = NNUE::ReadParameters(stream);
+
+  if (!result)
+      // It's a problem if it doesn't finish when there is a read error.
+      std::cout << "Error! " << NNUE::fileName << " not found or wrong format" << std::endl;
+
+  else
+      std::cout << "info string NNUE " << NNUE::fileName << " found & loaded" << std::endl;
+}
+
+// Initialization
+void init() {
+}
+
+// Evaluation function. Perform full calculation instead of difference calculation.
+// Called only once with Position::set(). (The difference calculation after that)
+// Note that the evaluation value seen from the turn side is returned. (Design differs from other evaluation functions in this respect)
+// Since, we will not try to optimize this function.
+Value compute_eval(const Position& pos) {
+  return NNUE::ComputeScore(pos, true);
+}
+
+// Evaluation function
+Value evaluate(const Position& pos) {
+  const auto& accumulator = pos.state()->accumulator;
+  if (accumulator.computed_score) {
+    return accumulator.score;
+  }
+
+#if defined(USE_GLOBAL_OPTIONS)
+  // If Global Options is set not to use eval hash
+  // Skip the query to the eval hash.
+  if (!GlobalOptions.use_eval_hash) {
+    ASSERT_LV5(pos.state()->materialValue == Eval::material(pos));
+    return NNUE::ComputeScore(pos);
+  }
+#endif
+
+#if defined(USE_EVAL_HASH)
+  // May be in the evaluate hash table.
+  const Key key = pos.key();
+  ScoreKeyValue entry = *g_evalTable[key];
+  entry.decode();
+  if (entry.key == key) {
+    // there were!
+    return Value(entry.score);
+  }
+#endif
+
+  Value score = NNUE::ComputeScore(pos);
+#if defined(USE_EVAL_HASH)
+  // Since it was calculated carefully, save it in the evaluate hash table.
+  entry.key = key;
+  entry.score = score;
+  entry.encode();
+  *g_evalTable[key] = entry;
+#endif
+
+  return score;
+}
+
+// proceed if you can calculate the difference
+void evaluate_with_no_return(const Position& pos) {
+  NNUE::UpdateAccumulatorIfPossible(pos);
+}
+
+// display the breakdown of the evaluation value of the current phase
+void print_eval_stat(Position& /*pos*/) {
+  std::cout << "--- EVAL STAT: not implemented" << std::endl;
+}
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
@@ -0,0 +1,67 @@
+// header used in NNUE evaluation function
+
+#ifndef _EVALUATE_NNUE_H_
+#define _EVALUATE_NNUE_H_
+
+#if defined(EVAL_NNUE)
+
+#include "nnue_feature_transformer.h"
+#include "nnue_architecture.h"
+
+#include <memory>
+
+namespace Eval {
+
+namespace NNUE {
+
+// hash value of evaluation function structure
+constexpr std::uint32_t kHashValue =
+    FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
+
+// Deleter for automating release of memory area
+template <typename T>
+struct AlignedDeleter {
+  void operator()(T* ptr) const {
+    ptr->~T();
+    aligned_free(ptr);
+  }
+};
+template <typename T>
+using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
+
+// Input feature converter
+extern AlignedPtr<FeatureTransformer> feature_transformer;
+
+// Evaluation function
+extern AlignedPtr<Network> network;
+
+// Evaluation function file name
+extern std::string fileName;
+
+// Saved evaluation function file name
+extern std::string savedfileName;
+
+// Get a string that represents the structure of the evaluation function
+std::string GetArchitectureString();
+
+// read the header
+bool ReadHeader(std::istream& stream,
+    std::uint32_t* hash_value, std::string* architecture);
+
+// write the header
+bool WriteHeader(std::ostream& stream,
+    std::uint32_t hash_value, const std::string& architecture);
+
+// read evaluation function parameters
+bool ReadParameters(std::istream& stream);
+
+// write evaluation function parameters
+bool WriteParameters(std::ostream& stream);
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,231 @@
+// Code for learning NNUE evaluation function
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include <random>
+#include <fstream>
+
+#include "../../learn/learn.h"
+#include "../../learn/learning_tools.h"
+
+#include "../../position.h"
+#include "../../uci.h"
+#include "../../misc.h"
+#include "../../thread_win32_osx.h"
+
+#include "../evaluate_common.h"
+
+#include "evaluate_nnue.h"
+#include "evaluate_nnue_learner.h"
+#include "trainer/features/factorizer_feature_set.h"
+#include "trainer/features/factorizer_half_kp.h"
+#include "trainer/trainer_feature_transformer.h"
+#include "trainer/trainer_input_slice.h"
+#include "trainer/trainer_affine_transform.h"
+#include "trainer/trainer_clipped_relu.h"
+#include "trainer/trainer_sum.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace {
+
+// learning data
+std::vector<Example> examples;
+
+// Mutex for exclusive control of examples
+std::mutex examples_mutex;
+
+// number of samples in mini-batch
+uint64_t batch_size;
+
+// random number generator
+std::mt19937 rng;
+
+// learner
+std::shared_ptr<Trainer<Network>> trainer;
+
+// Learning rate scale
+double global_learning_rate_scale;
+
+// Get the learning rate scale
+double GetGlobalLearningRateScale() {
+  return global_learning_rate_scale;
+}
+
+// Tell the learner options such as hyperparameters
+void SendMessages(std::vector<Message> messages) {
+  for (auto& message : messages) {
+    trainer->SendMessage(&message);
+    assert(message.num_receivers > 0);
+  }
+}
+
+}  // namespace
+
+// Initialize learning
+void InitializeTraining(double eta1, uint64_t eta1_epoch,
+                        double eta2, uint64_t eta2_epoch, double eta3) {
+  std::cout << "Initializing NN training for "
+            << GetArchitectureString() << std::endl;
+
+  assert(feature_transformer);
+  assert(network);
+  trainer = Trainer<Network>::Create(network.get(), feature_transformer.get());
+
+  if (Options["SkipLoadingEval"]) {
+    trainer->Initialize(rng);
+  }
+
+  global_learning_rate_scale = 1.0;
+  EvalLearningTools::Weight::init_eta(eta1, eta2, eta3, eta1_epoch, eta2_epoch);
+}
+
+// set the number of samples in the mini-batch
+void SetBatchSize(uint64_t size) {
+  assert(size > 0);
+  batch_size = size;
+}
+
+// set the learning rate scale
+void SetGlobalLearningRateScale(double scale) {
+  global_learning_rate_scale = scale;
+}
+
+// Set options such as hyperparameters
+void SetOptions(const std::string& options) {
+  std::vector<Message> messages;
+  for (const auto& option : Split(options, ',')) {
+    const auto fields = Split(option, '=');
+    assert(fields.size() == 1 || fields.size() == 2);
+    if (fields.size() == 1) {
+      messages.emplace_back(fields[0]);
+    } else {
+      messages.emplace_back(fields[0], fields[1]);
+    }
+  }
+  SendMessages(std::move(messages));
+}
+
+// Reread the evaluation function parameters for learning from the file
+void RestoreParameters(const std::string& dir_name) {
+  const std::string file_name = Path::Combine(dir_name, NNUE::savedfileName);
+  std::ifstream stream(file_name, std::ios::binary);
+  bool result = ReadParameters(stream);
+  assert(result);
+
+  SendMessages({{"reset"}});
+}
+
+// Add 1 sample of learning data
+void AddExample(Position& pos, Color rootColor,
+                const Learner::PackedSfenValue& psv, double weight) {
+  Example example;
+  if (rootColor == pos.side_to_move()) {
+    example.sign = 1;
+  } else {
+    example.sign = -1;
+  }
+  example.psv = psv;
+  example.weight = weight;
+
+  Features::IndexList active_indices[2];
+  for (const auto trigger : kRefreshTriggers) {
+    RawFeatures::AppendActiveIndices(pos, trigger, active_indices);
+  }
+  if (pos.side_to_move() != WHITE) {
+    active_indices[0].swap(active_indices[1]);
+  }
+  for (const auto color : Colors) {
+    std::vector<TrainingFeature> training_features;
+    for (const auto base_index : active_indices[color]) {
+      static_assert(Features::Factorizer<RawFeatures>::GetDimensions() <
+                    (1 << TrainingFeature::kIndexBits), "");
+      Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+          base_index, &training_features);
+    }
+    std::sort(training_features.begin(), training_features.end());
+
+    auto& unique_features = example.training_features[color];
+    for (const auto& feature : training_features) {
+      if (!unique_features.empty() &&
+          feature.GetIndex() == unique_features.back().GetIndex()) {
+        unique_features.back() += feature;
+      } else {
+        unique_features.push_back(feature);
+      }
+    }
+  }
+
+  std::lock_guard<std::mutex> lock(examples_mutex);
+  examples.push_back(std::move(example));
+}
+
+// update the evaluation function parameters
+void UpdateParameters(uint64_t epoch) {
+  assert(batch_size > 0);
+
+  EvalLearningTools::Weight::calc_eta(epoch);
+  const auto learning_rate = static_cast<LearnFloatType>(
+      get_eta() / batch_size);
+
+  std::lock_guard<std::mutex> lock(examples_mutex);
+  std::shuffle(examples.begin(), examples.end(), rng);
+  while (examples.size() >= batch_size) {
+    std::vector<Example> batch(examples.end() - batch_size, examples.end());
+    examples.resize(examples.size() - batch_size);
+
+    const auto network_output = trainer->Propagate(batch);
+
+    std::vector<LearnFloatType> gradients(batch.size());
+    for (std::size_t b = 0; b < batch.size(); ++b) {
+      const auto shallow = static_cast<Value>(Round<std::int32_t>(
+          batch[b].sign * network_output[b] * kPonanzaConstant));
+      const auto& psv = batch[b].psv;
+      const double gradient = batch[b].sign * Learner::calc_grad(shallow, psv);
+      gradients[b] = static_cast<LearnFloatType>(gradient * batch[b].weight);
+    }
+
+    trainer->Backpropagate(gradients.data(), learning_rate);
+  }
+  SendMessages({{"quantize_parameters"}});
+}
+
+// Check if there are any problems with learning
+void CheckHealth() {
+  SendMessages({{"check_health"}});
+}
+
+}  // namespace NNUE
+
+// save merit function parameters to a file
+void save_eval(std::string dir_name) {
+  auto eval_dir = Path::Combine(Options["EvalSaveDir"], dir_name);
+  std::cout << "save_eval() start. folder = " << eval_dir << std::endl;
+
+  // mkdir() will fail if this folder already exists, but
+  // Apart from that. If not, I just want you to make it.
+  // Also, assume that the folders up to EvalSaveDir have been dug.
+  Dependency::mkdir(eval_dir);
+
+  if (Options["SkipLoadingEval"] && NNUE::trainer) {
+    NNUE::SendMessages({{"clear_unobserved_feature_weights"}});
+  }
+
+  const std::string file_name = Path::Combine(eval_dir, NNUE::savedfileName);
+  std::ofstream stream(file_name, std::ios::binary);
+  const bool result = NNUE::WriteParameters(stream);
+  assert(result);
+
+  std::cout << "save_eval() finished. folder = " << eval_dir << std::endl;
+}
+
+// get the current eta
+double get_eta() {
+  return NNUE::GetGlobalLearningRateScale() * EvalLearningTools::Weight::eta;
+}
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
@@ -0,0 +1,46 @@
+// Interface used for learning NNUE evaluation function
+
+#ifndef _EVALUATE_NNUE_LEARNER_H_
+#define _EVALUATE_NNUE_LEARNER_H_
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../learn/learn.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// Initialize learning
+void InitializeTraining(double eta1, uint64_t eta1_epoch,
+                        double eta2, uint64_t eta2_epoch, double eta3);
+
+// set the number of samples in the mini-batch
+void SetBatchSize(uint64_t size);
+
+// set the learning rate scale
+void SetGlobalLearningRateScale(double scale);
+
+// Set options such as hyperparameters
+void SetOptions(const std::string& options);
+
+// Reread the evaluation function parameters for learning from the file
+void RestoreParameters(const std::string& dir_name);
+
+// Add 1 sample of learning data
+void AddExample(Position& pos, Color rootColor,
+                const Learner::PackedSfenValue& psv, double weight);
+
+// update the evaluation function parameters
+void UpdateParameters(uint64_t epoch);
+
+// Check if there are any problems with learning
+void CheckHealth();
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,73 @@
+//Definition of input feature quantity K of NNUE evaluation function
+
+#if defined(EVAL_NNUE)
+
+#include "castling_right.h"
+#include "index_list.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    namespace Features {
+
+      // Get a list of indices with a value of 1 among the features
+      void CastlingRight::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
+        // do nothing if array size is small to avoid compiler warning
+        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+        int castling_rights = pos.state()->castlingRights;
+        int relative_castling_rights;
+        if (perspective == WHITE) {
+          relative_castling_rights = castling_rights;
+        }
+        else {
+          // Invert the perspective.
+          relative_castling_rights = ((castling_rights & 3) << 2)
+            & ((castling_rights >> 2) & 3);
+        }
+
+        for (int i = 0; i <kDimensions; ++i) {
+          if (relative_castling_rights & (i << 1)) {
+            active->push_back(i);
+          }
+        }
+      }
+
+      // Get a list of indices whose values have changed from the previous one in the feature quantity
+      void CastlingRight::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* added) {
+
+        int previous_castling_rights = pos.state()->previous->castlingRights;
+        int current_castling_rights = pos.state()->castlingRights;
+        int relative_previous_castling_rights;
+        int relative_current_castling_rights;
+        if (perspective == WHITE) {
+          relative_previous_castling_rights = previous_castling_rights;
+          relative_current_castling_rights = current_castling_rights;
+        }
+        else {
+          // Invert the perspective.
+          relative_previous_castling_rights = ((previous_castling_rights & 3) << 2)
+            & ((previous_castling_rights >> 2) & 3);
+          relative_current_castling_rights = ((current_castling_rights & 3) << 2)
+            & ((current_castling_rights >> 2) & 3);
+        }
+
+        for (int i = 0; i < kDimensions; ++i) {
+          if ((relative_previous_castling_rights & (i << 1)) &&
+            (relative_current_castling_rights & (i << 1)) == 0) {
+            removed->push_back(i);
+          }
+        }
+      }
+
+    }  // namespace Features
+
+  }  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
@@ -0,0 +1,48 @@
+//Definition of input feature quantity K of NNUE evaluation function
+
+#ifndef _NNUE_FEATURES_CASTLING_RIGHT_H_
+#define _NNUE_FEATURES_CASTLING_RIGHT_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    namespace Features {
+
+      // Feature K: Ball position
+      class CastlingRight {
+      public:
+        // feature quantity name
+        static constexpr const char* kName = "CastlingRight";
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x913968AAu;
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = 4;
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 4;
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+        // Get a list of indices with a value of 1 among the features
+        static void AppendActiveIndices(const Position& pos, Color perspective,
+          IndexList* active);
+
+        // Get a list of indices whose values ??have changed from the previous one in the feature quantity
+        static void AppendChangedIndices(const Position& pos, Color perspective,
+          IndexList* removed, IndexList* added);
+      };
+
+    }  // namespace Features
+
+  }  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,47 @@
+//Definition of input feature quantity K of NNUE evaluation function
+
+#if defined(EVAL_NNUE)
+
+#include "enpassant.h"
+#include "index_list.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    namespace Features {
+
+      // Get a list of indices with a value of 1 among the features
+      void EnPassant::AppendActiveIndices(
+        const Position& pos, Color perspective, IndexList* active) {
+        // do nothing if array size is small to avoid compiler warning
+        if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+        auto epSquare = pos.state()->epSquare;
+        if (epSquare == SQ_NONE) {
+          return;
+        }
+
+        if (perspective == BLACK) {
+          epSquare = Inv(epSquare);
+        }
+
+        auto file = file_of(epSquare);
+        active->push_back(file);
+      }
+
+      // Get a list of indices whose values ??have changed from the previous one in the feature quantity
+      void EnPassant::AppendChangedIndices(
+        const Position& pos, Color perspective,
+        IndexList* removed, IndexList* added) {
+        // Not implemented.
+        assert(false);
+      }
+
+    }  // namespace Features
+
+  }  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
@@ -0,0 +1,48 @@
+//Definition of input feature quantity K of NNUE evaluation function
+
+#ifndef _NNUE_FEATURES_ENPASSANT_H_
+#define _NNUE_FEATURES_ENPASSANT_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+  namespace NNUE {
+
+    namespace Features {
+
+      // Feature K: Ball position
+      class EnPassant {
+      public:
+        // feature quantity name
+        static constexpr const char* kName = "EnPassant";
+        // Hash value embedded in the evaluation function file
+        static constexpr std::uint32_t kHashValue = 0x02924F91u;
+        // number of feature dimensions
+        static constexpr IndexType kDimensions = 8;
+        // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+        static constexpr IndexType kMaxActiveDimensions = 1;
+        // Timing of full calculation instead of difference calculation
+        static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kAnyPieceMoved;
+
+        // Get a list of indices with a value of 1 among the features
+        static void AppendActiveIndices(const Position& pos, Color perspective,
+          IndexList* active);
+
+        // Get a list of indices whose values ??have changed from the previous one in the feature quantity
+        static void AppendChangedIndices(const Position& pos, Color perspective,
+          IndexList* removed, IndexList* added);
+      };
+
+    }  // namespace Features
+
+  }  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,249 @@
+// A class template that represents the input feature set of the NNUE evaluation function
+
+#ifndef _NNUE_FEATURE_SET_H_
+#define _NNUE_FEATURE_SET_H_
+
+#if defined(EVAL_NNUE)
+
+#include "features_common.h"
+#include <array>
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// A class template that represents a list of values
+template <typename T, T... Values>
+struct CompileTimeList;
+template <typename T, T First, T... Remaining>
+struct CompileTimeList<T, First, Remaining...> {
+  static constexpr bool Contains(T value) {
+    return value == First || CompileTimeList<T, Remaining...>::Contains(value);
+  }
+  static constexpr std::array<T, sizeof...(Remaining) + 1>
+      kValues = {{First, Remaining...}};
+};
+template <typename T, T First, T... Remaining>
+constexpr std::array<T, sizeof...(Remaining) + 1>
+    CompileTimeList<T, First, Remaining...>::kValues;
+template <typename T>
+struct CompileTimeList<T> {
+  static constexpr bool Contains(T /*value*/) {
+    return false;
+  }
+  static constexpr std::array<T, 0> kValues = {{}};
+};
+
+// Class template that adds to the beginning of the list
+template <typename T, typename ListType, T Value>
+struct AppendToList;
+template <typename T, T... Values, T AnotherValue>
+struct AppendToList<T, CompileTimeList<T, Values...>, AnotherValue> {
+  using Result = CompileTimeList<T, AnotherValue, Values...>;
+};
+
+// Class template for adding to a sorted, unique list
+template <typename T, typename ListType, T Value>
+struct InsertToSet;
+template <typename T, T First, T... Remaining, T AnotherValue>
+struct InsertToSet<T, CompileTimeList<T, First, Remaining...>, AnotherValue> {
+  using Result = std::conditional_t<
+      CompileTimeList<T, First, Remaining...>::Contains(AnotherValue),
+      CompileTimeList<T, First, Remaining...>,
+      std::conditional_t<(AnotherValue <First),
+          CompileTimeList<T, AnotherValue, First, Remaining...>,
+          typename AppendToList<T, typename InsertToSet<
+              T, CompileTimeList<T, Remaining...>, AnotherValue>::Result,
+              First>::Result>>;
+};
+template <typename T, T Value>
+struct InsertToSet<T, CompileTimeList<T>, Value> {
+  using Result = CompileTimeList<T, Value>;
+};
+
+// Base class of feature set
+template <typename Derived>
+class FeatureSetBase {
+ public:
+  // Get a list of indices with a value of 1 among the features
+  template <typename IndexListType>
+  static void AppendActiveIndices(
+      const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
+    for (const auto perspective :Colors) {
+      Derived::CollectActiveIndices(
+          pos, trigger, perspective, &active[perspective]);
+    }
+  }
+
+  // Get a list of indices whose values have changed from the previous one in the feature quantity
+  template <typename PositionType, typename IndexListType>
+  static void AppendChangedIndices(
+      const PositionType& pos, TriggerEvent trigger,
+      IndexListType removed[2], IndexListType added[2], bool reset[2]) {
+    const auto& dp = pos.state()->dirtyPiece;
+    if (dp.dirty_num == 0) return;
+
+    for (const auto perspective :Colors) {
+      reset[perspective] = false;
+      switch (trigger) {
+        case TriggerEvent::kNone:
+          break;
+        case TriggerEvent::kFriendKingMoved:
+          reset[perspective] =
+              dp.pieceNo[0] == PIECE_NUMBER_KING + perspective;
+          break;
+        case TriggerEvent::kEnemyKingMoved:
+          reset[perspective] =
+              dp.pieceNo[0] == PIECE_NUMBER_KING + ~perspective;
+          break;
+        case TriggerEvent::kAnyKingMoved:
+          reset[perspective] = dp.pieceNo[0] >= PIECE_NUMBER_KING;
+          break;
+        case TriggerEvent::kAnyPieceMoved:
+          reset[perspective] = true;
+          break;
+        default:
+          assert(false);
+          break;
+      }
+      if (reset[perspective]) {
+        Derived::CollectActiveIndices(
+            pos, trigger, perspective, &added[perspective]);
+      } else {
+        Derived::CollectChangedIndices(
+            pos, trigger, perspective,
+            &removed[perspective], &added[perspective]);
+      }
+    }
+  }
+};
+
+// Class template that represents the feature set
+// do internal processing in reverse order of template arguments in order to linearize the amount of calculation at runtime
+template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+class FeatureSet<FirstFeatureType, RemainingFeatureTypes...> :
+    public FeatureSetBase<
+        FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
+ private:
+  using Head = FirstFeatureType;
+  using Tail = FeatureSet<RemainingFeatureTypes...>;
+
+ public:
+  // Hash value embedded in the evaluation function file
+  static constexpr std::uint32_t kHashValue =
+      Head::kHashValue ^ (Tail::kHashValue << 1) ^ (Tail::kHashValue >> 31);
+  // number of feature dimensions
+  static constexpr IndexType kDimensions =
+      Head::kDimensions + Tail::kDimensions;
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+  static constexpr IndexType kMaxActiveDimensions =
+      Head::kMaxActiveDimensions + Tail::kMaxActiveDimensions;
+  // List of timings to perform all calculations instead of difference calculation
+  using SortedTriggerSet = typename InsertToSet<TriggerEvent,
+      typename Tail::SortedTriggerSet, Head::kRefreshTrigger>::Result;
+  static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+  // Get the feature quantity name
+  static std::string GetName() {
+    return std::string(Head::kName) + "+" + Tail::GetName();
+  }
+
+ private:
+  // Get a list of indices with a value of 1 among the features
+  template <typename IndexListType>
+  static void CollectActiveIndices(
+      const Position& pos, const TriggerEvent trigger, const Color perspective,
+      IndexListType* const active) {
+    Tail::CollectActiveIndices(pos, trigger, perspective, active);
+    if (Head::kRefreshTrigger == trigger) {
+      const auto start = active->size();
+      Head::AppendActiveIndices(pos, perspective, active);
+      for (auto i = start; i < active->size(); ++i) {
+        (*active)[i] += Tail::kDimensions;
+      }
+    }
+  }
+
+  // Get a list of indices whose values have changed from the previous one in the feature quantity
+  template <typename IndexListType>
+  static void CollectChangedIndices(
+      const Position& pos, const TriggerEvent trigger, const Color perspective,
+      IndexListType* const removed, IndexListType* const added) {
+    Tail::CollectChangedIndices(pos, trigger, perspective, removed, added);
+    if (Head::kRefreshTrigger == trigger) {
+      const auto start_removed = removed->size();
+      const auto start_added = added->size();
+      Head::AppendChangedIndices(pos, perspective, removed, added);
+      for (auto i = start_removed; i < removed->size(); ++i) {
+        (*removed)[i] += Tail::kDimensions;
+      }
+      for (auto i = start_added; i < added->size(); ++i) {
+        (*added)[i] += Tail::kDimensions;
+      }
+    }
+  }
+
+  // Make the base class and the class template that recursively uses itself a friend
+  friend class FeatureSetBase<FeatureSet>;
+  template <typename... FeatureTypes>
+  friend class FeatureSet;
+};
+
+// Class template that represents the feature set
+// Specialization with one template argument
+template <typename FeatureType>
+class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+ public:
+  // Hash value embedded in the evaluation function file
+  static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
+  // number of feature dimensions
+  static constexpr IndexType kDimensions = FeatureType::kDimensions;
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+  static constexpr IndexType kMaxActiveDimensions =
+      FeatureType::kMaxActiveDimensions;
+  // List of timings to perform all calculations instead of difference calculation
+  using SortedTriggerSet =
+      CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
+  static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+  // Get the feature quantity name
+  static std::string GetName() {
+    return FeatureType::kName;
+  }
+
+ private:
+  // Get a list of indices with a value of 1 among the features
+  static void CollectActiveIndices(
+      const Position& pos, const TriggerEvent trigger, const Color perspective,
+      IndexList* const active) {
+    if (FeatureType::kRefreshTrigger == trigger) {
+      FeatureType::AppendActiveIndices(pos, perspective, active);
+    }
+  }
+
+  // Get a list of indices whose values have changed from the previous one in the feature quantity
+  static void CollectChangedIndices(
+      const Position& pos, const TriggerEvent trigger, const Color perspective,
+      IndexList* const removed, IndexList* const added) {
+    if (FeatureType::kRefreshTrigger == trigger) {
+      FeatureType::AppendChangedIndices(pos, perspective, removed, added);
+    }
+  }
+
+  // Make the base class and the class template that recursively uses itself a friend
+  friend class FeatureSetBase<FeatureSet>;
+  template <typename... FeatureTypes>
+  friend class FeatureSet;
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,47 @@
+//Common header of input features of NNUE evaluation function
+
+#ifndef _NNUE_FEATURES_COMMON_H_
+#define _NNUE_FEATURES_COMMON_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// Index list type
+class IndexList;
+
+// Class template that represents the feature set
+template <typename... FeatureTypes>
+class FeatureSet;
+
+// Type of timing to perform all calculations instead of difference calculation
+enum class TriggerEvent {
+  kNone, // Calculate the difference whenever possible
+  kFriendKingMoved, // calculate all when own ball moves
+  kEnemyKingMoved, // do all calculations when enemy balls move
+  kAnyKingMoved, // do all calculations if either ball moves
+  kAnyPieceMoved, // always do all calculations
+};
+
+// turn side or other side
+enum class Side {
+  kFriend, // turn side
+  kEnemy, // opponent
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,84 @@
+//Definition of input features HalfKP of NNUE evaluation function
+
+#if defined(EVAL_NNUE)
+
+#include "half_kp.h"
+#include "index_list.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// Find the index of the feature quantity from the ball position and BonaPiece
+template <Side AssociatedKing>
+inline IndexType HalfKP<AssociatedKing>::MakeIndex(Square sq_k, BonaPiece p) {
+  return static_cast<IndexType>(fe_end) * static_cast<IndexType>(sq_k) + p;
+}
+
+// Get the piece information
+template <Side AssociatedKing>
+inline void HalfKP<AssociatedKing>::GetPieces(
+    const Position& pos, Color perspective,
+    BonaPiece** pieces, Square* sq_target_k) {
+  *pieces = (perspective == BLACK) ?
+      pos.eval_list()->piece_list_fb() :
+      pos.eval_list()->piece_list_fw();
+  const PieceNumber target = (AssociatedKing == Side::kFriend) ?
+      static_cast<PieceNumber>(PIECE_NUMBER_KING + perspective) :
+      static_cast<PieceNumber>(PIECE_NUMBER_KING + ~perspective);
+  *sq_target_k = static_cast<Square>(((*pieces)[target] - f_king) % SQUARE_NB);
+}
+
+// Get a list of indices with a value of 1 among the features
+template <Side AssociatedKing>
+void HalfKP<AssociatedKing>::AppendActiveIndices(
+    const Position& pos, Color perspective, IndexList* active) {
+  // do nothing if array size is small to avoid compiler warning
+  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+  BonaPiece* pieces;
+  Square sq_target_k;
+  GetPieces(pos, perspective, &pieces, &sq_target_k);
+  for (PieceNumber i = PIECE_NUMBER_ZERO; i < PIECE_NUMBER_KING; ++i) {
+    if (pieces[i] != Eval::BONA_PIECE_ZERO) {
+      active->push_back(MakeIndex(sq_target_k, pieces[i]));
+    }
+  }
+}
+
+// Get a list of indices whose values have changed from the previous one in the feature quantity
+template <Side AssociatedKing>
+void HalfKP<AssociatedKing>::AppendChangedIndices(
+    const Position& pos, Color perspective,
+    IndexList* removed, IndexList* added) {
+  BonaPiece* pieces;
+  Square sq_target_k;
+  GetPieces(pos, perspective, &pieces, &sq_target_k);
+  const auto& dp = pos.state()->dirtyPiece;
+  for (int i = 0; i < dp.dirty_num; ++i) {
+    if (dp.pieceNo[i] >= PIECE_NUMBER_KING) continue;
+    const auto old_p = static_cast<BonaPiece>(
+        dp.changed_piece[i].old_piece.from[perspective]);
+    if (old_p != Eval::BONA_PIECE_ZERO) {
+      removed->push_back(MakeIndex(sq_target_k, old_p));
+    }
+    const auto new_p = static_cast<BonaPiece>(
+        dp.changed_piece[i].new_piece.from[perspective]);
+    if (new_p != Eval::BONA_PIECE_ZERO) {
+      added->push_back(MakeIndex(sq_target_k, new_p));
+    }
+  }
+}
+
+template class HalfKP<Side::kFriend>;
+template class HalfKP<Side::kEnemy>;
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
@@ -0,0 +1,62 @@
+//Definition of input features HalfKP of NNUE evaluation function
+
+#ifndef _NNUE_FEATURES_HALF_KP_H_
+#define _NNUE_FEATURES_HALF_KP_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// Feature HalfKP: Combination of the position of own ball or enemy ball and the position of pieces other than balls
+template <Side AssociatedKing>
+class HalfKP {
+ public:
+  // feature quantity name
+  static constexpr const char* kName =
+      (AssociatedKing == Side::kFriend) ? "HalfKP(Friend)" : "HalfKP(Enemy)";
+  // Hash value embedded in the evaluation function file
+  static constexpr std::uint32_t kHashValue =
+      0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
+  // number of feature dimensions
+  static constexpr IndexType kDimensions =
+      static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(fe_end);
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+  static constexpr IndexType kMaxActiveDimensions = PIECE_NUMBER_KING;
+  // Timing of full calculation instead of difference calculation
+  static constexpr TriggerEvent kRefreshTrigger =
+      (AssociatedKing == Side::kFriend) ?
+      TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+  // Get a list of indices with a value of 1 among the features
+  static void AppendActiveIndices(const Position& pos, Color perspective,
+                                  IndexList* active);
+
+  // Get a list of indices whose values have changed from the previous one in the feature quantity
+  static void AppendChangedIndices(const Position& pos, Color perspective,
+                                   IndexList* removed, IndexList* added);
+
+  // Find the index of the feature quantity from the ball position and BonaPiece
+  static IndexType MakeIndex(Square sq_k, BonaPiece p);
+
+ private:
+  // Get the piece information
+  static void GetPieces(const Position& pos, Color perspective,
+                        BonaPiece** pieces, Square* sq_target_k);
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,97 @@
+//Definition of input features HalfRelativeKP of NNUE evaluation function
+
+#if defined(EVAL_NNUE)
+
+#include "half_relative_kp.h"
+#include "index_list.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// Find the index of the feature quantity from the ball position and BonaPiece
+template <Side AssociatedKing>
+inline IndexType HalfRelativeKP<AssociatedKing>::MakeIndex(
+    Square sq_k, BonaPiece p) {
+  constexpr IndexType W = kBoardWidth;
+  constexpr IndexType H = kBoardHeight;
+  const IndexType piece_index = (p - fe_hand_end) / SQUARE_NB;
+  const Square sq_p = static_cast<Square>((p - fe_hand_end) % SQUARE_NB);
+  const IndexType relative_file = file_of(sq_p) - file_of(sq_k) + (W / 2);
+  const IndexType relative_rank = rank_of(sq_p) - rank_of(sq_k) + (H / 2);
+  return H * W * piece_index + H * relative_file + relative_rank;
+}
+
+// Get the piece information
+template <Side AssociatedKing>
+inline void HalfRelativeKP<AssociatedKing>::GetPieces(
+    const Position& pos, Color perspective,
+    BonaPiece** pieces, Square* sq_target_k) {
+  *pieces = (perspective == BLACK) ?
+      pos.eval_list()->piece_list_fb() :
+      pos.eval_list()->piece_list_fw();
+  const PieceNumber target = (AssociatedKing == Side::kFriend) ?
+      static_cast<PieceNumber>(PIECE_NUMBER_KING + perspective) :
+      static_cast<PieceNumber>(PIECE_NUMBER_KING + ~perspective);
+  *sq_target_k = static_cast<Square>(((*pieces)[target] - f_king) % SQUARE_NB);
+}
+
+// Get a list of indices with a value of 1 among the features
+template <Side AssociatedKing>
+void HalfRelativeKP<AssociatedKing>::AppendActiveIndices(
+    const Position& pos, Color perspective, IndexList* active) {
+  // do nothing if array size is small to avoid compiler warning
+  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+  BonaPiece* pieces;
+  Square sq_target_k;
+  GetPieces(pos, perspective, &pieces, &sq_target_k);
+  for (PieceNumber i = PIECE_NUMBER_ZERO; i < PIECE_NUMBER_KING; ++i) {
+    if (pieces[i] >= fe_hand_end) {
+      if (pieces[i] != Eval::BONA_PIECE_ZERO) {
+        active->push_back(MakeIndex(sq_target_k, pieces[i]));
+      }
+    }
+  }
+}
+
+// Get a list of indices whose values have changed from the previous one in the feature quantity
+template <Side AssociatedKing>
+void HalfRelativeKP<AssociatedKing>::AppendChangedIndices(
+    const Position& pos, Color perspective,
+    IndexList* removed, IndexList* added) {
+  BonaPiece* pieces;
+  Square sq_target_k;
+  GetPieces(pos, perspective, &pieces, &sq_target_k);
+  const auto& dp = pos.state()->dirtyPiece;
+  for (int i = 0; i < dp.dirty_num; ++i) {
+    if (dp.pieceNo[i] >= PIECE_NUMBER_KING) continue;
+    const auto old_p = static_cast<BonaPiece>(
+        dp.changed_piece[i].old_piece.from[perspective]);
+    if (old_p >= fe_hand_end) {
+      if (old_p != Eval::BONA_PIECE_ZERO) {
+        removed->push_back(MakeIndex(sq_target_k, old_p));
+      }
+    }
+    const auto new_p = static_cast<BonaPiece>(
+        dp.changed_piece[i].new_piece.from[perspective]);
+    if (new_p >= fe_hand_end) {
+      if (new_p != Eval::BONA_PIECE_ZERO) {
+        added->push_back(MakeIndex(sq_target_k, new_p));
+      }
+    }
+  }
+}
+
+template class HalfRelativeKP<Side::kFriend>;
+template class HalfRelativeKP<Side::kEnemy>;
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
@@ -0,0 +1,68 @@
+//Definition of input features HalfRelativeKP of NNUE evaluation function
+
+#ifndef _NNUE_FEATURES_HALF_RELATIVE_KP_H_
+#define _NNUE_FEATURES_HALF_RELATIVE_KP_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// Feature HalfRelativeKP: Relative position of each piece other than the ball based on own ball or enemy ball
+template <Side AssociatedKing>
+class HalfRelativeKP {
+ public:
+  // feature quantity name
+  static constexpr const char* kName = (AssociatedKing == Side::kFriend) ?
+      "HalfRelativeKP(Friend)" : "HalfRelativeKP(Enemy)";
+  // Hash value embedded in the evaluation function file
+  static constexpr std::uint32_t kHashValue =
+      0xF9180919u ^ (AssociatedKing == Side::kFriend);
+  // Piece type excluding balls
+  static constexpr IndexType kNumPieceKinds = (fe_end - fe_hand_end) / SQUARE_NB;
+  // width of the virtual board with the ball in the center
+  static constexpr IndexType kBoardWidth = FILE_NB * 2 - 1;
+  // height of a virtual board with balls in the center
+  static constexpr IndexType kBoardHeight = RANK_NB * 2 - 1;
+  // number of feature dimensions
+  static constexpr IndexType kDimensions =
+      kNumPieceKinds * kBoardHeight * kBoardWidth;
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+  static constexpr IndexType kMaxActiveDimensions = PIECE_NUMBER_KING;
+  // Timing of full calculation instead of difference calculation
+  static constexpr TriggerEvent kRefreshTrigger =
+      (AssociatedKing == Side::kFriend) ?
+      TriggerEvent::kFriendKingMoved : TriggerEvent::kEnemyKingMoved;
+
+  // Get a list of indices with a value of 1 among the features
+  static void AppendActiveIndices(const Position& pos, Color perspective,
+                                  IndexList* active);
+
+  // Get a list of indices whose values have changed from the previous one in the feature quantity
+  static void AppendChangedIndices(const Position& pos, Color perspective,
+                                   IndexList* removed, IndexList* added);
+
+  // Find the index of the feature quantity from the ball position and BonaPiece
+  static IndexType MakeIndex(Square sq_k, BonaPiece p);
+
+ private:
+  // Get the piece information
+  static void GetPieces(const Position& pos, Color perspective,
+                        BonaPiece** pieces, Square* sq_target_k);
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,55 @@
+// Definition of index list of input features
+
+#ifndef _NNUE_FEATURES_INDEX_LIST_H_
+#define _NNUE_FEATURES_INDEX_LIST_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../position.h"
+#include "../nnue_architecture.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// Class template used for feature index list
+template <typename T, std::size_t MaxSize>
+class ValueList {
+ public:
+  std::size_t size() const { return size_; }
+  void resize(std::size_t size) { size_ = size; }
+  void push_back(const T& value) { values_[size_++] = value; }
+  T& operator[](std::size_t index) { return values_[index]; }
+  T* begin() { return values_; }
+  T* end() { return values_ + size_; }
+  const T& operator[](std::size_t index) const { return values_[index]; }
+  const T* begin() const { return values_; }
+  const T* end() const { return values_ + size_; }
+  void swap(ValueList& other) {
+    const std::size_t max_size = std::max(size_, other.size_);
+    for (std::size_t i = 0; i < max_size; ++i) {
+      std::swap(values_[i], other.values_[i]);
+    }
+    std::swap(size_, other.size_);
+  }
+ private:
+  T values_[MaxSize];
+  std::size_t size_ = 0;
+};
+
+//Type of feature index list
+class IndexList
+    : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,49 @@
+//Definition of input feature quantity K of NNUE evaluation function
+
+#if defined(EVAL_NNUE)
+
+#include "k.h"
+#include "index_list.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// Get a list of indices with a value of 1 among the features
+void K::AppendActiveIndices(
+    const Position& pos, Color perspective, IndexList* active) {
+  // do nothing if array size is small to avoid compiler warning
+  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+  const BonaPiece* pieces = (perspective == BLACK) ?
+      pos.eval_list()->piece_list_fb() :
+      pos.eval_list()->piece_list_fw();
+  assert(pieces[PIECE_NUMBER_BKING] != BONA_PIECE_ZERO);
+  assert(pieces[PIECE_NUMBER_WKING] != BONA_PIECE_ZERO);
+  for (PieceNumber i = PIECE_NUMBER_KING; i < PIECE_NUMBER_NB; ++i) {
+    active->push_back(pieces[i] - fe_end);
+  }
+}
+
+// Get a list of indices whose values have changed from the previous one in the feature quantity
+void K::AppendChangedIndices(
+    const Position& pos, Color perspective,
+    IndexList* removed, IndexList* added) {
+  const auto& dp = pos.state()->dirtyPiece;
+  if (dp.pieceNo[0] >= PIECE_NUMBER_KING) {
+    removed->push_back(
+        dp.changed_piece[0].old_piece.from[perspective] - fe_end);
+    added->push_back(
+        dp.changed_piece[0].new_piece.from[perspective] - fe_end);
+  }
+}
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
@@ -0,0 +1,48 @@
+//Definition of input feature quantity K of NNUE evaluation function
+
+#ifndef _NNUE_FEATURES_K_H_
+#define _NNUE_FEATURES_K_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// Feature K: Ball position
+class K {
+ public:
+  // feature quantity name
+  static constexpr const char* kName = "K";
+  // Hash value embedded in the evaluation function file
+  static constexpr std::uint32_t kHashValue = 0xD3CEE169u;
+  // number of feature dimensions
+  static constexpr IndexType kDimensions = SQUARE_NB * 2;
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+  static constexpr IndexType kMaxActiveDimensions = 2;
+  // Timing of full calculation instead of difference calculation
+  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+  // Get a list of indices with a value of 1 among the features
+  static void AppendActiveIndices(const Position& pos, Color perspective,
+                                  IndexList* active);
+
+  // Get a list of indices whose values have changed from the previous one in the feature quantity
+  static void AppendChangedIndices(const Position& pos, Color perspective,
+                                   IndexList* removed, IndexList* added);
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,52 @@
+//Definition of input feature P of NNUE evaluation function
+
+#if defined(EVAL_NNUE)
+
+#include "p.h"
+#include "index_list.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// Get a list of indices with a value of 1 among the features
+void P::AppendActiveIndices(
+    const Position& pos, Color perspective, IndexList* active) {
+  // do nothing if array size is small to avoid compiler warning
+  if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+  const BonaPiece* pieces = (perspective == BLACK) ?
+      pos.eval_list()->piece_list_fb() :
+      pos.eval_list()->piece_list_fw();
+  for (PieceNumber i = PIECE_NUMBER_ZERO; i < PIECE_NUMBER_KING; ++i) {
+    if (pieces[i] != Eval::BONA_PIECE_ZERO) {
+      active->push_back(pieces[i]);
+    }
+  }
+}
+
+// Get a list of indices whose values have changed from the previous one in the feature quantity
+void P::AppendChangedIndices(
+    const Position& pos, Color perspective,
+    IndexList* removed, IndexList* added) {
+  const auto& dp = pos.state()->dirtyPiece;
+  for (int i = 0; i < dp.dirty_num; ++i) {
+    if (dp.pieceNo[i] >= PIECE_NUMBER_KING) continue;
+    if (dp.changed_piece[i].old_piece.from[perspective] != Eval::BONA_PIECE_ZERO) {
+      removed->push_back(dp.changed_piece[i].old_piece.from[perspective]);
+    }
+    if (dp.changed_piece[i].new_piece.from[perspective] != Eval::BONA_PIECE_ZERO) {
+      added->push_back(dp.changed_piece[i].new_piece.from[perspective]);
+    }
+  }
+}
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
@@ -0,0 +1,48 @@
+//Definition of input feature P of NNUE evaluation function
+
+#ifndef _NNUE_FEATURES_P_H_
+#define _NNUE_FEATURES_P_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// Feature P: BonaPiece of pieces other than balls
+class P {
+ public:
+  // feature quantity name
+  static constexpr const char* kName = "P";
+  // Hash value embedded in the evaluation function file
+  static constexpr std::uint32_t kHashValue = 0x764CFB4Bu;
+  // number of feature dimensions
+  static constexpr IndexType kDimensions = fe_end;
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+  static constexpr IndexType kMaxActiveDimensions = PIECE_NUMBER_KING;
+  // Timing of full calculation instead of difference calculation
+  static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kNone;
+
+  // Get a list of indices with a value of 1 among the features
+  static void AppendActiveIndices(const Position& pos, Color perspective,
+                                  IndexList* active);
+
+  // Get a list of indices whose values have changed from the previous one in the feature quantity
+  static void AppendChangedIndices(const Position& pos, Color perspective,
+                                   IndexList* removed, IndexList* added);
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,217 @@
+// Definition of layer AffineTransform of NNUE evaluation function
+
+#ifndef _NNUE_LAYERS_AFFINE_TRANSFORM_H_
+#define _NNUE_LAYERS_AFFINE_TRANSFORM_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Layers {
+
+// affine transformation layer
+template <typename PreviousLayer, IndexType OutputDimensions>
+class AffineTransform {
+ public:
+  // Input/output type
+  using InputType = typename PreviousLayer::OutputType;
+  using OutputType = std::int32_t;
+  static_assert(std::is_same<InputType, std::uint8_t>::value, "");
+
+  // number of input/output dimensions
+  static constexpr IndexType kInputDimensions =
+      PreviousLayer::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = OutputDimensions;
+  static constexpr IndexType kPaddedInputDimensions =
+      CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);
+
+  // Size of forward propagation buffer used in this layer
+  static constexpr std::size_t kSelfBufferSize =
+      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+
+  // Size of the forward propagation buffer used from the input layer to this layer
+  static constexpr std::size_t kBufferSize =
+      PreviousLayer::kBufferSize + kSelfBufferSize;
+
+  // Hash value embedded in the evaluation function file
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0xCC03DAE4u;
+    hash_value += kOutputDimensions;
+    hash_value ^= PreviousLayer::GetHashValue() >> 1;
+    hash_value ^= PreviousLayer::GetHashValue() << 31;
+    return hash_value;
+  }
+
+  // A string that represents the structure from the input layer to this layer
+  static std::string GetStructureString() {
+    return "AffineTransform[" +
+        std::to_string(kOutputDimensions) + "<-" +
+        std::to_string(kInputDimensions) + "](" +
+        PreviousLayer::GetStructureString() + ")";
+  }
+
+  // read parameters
+  bool ReadParameters(std::istream& stream) {
+    if (!previous_layer_.ReadParameters(stream)) return false;
+    stream.read(reinterpret_cast<char*>(biases_),
+                kOutputDimensions * sizeof(BiasType));
+    stream.read(reinterpret_cast<char*>(weights_),
+                kOutputDimensions * kPaddedInputDimensions *
+                sizeof(WeightType));
+    return !stream.fail();
+  }
+
+  // write parameters
+  bool WriteParameters(std::ostream& stream) const {
+    if (!previous_layer_.WriteParameters(stream)) return false;
+    stream.write(reinterpret_cast<const char*>(biases_),
+                 kOutputDimensions * sizeof(BiasType));
+    stream.write(reinterpret_cast<const char*>(weights_),
+                 kOutputDimensions * kPaddedInputDimensions *
+                 sizeof(WeightType));
+    return !stream.fail();
+  }
+
+  // forward propagation
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features, char* buffer) const {
+    const auto input = previous_layer_.Propagate(
+        transformed_features, buffer + kSelfBufferSize);
+    const auto output = reinterpret_cast<OutputType*>(buffer);
+#if defined(USE_AVX512)
+    constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
+    const __m512i kOnes = _mm512_set1_epi16(1);
+    const auto input_vector = reinterpret_cast<const __m512i*>(input);
+#elif defined(USE_AVX2)
+    constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+    const __m256i kOnes = _mm256_set1_epi16(1);
+    const auto input_vector = reinterpret_cast<const __m256i*>(input);
+#elif defined(USE_SSSE3)
+    constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+    const __m128i kOnes = _mm_set1_epi16(1);
+    const auto input_vector = reinterpret_cast<const __m128i*>(input);
+#elif defined(IS_ARM)
+    constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+    const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
+#endif
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      const IndexType offset = i * kPaddedInputDimensions;
+#if defined(USE_AVX512)
+      __m512i sum = _mm512_setzero_si512();
+      const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+#if defined(__MINGW32__) || defined(__MINGW64__)
+          __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+#else
+          __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+#endif
+          product = _mm512_madd_epi16(product, kOnes);
+          sum = _mm512_add_epi32(sum, product);
+      }
+      output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
+      
+      // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
+      // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
+      // and we have to do one more 256bit chunk.
+      if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
+      {
+          const auto iv_256  = reinterpret_cast<const __m256i*>(input);
+          const auto row_256 = reinterpret_cast<const __m256i*>(&weights_[offset]);
+          int j = kNumChunks * 2;
+#if defined(__MINGW32__) || defined(__MINGW64__)  // See HACK comment below in AVX2.
+          __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
+#else
+          __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
+#endif
+          sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1));
+
+          sum256 = _mm256_hadd_epi32(sum256, sum256);
+          sum256 = _mm256_hadd_epi32(sum256, sum256);
+          const __m128i lo = _mm256_extracti128_si256(sum256, 0);
+          const __m128i hi = _mm256_extracti128_si256(sum256, 1);
+          output[i] += _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
+      }
+#elif defined(USE_AVX2)
+      __m256i sum = _mm256_setzero_si256();
+      const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        __m256i product = _mm256_maddubs_epi16(
+#if defined(__MINGW32__) || defined(__MINGW64__)
+          // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+          //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+          //       even though alignas is specified.
+          _mm256_loadu_si256
+#else
+          _mm256_load_si256
+#endif
+          (&input_vector[j]), _mm256_load_si256(&row[j]));
+        product = _mm256_madd_epi16(product, kOnes);
+        sum = _mm256_add_epi32(sum, product);
+      }
+      sum = _mm256_hadd_epi32(sum, sum);
+      sum = _mm256_hadd_epi32(sum, sum);
+      const __m128i lo = _mm256_extracti128_si256(sum, 0);
+      const __m128i hi = _mm256_extracti128_si256(sum, 1);
+      output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi) + biases_[i];
+#elif defined(USE_SSSE3)
+      __m128i sum = _mm_cvtsi32_si128(biases_[i]);
+      const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        __m128i product = _mm_maddubs_epi16(
+            _mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+        product = _mm_madd_epi16(product, kOnes);
+        sum = _mm_add_epi32(sum, product);
+      }
+      sum = _mm_hadd_epi32(sum, sum);
+      sum = _mm_hadd_epi32(sum, sum);
+      output[i] = _mm_cvtsi128_si32(sum);
+#elif defined(IS_ARM)
+      int32x4_t sum = {biases_[i]};
+      const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
+        product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
+        sum = vpadalq_s16(sum, product);
+      }
+      output[i] = sum[0] + sum[1] + sum[2] + sum[3];
+#else
+      OutputType sum = biases_[i];
+      for (IndexType j = 0; j < kInputDimensions; ++j) {
+        sum += weights_[offset + j] * input[j];
+      }
+      output[i] = sum;
+#endif
+    }
+    return output;
+  }
+
+ private:
+  // parameter type
+  using BiasType = OutputType;
+  using WeightType = std::int8_t;
+
+  // Make the learning class a friend
+  friend class Trainer<AffineTransform>;
+
+  // the layer immediately before this layer
+  PreviousLayer previous_layer_;
+
+  // parameter
+  alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
+  alignas(kCacheLineSize)
+      WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
+};
+
+}  // namespace Layers
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,177 @@
+// Definition of layer ClippedReLU of NNUE evaluation function
+
+#ifndef _NNUE_LAYERS_CLIPPED_RELU_H_
+#define _NNUE_LAYERS_CLIPPED_RELU_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Layers {
+
+// Clipped ReLU
+template <typename PreviousLayer>
+class ClippedReLU {
+ public:
+  // Input/output type
+  using InputType = typename PreviousLayer::OutputType;
+  using OutputType = std::uint8_t;
+  static_assert(std::is_same<InputType, std::int32_t>::value, "");
+
+  // number of input/output dimensions
+  static constexpr IndexType kInputDimensions =
+      PreviousLayer::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = kInputDimensions;
+
+  // Size of forward propagation buffer used in this layer
+  static constexpr std::size_t kSelfBufferSize =
+      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+
+  // Size of the forward propagation buffer used from the input layer to this layer
+  static constexpr std::size_t kBufferSize =
+      PreviousLayer::kBufferSize + kSelfBufferSize;
+
+  // Hash value embedded in the evaluation function file
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0x538D24C7u;
+    hash_value += PreviousLayer::GetHashValue();
+    return hash_value;
+  }
+
+  // A string that represents the structure from the input layer to this layer
+  static std::string GetStructureString() {
+    return "ClippedReLU[" +
+        std::to_string(kOutputDimensions) + "](" +
+        PreviousLayer::GetStructureString() + ")";
+  }
+
+  // read parameters
+  bool ReadParameters(std::istream& stream) {
+    return previous_layer_.ReadParameters(stream);
+  }
+
+  // write parameters
+  bool WriteParameters(std::ostream& stream) const {
+    return previous_layer_.WriteParameters(stream);
+  }
+
+  // forward propagation
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features, char* buffer) const {
+    const auto input = previous_layer_.Propagate(
+        transformed_features, buffer + kSelfBufferSize);
+    const auto output = reinterpret_cast<OutputType*>(buffer);
+#if defined(USE_AVX2)
+    constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+    const __m256i kZero = _mm256_setzero_si256();
+    const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+    const auto in = reinterpret_cast<const __m256i*>(input);
+    const auto out = reinterpret_cast<__m256i*>(output);
+    for (IndexType i = 0; i < kNumChunks; ++i) {
+      const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
+#if defined(__MINGW32__) || defined(__MINGW64__)
+        // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+        //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+        //       even though alignas is specified.
+        _mm256_loadu_si256
+#else
+        _mm256_load_si256
+#endif
+        (&in[i * 4 + 0]),
+#if defined(__MINGW32__) || defined(__MINGW64__)
+        _mm256_loadu_si256
+#else
+        _mm256_load_si256
+#endif
+        (&in[i * 4 + 1])), kWeightScaleBits);
+      const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
+#if defined(__MINGW32__) || defined(__MINGW64__)
+        _mm256_loadu_si256
+#else
+        _mm256_load_si256
+#endif
+        (&in[i * 4 + 2]),
+#if defined(__MINGW32__) || defined(__MINGW64__)
+        _mm256_loadu_si256
+#else
+        _mm256_load_si256
+#endif
+        (&in[i * 4 + 3])), kWeightScaleBits);
+#if defined(__MINGW32__) || defined(__MINGW64__)
+      _mm256_storeu_si256
+#else
+      _mm256_store_si256
+#endif
+        (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+          _mm256_packs_epi16(words0, words1), kZero), kOffsets));
+    }
+    constexpr IndexType kStart = kNumChunks * kSimdWidth;
+#elif defined(USE_SSSE3)
+    constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+    const __m128i kZero = _mm_setzero_si128();
+#ifndef USE_SSE41
+    const __m128i k0x80s = _mm_set1_epi8(-128);
+#endif
+    const auto in = reinterpret_cast<const __m128i*>(input);
+    const auto out = reinterpret_cast<__m128i*>(output);
+    for (IndexType i = 0; i < kNumChunks; ++i) {
+      const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
+          _mm_load_si128(&in[i * 4 + 0]),
+          _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits);
+      const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
+          _mm_load_si128(&in[i * 4 + 2]),
+          _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
+      const __m128i packedbytes = _mm_packs_epi16(words0, words1);
+      _mm_store_si128(&out[i], 
+#ifdef USE_SSE41
+        _mm_max_epi8(packedbytes, kZero)
+#else
+        _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+#endif
+      );
+    }
+    constexpr IndexType kStart = kNumChunks * kSimdWidth;
+#elif defined(IS_ARM)
+    constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
+    const int8x8_t kZero = {0};
+    const auto in = reinterpret_cast<const int32x4_t*>(input);
+    const auto out = reinterpret_cast<int8x8_t*>(output);
+    for (IndexType i = 0; i < kNumChunks; ++i) {
+      int16x8_t shifted;
+      const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
+      pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits);
+      pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits);
+      out[i] = vmax_s8(vqmovn_s16(shifted), kZero);
+    }
+    constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2);
+#else
+    constexpr IndexType kStart = 0;
+#endif
+    for (IndexType i = kStart; i < kInputDimensions; ++i) {
+      output[i] = static_cast<OutputType>(
+          std::max(0, std::min(127, input[i] >> kWeightScaleBits)));
+    }
+    return output;
+  }
+
+ private:
+  // Make the learning class a friend
+  friend class Trainer<ClippedReLU>;
+
+  // the layer immediately before this layer
+  PreviousLayer previous_layer_;
+};
+
+}  // namespace Layers
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,74 @@
+// NNUE evaluation function layer InputSlice definition
+
+#ifndef _NNUE_LAYERS_INPUT_SLICE_H_
+#define _NNUE_LAYERS_INPUT_SLICE_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Layers {
+
+// input layer
+template <IndexType OutputDimensions, IndexType Offset = 0>
+class InputSlice {
+ public:
+  // need to maintain alignment
+  static_assert(Offset % kMaxSimdWidth == 0, "");
+
+  // output type
+  using OutputType = TransformedFeatureType;
+
+  // output dimensionality
+  static constexpr IndexType kOutputDimensions = OutputDimensions;
+
+  // Size of the forward propagation buffer used from the input layer to this layer
+  static constexpr std::size_t kBufferSize = 0;
+
+  // Hash value embedded in the evaluation function file
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0xEC42E90Du;
+    hash_value ^= kOutputDimensions ^ (Offset << 10);
+    return hash_value;
+  }
+
+  // A string that represents the structure from the input layer to this layer
+  static std::string GetStructureString() {
+    return "InputSlice[" + std::to_string(kOutputDimensions) + "(" +
+        std::to_string(Offset) + ":" +
+        std::to_string(Offset + kOutputDimensions) + ")]";
+  }
+
+  // read parameters
+  bool ReadParameters(std::istream& /*stream*/) {
+    return true;
+  }
+
+  // write parameters
+  bool WriteParameters(std::ostream& /*stream*/) const {
+    return true;
+  }
+
+  // forward propagation
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features,
+      char* /*buffer*/) const {
+    return transformed_features + Offset;
+  }
+
+ private:
+};
+
+}  // namespace Layers
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,163 @@
+// Definition of layer Sum of NNUE evaluation function
+
+#ifndef _NNUE_LAYERS_SUM_H_
+#define _NNUE_LAYERS_SUM_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Layers {
+
+// Layer that sums the output of multiple layers
+template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+class Sum : public Sum<RemainingPreviousLayers...> {
+ private:
+  using Head = FirstPreviousLayer;
+  using Tail = Sum<RemainingPreviousLayers...>;
+
+ public:
+  // Input/output type
+  using InputType = typename Head::OutputType;
+  using OutputType = InputType;
+  static_assert(std::is_same<InputType, typename Tail::InputType>::value, "");
+
+  // number of input/output dimensions
+  static constexpr IndexType kInputDimensions = Head::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = kInputDimensions;
+  static_assert(kInputDimensions == Tail::kInputDimensions ,"");
+
+  // Size of forward propagation buffer used in this layer
+  static constexpr std::size_t kSelfBufferSize =
+      CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+
+  // Size of the forward propagation buffer used from the input layer to this layer
+  static constexpr std::size_t kBufferSize =
+      std::max(Head::kBufferSize + kSelfBufferSize, Tail::kBufferSize);
+
+  // Hash value embedded in the evaluation function file
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0xBCE400B4u;
+    hash_value ^= Head::GetHashValue() >> 1;
+    hash_value ^= Head::GetHashValue() << 31;
+    hash_value ^= Tail::GetHashValue() >> 2;
+    hash_value ^= Tail::GetHashValue() << 30;
+    return hash_value;
+  }
+
+  // A string that represents the structure from the input layer to this layer
+  static std::string GetStructureString() {
+    return "Sum[" +
+        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+  }
+
+  // read parameters
+  bool ReadParameters(std::istream& stream) {
+    if (!Tail::ReadParameters(stream)) return false;
+    return previous_layer_.ReadParameters(stream);
+  }
+
+  // write parameters
+  bool WriteParameters(std::ostream& stream) const {
+    if (!Tail::WriteParameters(stream)) return false;
+    return previous_layer_.WriteParameters(stream);
+  }
+
+  // forward propagation
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features, char* buffer) const {
+    Tail::Propagate(transformed_features, buffer);
+    const auto head_output = previous_layer_.Propagate(
+        transformed_features, buffer + kSelfBufferSize);
+    const auto output = reinterpret_cast<OutputType*>(buffer);
+    for (IndexType i = 0; i <kOutputDimensions; ++i) {
+      output[i] += head_output[i];
+    }
+    return output;
+  }
+
+ protected:
+  // A string that represents the list of layers to be summed
+  static std::string GetSummandsString() {
+    return Head::GetStructureString() + "," + Tail::GetSummandsString();
+  }
+
+  // Make the learning class a friend
+  friend class Trainer<Sum>;
+
+  // the layer immediately before this layer
+  FirstPreviousLayer previous_layer_;
+};
+
+// Layer that sums the output of multiple layers (when there is one template argument)
+template <typename PreviousLayer>
+class Sum<PreviousLayer> {
+ public:
+  // Input/output type
+  using InputType = typename PreviousLayer::OutputType;
+  using OutputType = InputType;
+
+  // number of input/output dimensions
+  static constexpr IndexType kInputDimensions =
+      PreviousLayer::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = kInputDimensions;
+
+  // Size of the forward propagation buffer used from the input layer to this layer
+  static constexpr std::size_t kBufferSize = PreviousLayer::kBufferSize;
+
+  // Hash value embedded in the evaluation function file
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0xBCE400B4u;
+    hash_value ^= PreviousLayer::GetHashValue() >> 1;
+    hash_value ^= PreviousLayer::GetHashValue() << 31;
+    return hash_value;
+  }
+
+  // A string that represents the structure from the input layer to this layer
+  static std::string GetStructureString() {
+    return "Sum[" +
+        std::to_string(kOutputDimensions) + "](" + GetSummandsString() + ")";
+  }
+
+  // read parameters
+  bool ReadParameters(std::istream& stream) {
+    return previous_layer_.ReadParameters(stream);
+  }
+
+  // write parameters
+  bool WriteParameters(std::ostream& stream) const {
+    return previous_layer_.WriteParameters(stream);
+  }
+
+  // forward propagation
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features, char* buffer) const {
+    return previous_layer_.Propagate(transformed_features, buffer);
+  }
+
+ protected:
+  // A string that represents the list of layers to be summed
+  static std::string GetSummandsString() {
+    return PreviousLayer::GetStructureString();
+  }
+
+  // Make the learning class a friend
+  friend class Trainer<Sum>;
+
+  // the layer immediately before this layer
+  PreviousLayer previous_layer_;
+};
+
+}  // namespace Layers
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,30 @@
+// Class for difference calculation of NNUE evaluation function
+
+#ifndef _NNUE_ACCUMULATOR_H_
+#define _NNUE_ACCUMULATOR_H_
+
+#if defined(EVAL_NNUE)
+
+#include "nnue_architecture.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// Class that holds the result of affine transformation of input features
+// Keep the evaluation value that is the final output together
+struct alignas(32) Accumulator {
+  std::int16_t
+      accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+  Value score = VALUE_ZERO;
+  bool computed_accumulation = false;
+  bool computed_score = false;
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,33 @@
+// Input features and network structure used in NNUE evaluation function
+
+#ifndef _NNUE_ARCHITECTURE_H_
+#define _NNUE_ARCHITECTURE_H_
+
+#if defined(EVAL_NNUE)
+
+// include a header that defines the input features and network structure
+//#include "architectures/k-p_256x2-32-32.h"
+//#include "architectures/k-p-cr_256x2-32-32.h"
+//#include "architectures/k-p-cr-ep_256x2-32-32.h"
+#include "architectures/halfkp_256x2-32-32.h"
+//#include "architectures/halfkp-cr-ep_256x2-32-32.h"
+//#include "architectures/halfkp_384x2-32-32.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
+static_assert(Network::kOutputDimensions == 1, "");
+static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
+
+// List of timings to perform all calculations instead of difference calculation
+constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,64 @@
+// Constants used in NNUE evaluation function
+
+#ifndef _NNUE_COMMON_H_
+#define _NNUE_COMMON_H_
+
+#if defined(EVAL_NNUE)
+
+#if defined(USE_AVX2)
+#include <immintrin.h>
+#elif defined(USE_SSE41)
+#include <smmintrin.h>
+#elif defined(USE_SSSE3)
+#include <tmmintrin.h>
+#elif defined(USE_SSE2)
+#include <emmintrin.h>
+#endif
+
+namespace Eval {
+
+namespace NNUE {
+
+// A constant that represents the version of the evaluation function file
+constexpr std::uint32_t kVersion = 0x7AF32F16u;
+
+// Constant used in evaluation value calculation
+constexpr int FV_SCALE = 16;
+constexpr int kWeightScaleBits = 6;
+
+// Size of cache line (in bytes)
+constexpr std::size_t kCacheLineSize = 64;
+
+// SIMD width (in bytes)
+#if defined(USE_AVX2)
+constexpr std::size_t kSimdWidth = 32;
+#elif defined(USE_SSE2)
+constexpr std::size_t kSimdWidth = 16;
+#elif defined(IS_ARM)
+constexpr std::size_t kSimdWidth = 16;
+#endif
+constexpr std::size_t kMaxSimdWidth = 32;
+
+// Type of input feature after conversion
+using TransformedFeatureType = std::uint8_t;
+
+// index type
+using IndexType = std::uint32_t;
+
+// Forward declaration of learning class template
+template <typename Layer>
+class Trainer;
+
+// find the smallest multiple of n and above
+template <typename IntType>
+constexpr IntType CeilToMultiple(IntType n, IntType base) {
+  return (n + base - 1) / base * base;
+}
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,357 @@
+// A class that converts the input features of the NNUE evaluation function
+
+#ifndef _NNUE_FEATURE_TRANSFORMER_H_
+#define _NNUE_FEATURE_TRANSFORMER_H_
+
+#if defined(EVAL_NNUE)
+
+#include "nnue_common.h"
+#include "nnue_architecture.h"
+#include "features/index_list.h"
+
+#include <cstring> // std::memset()
+
+namespace Eval {
+
+namespace NNUE {
+
+// Input feature converter
+class FeatureTransformer {
+ private:
+  // number of output dimensions for one side
+  static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
+
+ public:
+  // output type
+  using OutputType = TransformedFeatureType;
+
+  // number of input/output dimensions
+  static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
+  static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
+
+  // size of forward propagation buffer
+  static constexpr std::size_t kBufferSize =
+      kOutputDimensions * sizeof(OutputType);
+
+  // Hash value embedded in the evaluation function file
+  static constexpr std::uint32_t GetHashValue() {
+    return RawFeatures::kHashValue ^ kOutputDimensions;
+  }
+
+  // a string representing the structure
+  static std::string GetStructureString() {
+    return RawFeatures::GetName() + "[" +
+        std::to_string(kInputDimensions) + "->" +
+        std::to_string(kHalfDimensions) + "x2]";
+  }
+
+  // read parameters
+  bool ReadParameters(std::istream& stream) {
+    stream.read(reinterpret_cast<char*>(biases_),
+                kHalfDimensions * sizeof(BiasType));
+    stream.read(reinterpret_cast<char*>(weights_),
+                kHalfDimensions * kInputDimensions * sizeof(WeightType));
+    return !stream.fail();
+  }
+
+  // write parameters
+  bool WriteParameters(std::ostream& stream) const {
+    stream.write(reinterpret_cast<const char*>(biases_),
+                 kHalfDimensions * sizeof(BiasType));
+    stream.write(reinterpret_cast<const char*>(weights_),
+                 kHalfDimensions * kInputDimensions * sizeof(WeightType));
+    return !stream.fail();
+  }
+
+  // proceed with the difference calculation if possible
+  bool UpdateAccumulatorIfPossible(const Position& pos) const {
+    const auto now = pos.state();
+    if (now->accumulator.computed_accumulation) {
+      return true;
+    }
+    const auto prev = now->previous;
+    if (prev && prev->accumulator.computed_accumulation) {
+      UpdateAccumulator(pos);
+      return true;
+    }
+    return false;
+  }
+
+  // convert input features
+  void Transform(const Position& pos, OutputType* output, bool refresh) const {
+    if (refresh || !UpdateAccumulatorIfPossible(pos)) {
+      RefreshAccumulator(pos);
+    }
+    const auto& accumulation = pos.state()->accumulator.accumulation;
+#if defined(USE_AVX2)
+    constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+    constexpr int kControl = 0b11011000;
+    const __m256i kZero = _mm256_setzero_si256();
+#elif defined(USE_SSSE3)
+    constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+    const __m128i kZero = _mm_setzero_si128();
+#ifndef USE_SSE41
+    const __m128i k0x80s = _mm_set1_epi8(-128);
+#endif
+#elif defined(IS_ARM)
+    constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+    const int8x8_t kZero = {0};
+#endif
+    const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+    for (IndexType p = 0; p < 2; ++p) {
+      const IndexType offset = kHalfDimensions * p;
+#if defined(USE_AVX2)
+      auto out = reinterpret_cast<__m256i*>(&output[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        __m256i sum0 =
+#if defined(__MINGW32__) || defined(__MINGW64__)
+          // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+          //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+          //       even though alignas is specified.
+          _mm256_loadu_si256
+#else
+          _mm256_load_si256
+#endif
+          (&reinterpret_cast<const __m256i*>(
+            accumulation[perspectives[p]][0])[j * 2 + 0]);
+        __m256i sum1 =
+#if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_loadu_si256
+#else
+          _mm256_load_si256
+#endif
+          (&reinterpret_cast<const __m256i*>(
+            accumulation[perspectives[p]][0])[j * 2 + 1]);
+        for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+          sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
+              accumulation[perspectives[p]][i])[j * 2 + 0]);
+          sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
+              accumulation[perspectives[p]][i])[j * 2 + 1]);
+        }
+#if defined(__MINGW32__) || defined(__MINGW64__)
+        _mm256_storeu_si256
+#else
+        _mm256_store_si256
+#endif
+        (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+            _mm256_packs_epi16(sum0, sum1), kZero), kControl));
+      }
+#elif defined(USE_SSSE3)
+      auto out = reinterpret_cast<__m128i*>(&output[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+            accumulation[perspectives[p]][0])[j * 2 + 0]);
+        __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+            accumulation[perspectives[p]][0])[j * 2 + 1]);
+        for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+          sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
+              accumulation[perspectives[p]][i])[j * 2 + 0]);
+          sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
+              accumulation[perspectives[p]][i])[j * 2 + 1]);
+        }
+  	const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
+ 
+        _mm_store_si128(&out[j],
+#ifdef USE_SSE41
+          _mm_max_epi8(packedbytes, kZero)
+#else
+          _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+#endif
+        );
+      }
+#elif defined(IS_ARM)
+      const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
+      for (IndexType j = 0; j < kNumChunks; ++j) {
+        int16x8_t sum = reinterpret_cast<const int16x8_t*>(
+            accumulation[perspectives[p]][0])[j];
+        for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+          sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
+              accumulation[perspectives[p]][i])[j]);
+        }
+        out[j] = vmax_s8(vqmovn_s16(sum), kZero);
+      }
+#else
+      for (IndexType j = 0; j < kHalfDimensions; ++j) {
+        BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
+        for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
+          sum += accumulation[static_cast<int>(perspectives[p])][i][j];
+        }
+        output[offset + j] = static_cast<OutputType>(
+            std::max<int>(0, std::min<int>(127, sum)));
+      }
+#endif
+    }
+  }
+
+ private:
+  // Calculate cumulative value without using difference calculation
+  void RefreshAccumulator(const Position& pos) const {
+    auto& accumulator = pos.state()->accumulator;
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList active_indices[2];
+      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
+                                       active_indices);
+      for (const auto perspective : Colors) {
+        if (i == 0) {
+          std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                      kHalfDimensions * sizeof(BiasType));
+        } else {
+          std::memset(accumulator.accumulation[perspective][i], 0,
+                      kHalfDimensions * sizeof(BiasType));
+        }
+        for (const auto index : active_indices[perspective]) {
+          const IndexType offset = kHalfDimensions * index;
+#if defined(USE_AVX2)
+          auto accumulation = reinterpret_cast<__m256i*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+#if defined(__MINGW32__) || defined(__MINGW64__)
+            _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j]));
+#else
+            accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
+#endif
+          }
+#elif defined(USE_SSE2)
+          auto accumulation = reinterpret_cast<__m128i*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
+          }
+#elif defined(IS_ARM)
+          auto accumulation = reinterpret_cast<int16x8_t*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = vaddq_s16(accumulation[j], column[j]);
+          }
+#else
+          for (IndexType j = 0; j < kHalfDimensions; ++j) {
+            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+          }
+#endif
+        }
+      }
+    }
+
+    accumulator.computed_accumulation = true;
+    accumulator.computed_score = false;
+  }
+
+  // Calculate cumulative value using difference calculation
+  void UpdateAccumulator(const Position& pos) const {
+    const auto prev_accumulator = pos.state()->previous->accumulator;
+    auto& accumulator = pos.state()->accumulator;
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList removed_indices[2], added_indices[2];
+      bool reset[2];
+      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
+                                        removed_indices, added_indices, reset);
+      for (const auto perspective : Colors) {
+#if defined(USE_AVX2)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<__m256i*>(
+            &accumulator.accumulation[perspective][i][0]);
+#elif defined(USE_SSE2)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<__m128i*>(
+            &accumulator.accumulation[perspective][i][0]);
+#elif defined(IS_ARM)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<int16x8_t*>(
+            &accumulator.accumulation[perspective][i][0]);
+#endif
+        if (reset[perspective]) {
+          if (i == 0) {
+            std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                        kHalfDimensions * sizeof(BiasType));
+          } else {
+            std::memset(accumulator.accumulation[perspective][i], 0,
+                        kHalfDimensions * sizeof(BiasType));
+          }
+        } else {// Difference calculation for the feature amount changed from 1 to 0
+          std::memcpy(accumulator.accumulation[perspective][i],
+                      prev_accumulator.accumulation[perspective][i],
+                      kHalfDimensions * sizeof(BiasType));
+          for (const auto index : removed_indices[perspective]) {
+            const IndexType offset = kHalfDimensions * index;
+#if defined(USE_AVX2)
+            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
+            }
+#elif defined(USE_SSE2)
+            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
+            }
+#elif defined(IS_ARM)
+            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = vsubq_s16(accumulation[j], column[j]);
+            }
+#else
+            for (IndexType j = 0; j < kHalfDimensions; ++j) {
+              accumulator.accumulation[perspective][i][j] -=
+                  weights_[offset + j];
+            }
+#endif
+          }
+        }
+        {// Difference calculation for features that changed from 0 to 1
+          for (const auto index : added_indices[perspective]) {
+            const IndexType offset = kHalfDimensions * index;
+#if defined(USE_AVX2)
+            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
+            }
+#elif defined(USE_SSE2)
+            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
+            }
+#elif defined(IS_ARM)
+            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = vaddq_s16(accumulation[j], column[j]);
+            }
+#else
+            for (IndexType j = 0; j < kHalfDimensions; ++j) {
+              accumulator.accumulation[perspective][i][j] +=
+                  weights_[offset + j];
+            }
+#endif
+          }
+        }
+      }
+    }
+
+    accumulator.computed_accumulation = true;
+    accumulator.computed_score = false;
+  }
+
+  // parameter type
+  using BiasType = std::int16_t;
+  using WeightType = std::int16_t;
+
+  // Make the learning class a friend
+  friend class Trainer<FeatureTransformer>;
+
+  // parameter
+  alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
+  alignas(kCacheLineSize)
+      WeightType weights_[kHalfDimensions * kInputDimensions];
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,201 @@
+// USI extended command for NNUE evaluation function
+
+#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+
+#include "../../thread.h"
+#include "../../uci.h"
+#include "evaluate_nnue.h"
+#include "nnue_test_command.h"
+
+#include <set>
+#include <fstream>
+
+#define ASSERT(X) { if (!(X)) { std::cout << "\nError : ASSERT(" << #X << "), " << __FILE__ << "(" << __LINE__ << "): " << __func__ << std::endl; \
+ std::this_thread::sleep_for(std::chrono::microseconds(3000)); *(int*)1 =0;} }
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace {
+
+// Testing RawFeatures mainly for difference calculation
+void TestFeatures(Position& pos) {
+  const std::uint64_t num_games = 1000;
+  StateInfo si;
+  pos.set(StartFEN, false, &si, Threads.main());
+  const int MAX_PLY = 256; // test up to 256 hands
+
+  StateInfo state[MAX_PLY]; // StateInfo only for the maximum number of steps
+  int ply; // Trouble from the initial phase
+
+  PRNG prng(20171128);
+
+  std::uint64_t num_moves = 0;
+  std::vector<std::uint64_t> num_updates(kRefreshTriggers.size() + 1);
+  std::vector<std::uint64_t> num_resets(kRefreshTriggers.size());
+  constexpr IndexType kUnknown = -1;
+  std::vector<IndexType> trigger_map(RawFeatures::kDimensions, kUnknown);
+  auto make_index_sets = [&](const Position& pos) {
+    std::vector<std::vector<std::set<IndexType>>> index_sets(
+        kRefreshTriggers.size(), std::vector<std::set<IndexType>>(2));
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList active_indices[2];
+      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
+                                       active_indices);
+      for (const auto perspective : Colors) {
+        for (const auto index : active_indices[perspective]) {
+          ASSERT(index < RawFeatures::kDimensions);
+          ASSERT(index_sets[i][perspective].count(index) == 0);
+          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+          index_sets[i][perspective].insert(index);
+          trigger_map[index] = i;
+        }
+      }
+    }
+    return index_sets;
+  };
+  auto update_index_sets = [&](const Position& pos, auto* index_sets) {
+    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+      Features::IndexList removed_indices[2], added_indices[2];
+      bool reset[2];
+      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
+                                        removed_indices, added_indices, reset);
+      for (const auto perspective : Colors) {
+        if (reset[perspective]) {
+          (*index_sets)[i][perspective].clear();
+          ++num_resets[i];
+        } else {
+          for (const auto index : removed_indices[perspective]) {
+            ASSERT(index < RawFeatures::kDimensions);
+            ASSERT((*index_sets)[i][perspective].count(index) == 1);
+            ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+            (*index_sets)[i][perspective].erase(index);
+            ++num_updates.back();
+            ++num_updates[i];
+            trigger_map[index] = i;
+          }
+        }
+        for (const auto index : added_indices[perspective]) {
+          ASSERT(index < RawFeatures::kDimensions);
+          ASSERT((*index_sets)[i][perspective].count(index) == 0);
+          ASSERT(trigger_map[index] == kUnknown || trigger_map[index] == i);
+          (*index_sets)[i][perspective].insert(index);
+          ++num_updates.back();
+          ++num_updates[i];
+          trigger_map[index] = i;
+        }
+      }
+    }
+  };
+
+  std::cout << "feature set: " << RawFeatures::GetName()
+            << "[" << RawFeatures::kDimensions << "]" << std::endl;
+  std::cout << "start testing with random games";
+
+  for (std::uint64_t i = 0; i < num_games; ++i) {
+    auto index_sets = make_index_sets(pos);
+    for (ply = 0; ply < MAX_PLY; ++ply) {
+      MoveList<LEGAL> mg(pos); // Generate all legal hands
+
+      // There was no legal move == Clog
+      if (mg.size() == 0)
+        break;
+
+      // Randomly choose from the generated moves and advance the phase with the moves.
+      Move m = mg.begin()[prng.rand(mg.size())];
+      pos.do_move(m, state[ply]);
+
+      ++num_moves;
+      update_index_sets(pos, &index_sets);
+      ASSERT(index_sets == make_index_sets(pos));
+    }
+
+    pos.set(StartFEN, false, &si, Threads.main());
+
+    // Output'.' every 100 times (so you can see that it's progressing)
+    if ((i % 100) == 0)
+      std::cout << "." << std::flush;
+  }
+  std::cout << "passed." << std::endl;
+  std::cout << num_games << " games, " << num_moves << " moves, "
+            << num_updates.back() << " updates, "
+            << (1.0 * num_updates.back() / num_moves)
+            << " updates per move" << std::endl;
+  std::size_t num_observed_indices = 0;
+  for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
+    const auto count = std::count(trigger_map.begin(), trigger_map.end(), i);
+    num_observed_indices += count;
+    std::cout << "TriggerEvent(" << static_cast<int>(kRefreshTriggers[i])
+              << "): " << count << " features ("
+              << (100.0 * count / RawFeatures::kDimensions) << "%), "
+              << num_updates[i] << " updates ("
+              << (1.0 * num_updates[i] / num_moves) << " per move), "
+              << num_resets[i] << " resets ("
+              << (100.0 * num_resets[i] / num_moves) << "%)"
+              << std::endl;
+  }
+  std::cout << "observed " << num_observed_indices << " ("
+            << (100.0 * num_observed_indices / RawFeatures::kDimensions)
+            << "% of " << RawFeatures::kDimensions
+            << ") features" << std::endl;
+}
+
+// Output a string that represents the structure of the evaluation function
+void PrintInfo(std::istream& stream) {
+  std::cout << "network architecture: " << GetArchitectureString() << std::endl;
+
+  while (true) {
+    std::string file_name;
+    stream >> file_name;
+    if (file_name.empty()) break;
+
+    std::uint32_t hash_value;
+    std::string architecture;
+    const bool success = [&]() {
+      std::ifstream file_stream(file_name, std::ios::binary);
+      if (!file_stream) return false;
+      if (!ReadHeader(file_stream, &hash_value, &architecture)) return false;
+      return true;
+    }();
+
+    std::cout << file_name << ": ";
+    if (success) {
+      if (hash_value == kHashValue) {
+        std::cout << "matches with this binary";
+        if (architecture != GetArchitectureString()) {
+          std::cout << ", but architecture string differs: " << architecture;
+        }
+        std::cout << std::endl;
+      } else {
+        std::cout << architecture << std::endl;
+      }
+    } else {
+      std::cout << "failed to read header" << std::endl;
+    }
+  }
+}
+
+}  // namespace
+
+// USI extended command for NNUE evaluation function
+void TestCommand(Position& pos, std::istream& stream) {
+  std::string sub_command;
+  stream >> sub_command;
+
+  if (sub_command == "test_features") {
+    TestFeatures(pos);
+  } else if (sub_command == "info") {
+    PrintInfo(stream);
+  } else {
+    std::cout << "usage:" << std::endl;
+    std::cout << " test nnue test_features" << std::endl;
+    std::cout << " test nnue info [path/to/" << fileName << "...]" << std::endl;
+  }
+}
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
@@ -0,0 +1,21 @@
+// USI extended command interface for NNUE evaluation function
+
+#ifndef _NNUE_TEST_COMMAND_H_
+#define _NNUE_TEST_COMMAND_H_
+
+#if defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+
+namespace Eval {
+
+namespace NNUE {
+
+// USI extended command for NNUE evaluation function
+void TestCommand(Position& pos, std::istream& stream);
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(ENABLE_TEST_CMD) && defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,110 @@
+// NNUE evaluation function feature conversion class template
+
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../nnue_common.h"
+#include "../trainer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// Class template that converts input features into learning features
+// By default, the learning feature is the same as the original input feature, and specialized as necessary
+template <typename FeatureType>
+class Factorizer {
+ public:
+  // Get the dimensionality of the learning feature
+  static constexpr IndexType GetDimensions() {
+    return FeatureType::kDimensions;
+  }
+
+  // Get index of learning feature and scale of learning rate
+  static void AppendTrainingFeatures(
+      IndexType base_index, std::vector<TrainingFeature>* training_features) {
+    assert(base_index <FeatureType::kDimensions);
+    training_features->emplace_back(base_index);
+  }
+};
+
+// Learning feature information
+struct FeatureProperties {
+  bool active;
+  IndexType dimensions;
+};
+
+// Add the original input features to the learning features
+template <typename FeatureType>
+IndexType AppendBaseFeature(
+    FeatureProperties properties, IndexType base_index,
+    std::vector<TrainingFeature>* training_features) {
+  assert(properties.dimensions == FeatureType::kDimensions);
+  assert(base_index < FeatureType::kDimensions);
+  training_features->emplace_back(base_index);
+  return properties.dimensions;
+}
+
+// If the learning rate scale is not 0, inherit other types of learning features
+template <typename FeatureType>
+IndexType InheritFeaturesIfRequired(
+    IndexType index_offset, FeatureProperties properties, IndexType base_index,
+    std::vector<TrainingFeature>* training_features) {
+  if (!properties.active) {
+    return 0;
+  }
+  assert(properties.dimensions == Factorizer<FeatureType>::GetDimensions());
+  assert(base_index < FeatureType::kDimensions);
+  const auto start = training_features->size();
+  Factorizer<FeatureType>::AppendTrainingFeatures(
+      base_index, training_features);
+  for (auto i = start; i < training_features->size(); ++i) {
+    auto& feature = (*training_features)[i];
+    assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
+    feature.ShiftIndex(index_offset);
+  }
+  return properties.dimensions;
+}
+
+// Return the index difference as needed, without adding learning features
+// Call instead of InheritFeaturesIfRequired() if there are no corresponding features
+IndexType SkipFeatures(FeatureProperties properties) {
+  if (!properties.active) {
+    return 0;
+  }
+  return properties.dimensions;
+}
+
+// Get the dimensionality of the learning feature
+template <std::size_t N>
+constexpr IndexType GetActiveDimensions(
+    const FeatureProperties (&properties)[N]) {
+  static_assert(N > 0, "");
+  IndexType dimensions = properties[0].dimensions;
+  for (std::size_t i = 1; i < N; ++i) {
+    if (properties[i].active) {
+      dimensions += properties[i].dimensions;
+    }
+  }
+  return dimensions;
+}
+
+// get the number of elements in the array
+template <typename T, std::size_t N>
+constexpr std::size_t GetArrayLength(const T (&/*array*/)[N]) {
+  return N;
+}
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,104 @@
+// Specialization for feature set of feature conversion class template of NNUE evaluation function
+
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_FEATURE_SET_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../features/feature_set.h"
+#include "factorizer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// Class template that converts input features into learning features
+// Specialization for FeatureSet
+template <typename FirstFeatureType, typename... RemainingFeatureTypes>
+class Factorizer<FeatureSet<FirstFeatureType, RemainingFeatureTypes...>> {
+ private:
+  using Head = Factorizer<FeatureSet<FirstFeatureType>>;
+  using Tail = Factorizer<FeatureSet<RemainingFeatureTypes...>>;
+
+ public:
+  // number of dimensions of original input features
+  static constexpr IndexType kBaseDimensions =
+      FeatureSet<FirstFeatureType, RemainingFeatureTypes...>::kDimensions;
+
+  // Get the dimensionality of the learning feature
+  static constexpr IndexType GetDimensions() {
+    return Head::GetDimensions() + Tail::GetDimensions();
+  }
+
+  // Get index of learning feature and scale of learning rate
+  static void AppendTrainingFeatures(
+      IndexType base_index, std::vector<TrainingFeature>* training_features,
+      IndexType base_dimensions = kBaseDimensions) {
+    assert(base_index < kBaseDimensions);
+    constexpr auto boundary = FeatureSet<RemainingFeatureTypes...>::kDimensions;
+    if (base_index < boundary) {
+      Tail::AppendTrainingFeatures(
+          base_index, training_features, base_dimensions);
+    } else {
+      const auto start = training_features->size();
+      Head::AppendTrainingFeatures(
+          base_index - boundary, training_features, base_dimensions);
+      for (auto i = start; i < training_features->size(); ++i) {
+        auto& feature = (*training_features)[i];
+        const auto index = feature.GetIndex();
+        assert(index < Head::GetDimensions() ||
+                   (index >= base_dimensions &&
+                    index < base_dimensions +
+                            Head::GetDimensions() - Head::kBaseDimensions));
+        if (index < Head::kBaseDimensions) {
+          feature.ShiftIndex(Tail::kBaseDimensions);
+        } else {
+          feature.ShiftIndex(Tail::GetDimensions() - Tail::kBaseDimensions);
+        }
+      }
+    }
+  }
+};
+
+// Class template that converts input features into learning features
+// Specialization when FeatureSet has one template argument
+template <typename FeatureType>
+class Factorizer<FeatureSet<FeatureType>> {
+public:
+  // number of dimensions of original input features
+  static constexpr IndexType kBaseDimensions = FeatureType::kDimensions;
+
+  // Get the dimensionality of the learning feature
+  static constexpr IndexType GetDimensions() {
+    return Factorizer<FeatureType>::GetDimensions();
+  }
+
+  // Get index of learning feature and scale of learning rate
+  static void AppendTrainingFeatures(
+      IndexType base_index, std::vector<TrainingFeature>* training_features,
+      IndexType base_dimensions = kBaseDimensions) {
+    assert(base_index < kBaseDimensions);
+    const auto start = training_features->size();
+    Factorizer<FeatureType>::AppendTrainingFeatures(
+        base_index, training_features);
+    for (auto i = start; i < training_features->size(); ++i) {
+      auto& feature = (*training_features)[i];
+      assert(feature.GetIndex() < Factorizer<FeatureType>::GetDimensions());
+      if (feature.GetIndex() >= kBaseDimensions) {
+        feature.ShiftIndex(base_dimensions - kBaseDimensions);
+      }
+    }
+  }
+};
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,103 @@
+// Specialization of NNUE evaluation function feature conversion class template for HalfKP
+
+#ifndef _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
+#define _NNUE_TRAINER_FEATURES_FACTORIZER_HALF_KP_H_
+
+#if defined(EVAL_NNUE)
+
+#include "../../features/half_kp.h"
+#include "../../features/p.h"
+#include "../../features/half_relative_kp.h"
+#include "factorizer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+namespace Features {
+
+// Class template that converts input features into learning features
+// Specialization for HalfKP
+template <Side AssociatedKing>
+class Factorizer<HalfKP<AssociatedKing>> {
+ private:
+  using FeatureType = HalfKP<AssociatedKing>;
+
+  // The maximum value of the number of indexes whose value is 1 at the same time among the feature values
+  static constexpr IndexType kMaxActiveDimensions =
+      FeatureType::kMaxActiveDimensions;
+
+  // Type of learning feature
+  enum TrainingFeatureType {
+    kFeaturesHalfKP,
+    kFeaturesHalfK,
+    kFeaturesP,
+    kFeaturesHalfRelativeKP,
+    kNumTrainingFeatureTypes,
+  };
+
+  // Learning feature information
+  static constexpr FeatureProperties kProperties[] = {
+    // kFeaturesHalfKP
+    {true, FeatureType::kDimensions},
+    // kFeaturesHalfK
+    {true, SQUARE_NB},
+    // kFeaturesP
+    {true, Factorizer<P>::GetDimensions()},
+    // kFeaturesHalfRelativeKP
+    {true, Factorizer<HalfRelativeKP<AssociatedKing>>::GetDimensions()},
+  };
+  static_assert(GetArrayLength(kProperties) == kNumTrainingFeatureTypes, "");
+
+ public:
+  // Get the dimensionality of the learning feature
+  static constexpr IndexType GetDimensions() {
+    return GetActiveDimensions(kProperties);
+  }
+
+  // Get index of learning feature and scale of learning rate
+  static void AppendTrainingFeatures(
+      IndexType base_index, std::vector<TrainingFeature>* training_features) {
+    // kFeaturesHalfKP
+    IndexType index_offset = AppendBaseFeature<FeatureType>(
+        kProperties[kFeaturesHalfKP], base_index, training_features);
+
+    const auto sq_k = static_cast<Square>(base_index / fe_end);
+    const auto p = static_cast<BonaPiece>(base_index % fe_end);
+    // kFeaturesHalfK
+    {
+      const auto& properties = kProperties[kFeaturesHalfK];
+      if (properties.active) {
+        training_features->emplace_back(index_offset + sq_k);
+        index_offset += properties.dimensions;
+      }
+    }
+    // kFeaturesP
+    index_offset += InheritFeaturesIfRequired<P>(
+        index_offset, kProperties[kFeaturesP], p, training_features);
+    // kFeaturesHalfRelativeKP
+    if (p >= fe_hand_end) {
+      index_offset += InheritFeaturesIfRequired<HalfRelativeKP<AssociatedKing>>(
+          index_offset, kProperties[kFeaturesHalfRelativeKP],
+          HalfRelativeKP<AssociatedKing>::MakeIndex(sq_k, p),
+          training_features);
+    } else {
+      index_offset += SkipFeatures(kProperties[kFeaturesHalfRelativeKP]);
+    }
+
+    assert(index_offset == GetDimensions());
+  }
+};
+
+template <Side AssociatedKing>
+constexpr FeatureProperties Factorizer<HalfKP<AssociatedKing>>::kProperties[];
+
+}  // namespace Features
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,125 @@
+// Common header of class template for learning NNUE evaluation function
+
+#ifndef _NNUE_TRAINER_H_
+#define _NNUE_TRAINER_H_
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../nnue_common.h"
+#include "../features/index_list.h"
+
+#include <sstream>
+#if defined(USE_BLAS)
+static_assert(std::is_same<LearnFloatType, float>::value, "");
+#include <cblas.h>
+#endif
+
+namespace Eval {
+
+namespace NNUE {
+
+// Ponanza constant used in the relation between evaluation value and winning percentage
+constexpr double kPonanzaConstant = 600.0;
+
+// Class that represents one index of learning feature
+class TrainingFeature {
+  using StorageType = std::uint32_t;
+  static_assert(std::is_unsigned<StorageType>::value, "");
+
+ public:
+  static constexpr std::uint32_t kIndexBits = 24;
+  static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
+  static constexpr std::uint32_t kCountBits =
+      std::numeric_limits<StorageType>::digits - kIndexBits;
+
+  explicit TrainingFeature(IndexType index) :
+      index_and_count_((index << kCountBits) | 1) {
+    assert(index < (1 << kIndexBits));
+  }
+  TrainingFeature& operator+=(const TrainingFeature& other) {
+    assert(other.GetIndex() == GetIndex());
+    assert(other.GetCount() + GetCount() < (1 << kCountBits));
+    index_and_count_ += other.GetCount();
+    return *this;
+  }
+  IndexType GetIndex() const {
+    return static_cast<IndexType>(index_and_count_ >> kCountBits);
+  }
+  void ShiftIndex(IndexType offset) {
+    assert(GetIndex() + offset < (1 << kIndexBits));
+    index_and_count_ += offset << kCountBits;
+  }
+  IndexType GetCount() const {
+    return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
+  }
+  bool operator<(const TrainingFeature& other) const {
+    return index_and_count_ < other.index_and_count_;
+  }
+
+ private:
+  StorageType index_and_count_;
+};
+
+// Structure that represents one sample of training data
+struct Example {
+  std::vector<TrainingFeature> training_features[2];
+  Learner::PackedSfenValue psv;
+  int sign;
+  double weight;
+};
+
+// Message used for setting hyperparameters
+struct Message {
+  Message(const std::string& name, const std::string& value = ""):
+      name(name), value(value), num_peekers(0), num_receivers(0) {}
+  const std::string name;
+  const std::string value;
+  std::uint32_t num_peekers;
+  std::uint32_t num_receivers;
+};
+
+// determine whether to accept the message
+bool ReceiveMessage(const std::string& name, Message* message) {
+  const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
+  if (message->name.substr(0, name.size() + 1) == name + "[") {
+    ++message->num_peekers;
+  }
+  if (message->name == name || message->name == name + subscript) {
+    ++message->num_receivers;
+    return true;
+  }
+  return false;
+}
+
+// split the string
+std::vector<std::string> Split(const std::string& input, char delimiter) {
+  std::istringstream stream(input);
+  std::string field;
+  std::vector<std::string> fields;
+  while (std::getline(stream, field, delimiter)) {
+    fields.push_back(field);
+  }
+  return fields;
+}
+
+// round a floating point number to an integer
+template <typename IntType>
+IntType Round(double value) {
+  return static_cast<IntType>(std::floor(value + 0.5));
+}
+
+// make_shared with alignment
+template <typename T, typename... ArgumentTypes>
+std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
+  const auto ptr = new(aligned_malloc(sizeof(T), alignof(T)))
+      T(std::forward<ArgumentTypes>(arguments)...);
+  return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
+}
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,301 @@
+// Specialization of NNUE evaluation function learning class template for AffineTransform
+
+#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
+#define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../layers/affine_transform.h"
+#include "trainer.h"
+
+#include <random>
+
+namespace Eval {
+
+namespace NNUE {
+
+// Learning: Affine transformation layer
+template <typename PreviousLayer, IndexType OutputDimensions>
+class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
+ private:
+  // Type of layer to learn
+  using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
+
+ public:
+  // factory function
+  static std::shared_ptr<Trainer> Create(
+      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(
+        new Trainer(target_layer, feature_transformer));
+  }
+
+  // Set options such as hyperparameters
+  void SendMessage(Message* message) {
+    previous_layer_trainer_->SendMessage(message);
+    if (ReceiveMessage("momentum", message)) {
+      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+    }
+    if (ReceiveMessage("learning_rate_scale", message)) {
+      learning_rate_scale_ =
+          static_cast<LearnFloatType>(std::stod(message->value));
+    }
+    if (ReceiveMessage("reset", message)) {
+      DequantizeParameters();
+    }
+    if (ReceiveMessage("quantize_parameters", message)) {
+      QuantizeParameters();
+    }
+  }
+
+  // Initialize the parameters with random numbers
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    previous_layer_trainer_->Initialize(rng);
+    if (kIsOutputLayer) {
+      // Initialize output layer with 0
+      std::fill(std::begin(biases_), std::end(biases_),
+                static_cast<LearnFloatType>(0.0));
+      std::fill(std::begin(weights_), std::end(weights_),
+                static_cast<LearnFloatType>(0.0));
+    } else {
+      // Assuming that the input distribution is unit-mean 0.5, equal variance,
+      // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
+      const double kSigma = 1.0 / std::sqrt(kInputDimensions);
+      auto distribution = std::normal_distribution<double>(0.0, kSigma);
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        double sum = 0.0;
+        for (IndexType j = 0; j < kInputDimensions; ++j) {
+          const auto weight = static_cast<LearnFloatType>(distribution(rng));
+          weights_[kInputDimensions * i + j] = weight;
+          sum += weight;
+        }
+        biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
+      }
+    }
+    QuantizeParameters();
+  }
+
+  // forward propagation
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+      gradients_.resize(kInputDimensions * batch.size());
+    }
+    batch_size_ = static_cast<IndexType>(batch.size());
+    batch_input_ = previous_layer_trainer_->Propagate(batch);
+#if defined(USE_BLAS)
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
+    }
+    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, batch_size_, kInputDimensions, 1.0,
+                weights_, kInputDimensions,
+                batch_input_, kInputDimensions,
+                1.0, &output_[0], kOutputDimensions);
+#else
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_batch_offset = kInputDimensions * b;
+      const IndexType output_batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        double sum = biases_[i];
+        for (IndexType j = 0; j < kInputDimensions; ++j) {
+          const IndexType index = kInputDimensions * i + j;
+          sum += weights_[index] * batch_input_[input_batch_offset + j];
+        }
+        output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
+      }
+    }
+#endif
+    return output_.data();
+  }
+
+  // backpropagation
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    const LearnFloatType local_learning_rate =
+        learning_rate * learning_rate_scale_;
+#if defined(USE_BLAS)
+    // backpropagate
+    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                kInputDimensions, batch_size_, kOutputDimensions, 1.0,
+                weights_, kInputDimensions,
+                gradients, kOutputDimensions,
+                0.0, &gradients_[0], kInputDimensions);
+    // update
+    cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      cblas_saxpy(kOutputDimensions, 1.0,
+                  &gradients[batch_offset], 1, biases_diff_, 1);
+    }
+    cblas_saxpy(kOutputDimensions, -local_learning_rate,
+                biases_diff_, 1, biases_, 1);
+    cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
+                kOutputDimensions, kInputDimensions, batch_size_, 1.0,
+                gradients, kOutputDimensions,
+                batch_input_, kInputDimensions,
+                momentum_, weights_diff_, kInputDimensions);
+    cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
+                weights_diff_, 1, weights_, 1);
+#else
+    // backpropagate
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_batch_offset = kInputDimensions * b;
+      const IndexType output_batch_offset = kOutputDimensions * b;
+      for (IndexType j = 0; j < kInputDimensions; ++j) {
+        double sum = 0.0;
+        for (IndexType i = 0; i < kOutputDimensions; ++i) {
+          const IndexType index = kInputDimensions * i + j;
+          sum += weights_[index] * gradients[output_batch_offset + i];
+        }
+        gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
+      }
+    }
+    // update
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      biases_diff_[i] *= momentum_;
+    }
+    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+      weights_diff_[i] *= momentum_;
+    }
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_batch_offset = kInputDimensions * b;
+      const IndexType output_batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        biases_diff_[i] += gradients[output_batch_offset + i];
+      }
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        for (IndexType j = 0; j < kInputDimensions; ++j) {
+          const IndexType index = kInputDimensions * i + j;
+          weights_diff_[index] += gradients[output_batch_offset + i] *
+              batch_input_[input_batch_offset + j];
+        }
+      }
+    }
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      biases_[i] -= local_learning_rate * biases_diff_[i];
+    }
+    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+      weights_[i] -= local_learning_rate * weights_diff_[i];
+    }
+#endif
+    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+  }
+
+ private:
+  // constructor
+  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      batch_input_(nullptr),
+      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+          &target_layer->previous_layer_, feature_transformer)),
+      target_layer_(target_layer),
+      biases_(),
+      weights_(),
+      biases_diff_(),
+      weights_diff_(),
+      momentum_(0.0),
+      learning_rate_scale_(1.0) {
+    DequantizeParameters();
+  }
+
+  // Weight saturation and parameterization
+  void QuantizeParameters() {
+    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+      weights_[i] = std::max(-kMaxWeightMagnitude,
+                             std::min(+kMaxWeightMagnitude, weights_[i]));
+    }
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      target_layer_->biases_[i] =
+          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+    }
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      const auto offset = kInputDimensions * i;
+      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+      for (IndexType j = 0; j < kInputDimensions; ++j) {
+        target_layer_->weights_[padded_offset + j] =
+            Round<typename LayerType::WeightType>(
+                weights_[offset + j] * kWeightScale);
+      }
+    }
+  }
+
+  // read parameterized integer
+  void DequantizeParameters() {
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      biases_[i] = static_cast<LearnFloatType>(
+          target_layer_->biases_[i] / kBiasScale);
+    }
+    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+      const auto offset = kInputDimensions * i;
+      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
+      for (IndexType j = 0; j < kInputDimensions; ++j) {
+        weights_[offset + j] = static_cast<LearnFloatType>(
+            target_layer_->weights_[padded_offset + j] / kWeightScale);
+      }
+    }
+    std::fill(std::begin(biases_diff_), std::end(biases_diff_),
+              static_cast<LearnFloatType>(0.0));
+    std::fill(std::begin(weights_diff_), std::end(weights_diff_),
+              static_cast<LearnFloatType>(0.0));
+  }
+
+  // number of input/output dimensions
+  static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+  // If the output dimensionality is 1, the output layer
+  static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
+
+  // Coefficient used for parameterization
+  static constexpr LearnFloatType kActivationScale =
+      std::numeric_limits<std::int8_t>::max();
+  static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
+      (kPonanzaConstant * FV_SCALE) :
+      ((1 << kWeightScaleBits) * kActivationScale);
+  static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
+
+  // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
+  static constexpr LearnFloatType kMaxWeightMagnitude =
+      std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
+
+  // number of samples in mini-batch
+  IndexType batch_size_;
+
+  // Input mini batch
+  const LearnFloatType* batch_input_;
+
+  // Trainer of the previous layer
+  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+  // layer to learn
+  LayerType* const target_layer_;
+
+  // parameter
+  LearnFloatType biases_[kOutputDimensions];
+  LearnFloatType weights_[kOutputDimensions * kInputDimensions];
+
+  // Buffer used for updating parameters
+  LearnFloatType biases_diff_[kOutputDimensions];
+  LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+
+  // Forward propagation buffer
+  std::vector<LearnFloatType> output_;
+
+  // buffer for back propagation
+  std::vector<LearnFloatType> gradients_;
+
+  // hyper parameter
+  LearnFloatType momentum_;
+  LearnFloatType learning_rate_scale_;
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,142 @@
+// Specialization of NNUE evaluation function learning class template for ClippedReLU
+
+#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
+#define _NNUE_TRAINER_CLIPPED_RELU_H_
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../layers/clipped_relu.h"
+#include "trainer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// Learning: Affine transformation layer
+template <typename PreviousLayer>
+class Trainer<Layers::ClippedReLU<PreviousLayer>> {
+ private:
+  // Type of layer to learn
+  using LayerType = Layers::ClippedReLU<PreviousLayer>;
+
+ public:
+  // factory function
+  static std::shared_ptr<Trainer> Create(
+      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(
+        new Trainer(target_layer, feature_transformer));
+  }
+
+  // Set options such as hyperparameters
+  void SendMessage(Message* message) {
+    previous_layer_trainer_->SendMessage(message);
+    if (ReceiveMessage("check_health", message)) {
+      CheckHealth();
+    }
+  }
+
+  // Initialize the parameters with random numbers
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    previous_layer_trainer_->Initialize(rng);
+  }
+
+  // forward propagation
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+      gradients_.resize(kInputDimensions * batch.size());
+    }
+    const auto input = previous_layer_trainer_->Propagate(batch);
+    batch_size_ = static_cast<IndexType>(batch.size());
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        const IndexType index = batch_offset + i;
+        output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
+        min_activations_[i] = std::min(min_activations_[i], output_[index]);
+        max_activations_[i] = std::max(max_activations_[i], output_[index]);
+      }
+    }
+    return output_.data();
+  }
+
+  // backpropagation
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        const IndexType index = batch_offset + i;
+        gradients_[index] = gradients[index] *
+            (output_[index] > kZero) * (output_[index] < kOne);
+      }
+    }
+    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
+  }
+
+ private:
+  // constructor
+  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+          &target_layer->previous_layer_, feature_transformer)),
+      target_layer_(target_layer) {
+    std::fill(std::begin(min_activations_), std::end(min_activations_),
+              std::numeric_limits<LearnFloatType>::max());
+    std::fill(std::begin(max_activations_), std::end(max_activations_),
+              std::numeric_limits<LearnFloatType>::lowest());
+  }
+
+  // Check if there are any problems with learning
+  void CheckHealth() {
+    const auto largest_min_activation = *std::max_element(
+        std::begin(min_activations_), std::end(min_activations_));
+    const auto smallest_max_activation = *std::min_element(
+        std::begin(max_activations_), std::end(max_activations_));
+    std::cout << "INFO: largest min activation = " << largest_min_activation
+              << ", smallest max activation = " << smallest_max_activation
+              << std::endl;
+
+    std::fill(std::begin(min_activations_), std::end(min_activations_),
+              std::numeric_limits<LearnFloatType>::max());
+    std::fill(std::begin(max_activations_), std::end(max_activations_),
+              std::numeric_limits<LearnFloatType>::lowest());
+  }
+
+  // number of input/output dimensions
+  static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+  // LearnFloatType constant
+  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+
+  // number of samples in mini-batch
+  IndexType batch_size_;
+
+  // Trainer of the previous layer
+  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+  // layer to learn
+  LayerType* const target_layer_;
+
+  // Forward propagation buffer
+  std::vector<LearnFloatType> output_;
+
+  // buffer for back propagation
+  std::vector<LearnFloatType> gradients_;
+
+  // Health check statistics
+  LearnFloatType min_activations_[kOutputDimensions];
+  LearnFloatType max_activations_[kOutputDimensions];
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,377 @@
+// Specialization for feature transformer of learning class template of NNUE evaluation function
+
+#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
+#define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../nnue_feature_transformer.h"
+#include "trainer.h"
+#include "features/factorizer_feature_set.h"
+
+#include <array>
+#include <bitset>
+#include <numeric>
+#include <random>
+#include <set>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace Eval {
+
+namespace NNUE {
+
+// Learning: Input feature converter
+template <>
+class Trainer<FeatureTransformer> {
+ private:
+  // Type of layer to learn
+  using LayerType = FeatureTransformer;
+
+ public:
+  template <typename T>
+  friend struct AlignedDeleter;
+  template <typename T, typename... ArgumentTypes>
+  friend std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments);
+
+  // factory function
+  static std::shared_ptr<Trainer> Create(LayerType* target_layer) {
+    return MakeAlignedSharedPtr<Trainer>(target_layer);
+  }
+
+  // Set options such as hyperparameters
+  void SendMessage(Message* message) {
+    if (ReceiveMessage("momentum", message)) {
+      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+    }
+    if (ReceiveMessage("learning_rate_scale", message)) {
+      learning_rate_scale_ =
+          static_cast<LearnFloatType>(std::stod(message->value));
+    }
+    if (ReceiveMessage("reset", message)) {
+      DequantizeParameters();
+    }
+    if (ReceiveMessage("quantize_parameters", message)) {
+      QuantizeParameters();
+    }
+    if (ReceiveMessage("clear_unobserved_feature_weights", message)) {
+      ClearUnobservedFeatureWeights();
+    }
+    if (ReceiveMessage("check_health", message)) {
+      CheckHealth();
+    }
+  }
+
+  // Initialize the parameters with random numbers
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    std::fill(std::begin(weights_), std::end(weights_), +kZero);
+    const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
+    auto distribution = std::normal_distribution<double>(0.0, kSigma);
+    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
+      const auto weight = static_cast<LearnFloatType>(distribution(rng));
+      weights_[i] = weight;
+    }
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      biases_[i] = static_cast<LearnFloatType>(0.5);
+    }
+    QuantizeParameters();
+  }
+
+  // forward propagation
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+      gradients_.resize(kOutputDimensions * batch.size());
+    }
+    batch_ = &batch;
+    // affine transform
+#pragma omp parallel for
+    for (IndexType b = 0; b < batch.size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType c = 0; c < 2; ++c) {
+        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+#if defined(USE_BLAS)
+        cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
+        for (const auto& feature : batch[b].training_features[c]) {
+          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+          cblas_saxpy(kHalfDimensions, (float)feature.GetCount(),
+                      &weights_[weights_offset], 1, &output_[output_offset], 1);
+        }
+#else
+        for (IndexType i = 0; i < kHalfDimensions; ++i) {
+          output_[output_offset + i] = biases_[i];
+        }
+        for (const auto& feature : batch[b].training_features[c]) {
+          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+          for (IndexType i = 0; i < kHalfDimensions; ++i) {
+            output_[output_offset + i] +=
+                feature.GetCount() * weights_[weights_offset + i];
+          }
+        }
+#endif
+      }
+    }
+    // clipped ReLU
+    for (IndexType b = 0; b < batch.size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        const IndexType index = batch_offset + i;
+        min_pre_activation_ = std::min(min_pre_activation_, output_[index]);
+        max_pre_activation_ = std::max(max_pre_activation_, output_[index]);
+        output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
+        const IndexType t = i % kHalfDimensions;
+        min_activations_[t] = std::min(min_activations_[t], output_[index]);
+        max_activations_[t] = std::max(max_activations_[t], output_[index]);
+      }
+    }
+    return output_.data();
+  }
+
+  // backpropagation
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    const LearnFloatType local_learning_rate =
+        learning_rate * learning_rate_scale_;
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        const IndexType index = batch_offset + i;
+        gradients_[index] = gradients[index] *
+            ((output_[index] > kZero) * (output_[index] < kOne));
+      }
+    }
+    // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
+    // Correct the learning rate and adjust the scale without using momentum
+    const LearnFloatType effective_learning_rate =
+        static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
+#if defined(USE_BLAS)
+    cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType c = 0; c < 2; ++c) {
+        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+        cblas_saxpy(kHalfDimensions, 1.0,
+                    &gradients_[output_offset], 1, biases_diff_, 1);
+      }
+    }
+    cblas_saxpy(kHalfDimensions, -local_learning_rate,
+                biases_diff_, 1, biases_, 1);
+#pragma omp parallel
+    {
+#if defined(_OPENMP)
+      const IndexType num_threads = omp_get_num_threads();
+      const IndexType thread_index = omp_get_thread_num();
+#endif
+      for (IndexType b = 0; b < batch_->size(); ++b) {
+        const IndexType batch_offset = kOutputDimensions * b;
+        for (IndexType c = 0; c < 2; ++c) {
+          const IndexType output_offset = batch_offset + kHalfDimensions * c;
+          for (const auto& feature : (*batch_)[b].training_features[c]) {
+#if defined(_OPENMP)
+            if (feature.GetIndex() % num_threads != thread_index) continue;
+#endif
+            const IndexType weights_offset =
+                kHalfDimensions * feature.GetIndex();
+            const auto scale = static_cast<LearnFloatType>(
+                effective_learning_rate / feature.GetCount());
+            cblas_saxpy(kHalfDimensions, -scale,
+                        &gradients_[output_offset], 1,
+                        &weights_[weights_offset], 1);
+          }
+        }
+      }
+    }
+#else
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      biases_diff_[i] *= momentum_;
+    }
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType c = 0; c < 2; ++c) {
+        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+        for (IndexType i = 0; i < kHalfDimensions; ++i) {
+          biases_diff_[i] += gradients_[output_offset + i];
+        }
+      }
+    }
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      biases_[i] -= local_learning_rate * biases_diff_[i];
+    }
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType c = 0; c < 2; ++c) {
+        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+        for (const auto& feature : (*batch_)[b].training_features[c]) {
+          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+          const auto scale = static_cast<LearnFloatType>(
+              effective_learning_rate / feature.GetCount());
+          for (IndexType i = 0; i < kHalfDimensions; ++i) {
+            weights_[weights_offset + i] -=
+                scale * gradients_[output_offset + i];
+          }
+        }
+      }
+    }
+#endif
+    for (IndexType b = 0; b < batch_->size(); ++b) {
+      for (IndexType c = 0; c < 2; ++c) {
+        for (const auto& feature : (*batch_)[b].training_features[c]) {
+          observed_features.set(feature.GetIndex());
+        }
+      }
+    }
+  }
+
+ private:
+  // constructor
+  Trainer(LayerType* target_layer) :
+      batch_(nullptr),
+      target_layer_(target_layer),
+      biases_(),
+      weights_(),
+      biases_diff_(),
+      momentum_(0.0),
+      learning_rate_scale_(1.0) {
+    min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
+    max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
+    std::fill(std::begin(min_activations_), std::end(min_activations_),
+              std::numeric_limits<LearnFloatType>::max());
+    std::fill(std::begin(max_activations_), std::end(max_activations_),
+              std::numeric_limits<LearnFloatType>::lowest());
+    DequantizeParameters();
+  }
+
+  // Weight saturation and parameterization
+  void QuantizeParameters() {
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      target_layer_->biases_[i] =
+          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
+    }
+    std::vector<TrainingFeature> training_features;
+#pragma omp parallel for private(training_features)
+    for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
+      training_features.clear();
+      Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+          j, &training_features);
+      for (IndexType i = 0; i < kHalfDimensions; ++i) {
+        double sum = 0.0;
+        for (const auto& feature : training_features) {
+          sum += weights_[kHalfDimensions * feature.GetIndex() + i];
+        }
+        target_layer_->weights_[kHalfDimensions * j + i] =
+            Round<typename LayerType::WeightType>(sum * kWeightScale);
+      }
+    }
+  }
+
+  // read parameterized integer
+  void DequantizeParameters() {
+    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+      biases_[i] = static_cast<LearnFloatType>(
+          target_layer_->biases_[i] / kBiasScale);
+    }
+    std::fill(std::begin(weights_), std::end(weights_), +kZero);
+    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
+      weights_[i] = static_cast<LearnFloatType>(
+          target_layer_->weights_[i] / kWeightScale);
+    }
+    std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
+  }
+
+  // Set the weight corresponding to the feature that does not appear in the learning data to 0
+  void ClearUnobservedFeatureWeights() {
+    for (IndexType i = 0; i < kInputDimensions; ++i) {
+      if (!observed_features.test(i)) {
+        std::fill(std::begin(weights_) + kHalfDimensions * i,
+                  std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
+      }
+    }
+    QuantizeParameters();
+  }
+
+  // Check if there are any problems with learning
+  void CheckHealth() {
+    std::cout << "INFO: observed " << observed_features.count()
+              << " (out of " << kInputDimensions << ") features" << std::endl;
+
+    constexpr LearnFloatType kPreActivationLimit =
+        std::numeric_limits<typename LayerType::WeightType>::max() /
+        kWeightScale;
+    std::cout << "INFO: (min, max) of pre-activations = "
+              << min_pre_activation_ << ", "
+              << max_pre_activation_ << " (limit = "
+              << kPreActivationLimit << ")" << std::endl;
+
+    const auto largest_min_activation = *std::max_element(
+        std::begin(min_activations_), std::end(min_activations_));
+    const auto smallest_max_activation = *std::min_element(
+        std::begin(max_activations_), std::end(max_activations_));
+    std::cout << "INFO: largest min activation = " << largest_min_activation
+              << ", smallest max activation = " << smallest_max_activation
+              << std::endl;
+
+    std::fill(std::begin(min_activations_), std::end(min_activations_),
+              std::numeric_limits<LearnFloatType>::max());
+    std::fill(std::begin(max_activations_), std::end(max_activations_),
+              std::numeric_limits<LearnFloatType>::lowest());
+  }
+
+  // number of input/output dimensions
+  static constexpr IndexType kInputDimensions =
+      Features::Factorizer<RawFeatures>::GetDimensions();
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+  static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
+
+  // Coefficient used for parameterization
+  static constexpr LearnFloatType kActivationScale =
+      std::numeric_limits<std::int8_t>::max();
+  static constexpr LearnFloatType kBiasScale = kActivationScale;
+  static constexpr LearnFloatType kWeightScale = kActivationScale;
+
+  // LearnFloatType constant
+  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+
+  // mini batch
+  const std::vector<Example>* batch_;
+
+  // layer to learn
+  LayerType* const target_layer_;
+
+  // parameter
+  alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
+  alignas(kCacheLineSize)
+      LearnFloatType weights_[kHalfDimensions * kInputDimensions];
+
+  // Buffer used for updating parameters
+  LearnFloatType biases_diff_[kHalfDimensions];
+  std::vector<LearnFloatType> gradients_;
+
+  // Forward propagation buffer
+  std::vector<LearnFloatType> output_;
+
+  // Features that appeared in the training data
+  std::bitset<kInputDimensions> observed_features;
+
+  // hyper parameter
+  LearnFloatType momentum_;
+  LearnFloatType learning_rate_scale_;
+
+  // Health check statistics
+  LearnFloatType min_pre_activation_;
+  LearnFloatType max_pre_activation_;
+  LearnFloatType min_activations_[kHalfDimensions];
+  LearnFloatType max_activations_[kHalfDimensions];
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,251 @@
+// Specialization of NNUE evaluation function learning class template for InputSlice
+
+#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
+#define _NNUE_TRAINER_INPUT_SLICE_H_
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../layers/input_slice.h"
+#include "trainer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// Learning: Input layer
+class SharedInputTrainer {
+ public:
+  // factory function
+  static std::shared_ptr<SharedInputTrainer> Create(
+      FeatureTransformer* feature_transformer) {
+    static std::shared_ptr<SharedInputTrainer> instance;
+    if (!instance) {
+      instance.reset(new SharedInputTrainer(feature_transformer));
+    }
+    ++instance->num_referrers_;
+    return instance;
+  }
+
+  // Set options such as hyperparameters
+  void SendMessage(Message* message) {
+    if (num_calls_ == 0) {
+      current_operation_ = Operation::kSendMessage;
+      feature_transformer_trainer_->SendMessage(message);
+    }
+    assert(current_operation_ == Operation::kSendMessage);
+    if (++num_calls_ == num_referrers_) {
+      num_calls_ = 0;
+      current_operation_ = Operation::kNone;
+    }
+  }
+
+  // Initialize the parameters with random numbers
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    if (num_calls_ == 0) {
+      current_operation_ = Operation::kInitialize;
+      feature_transformer_trainer_->Initialize(rng);
+    }
+    assert(current_operation_ == Operation::kInitialize);
+    if (++num_calls_ == num_referrers_) {
+      num_calls_ = 0;
+      current_operation_ = Operation::kNone;
+    }
+  }
+
+  // forward propagation
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (gradients_.size() < kInputDimensions * batch.size()) {
+      gradients_.resize(kInputDimensions * batch.size());
+    }
+    batch_size_ = static_cast<IndexType>(batch.size());
+    if (num_calls_ == 0) {
+      current_operation_ = Operation::kPropagate;
+      output_ = feature_transformer_trainer_->Propagate(batch);
+    }
+    assert(current_operation_ == Operation::kPropagate);
+    if (++num_calls_ == num_referrers_) {
+      num_calls_ = 0;
+      current_operation_ = Operation::kNone;
+    }
+    return output_;
+  }
+
+  // backpropagation
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    if (num_referrers_ == 1) {
+      feature_transformer_trainer_->Backpropagate(gradients, learning_rate);
+      return;
+    }
+    if (num_calls_ == 0) {
+      current_operation_ = Operation::kBackPropagate;
+      for (IndexType b = 0; b < batch_size_; ++b) {
+        const IndexType batch_offset = kInputDimensions * b;
+        for (IndexType i = 0; i < kInputDimensions; ++i) {
+          gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
+        }
+      }
+    }
+    assert(current_operation_ == Operation::kBackPropagate);
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kInputDimensions * b;
+      for (IndexType i = 0; i < kInputDimensions; ++i) {
+        gradients_[batch_offset + i] += gradients[batch_offset + i];
+      }
+    }
+    if (++num_calls_ == num_referrers_) {
+      feature_transformer_trainer_->Backpropagate(
+          gradients_.data(), learning_rate);
+      num_calls_ = 0;
+      current_operation_ = Operation::kNone;
+    }
+  }
+
+ private:
+  // constructor
+  SharedInputTrainer(FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      num_referrers_(0),
+      num_calls_(0),
+      current_operation_(Operation::kNone),
+      feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
+          feature_transformer)),
+      output_(nullptr) {
+  }
+
+  // number of input/output dimensions
+  static constexpr IndexType kInputDimensions =
+      FeatureTransformer::kOutputDimensions;
+
+  // type of processing
+  enum class Operation {
+    kNone,
+    kSendMessage,
+    kInitialize,
+    kPropagate,
+    kBackPropagate,
+  };
+
+  // number of samples in mini-batch
+  IndexType batch_size_;
+
+  // number of layers sharing this layer as input
+  std::uint32_t num_referrers_;
+
+  // Number of times the current process has been called
+  std::uint32_t num_calls_;
+
+  // current processing type
+  Operation current_operation_;
+
+  // Trainer of input feature converter
+  const std::shared_ptr<Trainer<FeatureTransformer>>
+      feature_transformer_trainer_;
+
+  // pointer to output shared for forward propagation
+  const LearnFloatType* output_;
+
+  // buffer for back propagation
+  std::vector<LearnFloatType> gradients_;
+};
+
+// Learning: Input layer
+template <IndexType OutputDimensions, IndexType Offset>
+class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
+ private:
+  // Type of layer to learn
+  using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
+
+ public:
+  // factory function
+  static std::shared_ptr<Trainer> Create(
+      LayerType* /*target_layer*/, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(new Trainer(feature_transformer));
+  }
+
+  // Set options such as hyperparameters
+  void SendMessage(Message* message) {
+    shared_input_trainer_->SendMessage(message);
+  }
+
+  // Initialize the parameters with random numbers
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    shared_input_trainer_->Initialize(rng);
+  }
+
+  // forward propagation
+  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+      gradients_.resize(kInputDimensions * batch.size());
+    }
+    batch_size_ = static_cast<IndexType>(batch.size());
+    const auto input = shared_input_trainer_->Propagate(batch);
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_offset = kInputDimensions * b;
+      const IndexType output_offset = kOutputDimensions * b;
+#if defined(USE_BLAS)
+      cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
+                  &output_[output_offset], 1);
+#else
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        output_[output_offset + i] = input[input_offset + Offset + i];
+      }
+#endif
+    }
+    return output_.data();
+  }
+
+  // backpropagation
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType input_offset = kInputDimensions * b;
+      const IndexType output_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kInputDimensions; ++i) {
+        if (i < Offset || i >= Offset + kOutputDimensions) {
+          gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
+        } else {
+          gradients_[input_offset + i] = gradients[output_offset + i - Offset];
+        }
+      }
+    }
+    shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate);
+  }
+
+ private:
+  // constructor
+  Trainer(FeatureTransformer* feature_transformer):
+      batch_size_(0),
+      shared_input_trainer_(SharedInputTrainer::Create(feature_transformer)) {
+  }
+
+  // number of input/output dimensions
+  static constexpr IndexType kInputDimensions =
+      FeatureTransformer::kOutputDimensions;
+  static constexpr IndexType kOutputDimensions = OutputDimensions;
+  static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
+
+  // number of samples in mini-batch
+  IndexType batch_size_;
+
+  // Trainer of shared input layer
+  const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
+
+  // Forward propagation buffer
+  std::vector<LearnFloatType> output_;
+
+  // buffer for back propagation
+  std::vector<LearnFloatType> gradients_;
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif
@@ -0,0 +1,190 @@
+// Specialization of NNUE evaluation function learning class template for Sum
+
+#ifndef _NNUE_TRAINER_SUM_H_
+#define _NNUE_TRAINER_SUM_H_
+
+#if defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#include "../../../learn/learn.h"
+#include "../layers/sum.h"
+#include "trainer.h"
+
+namespace Eval {
+
+namespace NNUE {
+
+// Learning: A layer that sums the outputs of multiple layers
+template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
+      Trainer<Layers::Sum<RemainingPreviousLayers...>> {
+ private:
+  // Type of layer to learn
+  using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
+  using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
+
+ public:
+  // factory function
+  static std::shared_ptr<Trainer> Create(
+      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(
+        new Trainer(target_layer, feature_transformer));
+  }
+
+  // Set options such as hyperparameters
+  void SendMessage(Message* message) {
+    // The results of other member functions do not depend on the processing order, so
+    // Tail is processed first for the purpose of simplifying the implementation, but
+    // SendMessage processes Head first to make it easier to understand subscript correspondence
+    previous_layer_trainer_->SendMessage(message);
+    Tail::SendMessage(message);
+  }
+
+  // Initialize the parameters with random numbers
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    Tail::Initialize(rng);
+    previous_layer_trainer_->Initialize(rng);
+  }
+
+  // forward propagation
+  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    batch_size_ = static_cast<IndexType>(batch.size());
+    auto output = Tail::Propagate(batch);
+    const auto head_output = previous_layer_trainer_->Propagate(batch);
+#if defined(USE_BLAS)
+    cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
+                head_output, 1, output, 1);
+#else
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        output[batch_offset + i] += head_output[batch_offset + i];
+      }
+    }
+#endif
+    return output;
+  }
+
+  // backpropagation
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    Tail::Backpropagate(gradients, learning_rate);
+    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+  }
+
+ private:
+  // constructor
+  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer):
+      Tail(target_layer, feature_transformer),
+      batch_size_(0),
+      previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
+          &target_layer->previous_layer_, feature_transformer)),
+      target_layer_(target_layer) {
+  }
+
+  // number of input/output dimensions
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+  // make subclass friend
+  template <typename SumLayer>
+  friend class Trainer;
+
+  // number of samples in mini-batch
+  IndexType batch_size_;
+
+  // Trainer of the previous layer
+  const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
+
+  // layer to learn
+  LayerType* const target_layer_;
+};
+
+
+// Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
+template <typename PreviousLayer>
+class Trainer<Layers::Sum<PreviousLayer>> {
+ private:
+  // Type of layer to learn
+  using LayerType = Layers::Sum<PreviousLayer>;
+
+ public:
+  // factory function
+  static std::shared_ptr<Trainer> Create(
+      LayerType* target_layer, FeatureTransformer* feature_transformer) {
+    return std::shared_ptr<Trainer>(
+        new Trainer(target_layer, feature_transformer));
+  }
+
+  // Set options such as hyperparameters
+  void SendMessage(Message* message) {
+    previous_layer_trainer_->SendMessage(message);
+  }
+
+  // Initialize the parameters with random numbers
+  template <typename RNG>
+  void Initialize(RNG& rng) {
+    previous_layer_trainer_->Initialize(rng);
+  }
+
+  // forward propagation
+  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
+    if (output_.size() < kOutputDimensions * batch.size()) {
+      output_.resize(kOutputDimensions * batch.size());
+    }
+    batch_size_ = static_cast<IndexType>(batch.size());
+    const auto output = previous_layer_trainer_->Propagate(batch);
+#if defined(USE_BLAS)
+    cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
+#else
+    for (IndexType b = 0; b < batch_size_; ++b) {
+      const IndexType batch_offset = kOutputDimensions * b;
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        output_[batch_offset + i] = output[batch_offset + i];
+      }
+    }
+#endif
+    return output_.data();
+  }
+
+  // backpropagation
+  void Backpropagate(const LearnFloatType* gradients,
+                     LearnFloatType learning_rate) {
+    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+  }
+
+ private:
+  // constructor
+  Trainer(LayerType* target_layer, FeatureTransformer* feature_transformer) :
+      batch_size_(0),
+      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+          &target_layer->previous_layer_, feature_transformer)),
+      target_layer_(target_layer) {
+  }
+
+  // number of input/output dimensions
+  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
+  // make subclass friend
+  template <typename SumLayer>
+  friend class Trainer;
+
+  // number of samples in mini-batch
+  IndexType batch_size_;
+
+  // Trainer of the previous layer
+  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
+  // layer to learn
+  LayerType* const target_layer_;
+
+  // Forward propagation buffer
+  std::vector<LearnFloatType> output_;
+};
+
+}  // namespace NNUE
+
+}  // namespace Eval
+
+#endif  // defined(EVAL_LEARN) && defined(EVAL_NNUE)
+
+#endif