Cleanup trainer.

2026-05-20 08:37:44 +00:00 · 2020-10-14 21:26:03 +02:00
parent ea8eb415de
commit c286f9cd7d
6 changed files with 1263 additions and 1153 deletions
@@ -1,121 +1,134 @@
-// Common header of class template for learning NNUE evaluation function
+#ifndef _NNUE_TRAINER_H_
 #ifndef _NNUE_TRAINER_H_
 #define _NNUE_TRAINER_H_
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
-#include "../features/index_list.h"
+#include "nnue/features/index_list.h"
 #include <sstream>
 #if defined(USE_BLAS)
 static_assert(std::is_same<LearnFloatType, float>::value, "");
 #include <cblas.h>
 #endif
-namespace Eval {
+// Common header of class template for learning NNUE evaluation function
 namespace Eval::NNUE {
-namespace NNUE {
+    // Ponanza constant used in the relation between evaluation value and winning percentage
    constexpr double kPonanzaConstant = 600.0;
-// Ponanza constant used in the relation between evaluation value and winning percentage
+    // Class that represents one index of learning feature
-constexpr double kPonanzaConstant = 600.0;
+    class TrainingFeature {
        using StorageType = std::uint32_t;
        static_assert(std::is_unsigned<StorageType>::value, "");
-// Class that represents one index of learning feature
+    public:
-class TrainingFeature {
+        static constexpr std::uint32_t kIndexBits = 24;
  using StorageType = std::uint32_t;
  static_assert(std::is_unsigned<StorageType>::value, "");
- public:
+        static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
  static constexpr std::uint32_t kIndexBits = 24;
  static_assert(kIndexBits < std::numeric_limits<StorageType>::digits, "");
  static constexpr std::uint32_t kCountBits =
      std::numeric_limits<StorageType>::digits - kIndexBits;
-  explicit TrainingFeature(IndexType index) :
+        static constexpr std::uint32_t kCountBits =
-      index_and_count_((index << kCountBits) | 1) {
+            std::numeric_limits<StorageType>::digits - kIndexBits;
    assert(index < (1 << kIndexBits));
  }
  TrainingFeature& operator+=(const TrainingFeature& other) {
    assert(other.GetIndex() == GetIndex());
    assert(other.GetCount() + GetCount() < (1 << kCountBits));
    index_and_count_ += other.GetCount();
    return *this;
  }
  IndexType GetIndex() const {
    return static_cast<IndexType>(index_and_count_ >> kCountBits);
  }
  void ShiftIndex(IndexType offset) {
    assert(GetIndex() + offset < (1 << kIndexBits));
    index_and_count_ += offset << kCountBits;
  }
  IndexType GetCount() const {
    return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
  }
  bool operator<(const TrainingFeature& other) const {
    return index_and_count_ < other.index_and_count_;
  }
- private:
+        explicit TrainingFeature(IndexType index) :
-  StorageType index_and_count_;
+            index_and_count_((index << kCountBits) | 1) {
 };
-// Structure that represents one sample of training data
+            assert(index < (1 << kIndexBits));
-struct Example {
+        }
  std::vector<TrainingFeature> training_features[2];
  Learner::PackedSfenValue psv;
  int sign;
  double weight;
 };
-// Message used for setting hyperparameters
+        TrainingFeature& operator+=(const TrainingFeature& other) {
-struct Message {
+            assert(other.GetIndex() == GetIndex());
-  Message(const std::string& message_name, const std::string& message_value = ""):
+            assert(other.GetCount() + GetCount() < (1 << kCountBits));
-      name(message_name), value(message_value), num_peekers(0), num_receivers(0) {}
+            index_and_count_ += other.GetCount();
-  const std::string name;
+            return *this;
-  const std::string value;
+        }
  std::uint32_t num_peekers;
  std::uint32_t num_receivers;
 };
-// determine whether to accept the message
+        IndexType GetIndex() const {
-bool ReceiveMessage(const std::string& name, Message* message) {
+            return static_cast<IndexType>(index_and_count_ >> kCountBits);
-  const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
+        }
  if (message->name.substr(0, name.size() + 1) == name + "[") {
    ++message->num_peekers;
  }
  if (message->name == name || message->name == name + subscript) {
    ++message->num_receivers;
    return true;
  }
  return false;
 }
-// split the string
+        void ShiftIndex(IndexType offset) {
-std::vector<std::string> Split(const std::string& input, char delimiter) {
+            assert(GetIndex() + offset < (1 << kIndexBits));
-  std::istringstream stream(input);
+            index_and_count_ += offset << kCountBits;
-  std::string field;
+        }
  std::vector<std::string> fields;
  while (std::getline(stream, field, delimiter)) {
    fields.push_back(field);
  }
  return fields;
 }
-// round a floating point number to an integer
+        IndexType GetCount() const {
-template <typename IntType>
+            return static_cast<IndexType>(index_and_count_ & ((1 << kCountBits) - 1));
-IntType Round(double value) {
+        }
  return static_cast<IntType>(std::floor(value + 0.5));
 }
-// make_shared with alignment
+        bool operator<(const TrainingFeature& other) const {
-template <typename T, typename... ArgumentTypes>
+            return index_and_count_ < other.index_and_count_;
-std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
+        }
  const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
      T(std::forward<ArgumentTypes>(arguments)...);
  return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
 }
-}  // namespace NNUE
+    private:
        StorageType index_and_count_;
    };
-}  // namespace Eval
+    // Structure that represents one sample of training data
    struct Example {
        std::vector<TrainingFeature> training_features[2];
        Learner::PackedSfenValue psv;
        int sign;
        double weight;
    };
    // Message used for setting hyperparameters
    struct Message {
        Message(const std::string& message_name, const std::string& message_value = "") :
            name(message_name), value(message_value), num_peekers(0), num_receivers(0)
        {
        }
        const std::string name;
        const std::string value;
        std::uint32_t num_peekers;
        std::uint32_t num_receivers;
    };
    // determine whether to accept the message
    bool ReceiveMessage(const std::string& name, Message* message) {
        const auto subscript = "[" + std::to_string(message->num_peekers) + "]";
        if (message->name.substr(0, name.size() + 1) == name + "[") {
            ++message->num_peekers;
        }
        if (message->name == name || message->name == name + subscript) {
            ++message->num_receivers;
            return true;
        }
        return false;
    }
    // split the string
    std::vector<std::string> Split(const std::string& input, char delimiter) {
        std::istringstream stream(input);
        std::string field;
        std::vector<std::string> fields;
        while (std::getline(stream, field, delimiter)) {
            fields.push_back(field);
        }
        return fields;
    }
    // round a floating point number to an integer
    template <typename IntType>
    IntType Round(double value) {
        return static_cast<IntType>(std::floor(value + 0.5));
    }
    // make_shared with alignment
    template <typename T, typename... ArgumentTypes>
    std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments) {
        const auto ptr = new(std_aligned_alloc(alignof(T), sizeof(T)))
            T(std::forward<ArgumentTypes>(arguments)...);
        return std::shared_ptr<T>(ptr, AlignedDeleter<T>());
    }
 }  // namespace Eval::NNUE
 #endif
@@ -1,297 +1,329 @@
-// Specialization of NNUE evaluation function learning class template for AffineTransform
+#ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #ifndef _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #define _NNUE_TRAINER_AFFINE_TRANSFORM_H_
 #include "../../learn/learn.h"
 #include "../layers/affine_transform.h"
 #include "trainer.h"
 #include "learn/learn.h"
 #include "nnue/layers/affine_transform.h"
 #include <random>
-namespace Eval {
+// Specialization of NNUE evaluation function learning class template for AffineTransform
 namespace Eval::NNUE {
-namespace NNUE {
+    // Learning: Affine transformation layer
    template <typename PreviousLayer, IndexType OutputDimensions>
    class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
    private:
        // Type of layer to learn
        using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
-// Learning: Affine transformation layer
+    public:
-template <typename PreviousLayer, IndexType OutputDimensions>
+        // factory function
-class Trainer<Layers::AffineTransform<PreviousLayer, OutputDimensions>> {
+        static std::shared_ptr<Trainer> Create(
- private:
+            LayerType* target_layer, FeatureTransformer* ft) {
  // Type of layer to learn
  using LayerType = Layers::AffineTransform<PreviousLayer, OutputDimensions>;
- public:
+            return std::shared_ptr<Trainer>(
-  // factory function
+                new Trainer(target_layer, ft));
  static std::shared_ptr<Trainer> Create(
      LayerType* target_layer, FeatureTransformer* ft) {
    return std::shared_ptr<Trainer>(
        new Trainer(target_layer, ft));
  }
  // Set options such as hyperparameters
  void SendMessage(Message* message) {
    previous_layer_trainer_->SendMessage(message);
    if (ReceiveMessage("momentum", message)) {
      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
    }
    if (ReceiveMessage("learning_rate_scale", message)) {
      learning_rate_scale_ =
          static_cast<LearnFloatType>(std::stod(message->value));
    }
    if (ReceiveMessage("reset", message)) {
      DequantizeParameters();
    }
    if (ReceiveMessage("quantize_parameters", message)) {
      QuantizeParameters();
    }
  }
  // Initialize the parameters with random numbers
  template <typename RNG>
  void Initialize(RNG& rng) {
    previous_layer_trainer_->Initialize(rng);
    if (kIsOutputLayer) {
      // Initialize output layer with 0
      std::fill(std::begin(biases_), std::end(biases_),
                static_cast<LearnFloatType>(0.0));
      std::fill(std::begin(weights_), std::end(weights_),
                static_cast<LearnFloatType>(0.0));
    } else {
      // Assuming that the input distribution is unit-mean 0.5, equal variance,
      // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
      const double kSigma = 1.0 / std::sqrt(kInputDimensions);
      auto distribution = std::normal_distribution<double>(0.0, kSigma);
      for (IndexType i = 0; i < kOutputDimensions; ++i) {
        double sum = 0.0;
        for (IndexType j = 0; j < kInputDimensions; ++j) {
          const auto weight = static_cast<LearnFloatType>(distribution(rng));
          weights_[kInputDimensions * i + j] = weight;
          sum += weight;
        }
        biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
      }
    }
    QuantizeParameters();
  }
-  // forward propagation
+        // Set options such as hyperparameters
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        void SendMessage(Message* message) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
+            previous_layer_trainer_->SendMessage(message);
-      output_.resize(kOutputDimensions * batch.size());
+
-      gradients_.resize(kInputDimensions * batch.size());
+            if (ReceiveMessage("momentum", message)) {
-    }
+                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-    batch_size_ = static_cast<IndexType>(batch.size());
+            }
-    batch_input_ = previous_layer_trainer_->Propagate(batch);
+
            if (ReceiveMessage("learning_rate_scale", message)) {
                learning_rate_scale_ =
                    static_cast<LearnFloatType>(std::stod(message->value));
            }
            if (ReceiveMessage("reset", message)) {
                DequantizeParameters();
            }
            if (ReceiveMessage("quantize_parameters", message)) {
                QuantizeParameters();
            }
        }
        // Initialize the parameters with random numbers
        template <typename RNG>
        void Initialize(RNG& rng) {
            previous_layer_trainer_->Initialize(rng);
            if (kIsOutputLayer) {
                // Initialize output layer with 0
                std::fill(std::begin(biases_), std::end(biases_),
                          static_cast<LearnFloatType>(0.0));
                std::fill(std::begin(weights_), std::end(weights_),
                          static_cast<LearnFloatType>(0.0));
            }
            else {
                // Assuming that the input distribution is unit-mean 0.5, equal variance,
                // Initialize the output distribution so that each unit has a mean of 0.5 and the same variance as the input
                const double kSigma = 1.0 / std::sqrt(kInputDimensions);
                auto distribution = std::normal_distribution<double>(0.0, kSigma);
                for (IndexType i = 0; i < kOutputDimensions; ++i) {
                    double sum = 0.0;
                      for (IndexType j = 0; j < kInputDimensions; ++j) {
                          const auto weight = static_cast<LearnFloatType>(distribution(rng));
                          weights_[kInputDimensions * i + j] = weight;
                          sum += weight;
                      }
                    biases_[i] = static_cast<LearnFloatType>(0.5 - 0.5 * sum);
                }
            }
            QuantizeParameters();
        }
        // forward propagation
        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
            if (output_.size() < kOutputDimensions * batch.size()) {
                output_.resize(kOutputDimensions * batch.size());
                gradients_.resize(kInputDimensions * batch.size());
            }
            batch_size_ = static_cast<IndexType>(batch.size());
            batch_input_ = previous_layer_trainer_->Propagate(batch);
 #if defined(USE_BLAS)
-    for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
+                const IndexType batch_offset = kOutputDimensions * b;
-      cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
+                cblas_scopy(kOutputDimensions, biases_, 1, &output_[batch_offset], 1);
-    }
+            }
-    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+
-                kOutputDimensions, batch_size_, kInputDimensions, 1.0,
+            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                weights_, kInputDimensions,
+                        kOutputDimensions, batch_size_, kInputDimensions, 1.0,
-                batch_input_, kInputDimensions,
+                        weights_, kInputDimensions,
-                1.0, &output_[0], kOutputDimensions);
+                        batch_input_, kInputDimensions,
-#else
+                        1.0, &output_[0], kOutputDimensions);
-    for (IndexType b = 0; b < batch_size_; ++b) {
+#else
-      const IndexType input_batch_offset = kInputDimensions * b;
+            for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType output_batch_offset = kOutputDimensions * b;
+                const IndexType input_batch_offset = kInputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const IndexType output_batch_offset = kOutputDimensions * b;
-        double sum = biases_[i];
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        for (IndexType j = 0; j < kInputDimensions; ++j) {
+                    double sum = biases_[i];
-          const IndexType index = kInputDimensions * i + j;
+                    for (IndexType j = 0; j < kInputDimensions; ++j) {
-          sum += weights_[index] * batch_input_[input_batch_offset + j];
+                        const IndexType index = kInputDimensions * i + j;
-        }
+                        sum += weights_[index] * batch_input_[input_batch_offset + j];
-        output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
+                    }
-      }
+
-    }
+                    output_[output_batch_offset + i] = static_cast<LearnFloatType>(sum);
-#endif
+                }
-    return output_.data();
+            }
-  }
+
 #endif
            return output_.data();
        }
        // backpropagation
        void Backpropagate(const LearnFloatType* gradients,
                           LearnFloatType learning_rate) {
            const LearnFloatType local_learning_rate =
                learning_rate * learning_rate_scale_;
  // backpropagation
  void Backpropagate(const LearnFloatType* gradients,
                     LearnFloatType learning_rate) {
    const LearnFloatType local_learning_rate =
        learning_rate * learning_rate_scale_;
 #if defined(USE_BLAS)
-    // backpropagate
+            // backpropagate
-    cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+            cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
-                kInputDimensions, batch_size_, kOutputDimensions, 1.0,
+                        kInputDimensions, batch_size_, kOutputDimensions, 1.0,
-                weights_, kInputDimensions,
+                        weights_, kInputDimensions,
-                gradients, kOutputDimensions,
+                        gradients, kOutputDimensions,
-                0.0, &gradients_[0], kInputDimensions);
+                        0.0, &gradients_[0], kInputDimensions);
-    // update
+
-    cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
+            // update
-    for (IndexType b = 0; b < batch_size_; ++b) {
+            cblas_sscal(kOutputDimensions, momentum_, biases_diff_, 1);
-      const IndexType batch_offset = kOutputDimensions * b;
+            for (IndexType b = 0; b < batch_size_; ++b) {
-      cblas_saxpy(kOutputDimensions, 1.0,
+                const IndexType batch_offset = kOutputDimensions * b;
-                  &gradients[batch_offset], 1, biases_diff_, 1);
+                cblas_saxpy(kOutputDimensions, 1.0,
-    }
+                          &gradients[batch_offset], 1, biases_diff_, 1);
-    cblas_saxpy(kOutputDimensions, -local_learning_rate,
+            }
-                biases_diff_, 1, biases_, 1);
+
-    cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
+            cblas_saxpy(kOutputDimensions, -local_learning_rate,
-                kOutputDimensions, kInputDimensions, batch_size_, 1.0,
+                        biases_diff_, 1, biases_, 1);
-                gradients, kOutputDimensions,
+
-                batch_input_, kInputDimensions,
+            cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans,
-                momentum_, weights_diff_, kInputDimensions);
+                        kOutputDimensions, kInputDimensions, batch_size_, 1.0,
-    cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
+                        gradients, kOutputDimensions,
-                weights_diff_, 1, weights_, 1);
+                        batch_input_, kInputDimensions,
                        momentum_, weights_diff_, kInputDimensions);
            cblas_saxpy(kOutputDimensions * kInputDimensions, -local_learning_rate,
                        weights_diff_, 1, weights_, 1);
 #else
-    // backpropagate
+            // backpropagate
-    for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType input_batch_offset = kInputDimensions * b;
+                const IndexType input_batch_offset = kInputDimensions * b;
-      const IndexType output_batch_offset = kOutputDimensions * b;
+                const IndexType output_batch_offset = kOutputDimensions * b;
-      for (IndexType j = 0; j < kInputDimensions; ++j) {
+                for (IndexType j = 0; j < kInputDimensions; ++j) {
-        double sum = 0.0;
+                    double sum = 0.0;
-        for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    for (IndexType i = 0; i < kOutputDimensions; ++i) {
-          const IndexType index = kInputDimensions * i + j;
+                        const IndexType index = kInputDimensions * i + j;
-          sum += weights_[index] * gradients[output_batch_offset + i];
+                        sum += weights_[index] * gradients[output_batch_offset + i];
-        }
+                    }
-        gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
+                    gradients_[input_batch_offset + j] = static_cast<LearnFloatType>(sum);
-      }
+                }
-    }
+            }
    // update
    for (IndexType i = 0; i < kOutputDimensions; ++i) {
      biases_diff_[i] *= momentum_;
    }
    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
      weights_diff_[i] *= momentum_;
    }
    for (IndexType b = 0; b < batch_size_; ++b) {
      const IndexType input_batch_offset = kInputDimensions * b;
      const IndexType output_batch_offset = kOutputDimensions * b;
      for (IndexType i = 0; i < kOutputDimensions; ++i) {
        biases_diff_[i] += gradients[output_batch_offset + i];
      }
      for (IndexType i = 0; i < kOutputDimensions; ++i) {
        for (IndexType j = 0; j < kInputDimensions; ++j) {
          const IndexType index = kInputDimensions * i + j;
          weights_diff_[index] += gradients[output_batch_offset + i] *
              batch_input_[input_batch_offset + j];
        }
      }
    }
    for (IndexType i = 0; i < kOutputDimensions; ++i) {
      biases_[i] -= local_learning_rate * biases_diff_[i];
    }
    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
      weights_[i] -= local_learning_rate * weights_diff_[i];
    }
 #endif
    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
  }
- private:
+            // update
-  // constructor
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+                biases_diff_[i] *= momentum_;
-      batch_size_(0),
+            }
      batch_input_(nullptr),
      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
          &target_layer->previous_layer_, ft)),
      target_layer_(target_layer),
      biases_(),
      weights_(),
      biases_diff_(),
      weights_diff_(),
      momentum_(0.2),
      learning_rate_scale_(1.0) {
    DequantizeParameters();
  }
-  // Weight saturation and parameterization
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-  void QuantizeParameters() {
+                weights_diff_[i] *= momentum_;
-    for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
+            }
      weights_[i] = std::max(-kMaxWeightMagnitude,
                             std::min(+kMaxWeightMagnitude, weights_[i]));
    }
    for (IndexType i = 0; i < kOutputDimensions; ++i) {
      target_layer_->biases_[i] =
          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
    }
    for (IndexType i = 0; i < kOutputDimensions; ++i) {
      const auto offset = kInputDimensions * i;
      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
      for (IndexType j = 0; j < kInputDimensions; ++j) {
        target_layer_->weights_[padded_offset + j] =
            Round<typename LayerType::WeightType>(
                weights_[offset + j] * kWeightScale);
      }
    }
  }
-  // read parameterized integer
+            for (IndexType b = 0; b < batch_size_; ++b) {
-  void DequantizeParameters() {
+                const IndexType input_batch_offset = kInputDimensions * b;
-    for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                const IndexType output_batch_offset = kOutputDimensions * b;
      biases_[i] = static_cast<LearnFloatType>(
          target_layer_->biases_[i] / kBiasScale);
    }
    for (IndexType i = 0; i < kOutputDimensions; ++i) {
      const auto offset = kInputDimensions * i;
      const auto padded_offset = LayerType::kPaddedInputDimensions * i;
      for (IndexType j = 0; j < kInputDimensions; ++j) {
        weights_[offset + j] = static_cast<LearnFloatType>(
            target_layer_->weights_[padded_offset + j] / kWeightScale);
      }
    }
    std::fill(std::begin(biases_diff_), std::end(biases_diff_),
              static_cast<LearnFloatType>(0.0));
    std::fill(std::begin(weights_diff_), std::end(weights_diff_),
              static_cast<LearnFloatType>(0.0));
  }
-  // number of input/output dimensions
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-  static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
+                    biases_diff_[i] += gradients[output_batch_offset + i];
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+                }
-  // If the output dimensionality is 1, the output layer
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-  static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
+                    for (IndexType j = 0; j < kInputDimensions; ++j) {
                        const IndexType index = kInputDimensions * i + j;
                        weights_diff_[index] += gradients[output_batch_offset + i] *
                            batch_input_[input_batch_offset + j];
                    }
                }
            }
-  // Coefficient used for parameterization
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-  static constexpr LearnFloatType kActivationScale =
+                biases_[i] -= local_learning_rate * biases_diff_[i];
-      std::numeric_limits<std::int8_t>::max();
+            }
  static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
      (kPonanzaConstant * FV_SCALE) :
      ((1 << kWeightScaleBits) * kActivationScale);
  static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
-  // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-  static constexpr LearnFloatType kMaxWeightMagnitude =
+                weights_[i] -= local_learning_rate * weights_diff_[i];
-      std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
+            }
-
+
-  // number of samples in mini-batch
+#endif
-  IndexType batch_size_;
+            previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
-
+        }
-  // Input mini batch
+
-  const LearnFloatType* batch_input_;
+    private:
-
+        // constructor
-  // Trainer of the previous layer
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+            batch_size_(0),
-
+            batch_input_(nullptr),
-  // layer to learn
+            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-  LayerType* const target_layer_;
+                &target_layer->previous_layer_, ft)),
-
+            target_layer_(target_layer),
-  // parameter
+            biases_(),
-  LearnFloatType biases_[kOutputDimensions];
+            weights_(),
-  LearnFloatType weights_[kOutputDimensions * kInputDimensions];
+            biases_diff_(),
-
+            weights_diff_(),
-  // Buffer used for updating parameters
+            momentum_(0.2),
-  LearnFloatType biases_diff_[kOutputDimensions];
+            learning_rate_scale_(1.0) {
-  LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
+
-
+            DequantizeParameters();
-  // Forward propagation buffer
+        }
-  std::vector<LearnFloatType> output_;
+
-
+        // Weight saturation and parameterization
-  // buffer for back propagation
+        void QuantizeParameters() {
-  std::vector<LearnFloatType> gradients_;
+            for (IndexType i = 0; i < kOutputDimensions * kInputDimensions; ++i) {
-
+                weights_[i] = std::max(-kMaxWeightMagnitude,
-  // hyper parameter
+                                       std::min(+kMaxWeightMagnitude, weights_[i]));
-  LearnFloatType momentum_;
+            }
-  LearnFloatType learning_rate_scale_;
+
-};
+            for (IndexType i = 0; i < kOutputDimensions; ++i) {
-
+                target_layer_->biases_[i] =
-}  // namespace NNUE
+                    Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
-
+            }
-}  // namespace Eval
+
            for (IndexType i = 0; i < kOutputDimensions; ++i) {
                const auto offset = kInputDimensions * i;
                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
                for (IndexType j = 0; j < kInputDimensions; ++j) {
                    target_layer_->weights_[padded_offset + j] =
                        Round<typename LayerType::WeightType>(
                            weights_[offset + j] * kWeightScale);
                }
            }
        }
        // read parameterized integer
        void DequantizeParameters() {
            for (IndexType i = 0; i < kOutputDimensions; ++i) {
                biases_[i] = static_cast<LearnFloatType>(
                    target_layer_->biases_[i] / kBiasScale);
            }
            for (IndexType i = 0; i < kOutputDimensions; ++i) {
                const auto offset = kInputDimensions * i;
                const auto padded_offset = LayerType::kPaddedInputDimensions * i;
                for (IndexType j = 0; j < kInputDimensions; ++j) {
                    weights_[offset + j] = static_cast<LearnFloatType>(
                        target_layer_->weights_[padded_offset + j] / kWeightScale);
                }
            }
            std::fill(std::begin(biases_diff_), std::end(biases_diff_),
                      static_cast<LearnFloatType>(0.0));
            std::fill(std::begin(weights_diff_), std::end(weights_diff_),
                      static_cast<LearnFloatType>(0.0));
        }
        // number of input/output dimensions
        static constexpr IndexType kInputDimensions = LayerType::kInputDimensions;
        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
        // If the output dimensionality is 1, the output layer
        static constexpr bool kIsOutputLayer = kOutputDimensions == 1;
        // Coefficient used for parameterization
        static constexpr LearnFloatType kActivationScale =
            std::numeric_limits<std::int8_t>::max();
        static constexpr LearnFloatType kBiasScale = kIsOutputLayer ?
            (kPonanzaConstant * FV_SCALE) :
            ((1 << kWeightScaleBits) * kActivationScale);
        static constexpr LearnFloatType kWeightScale = kBiasScale / kActivationScale;
        // Upper limit of absolute value of weight used to prevent overflow when parameterizing integers
        static constexpr LearnFloatType kMaxWeightMagnitude =
            std::numeric_limits<typename LayerType::WeightType>::max() / kWeightScale;
        // number of samples in mini-batch
        IndexType batch_size_;
        // Input mini batch
        const LearnFloatType* batch_input_;
        // Trainer of the previous layer
        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
        // layer to learn
        LayerType* const target_layer_;
        // parameter
        LearnFloatType biases_[kOutputDimensions];
        LearnFloatType weights_[kOutputDimensions * kInputDimensions];
        // Buffer used for updating parameters
        LearnFloatType biases_diff_[kOutputDimensions];
        LearnFloatType weights_diff_[kOutputDimensions * kInputDimensions];
        // Forward propagation buffer
        std::vector<LearnFloatType> output_;
        // buffer for back propagation
        std::vector<LearnFloatType> gradients_;
        // hyper parameter
        LearnFloatType momentum_;
        LearnFloatType learning_rate_scale_;
    };
 }  // namespace Eval::NNUE
 #endif
@@ -1,138 +1,142 @@
-// Specialization of NNUE evaluation function learning class template for ClippedReLU
+#ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
 #ifndef _NNUE_TRAINER_CLIPPED_RELU_H_
 #define _NNUE_TRAINER_CLIPPED_RELU_H_
 #include "../../learn/learn.h"
 #include "../layers/clipped_relu.h"
 #include "trainer.h"
-namespace Eval {
+#include "learn/learn.h"
-namespace NNUE {
+#include "nnue/layers/clipped_relu.h"
-// Learning: Affine transformation layer
+// Specialization of NNUE evaluation function learning class template for ClippedReLU
-template <typename PreviousLayer>
+namespace Eval::NNUE {
 class Trainer<Layers::ClippedReLU<PreviousLayer>> {
 private:
  // Type of layer to learn
  using LayerType = Layers::ClippedReLU<PreviousLayer>;
- public:
+    // Learning: Affine transformation layer
-  // factory function
+    template <typename PreviousLayer>
-  static std::shared_ptr<Trainer> Create(
+    class Trainer<Layers::ClippedReLU<PreviousLayer>> {
-      LayerType* target_layer, FeatureTransformer* ft) {
+    private:
-    return std::shared_ptr<Trainer>(
+        // Type of layer to learn
-        new Trainer(target_layer, ft));
+        using LayerType = Layers::ClippedReLU<PreviousLayer>;
  }
-  // Set options such as hyperparameters
+    public:
-  void SendMessage(Message* message) {
+        // factory function
-    previous_layer_trainer_->SendMessage(message);
+        static std::shared_ptr<Trainer> Create(
-    if (ReceiveMessage("check_health", message)) {
+            LayerType* target_layer, FeatureTransformer* ft) {
      CheckHealth();
    }
  }
-  // Initialize the parameters with random numbers
+            return std::shared_ptr<Trainer>(
-  template <typename RNG>
+                new Trainer(target_layer, ft));
-  void Initialize(RNG& rng) {
+        }
    previous_layer_trainer_->Initialize(rng);
  }
-  // forward propagation
+        // Set options such as hyperparameters
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+        void SendMessage(Message* message) {
-    if (output_.size() < kOutputDimensions * batch.size()) {
+            previous_layer_trainer_->SendMessage(message);
-      output_.resize(kOutputDimensions * batch.size());
+            if (ReceiveMessage("check_health", message)) {
-      gradients_.resize(kInputDimensions * batch.size());
+                CheckHealth();
-    }
+            }
-    const auto input = previous_layer_trainer_->Propagate(batch);
+        }
    batch_size_ = static_cast<IndexType>(batch.size());
    for (IndexType b = 0; b < batch_size_; ++b) {
      const IndexType batch_offset = kOutputDimensions * b;
      for (IndexType i = 0; i < kOutputDimensions; ++i) {
        const IndexType index = batch_offset + i;
        output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
        min_activations_[i] = std::min(min_activations_[i], output_[index]);
        max_activations_[i] = std::max(max_activations_[i], output_[index]);
      }
    }
    return output_.data();
  }
-  // backpropagation
+        // Initialize the parameters with random numbers
-  void Backpropagate(const LearnFloatType* gradients,
+        template <typename RNG>
-                     LearnFloatType learning_rate) {
+        void Initialize(RNG& rng) {
-    for (IndexType b = 0; b < batch_size_; ++b) {
+            previous_layer_trainer_->Initialize(rng);
-      const IndexType batch_offset = kOutputDimensions * b;
+        }
      for (IndexType i = 0; i < kOutputDimensions; ++i) {
        const IndexType index = batch_offset + i;
        gradients_[index] = gradients[index] *
            (output_[index] > kZero) * (output_[index] < kOne);
      }
    }
    previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
  }
- private:
+        // forward propagation
-  // constructor
+        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
-  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+            if (output_.size() < kOutputDimensions * batch.size()) {
-      batch_size_(0),
+              output_.resize(kOutputDimensions * batch.size());
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+              gradients_.resize(kInputDimensions * batch.size());
-          &target_layer->previous_layer_, ft)),
+            }
      target_layer_(target_layer) {
    std::fill(std::begin(min_activations_), std::end(min_activations_),
              std::numeric_limits<LearnFloatType>::max());
    std::fill(std::begin(max_activations_), std::end(max_activations_),
              std::numeric_limits<LearnFloatType>::lowest());
  }
-  // Check if there are any problems with learning
+            const auto input = previous_layer_trainer_->Propagate(batch);
-  void CheckHealth() {
+            batch_size_ = static_cast<IndexType>(batch.size());
-    const auto largest_min_activation = *std::max_element(
+            for (IndexType b = 0; b < batch_size_; ++b) {
-        std::begin(min_activations_), std::end(min_activations_));
+                const IndexType batch_offset = kOutputDimensions * b;
-    const auto smallest_max_activation = *std::min_element(
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        std::begin(max_activations_), std::end(max_activations_));
+                    const IndexType index = batch_offset + i;
-    std::cout << "INFO: largest min activation = " << largest_min_activation
+                    output_[index] = std::max(+kZero, std::min(+kOne, input[index]));
-              << ", smallest max activation = " << smallest_max_activation
+                    min_activations_[i] = std::min(min_activations_[i], output_[index]);
-              << std::endl;
+                    max_activations_[i] = std::max(max_activations_[i], output_[index]);
                }
            }
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
+            return output_.data();
-              std::numeric_limits<LearnFloatType>::max());
+        }
    std::fill(std::begin(max_activations_), std::end(max_activations_),
              std::numeric_limits<LearnFloatType>::lowest());
  }
-  // number of input/output dimensions
+        // backpropagation
-  static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
+        void Backpropagate(const LearnFloatType* gradients,
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+                           LearnFloatType learning_rate) {
-  // LearnFloatType constant
+            for (IndexType b = 0; b < batch_size_; ++b) {
-  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+                const IndexType batch_offset = kOutputDimensions * b;
-  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
                    const IndexType index = batch_offset + i;
                    gradients_[index] = gradients[index] *
                        (output_[index] > kZero) * (output_[index] < kOne);
                }
            }
-  // number of samples in mini-batch
+            previous_layer_trainer_->Backpropagate(gradients_.data(), learning_rate);
-  IndexType batch_size_;
+        }
-  // Trainer of the previous layer
+    private:
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+        // constructor
        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
            batch_size_(0),
            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
                &target_layer->previous_layer_, ft)),
            target_layer_(target_layer) {
-  // layer to learn
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
-  LayerType* const target_layer_;
+                      std::numeric_limits<LearnFloatType>::max());
            std::fill(std::begin(max_activations_), std::end(max_activations_),
                      std::numeric_limits<LearnFloatType>::lowest());
        }
-  // Forward propagation buffer
+        // Check if there are any problems with learning
-  std::vector<LearnFloatType> output_;
+        void CheckHealth() {
            const auto largest_min_activation = *std::max_element(
                std::begin(min_activations_), std::end(min_activations_));
            const auto smallest_max_activation = *std::min_element(
                std::begin(max_activations_), std::end(max_activations_));
-  // buffer for back propagation
+            std::cout << "INFO: largest min activation = " << largest_min_activation
-  std::vector<LearnFloatType> gradients_;
+                      << ", smallest max activation = " << smallest_max_activation
                      << std::endl;
-  // Health check statistics
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
-  LearnFloatType min_activations_[kOutputDimensions];
+                      std::numeric_limits<LearnFloatType>::max());
-  LearnFloatType max_activations_[kOutputDimensions];
+            std::fill(std::begin(max_activations_), std::end(max_activations_),
-};
+                      std::numeric_limits<LearnFloatType>::lowest());
        }
-}  // namespace NNUE
+        // number of input/output dimensions
        static constexpr IndexType kInputDimensions = LayerType::kOutputDimensions;
        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-}  // namespace Eval
+        // LearnFloatType constant
        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
        // number of samples in mini-batch
        IndexType batch_size_;
        // Trainer of the previous layer
        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
        // layer to learn
        LayerType* const target_layer_;
        // Forward propagation buffer
        std::vector<LearnFloatType> output_;
        // buffer for back propagation
        std::vector<LearnFloatType> gradients_;
        // Health check statistics
        LearnFloatType min_activations_[kOutputDimensions];
        LearnFloatType max_activations_[kOutputDimensions];
    };
 }  // namespace Eval::NNUE
 #endif
@@ -1,13 +1,14 @@
-// Specialization for feature transformer of learning class template of NNUE evaluation function
+#ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 #ifndef _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 #define _NNUE_TRAINER_FEATURE_TRANSFORMER_H_
 #include "../../learn/learn.h"
 #include "../nnue_feature_transformer.h"
 #include "trainer.h"
 #include "features/factorizer_feature_set.h"
 #include "learn/learn.h"
 #include "nnue/nnue_feature_transformer.h"
 #include <array>
 #include <bitset>
 #include <numeric>
@@ -18,356 +19,392 @@
 #include <omp.h>
 #endif
-namespace Eval {
+// Specialization for feature transformer of learning class template of NNUE evaluation function
 namespace Eval::NNUE {
-namespace NNUE {
+    // Learning: Input feature converter
    template <>
    class Trainer<FeatureTransformer> {
    private:
        // Type of layer to learn
        using LayerType = FeatureTransformer;
-// Learning: Input feature converter
+    public:
-template <>
+        template <typename T>
-class Trainer<FeatureTransformer> {
+        friend struct AlignedDeleter;
 private:
  // Type of layer to learn
  using LayerType = FeatureTransformer;
- public:
+        template <typename T, typename... ArgumentTypes>
-  template <typename T>
+        friend std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments);
  friend struct AlignedDeleter;
  template <typename T, typename... ArgumentTypes>
  friend std::shared_ptr<T> MakeAlignedSharedPtr(ArgumentTypes&&... arguments);
-  // factory function
+        // factory function
-  static std::shared_ptr<Trainer> Create(LayerType* target_layer) {
+        static std::shared_ptr<Trainer> Create(LayerType* target_layer) {
-    return MakeAlignedSharedPtr<Trainer>(target_layer);
+            return MakeAlignedSharedPtr<Trainer>(target_layer);
-  }
+        }
-  // Set options such as hyperparameters
+        // Set options such as hyperparameters
-  void SendMessage(Message* message) {
+        void SendMessage(Message* message) {
-    if (ReceiveMessage("momentum", message)) {
+            if (ReceiveMessage("momentum", message)) {
-      momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
+                momentum_ = static_cast<LearnFloatType>(std::stod(message->value));
-    }
+            }
    if (ReceiveMessage("learning_rate_scale", message)) {
      learning_rate_scale_ =
          static_cast<LearnFloatType>(std::stod(message->value));
    }
    if (ReceiveMessage("reset", message)) {
      DequantizeParameters();
    }
    if (ReceiveMessage("quantize_parameters", message)) {
      QuantizeParameters();
    }
    if (ReceiveMessage("clear_unobserved_feature_weights", message)) {
      ClearUnobservedFeatureWeights();
    }
    if (ReceiveMessage("check_health", message)) {
      CheckHealth();
    }
  }
-  // Initialize the parameters with random numbers
+            if (ReceiveMessage("learning_rate_scale", message)) {
-  template <typename RNG>
+                learning_rate_scale_ =
-  void Initialize(RNG& rng) {
+                    static_cast<LearnFloatType>(std::stod(message->value));
-    std::fill(std::begin(weights_), std::end(weights_), +kZero);
+            }
    const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
    auto distribution = std::normal_distribution<double>(0.0, kSigma);
    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
      const auto weight = static_cast<LearnFloatType>(distribution(rng));
      weights_[i] = weight;
    }
    for (IndexType i = 0; i < kHalfDimensions; ++i) {
      biases_[i] = static_cast<LearnFloatType>(0.5);
    }
    QuantizeParameters();
  }
-  // forward propagation
+            if (ReceiveMessage("reset", message)) {
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+                DequantizeParameters();
-    if (output_.size() < kOutputDimensions * batch.size()) {
+            }
-      output_.resize(kOutputDimensions * batch.size());
+
-      gradients_.resize(kOutputDimensions * batch.size());
+            if (ReceiveMessage("quantize_parameters", message)) {
-    }
+                QuantizeParameters();
-    batch_ = &batch;
+            }
-    // affine transform
+
            if (ReceiveMessage("clear_unobserved_feature_weights", message)) {
                ClearUnobservedFeatureWeights();
            }
            if (ReceiveMessage("check_health", message)) {
                CheckHealth();
            }
        }
        // Initialize the parameters with random numbers
        template <typename RNG>
        void Initialize(RNG& rng) {
            std::fill(std::begin(weights_), std::end(weights_), +kZero);
            const double kSigma = 0.1 / std::sqrt(RawFeatures::kMaxActiveDimensions);
            auto distribution = std::normal_distribution<double>(0.0, kSigma);
            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
                const auto weight = static_cast<LearnFloatType>(distribution(rng));
                weights_[i] = weight;
            }
            for (IndexType i = 0; i < kHalfDimensions; ++i) {
                biases_[i] = static_cast<LearnFloatType>(0.5);
            }
            QuantizeParameters();
        }
        // forward propagation
        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
            if (output_.size() < kOutputDimensions * batch.size()) {
                output_.resize(kOutputDimensions * batch.size());
                gradients_.resize(kOutputDimensions * batch.size());
            }
            batch_ = &batch;
            // affine transform
 #pragma omp parallel for
-    for (IndexType b = 0; b < batch.size(); ++b) {
+            for (IndexType b = 0; b < batch.size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
+                const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
+                for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
 #if defined(USE_BLAS)
-        cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
+                    cblas_scopy(kHalfDimensions, biases_, 1, &output_[output_offset], 1);
-        for (const auto& feature : batch[b].training_features[c]) {
+                    for (const auto& feature : batch[b].training_features[c]) {
-          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-          cblas_saxpy(kHalfDimensions, (float)feature.GetCount(),
+                        cblas_saxpy(kHalfDimensions, (float)feature.GetCount(),
-                      &weights_[weights_offset], 1, &output_[output_offset], 1);
+                                    &weights_[weights_offset], 1, &output_[output_offset], 1);
-        }
+                    }
 #else
-        for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-          output_[output_offset + i] = biases_[i];
+                        output_[output_offset + i] = biases_[i];
-        }
+                    }
-        for (const auto& feature : batch[b].training_features[c]) {
+                    for (const auto& feature : batch[b].training_features[c]) {
-          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-          for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-            output_[output_offset + i] +=
+                            output_[output_offset + i] +=
-                feature.GetCount() * weights_[weights_offset + i];
+                                feature.GetCount() * weights_[weights_offset + i];
-          }
+                        }
-        }
+                    }
 #endif
-      }
+                }
-    }
+            }
    // clipped ReLU
    for (IndexType b = 0; b < batch.size(); ++b) {
      const IndexType batch_offset = kOutputDimensions * b;
      for (IndexType i = 0; i < kOutputDimensions; ++i) {
        const IndexType index = batch_offset + i;
        min_pre_activation_ = std::min(min_pre_activation_, output_[index]);
        max_pre_activation_ = std::max(max_pre_activation_, output_[index]);
        output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
        const IndexType t = i % kHalfDimensions;
        min_activations_[t] = std::min(min_activations_[t], output_[index]);
        max_activations_[t] = std::max(max_activations_[t], output_[index]);
      }
    }
    return output_.data();
  }
-  // backpropagation
+            // clipped ReLU
-  void Backpropagate(const LearnFloatType* gradients,
+            for (IndexType b = 0; b < batch.size(); ++b) {
-                     LearnFloatType learning_rate) {
+                const IndexType batch_offset = kOutputDimensions * b;
-    const LearnFloatType local_learning_rate =
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        learning_rate * learning_rate_scale_;
+                    const IndexType index = batch_offset + i;
-    for (IndexType b = 0; b < batch_->size(); ++b) {
+                    min_pre_activation_ = std::min(min_pre_activation_, output_[index]);
-      const IndexType batch_offset = kOutputDimensions * b;
+                    max_pre_activation_ = std::max(max_pre_activation_, output_[index]);
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                    output_[index] = std::max(+kZero, std::min(+kOne, output_[index]));
-        const IndexType index = batch_offset + i;
+                    const IndexType t = i % kHalfDimensions;
-        gradients_[index] = gradients[index] *
+                    min_activations_[t] = std::min(min_activations_[t], output_[index]);
-            ((output_[index] > kZero) * (output_[index] < kOne));
+                    max_activations_[t] = std::max(max_activations_[t], output_[index]);
-      }
+                }
-    }
+            }
-    // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
+
-    // Correct the learning rate and adjust the scale without using momentum
+            return output_.data();
-    const LearnFloatType effective_learning_rate =
+        }
-        static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
+
        // backpropagation
        void Backpropagate(const LearnFloatType* gradients,
                           LearnFloatType learning_rate) {
            const LearnFloatType local_learning_rate =
                learning_rate * learning_rate_scale_;
            for (IndexType b = 0; b < batch_->size(); ++b) {
                const IndexType batch_offset = kOutputDimensions * b;
                for (IndexType i = 0; i < kOutputDimensions; ++i) {
                    const IndexType index = batch_offset + i;
                    gradients_[index] = gradients[index] *
                        ((output_[index] > kZero) * (output_[index] < kOne));
                }
            }
            // Since the weight matrix updates only the columns corresponding to the features that appeared in the input,
            // Correct the learning rate and adjust the scale without using momentum
            const LearnFloatType effective_learning_rate =
                static_cast<LearnFloatType>(local_learning_rate / (1.0 - momentum_));
 #if defined(USE_BLAS)
-    cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
+            cblas_sscal(kHalfDimensions, momentum_, biases_diff_, 1);
-    for (IndexType b = 0; b < batch_->size(); ++b) {
+            for (IndexType b = 0; b < batch_->size(); ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
+                const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType c = 0; c < 2; ++c) {
+                for (IndexType c = 0; c < 2; ++c) {
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-        cblas_saxpy(kHalfDimensions, 1.0,
+                    cblas_saxpy(kHalfDimensions, 1.0,
-                    &gradients_[output_offset], 1, biases_diff_, 1);
+                                &gradients_[output_offset], 1, biases_diff_, 1);
-      }
+                }
-    }
+            }
-    cblas_saxpy(kHalfDimensions, -local_learning_rate,
+
-                biases_diff_, 1, biases_, 1);
+            cblas_saxpy(kHalfDimensions, -local_learning_rate,
                        biases_diff_, 1, biases_, 1);
 #pragma omp parallel
-    {
+            {
 #if defined(_OPENMP)
-      const IndexType num_threads = omp_get_num_threads();
+                const IndexType num_threads = omp_get_num_threads();
-      const IndexType thread_index = omp_get_thread_num();
+                const IndexType thread_index = omp_get_thread_num();
 #endif
-      for (IndexType b = 0; b < batch_->size(); ++b) {
+                for (IndexType b = 0; b < batch_->size(); ++b) {
-        const IndexType batch_offset = kOutputDimensions * b;
+                    const IndexType batch_offset = kOutputDimensions * b;
-        for (IndexType c = 0; c < 2; ++c) {
+                    for (IndexType c = 0; c < 2; ++c) {
-          const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                        const IndexType output_offset = batch_offset + kHalfDimensions * c;
-          for (const auto& feature : (*batch_)[b].training_features[c]) {
+                        for (const auto& feature : (*batch_)[b].training_features[c]) {
 #if defined(_OPENMP)
-            if (feature.GetIndex() % num_threads != thread_index) continue;
+                            if (feature.GetIndex() % num_threads != thread_index)
                                continue;
 #endif
-            const IndexType weights_offset =
+                            const IndexType weights_offset =
-                kHalfDimensions * feature.GetIndex();
+                                kHalfDimensions * feature.GetIndex();
-            const auto scale = static_cast<LearnFloatType>(
+                            const auto scale = static_cast<LearnFloatType>(
-                effective_learning_rate / feature.GetCount());
+                                effective_learning_rate / feature.GetCount());
-            cblas_saxpy(kHalfDimensions, -scale,
+
-                        &gradients_[output_offset], 1,
+                            cblas_saxpy(kHalfDimensions, -scale,
-                        &weights_[weights_offset], 1);
+                                        &gradients_[output_offset], 1,
-          }
+                                        &weights_[weights_offset], 1);
-        }
+                        }
-      }
+                    }
-    }
+                }
            }
 #else
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_diff_[i] *= momentum_;
+                biases_diff_[i] *= momentum_;
-    }
+            }
-    for (IndexType b = 0; b < batch_->size(); ++b) {
+
-      const IndexType batch_offset = kOutputDimensions * b;
+            for (IndexType b = 0; b < batch_->size(); ++b) {
-      for (IndexType c = 0; c < 2; ++c) {
+                const IndexType batch_offset = kOutputDimensions * b;
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+                for (IndexType c = 0; c < 2; ++c) {
-        for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-          biases_diff_[i] += gradients_[output_offset + i];
+                    for (IndexType i = 0; i < kHalfDimensions; ++i) {
-        }
+                        biases_diff_[i] += gradients_[output_offset + i];
-      }
+                    }
-    }
+                }
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+            }
-      biases_[i] -= local_learning_rate * biases_diff_[i];
+
-    }
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-    for (IndexType b = 0; b < batch_->size(); ++b) {
+                biases_[i] -= local_learning_rate * biases_diff_[i];
-      const IndexType batch_offset = kOutputDimensions * b;
+            }
-      for (IndexType c = 0; c < 2; ++c) {
+
-        const IndexType output_offset = batch_offset + kHalfDimensions * c;
+            for (IndexType b = 0; b < batch_->size(); ++b) {
-        for (const auto& feature : (*batch_)[b].training_features[c]) {
+                const IndexType batch_offset = kOutputDimensions * b;
-          const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
+                for (IndexType c = 0; c < 2; ++c) {
-          const auto scale = static_cast<LearnFloatType>(
+                    const IndexType output_offset = batch_offset + kHalfDimensions * c;
-              effective_learning_rate / feature.GetCount());
+                    for (const auto& feature : (*batch_)[b].training_features[c]) {
-          for (IndexType i = 0; i < kHalfDimensions; ++i) {
+                        const IndexType weights_offset = kHalfDimensions * feature.GetIndex();
-            weights_[weights_offset + i] -=
+                        const auto scale = static_cast<LearnFloatType>(
-                scale * gradients_[output_offset + i];
+                            effective_learning_rate / feature.GetCount());
-          }
+
-        }
+                        for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      }
+                            weights_[weights_offset + i] -=
-    }
+                                scale * gradients_[output_offset + i];
                        }
                    }
                }
            }
 #endif
-    for (IndexType b = 0; b < batch_->size(); ++b) {
+            for (IndexType b = 0; b < batch_->size(); ++b) {
-      for (IndexType c = 0; c < 2; ++c) {
+                for (IndexType c = 0; c < 2; ++c) {
-        for (const auto& feature : (*batch_)[b].training_features[c]) {
+                    for (const auto& feature : (*batch_)[b].training_features[c]) {
-          observed_features.set(feature.GetIndex());
+                        observed_features.set(feature.GetIndex());
                    }
                }
            }
        }
      }
    }
  }
- private:
+    private:
-  // constructor
+        // constructor
-  Trainer(LayerType* target_layer) :
+        Trainer(LayerType* target_layer) :
-      batch_(nullptr),
+            batch_(nullptr),
-      target_layer_(target_layer),
+            target_layer_(target_layer),
-      biases_(),
+            biases_(),
-      weights_(),
+            weights_(),
-      biases_diff_(),
+            biases_diff_(),
-      momentum_(0.2),
+            momentum_(0.2),
-      learning_rate_scale_(1.0) {
+            learning_rate_scale_(1.0) {
-    min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
+
-    max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
+            min_pre_activation_ = std::numeric_limits<LearnFloatType>::max();
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
+            max_pre_activation_ = std::numeric_limits<LearnFloatType>::lowest();
-              std::numeric_limits<LearnFloatType>::max());
+
-    std::fill(std::begin(max_activations_), std::end(max_activations_),
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
-              std::numeric_limits<LearnFloatType>::lowest());
+                      std::numeric_limits<LearnFloatType>::max());
-    DequantizeParameters();
+            std::fill(std::begin(max_activations_), std::end(max_activations_),
-  }
+                      std::numeric_limits<LearnFloatType>::lowest());
            DequantizeParameters();
        }
        // Weight saturation and parameterization
        void QuantizeParameters() {
            for (IndexType i = 0; i < kHalfDimensions; ++i) {
                target_layer_->biases_[i] =
                    Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
            }
            std::vector<TrainingFeature> training_features;
  // Weight saturation and parameterization
  void QuantizeParameters() {
    for (IndexType i = 0; i < kHalfDimensions; ++i) {
      target_layer_->biases_[i] =
          Round<typename LayerType::BiasType>(biases_[i] * kBiasScale);
    }
    std::vector<TrainingFeature> training_features;
 #pragma omp parallel for private(training_features)
-    for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
+            for (IndexType j = 0; j < RawFeatures::kDimensions; ++j) {
-      training_features.clear();
+                training_features.clear();
-      Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
+                Features::Factorizer<RawFeatures>::AppendTrainingFeatures(
-          j, &training_features);
+                    j, &training_features);
-      for (IndexType i = 0; i < kHalfDimensions; ++i) {
+
-        double sum = 0.0;
+                for (IndexType i = 0; i < kHalfDimensions; ++i) {
-        for (const auto& feature : training_features) {
+                    double sum = 0.0;
-          sum += weights_[kHalfDimensions * feature.GetIndex() + i];
+                    for (const auto& feature : training_features) {
                        sum += weights_[kHalfDimensions * feature.GetIndex() + i];
                    }
                    target_layer_->weights_[kHalfDimensions * j + i] =
                        Round<typename LayerType::WeightType>(sum * kWeightScale);
                }
            }
        }
        target_layer_->weights_[kHalfDimensions * j + i] =
            Round<typename LayerType::WeightType>(sum * kWeightScale);
      }
    }
  }
-  // read parameterized integer
+        // read parameterized integer
-  void DequantizeParameters() {
+        void DequantizeParameters() {
-    for (IndexType i = 0; i < kHalfDimensions; ++i) {
+            for (IndexType i = 0; i < kHalfDimensions; ++i) {
-      biases_[i] = static_cast<LearnFloatType>(
+                biases_[i] = static_cast<LearnFloatType>(
-          target_layer_->biases_[i] / kBiasScale);
+                    target_layer_->biases_[i] / kBiasScale);
-    }
+            }
    std::fill(std::begin(weights_), std::end(weights_), +kZero);
    for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
      weights_[i] = static_cast<LearnFloatType>(
          target_layer_->weights_[i] / kWeightScale);
    }
    std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
  }
-  // Set the weight corresponding to the feature that does not appear in the learning data to 0
+            std::fill(std::begin(weights_), std::end(weights_), +kZero);
  void ClearUnobservedFeatureWeights() {
    for (IndexType i = 0; i < kInputDimensions; ++i) {
      if (!observed_features.test(i)) {
        std::fill(std::begin(weights_) + kHalfDimensions * i,
                  std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
      }
    }
    QuantizeParameters();
  }
-  // Check if there are any problems with learning
+            for (IndexType i = 0; i < kHalfDimensions * RawFeatures::kDimensions; ++i) {
-  void CheckHealth() {
+                weights_[i] = static_cast<LearnFloatType>(
-    std::cout << "INFO: observed " << observed_features.count()
+                    target_layer_->weights_[i] / kWeightScale);
-              << " (out of " << kInputDimensions << ") features" << std::endl;
+            }
-    constexpr LearnFloatType kPreActivationLimit =
+            std::fill(std::begin(biases_diff_), std::end(biases_diff_), +kZero);
-        std::numeric_limits<typename LayerType::WeightType>::max() /
+        }
        kWeightScale;
    std::cout << "INFO: (min, max) of pre-activations = "
              << min_pre_activation_ << ", "
              << max_pre_activation_ << " (limit = "
              << kPreActivationLimit << ")" << std::endl;
-    const auto largest_min_activation = *std::max_element(
+        // Set the weight corresponding to the feature that does not appear in the learning data to 0
-        std::begin(min_activations_), std::end(min_activations_));
+        void ClearUnobservedFeatureWeights() {
-    const auto smallest_max_activation = *std::min_element(
+            for (IndexType i = 0; i < kInputDimensions; ++i) {
-        std::begin(max_activations_), std::end(max_activations_));
+                if (!observed_features.test(i)) {
-    std::cout << "INFO: largest min activation = " << largest_min_activation
+                    std::fill(std::begin(weights_) + kHalfDimensions * i,
-              << ", smallest max activation = " << smallest_max_activation
+                              std::begin(weights_) + kHalfDimensions * (i + 1), +kZero);
-              << std::endl;
+                }
            }
-    std::fill(std::begin(min_activations_), std::end(min_activations_),
+            QuantizeParameters();
-              std::numeric_limits<LearnFloatType>::max());
+        }
    std::fill(std::begin(max_activations_), std::end(max_activations_),
              std::numeric_limits<LearnFloatType>::lowest());
  }
-  // number of input/output dimensions
+        // Check if there are any problems with learning
-  static constexpr IndexType kInputDimensions =
+        void CheckHealth() {
-      Features::Factorizer<RawFeatures>::GetDimensions();
+            std::cout << "INFO: observed " << observed_features.count()
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+                      << " (out of " << kInputDimensions << ") features" << std::endl;
  static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
-  // Coefficient used for parameterization
+            constexpr LearnFloatType kPreActivationLimit =
-  static constexpr LearnFloatType kActivationScale =
+                std::numeric_limits<typename LayerType::WeightType>::max() /
-      std::numeric_limits<std::int8_t>::max();
+                kWeightScale;
  static constexpr LearnFloatType kBiasScale = kActivationScale;
  static constexpr LearnFloatType kWeightScale = kActivationScale;
-  // LearnFloatType constant
+            std::cout << "INFO: (min, max) of pre-activations = "
-  static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
+                      << min_pre_activation_ << ", "
-  static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
+                      << max_pre_activation_ << " (limit = "
                      << kPreActivationLimit << ")" << std::endl;
-  // mini batch
+            const auto largest_min_activation = *std::max_element(
-  const std::vector<Example>* batch_;
+                std::begin(min_activations_), std::end(min_activations_));
            const auto smallest_max_activation = *std::min_element(
                std::begin(max_activations_), std::end(max_activations_));
-  // layer to learn
+            std::cout << "INFO: largest min activation = " << largest_min_activation
-  LayerType* const target_layer_;
+                      << ", smallest max activation = " << smallest_max_activation
                      << std::endl;
-  // parameter
+            std::fill(std::begin(min_activations_), std::end(min_activations_),
-  alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
+                      std::numeric_limits<LearnFloatType>::max());
-  alignas(kCacheLineSize)
+            std::fill(std::begin(max_activations_), std::end(max_activations_),
-      LearnFloatType weights_[kHalfDimensions * kInputDimensions];
+                      std::numeric_limits<LearnFloatType>::lowest());
        }
-  // Buffer used for updating parameters
+        // number of input/output dimensions
-  LearnFloatType biases_diff_[kHalfDimensions];
+        static constexpr IndexType kInputDimensions =
-  std::vector<LearnFloatType> gradients_;
+            Features::Factorizer<RawFeatures>::GetDimensions();
        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
        static constexpr IndexType kHalfDimensions = LayerType::kHalfDimensions;
-  // Forward propagation buffer
+        // Coefficient used for parameterization
-  std::vector<LearnFloatType> output_;
+        static constexpr LearnFloatType kActivationScale =
            std::numeric_limits<std::int8_t>::max();
        static constexpr LearnFloatType kBiasScale = kActivationScale;
        static constexpr LearnFloatType kWeightScale = kActivationScale;
-  // Features that appeared in the training data
+        // LearnFloatType constant
-  std::bitset<kInputDimensions> observed_features;
+        static constexpr LearnFloatType kZero = static_cast<LearnFloatType>(0.0);
        static constexpr LearnFloatType kOne = static_cast<LearnFloatType>(1.0);
-  // hyper parameter
+        // mini batch
-  LearnFloatType momentum_;
+        const std::vector<Example>* batch_;
  LearnFloatType learning_rate_scale_;
-  // Health check statistics
+        // layer to learn
-  LearnFloatType min_pre_activation_;
+        LayerType* const target_layer_;
  LearnFloatType max_pre_activation_;
  LearnFloatType min_activations_[kHalfDimensions];
  LearnFloatType max_activations_[kHalfDimensions];
 };
-}  // namespace NNUE
+        // parameter
        alignas(kCacheLineSize) LearnFloatType biases_[kHalfDimensions];
        alignas(kCacheLineSize)
            LearnFloatType weights_[kHalfDimensions * kInputDimensions];
-}  // namespace Eval
+        // Buffer used for updating parameters
        LearnFloatType biases_diff_[kHalfDimensions];
        std::vector<LearnFloatType> gradients_;
        // Forward propagation buffer
        std::vector<LearnFloatType> output_;
        // Features that appeared in the training data
        std::bitset<kInputDimensions> observed_features;
        // hyper parameter
        LearnFloatType momentum_;
        LearnFloatType learning_rate_scale_;
        // Health check statistics
        LearnFloatType min_pre_activation_;
        LearnFloatType max_pre_activation_;
        LearnFloatType min_activations_[kHalfDimensions];
        LearnFloatType max_activations_[kHalfDimensions];
    };
 }  // namespace Eval::NNUE
 #endif
@@ -1,247 +1,267 @@
-// Specialization of NNUE evaluation function learning class template for InputSlice
+#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
 #ifndef _NNUE_TRAINER_INPUT_SLICE_H_
 #define _NNUE_TRAINER_INPUT_SLICE_H_
 #include "../../learn/learn.h"
 #include "../layers/input_slice.h"
 #include "trainer.h"
-namespace Eval {
+#include "learn/learn.h"
-namespace NNUE {
+#include "nnue/layers/input_slice.h"
-// Learning: Input layer
+// Specialization of NNUE evaluation function learning class template for InputSlice
-class SharedInputTrainer {
+namespace Eval::NNUE {
 public:
  // factory function
  static std::shared_ptr<SharedInputTrainer> Create(
      FeatureTransformer* ft) {
    static std::shared_ptr<SharedInputTrainer> instance;
    if (!instance) {
      instance.reset(new SharedInputTrainer(ft));
    }
    ++instance->num_referrers_;
    return instance;
  }
-  // Set options such as hyperparameters
+    // Learning: Input layer
-  void SendMessage(Message* message) {
+    class SharedInputTrainer {
-    if (num_calls_ == 0) {
+    public:
-      current_operation_ = Operation::kSendMessage;
+        // factory function
-      feature_transformer_trainer_->SendMessage(message);
+        static std::shared_ptr<SharedInputTrainer> Create(
-    }
+            FeatureTransformer* ft) {
    assert(current_operation_ == Operation::kSendMessage);
    if (++num_calls_ == num_referrers_) {
      num_calls_ = 0;
      current_operation_ = Operation::kNone;
    }
  }
-  // Initialize the parameters with random numbers
+            static std::shared_ptr<SharedInputTrainer> instance;
  template <typename RNG>
  void Initialize(RNG& rng) {
    if (num_calls_ == 0) {
      current_operation_ = Operation::kInitialize;
      feature_transformer_trainer_->Initialize(rng);
    }
    assert(current_operation_ == Operation::kInitialize);
    if (++num_calls_ == num_referrers_) {
      num_calls_ = 0;
      current_operation_ = Operation::kNone;
    }
  }
-  // forward propagation
+            if (!instance) {
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+                instance.reset(new SharedInputTrainer(ft));
-    if (gradients_.size() < kInputDimensions * batch.size()) {
+            }
      gradients_.resize(kInputDimensions * batch.size());
    }
    batch_size_ = static_cast<IndexType>(batch.size());
    if (num_calls_ == 0) {
      current_operation_ = Operation::kPropagate;
      output_ = feature_transformer_trainer_->Propagate(batch);
    }
    assert(current_operation_ == Operation::kPropagate);
    if (++num_calls_ == num_referrers_) {
      num_calls_ = 0;
      current_operation_ = Operation::kNone;
    }
    return output_;
  }
-  // backpropagation
+            ++instance->num_referrers_;
-  void Backpropagate(const LearnFloatType* gradients,
+
-                     LearnFloatType learning_rate) {
+            return instance;
    if (num_referrers_ == 1) {
      feature_transformer_trainer_->Backpropagate(gradients, learning_rate);
      return;
    }
    if (num_calls_ == 0) {
      current_operation_ = Operation::kBackPropagate;
      for (IndexType b = 0; b < batch_size_; ++b) {
        const IndexType batch_offset = kInputDimensions * b;
        for (IndexType i = 0; i < kInputDimensions; ++i) {
          gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
        }
      }
    }
    assert(current_operation_ == Operation::kBackPropagate);
    for (IndexType b = 0; b < batch_size_; ++b) {
      const IndexType batch_offset = kInputDimensions * b;
      for (IndexType i = 0; i < kInputDimensions; ++i) {
        gradients_[batch_offset + i] += gradients[batch_offset + i];
      }
    }
    if (++num_calls_ == num_referrers_) {
      feature_transformer_trainer_->Backpropagate(
          gradients_.data(), learning_rate);
      num_calls_ = 0;
      current_operation_ = Operation::kNone;
    }
  }
- private:
+        // Set options such as hyperparameters
-  // constructor
+        void SendMessage(Message* message) {
-  SharedInputTrainer(FeatureTransformer* ft) :
+            if (num_calls_ == 0) {
-      batch_size_(0),
+                current_operation_ = Operation::kSendMessage;
-      num_referrers_(0),
+                feature_transformer_trainer_->SendMessage(message);
-      num_calls_(0),
+            }
      current_operation_(Operation::kNone),
      feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
          ft)),
      output_(nullptr) {
  }
-  // number of input/output dimensions
+            assert(current_operation_ == Operation::kSendMessage);
  static constexpr IndexType kInputDimensions =
      FeatureTransformer::kOutputDimensions;
-  // type of processing
+            if (++num_calls_ == num_referrers_) {
-  enum class Operation {
+                num_calls_ = 0;
-    kNone,
+                current_operation_ = Operation::kNone;
-    kSendMessage,
+            }
-    kInitialize,
+        }
    kPropagate,
    kBackPropagate,
  };
-  // number of samples in mini-batch
+        // Initialize the parameters with random numbers
-  IndexType batch_size_;
+        template <typename RNG>
        void Initialize(RNG& rng) {
            if (num_calls_ == 0) {
                current_operation_ = Operation::kInitialize;
                feature_transformer_trainer_->Initialize(rng);
            }
-  // number of layers sharing this layer as input
+            assert(current_operation_ == Operation::kInitialize);
  std::uint32_t num_referrers_;
-  // Number of times the current process has been called
+            if (++num_calls_ == num_referrers_) {
-  std::uint32_t num_calls_;
+                num_calls_ = 0;
                current_operation_ = Operation::kNone;
            }
        }
-  // current processing type
+        // forward propagation
-  Operation current_operation_;
+        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
            if (gradients_.size() < kInputDimensions * batch.size()) {
                gradients_.resize(kInputDimensions * batch.size());
            }
-  // Trainer of input feature converter
+            batch_size_ = static_cast<IndexType>(batch.size());
  const std::shared_ptr<Trainer<FeatureTransformer>>
      feature_transformer_trainer_;
-  // pointer to output shared for forward propagation
+            if (num_calls_ == 0) {
-  const LearnFloatType* output_;
+                current_operation_ = Operation::kPropagate;
                output_ = feature_transformer_trainer_->Propagate(batch);
            }
-  // buffer for back propagation
+            assert(current_operation_ == Operation::kPropagate);
  std::vector<LearnFloatType> gradients_;
 };
-// Learning: Input layer
+            if (++num_calls_ == num_referrers_) {
-template <IndexType OutputDimensions, IndexType Offset>
+                num_calls_ = 0;
-class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
+                current_operation_ = Operation::kNone;
- private:
+            }
  // Type of layer to learn
  using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
- public:
+            return output_;
-  // factory function
+        }
  static std::shared_ptr<Trainer> Create(
      LayerType* /*target_layer*/, FeatureTransformer* ft) {
    return std::shared_ptr<Trainer>(new Trainer(ft));
  }
-  // Set options such as hyperparameters
+        // backpropagation
-  void SendMessage(Message* message) {
+        void Backpropagate(const LearnFloatType* gradients,
-    shared_input_trainer_->SendMessage(message);
+                           LearnFloatType learning_rate) {
  }
-  // Initialize the parameters with random numbers
+            if (num_referrers_ == 1) {
-  template <typename RNG>
+                feature_transformer_trainer_->Backpropagate(gradients, learning_rate);
-  void Initialize(RNG& rng) {
+                return;
-    shared_input_trainer_->Initialize(rng);
+            }
  }
-  // forward propagation
+            if (num_calls_ == 0) {
-  const LearnFloatType* Propagate(const std::vector<Example>& batch) {
+                current_operation_ = Operation::kBackPropagate;
-    if (output_.size() < kOutputDimensions * batch.size()) {
+                for (IndexType b = 0; b < batch_size_; ++b) {
-      output_.resize(kOutputDimensions * batch.size());
+                    const IndexType batch_offset = kInputDimensions * b;
-      gradients_.resize(kInputDimensions * batch.size());
+                    for (IndexType i = 0; i < kInputDimensions; ++i) {
-    }
+                        gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
-    batch_size_ = static_cast<IndexType>(batch.size());
+                    }
-    const auto input = shared_input_trainer_->Propagate(batch);
+                }
-    for (IndexType b = 0; b < batch_size_; ++b) {
+            }
-      const IndexType input_offset = kInputDimensions * b;
+
-      const IndexType output_offset = kOutputDimensions * b;
+            assert(current_operation_ == Operation::kBackPropagate);
            for (IndexType b = 0; b < batch_size_; ++b) {
                const IndexType batch_offset = kInputDimensions * b;
                for (IndexType i = 0; i < kInputDimensions; ++i) {
                    gradients_[batch_offset + i] += gradients[batch_offset + i];
                }
            }
            if (++num_calls_ == num_referrers_) {
                feature_transformer_trainer_->Backpropagate(
                    gradients_.data(), learning_rate);
                num_calls_ = 0;
                current_operation_ = Operation::kNone;
            }
        }
    private:
        // constructor
        SharedInputTrainer(FeatureTransformer* ft) :
            batch_size_(0),
            num_referrers_(0),
            num_calls_(0),
            current_operation_(Operation::kNone),
            feature_transformer_trainer_(Trainer<FeatureTransformer>::Create(
                ft)),
            output_(nullptr) {
        }
        // number of input/output dimensions
        static constexpr IndexType kInputDimensions =
            FeatureTransformer::kOutputDimensions;
        // type of processing
        enum class Operation {
            kNone,
            kSendMessage,
            kInitialize,
            kPropagate,
            kBackPropagate,
        };
        // number of samples in mini-batch
        IndexType batch_size_;
        // number of layers sharing this layer as input
        std::uint32_t num_referrers_;
        // Number of times the current process has been called
        std::uint32_t num_calls_;
        // current processing type
        Operation current_operation_;
        // Trainer of input feature converter
        const std::shared_ptr<Trainer<FeatureTransformer>>
            feature_transformer_trainer_;
        // pointer to output shared for forward propagation
        const LearnFloatType* output_;
        // buffer for back propagation
        std::vector<LearnFloatType> gradients_;
    };
    // Learning: Input layer
    template <IndexType OutputDimensions, IndexType Offset>
    class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
    private:
        // Type of layer to learn
        using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
    public:
        // factory function
        static std::shared_ptr<Trainer> Create(
            LayerType* /*target_layer*/, FeatureTransformer* ft) {
            return std::shared_ptr<Trainer>(new Trainer(ft));
        }
        // Set options such as hyperparameters
        void SendMessage(Message* message) {
            shared_input_trainer_->SendMessage(message);
        }
        // Initialize the parameters with random numbers
        template <typename RNG>
        void Initialize(RNG& rng) {
            shared_input_trainer_->Initialize(rng);
        }
        // forward propagation
        const LearnFloatType* Propagate(const std::vector<Example>& batch) {
            if (output_.size() < kOutputDimensions * batch.size()) {
              output_.resize(kOutputDimensions * batch.size());
              gradients_.resize(kInputDimensions * batch.size());
            }
            batch_size_ = static_cast<IndexType>(batch.size());
            const auto input = shared_input_trainer_->Propagate(batch);
            for (IndexType b = 0; b < batch_size_; ++b) {
                const IndexType input_offset = kInputDimensions * b;
                const IndexType output_offset = kOutputDimensions * b;
 #if defined(USE_BLAS)
-      cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
+                cblas_scopy(kOutputDimensions, &input[input_offset + Offset], 1,
-                  &output_[output_offset], 1);
+                            &output_[output_offset], 1);
 #else
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output_[output_offset + i] = input[input_offset + Offset + i];
+                    output_[output_offset + i] = input[input_offset + Offset + i];
-      }
+                }
 #endif
-    }
+            }
    return output_.data();
  }
-  // backpropagation
+            return output_.data();
  void Backpropagate(const LearnFloatType* gradients,
                     LearnFloatType learning_rate) {
    for (IndexType b = 0; b < batch_size_; ++b) {
      const IndexType input_offset = kInputDimensions * b;
      const IndexType output_offset = kOutputDimensions * b;
      for (IndexType i = 0; i < kInputDimensions; ++i) {
        if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) {
          gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
        } else {
          gradients_[input_offset + i] = gradients[output_offset + i - Offset];
        }
      }
    }
    shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate);
  }
- private:
+        // backpropagation
-  // constructor
+        void Backpropagate(const LearnFloatType* gradients,
-  Trainer(FeatureTransformer* ft):
+                           LearnFloatType learning_rate) {
      batch_size_(0),
      shared_input_trainer_(SharedInputTrainer::Create(ft)) {
  }
-  // number of input/output dimensions
+            for (IndexType b = 0; b < batch_size_; ++b) {
-  static constexpr IndexType kInputDimensions =
+                const IndexType input_offset = kInputDimensions * b;
-      FeatureTransformer::kOutputDimensions;
+                const IndexType output_offset = kOutputDimensions * b;
-  static constexpr IndexType kOutputDimensions = OutputDimensions;
+                for (IndexType i = 0; i < kInputDimensions; ++i) {
-  static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
+                    if ((int)i < (int)Offset || i >= Offset + kOutputDimensions) {
                        gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
                    } else {
                        gradients_[input_offset + i] = gradients[output_offset + i - Offset];
                    }
                }
            }
            shared_input_trainer_->Backpropagate(gradients_.data(), learning_rate);
        }
-  // number of samples in mini-batch
+    private:
-  IndexType batch_size_;
+        // constructor
        Trainer(FeatureTransformer* ft):
            batch_size_(0),
            shared_input_trainer_(SharedInputTrainer::Create(ft)) {
        }
-  // Trainer of shared input layer
+        // number of input/output dimensions
-  const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
+        static constexpr IndexType kInputDimensions =
            FeatureTransformer::kOutputDimensions;
        static constexpr IndexType kOutputDimensions = OutputDimensions;
        static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
-  // Forward propagation buffer
+        // number of samples in mini-batch
-  std::vector<LearnFloatType> output_;
+        IndexType batch_size_;
-  // buffer for back propagation
+        // Trainer of shared input layer
-  std::vector<LearnFloatType> gradients_;
+        const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
 };
-}  // namespace NNUE
+        // Forward propagation buffer
        std::vector<LearnFloatType> output_;
-}  // namespace Eval
+        // buffer for back propagation
        std::vector<LearnFloatType> gradients_;
    };
 }  // namespace Eval::NNUE
 #endif
@@ -1,186 +1,190 @@
-// Specialization of NNUE evaluation function learning class template for Sum
+#ifndef _NNUE_TRAINER_SUM_H_
 #ifndef _NNUE_TRAINER_SUM_H_
 #define _NNUE_TRAINER_SUM_H_
 #include "../../learn/learn.h"
 #include "../layers/sum.h"
 #include "trainer.h"
-namespace Eval {
+// Specialization of NNUE evaluation function learning class template for Sum
 namespace Eval::NNUE {
-namespace NNUE {
+    // Learning: A layer that sums the outputs of multiple layers
    template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
    class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
          Trainer<Layers::Sum<RemainingPreviousLayers...>> {
    private:
        // Type of layer to learn
        using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
        using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
-// Learning: A layer that sums the outputs of multiple layers
+    public:
-template <typename FirstPreviousLayer, typename... RemainingPreviousLayers>
+        // factory function
-class Trainer<Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>> :
+        static std::shared_ptr<Trainer> Create(
-      Trainer<Layers::Sum<RemainingPreviousLayers...>> {
+            LayerType* target_layer, FeatureTransformer* ft) {
 private:
  // Type of layer to learn
  using LayerType = Layers::Sum<FirstPreviousLayer, RemainingPreviousLayers...>;
  using Tail = Trainer<Layers::Sum<RemainingPreviousLayers...>>;
- public:
+            return std::shared_ptr<Trainer>(
-  // factory function
+                new Trainer(target_layer, ft));
-  static std::shared_ptr<Trainer> Create(
+        }
      LayerType* target_layer, FeatureTransformer* ft) {
    return std::shared_ptr<Trainer>(
        new Trainer(target_layer, ft));
  }
-  // Set options such as hyperparameters
+        // Set options such as hyperparameters
-  void SendMessage(Message* message) {
+        void SendMessage(Message* message) {
-    // The results of other member functions do not depend on the processing order, so
+            // The results of other member functions do not depend on the processing order, so
-    // Tail is processed first for the purpose of simplifying the implementation, but
+            // Tail is processed first for the purpose of simplifying the implementation, but
-    // SendMessage processes Head first to make it easier to understand subscript correspondence
+            // SendMessage processes Head first to make it easier to understand subscript correspondence
-    previous_layer_trainer_->SendMessage(message);
+            previous_layer_trainer_->SendMessage(message);
-    Tail::SendMessage(message);
+            Tail::SendMessage(message);
-  }
+        }
-  // Initialize the parameters with random numbers
+        // Initialize the parameters with random numbers
-  template <typename RNG>
+        template <typename RNG>
-  void Initialize(RNG& rng) {
+        void Initialize(RNG& rng) {
-    Tail::Initialize(rng);
+            Tail::Initialize(rng);
-    previous_layer_trainer_->Initialize(rng);
+            previous_layer_trainer_->Initialize(rng);
-  }
+        }
        // forward propagation
        /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
            batch_size_ = static_cast<IndexType>(batch.size());
            auto output = Tail::Propagate(batch);
            const auto head_output = previous_layer_trainer_->Propagate(batch);
  // forward propagation
  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
    batch_size_ = static_cast<IndexType>(batch.size());
    auto output = Tail::Propagate(batch);
    const auto head_output = previous_layer_trainer_->Propagate(batch);
 #if defined(USE_BLAS)
-    cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
+            cblas_saxpy(kOutputDimensions * batch_size_, 1.0,
-                head_output, 1, output, 1);
+                        head_output, 1, output, 1);
 #else
-    for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
+                const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output[batch_offset + i] += head_output[batch_offset + i];
+                    output[batch_offset + i] += head_output[batch_offset + i];
-      }
+                }
-    }
+            }
 #endif
-    return output;
+            return output;
-  }
+        }
-  // backpropagation
+        // backpropagation
-  void Backpropagate(const LearnFloatType* gradients,
+        void Backpropagate(const LearnFloatType* gradients,
-                     LearnFloatType learning_rate) {
+                           LearnFloatType learning_rate) {
    Tail::Backpropagate(gradients, learning_rate);
    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
  }
- private:
+            Tail::Backpropagate(gradients, learning_rate);
-  // constructor
+            previous_layer_trainer_->Backpropagate(gradients, learning_rate);
-  Trainer(LayerType* target_layer, FeatureTransformer* ft):
+        }
      Tail(target_layer, ft),
      batch_size_(0),
      previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
          &target_layer->previous_layer_, ft)),
      target_layer_(target_layer) {
  }
-  // number of input/output dimensions
+    private:
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+        // constructor
        Trainer(LayerType* target_layer, FeatureTransformer* ft):
            Tail(target_layer, ft),
            batch_size_(0),
            previous_layer_trainer_(Trainer<FirstPreviousLayer>::Create(
                &target_layer->previous_layer_, ft)),
            target_layer_(target_layer) {
        }
-  // make subclass friend
+        // number of input/output dimensions
-  template <typename SumLayer>
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
  friend class Trainer;
-  // number of samples in mini-batch
+        // make subclass friend
-  IndexType batch_size_;
+        template <typename SumLayer>
        friend class Trainer;
-  // Trainer of the previous layer
+        // number of samples in mini-batch
-  const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
+        IndexType batch_size_;
-  // layer to learn
+        // Trainer of the previous layer
-  LayerType* const target_layer_;
+        const std::shared_ptr<Trainer<FirstPreviousLayer>> previous_layer_trainer_;
-};
+
        // layer to learn
        LayerType* const target_layer_;
    };
-// Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
+    // Learning: Layer that takes the sum of the outputs of multiple layers (when there is one template argument)
-template <typename PreviousLayer>
+    template <typename PreviousLayer>
-class Trainer<Layers::Sum<PreviousLayer>> {
+    class Trainer<Layers::Sum<PreviousLayer>> {
- private:
+    private:
-  // Type of layer to learn
+        // Type of layer to learn
-  using LayerType = Layers::Sum<PreviousLayer>;
+        using LayerType = Layers::Sum<PreviousLayer>;
- public:
+    public:
-  // factory function
+        // factory function
-  static std::shared_ptr<Trainer> Create(
+        static std::shared_ptr<Trainer> Create(
-      LayerType* target_layer, FeatureTransformer* ft) {
+            LayerType* target_layer, FeatureTransformer* ft) {
    return std::shared_ptr<Trainer>(
        new Trainer(target_layer, ft));
  }
-  // Set options such as hyperparameters
+            return std::shared_ptr<Trainer>(
-  void SendMessage(Message* message) {
+                new Trainer(target_layer, ft));
-    previous_layer_trainer_->SendMessage(message);
+        }
  }
-  // Initialize the parameters with random numbers
+        // Set options such as hyperparameters
-  template <typename RNG>
+        void SendMessage(Message* message) {
-  void Initialize(RNG& rng) {
+            previous_layer_trainer_->SendMessage(message);
-    previous_layer_trainer_->Initialize(rng);
+        }
-  }
+
        // Initialize the parameters with random numbers
        template <typename RNG>
        void Initialize(RNG& rng) {
            previous_layer_trainer_->Initialize(rng);
        }
        // forward propagation
        /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
            if (output_.size() < kOutputDimensions * batch.size()) {
                output_.resize(kOutputDimensions * batch.size());
            }
            batch_size_ = static_cast<IndexType>(batch.size());
            const auto output = previous_layer_trainer_->Propagate(batch);
  // forward propagation
  /*const*/ LearnFloatType* Propagate(const std::vector<Example>& batch) {
    if (output_.size() < kOutputDimensions * batch.size()) {
      output_.resize(kOutputDimensions * batch.size());
    }
    batch_size_ = static_cast<IndexType>(batch.size());
    const auto output = previous_layer_trainer_->Propagate(batch);
 #if defined(USE_BLAS)
-    cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
+            cblas_scopy(kOutputDimensions * batch_size_, output, 1, &output_[0], 1);
 #else
-    for (IndexType b = 0; b < batch_size_; ++b) {
+            for (IndexType b = 0; b < batch_size_; ++b) {
-      const IndexType batch_offset = kOutputDimensions * b;
+                const IndexType batch_offset = kOutputDimensions * b;
-      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+                for (IndexType i = 0; i < kOutputDimensions; ++i) {
-        output_[batch_offset + i] = output[batch_offset + i];
+                    output_[batch_offset + i] = output[batch_offset + i];
-      }
+                }
-    }
+            }
-#endif
+
-    return output_.data();
+#endif
-  }
+            return output_.data();
-
+        }
-  // backpropagation
+
-  void Backpropagate(const LearnFloatType* gradients,
+        // backpropagation
-                     LearnFloatType learning_rate) {
+        void Backpropagate(const LearnFloatType* gradients,
-    previous_layer_trainer_->Backpropagate(gradients, learning_rate);
+                           LearnFloatType learning_rate) {
-  }
+
-
+            previous_layer_trainer_->Backpropagate(gradients, learning_rate);
- private:
+        }
-  // constructor
+
-  Trainer(LayerType* target_layer, FeatureTransformer* ft) :
+    private:
-      batch_size_(0),
+        // constructor
-      previous_layer_trainer_(Trainer<PreviousLayer>::Create(
+        Trainer(LayerType* target_layer, FeatureTransformer* ft) :
-          &target_layer->previous_layer_, ft)),
+            batch_size_(0),
-      target_layer_(target_layer) {
+            previous_layer_trainer_(Trainer<PreviousLayer>::Create(
-  }
+                &target_layer->previous_layer_, ft)),
-
+            target_layer_(target_layer) {
-  // number of input/output dimensions
+        }
-  static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
+
-
+        // number of input/output dimensions
-  // make subclass friend
+        static constexpr IndexType kOutputDimensions = LayerType::kOutputDimensions;
-  template <typename SumLayer>
+
-  friend class Trainer;
+        // make subclass friend
-
+        template <typename SumLayer>
-  // number of samples in mini-batch
+        friend class Trainer;
-  IndexType batch_size_;
+
-
+        // number of samples in mini-batch
-  // Trainer of the previous layer
+        IndexType batch_size_;
-  const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
+
-
+        // Trainer of the previous layer
-  // layer to learn
+        const std::shared_ptr<Trainer<PreviousLayer>> previous_layer_trainer_;
-  LayerType* const target_layer_;
+
-
+        // layer to learn
-  // Forward propagation buffer
+        LayerType* const target_layer_;
-  std::vector<LearnFloatType> output_;
+
-};
+        // Forward propagation buffer
-
+        std::vector<LearnFloatType> output_;
-}  // namespace NNUE
+    };
-
+
-}  // namespace Eval
+}  // namespace Eval::NNUE
 #endif