mirror of
https://github.com/opelly27/Stockfish.git
synced 2026-05-20 14:27:45 +00:00
378 lines
13 KiB
C++
378 lines
13 KiB
C++
#ifndef _NNUE_TRAINER_INPUT_SLICE_H_
|
|
#define _NNUE_TRAINER_INPUT_SLICE_H_
|
|
|
|
#include "trainer.h"
|
|
|
|
#include "extra/stockfish_blas.h"
|
|
|
|
#include "learn/learn.h"
|
|
|
|
#include "nnue/layers/input_slice.h"
|
|
|
|
#include "thread.h"
|
|
|
|
// Specialization of NNUE evaluation function learning class template for InputSlice
|
|
namespace Eval::NNUE {
|
|
|
|
// Learning: Input layer
|
|
// This is tricky. It exists because when there's more than one trainer
|
|
// on top of a single feature transformer we want to only call propagate/backpropagate
|
|
// on the feature transformer once. This is straightforward in the old
|
|
// multithreading case, because propagate/backpropagate is called just once from the
|
|
// main thread. But with the current implementation of coarser multithreading
|
|
// we end up calling each method from each thread. Therefore we have to keep
|
|
// the num_calls and current_operation per thread basis, each thread must work
|
|
// on its designated batch slice, and the only synchronization points are
|
|
// step_start and step_end - for which we use state of the first thread.
|
|
// Each thread requires their own bookkeeping because it's possible that
|
|
// one thread is still in propagate of some batch slice while the other thread
|
|
// is doing backpropagate of some other slice. We also ensure the thread state
|
|
// isn't suspectible to false sharing by using a full cache line for the state.
|
|
class SharedInputTrainer {
|
|
public:
|
|
// factory function
|
|
static std::shared_ptr<SharedInputTrainer> create(
|
|
FeatureTransformer* ft) {
|
|
|
|
static std::shared_ptr<SharedInputTrainer> instance;
|
|
|
|
if (!instance) {
|
|
instance.reset(new SharedInputTrainer(ft));
|
|
}
|
|
|
|
++instance->num_referrers_;
|
|
|
|
return instance;
|
|
}
|
|
|
|
// Set options such as hyperparameters
|
|
void send_message(Message* message) {
|
|
auto& thread_state = thread_states_[0];
|
|
|
|
if (thread_state.num_calls == 0) {
|
|
thread_state.current_operation = Operation::kSendMessage;
|
|
feature_transformer_trainer_->send_message(message);
|
|
}
|
|
|
|
assert(thread_state.current_operation == Operation::kSendMessage);
|
|
|
|
if (++thread_state.num_calls == num_referrers_) {
|
|
thread_state.num_calls = 0;
|
|
thread_state.current_operation = Operation::kNone;
|
|
}
|
|
}
|
|
|
|
// Initialize the parameters with random numbers
|
|
template <typename RNG>
|
|
void initialize(RNG& rng) {
|
|
auto& thread_state = thread_states_[0];
|
|
|
|
if (thread_state.num_calls == 0) {
|
|
thread_state.current_operation = Operation::kInitialize;
|
|
feature_transformer_trainer_->initialize(rng);
|
|
}
|
|
|
|
assert(thread_state.current_operation == Operation::kInitialize);
|
|
|
|
if (++thread_state.num_calls == num_referrers_) {
|
|
thread_state.num_calls = 0;
|
|
thread_state.current_operation = Operation::kNone;
|
|
}
|
|
}
|
|
|
|
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
|
|
{
|
|
const auto size = batch_end - batch_begin;
|
|
|
|
if ((long)gradients_.size() < (long)kInputDimensions * size) {
|
|
gradients_.resize(kInputDimensions * size);
|
|
}
|
|
|
|
if (thread_states_.size() < thread_pool.size())
|
|
{
|
|
thread_states_.resize(thread_pool.size());
|
|
}
|
|
|
|
batch_size_ = size;
|
|
|
|
auto& thread_state = thread_states_[0];
|
|
|
|
if (thread_state.num_calls == 0) {
|
|
thread_state.current_operation = Operation::kStepStart;
|
|
output_ = feature_transformer_trainer_->step_start(thread_pool, batch_begin, batch_end);
|
|
}
|
|
|
|
assert(thread_state.current_operation == Operation::kStepStart);
|
|
|
|
if (++thread_state.num_calls == num_referrers_) {
|
|
thread_state.num_calls = 0;
|
|
thread_state.current_operation = Operation::kNone;
|
|
}
|
|
|
|
return output_;
|
|
}
|
|
|
|
// forward propagation
|
|
void propagate(Thread& th, uint64_t offset, uint64_t count) {
|
|
const auto thread_id = th.thread_idx();
|
|
|
|
auto& thread_state = thread_states_[thread_id];
|
|
|
|
if (thread_state.num_calls == 0) {
|
|
thread_state.current_operation = Operation::kPropagate;
|
|
feature_transformer_trainer_->propagate(th, offset, count);
|
|
}
|
|
|
|
assert(thread_state.current_operation == Operation::kPropagate);
|
|
|
|
if (++thread_state.num_calls == num_referrers_) {
|
|
thread_state.num_calls = 0;
|
|
thread_state.current_operation = Operation::kNone;
|
|
}
|
|
}
|
|
|
|
// backpropagation
|
|
void backpropagate(Thread& th,
|
|
const LearnFloatType* gradients,
|
|
uint64_t offset,
|
|
uint64_t count) {
|
|
|
|
const auto thread_id = th.thread_idx();
|
|
|
|
auto& thread_state = thread_states_[thread_id];
|
|
|
|
if (num_referrers_ == 1) {
|
|
feature_transformer_trainer_->backpropagate(th, gradients, offset, count);
|
|
return;
|
|
}
|
|
|
|
if (thread_state.num_calls == 0) {
|
|
thread_state.current_operation = Operation::kBackPropagate;
|
|
for (IndexType b = offset; b < offset + count; ++b) {
|
|
const IndexType batch_offset = kInputDimensions * b;
|
|
for (IndexType i = 0; i < kInputDimensions; ++i) {
|
|
gradients_[batch_offset + i] = static_cast<LearnFloatType>(0.0);
|
|
}
|
|
}
|
|
}
|
|
|
|
assert(thread_state.current_operation == Operation::kBackPropagate);
|
|
|
|
for (IndexType b = offset; b < offset + count; ++b) {
|
|
const IndexType batch_offset = kInputDimensions * b;
|
|
for (IndexType i = 0; i < kInputDimensions; ++i) {
|
|
gradients_[batch_offset + i] += gradients[batch_offset + i];
|
|
}
|
|
}
|
|
|
|
if (++thread_state.num_calls == num_referrers_) {
|
|
feature_transformer_trainer_->backpropagate(
|
|
th, gradients_.data(), offset, count);
|
|
thread_state.num_calls = 0;
|
|
thread_state.current_operation = Operation::kNone;
|
|
}
|
|
}
|
|
|
|
void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
|
|
auto& thread_state = thread_states_[0];
|
|
|
|
if (thread_state.num_calls == 0) {
|
|
thread_state.current_operation = Operation::kStepEnd;
|
|
feature_transformer_trainer_->step_end(thread_pool, learning_rate);
|
|
}
|
|
|
|
assert(thread_state.current_operation == Operation::kStepEnd);
|
|
|
|
if (++thread_state.num_calls == num_referrers_) {
|
|
thread_state.num_calls = 0;
|
|
thread_state.current_operation = Operation::kNone;
|
|
}
|
|
}
|
|
|
|
private:
|
|
// constructor
|
|
SharedInputTrainer(FeatureTransformer* ft) :
|
|
batch_size_(0),
|
|
num_referrers_(0),
|
|
thread_states_(1),
|
|
feature_transformer_trainer_(Trainer<FeatureTransformer>::create(
|
|
ft)),
|
|
output_(nullptr) {
|
|
}
|
|
|
|
// number of input/output dimensions
|
|
static constexpr IndexType kInputDimensions =
|
|
FeatureTransformer::kOutputDimensions;
|
|
|
|
// type of processing
|
|
enum class Operation {
|
|
kNone,
|
|
kSendMessage,
|
|
kInitialize,
|
|
kStepStart,
|
|
kPropagate,
|
|
kBackPropagate,
|
|
kStepEnd,
|
|
};
|
|
|
|
// number of samples in mini-batch
|
|
IndexType batch_size_;
|
|
|
|
// number of layers sharing this layer as input
|
|
std::uint32_t num_referrers_;
|
|
|
|
struct alignas(kCacheLineSize) ThreadState
|
|
{
|
|
std::uint32_t num_calls{0};
|
|
|
|
// current processing type
|
|
Operation current_operation = Operation::kNone;
|
|
};
|
|
|
|
// Number of times the current process has been called
|
|
std::vector<ThreadState, CacheLineAlignedAllocator<ThreadState>> thread_states_;
|
|
|
|
// Trainer of input feature converter
|
|
const std::shared_ptr<Trainer<FeatureTransformer>>
|
|
feature_transformer_trainer_;
|
|
|
|
// pointer to output shared for forward propagation
|
|
const LearnFloatType* output_;
|
|
|
|
// buffer for back propagation
|
|
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
|
|
};
|
|
|
|
// Learning: Input layer
|
|
template <IndexType OutputDimensions, IndexType Offset>
|
|
class Trainer<Layers::InputSlice<OutputDimensions, Offset>> {
|
|
private:
|
|
// Type of layer to learn
|
|
using LayerType = Layers::InputSlice<OutputDimensions, Offset>;
|
|
|
|
public:
|
|
// factory function
|
|
static std::shared_ptr<Trainer> create(
|
|
LayerType* /*target_layer*/, FeatureTransformer* ft) {
|
|
|
|
return std::shared_ptr<Trainer>(new Trainer(ft));
|
|
}
|
|
|
|
// Set options such as hyperparameters
|
|
void send_message(Message* message) {
|
|
shared_input_trainer_->send_message(message);
|
|
}
|
|
|
|
// Initialize the parameters with random numbers
|
|
template <typename RNG>
|
|
void initialize(RNG& rng) {
|
|
shared_input_trainer_->initialize(rng);
|
|
}
|
|
|
|
const LearnFloatType* step_start(ThreadPool& thread_pool, std::vector<Example>::const_iterator batch_begin, std::vector<Example>::const_iterator batch_end)
|
|
{
|
|
const auto size = batch_end - batch_begin;
|
|
|
|
if ((long)output_.size() < (long)kOutputDimensions * size) {
|
|
output_.resize(kOutputDimensions * size);
|
|
gradients_.resize(kInputDimensions * size);
|
|
}
|
|
|
|
batch_size_ = size;
|
|
|
|
input_ = shared_input_trainer_->step_start(thread_pool, batch_begin, batch_end);
|
|
|
|
return output_.data();
|
|
}
|
|
|
|
// forward propagation
|
|
void propagate(Thread& th, uint64_t offset, uint64_t count) {
|
|
|
|
shared_input_trainer_->propagate(th, offset, count);
|
|
|
|
for (IndexType b = offset; b < offset + count; ++b) {
|
|
const IndexType input_offset = kInputDimensions * b;
|
|
const IndexType output_offset = kOutputDimensions * b;
|
|
|
|
#if defined(USE_BLAS)
|
|
|
|
cblas_scopy(
|
|
kOutputDimensions, &input_[input_offset + Offset], 1,
|
|
&output_[output_offset], 1
|
|
);
|
|
#else
|
|
|
|
Blas::scopy(
|
|
kOutputDimensions, &input_[input_offset + Offset], 1,
|
|
&output_[output_offset], 1
|
|
);
|
|
|
|
#endif
|
|
}
|
|
}
|
|
|
|
// backpropagation
|
|
void backpropagate(Thread& th,
|
|
const LearnFloatType* gradients,
|
|
uint64_t offset,
|
|
uint64_t count) {
|
|
|
|
for (IndexType b = offset; b < offset + count; ++b)
|
|
{
|
|
const IndexType input_offset = kInputDimensions * b;
|
|
const IndexType output_offset = kOutputDimensions * b;
|
|
|
|
IndexType i = 0;
|
|
for (; i < Offset; ++i) {
|
|
gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
|
|
}
|
|
|
|
for (; i < Offset + kOutputDimensions; ++i) {
|
|
gradients_[input_offset + i] = gradients[output_offset + i - Offset];
|
|
}
|
|
|
|
for (; i < kInputDimensions; ++i)
|
|
{
|
|
gradients_[input_offset + i] = static_cast<LearnFloatType>(0.0);
|
|
}
|
|
}
|
|
|
|
shared_input_trainer_->backpropagate(th, gradients_.data(), offset, count);
|
|
}
|
|
|
|
void step_end(ThreadPool& thread_pool, LearnFloatType learning_rate) {
|
|
shared_input_trainer_->step_end(thread_pool, learning_rate);
|
|
}
|
|
|
|
private:
|
|
// constructor
|
|
Trainer(FeatureTransformer* ft) :
|
|
batch_size_(0),
|
|
shared_input_trainer_(SharedInputTrainer::create(ft)) {
|
|
}
|
|
|
|
// number of input/output dimensions
|
|
static constexpr IndexType kInputDimensions =
|
|
FeatureTransformer::kOutputDimensions;
|
|
static constexpr IndexType kOutputDimensions = OutputDimensions;
|
|
static_assert(Offset + kOutputDimensions <= kInputDimensions, "");
|
|
|
|
// number of samples in mini-batch
|
|
IndexType batch_size_;
|
|
|
|
const LearnFloatType* input_;
|
|
|
|
// Trainer of shared input layer
|
|
const std::shared_ptr<SharedInputTrainer> shared_input_trainer_;
|
|
|
|
// Forward propagation buffer
|
|
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
|
|
|
|
// buffer for back propagation
|
|
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> gradients_;
|
|
};
|
|
|
|
} // namespace Eval::NNUE
|
|
|
|
#endif
|