mirror of
https://github.com/opelly27/Stockfish.git
synced 2026-05-20 14:27:45 +00:00
Move the observed feature collection to the threaded part now that it can be done safely.
This commit is contained in:
@@ -418,6 +418,11 @@ public:
|
|||||||
constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket;
|
constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket;
|
||||||
constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size;
|
constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size;
|
||||||
|
|
||||||
|
LargeBitset()
|
||||||
|
{
|
||||||
|
std::fill(std::begin(bits), std::end(bits), 0);
|
||||||
|
}
|
||||||
|
|
||||||
void set(uint64_t idx)
|
void set(uint64_t idx)
|
||||||
{
|
{
|
||||||
const uint64_t bucket = idx / bits_per_bucket;
|
const uint64_t bucket = idx / bits_per_bucket;
|
||||||
|
|||||||
@@ -408,10 +408,26 @@ namespace Eval::NNUE {
|
|||||||
for (IndexType c = 0; c < 2; ++c) {
|
for (IndexType c = 0; c < 2; ++c) {
|
||||||
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
const IndexType output_offset = batch_offset + kHalfDimensions * c;
|
||||||
for (const auto& feature : (*batch_)[b].training_features[c]) {
|
for (const auto& feature : (*batch_)[b].training_features[c]) {
|
||||||
if (feature.get_index() % num_threads != thread_index)
|
const IndexType feature_index = feature.get_index();
|
||||||
|
|
||||||
|
// We assign each bucket a continuous range of bits at least
|
||||||
|
// of cache line size to prevent false sharing.
|
||||||
|
// For HalfKP this is enough to saturate about 80 threads.
|
||||||
|
const IndexType thread_bucket =
|
||||||
|
(feature_index / BitsetType::best_concurrent_access_stride)
|
||||||
|
% num_threads;
|
||||||
|
|
||||||
|
if (thread_bucket != thread_index)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
// This operation can be performed safely because
|
||||||
|
// each thread accesses a different memory location
|
||||||
|
// (even a different cache line)
|
||||||
|
observed_features.set(feature_index);
|
||||||
|
|
||||||
const IndexType weights_offset =
|
const IndexType weights_offset =
|
||||||
kHalfDimensions * feature.get_index();
|
kHalfDimensions * feature_index;
|
||||||
|
|
||||||
const auto scale = static_cast<LearnFloatType>(
|
const auto scale = static_cast<LearnFloatType>(
|
||||||
effective_learning_rate / feature.get_count());
|
effective_learning_rate / feature.get_count());
|
||||||
|
|
||||||
@@ -438,14 +454,6 @@ namespace Eval::NNUE {
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
for (IndexType b = 0; b < batch_->size(); ++b) {
|
|
||||||
for (IndexType c = 0; c < 2; ++c) {
|
|
||||||
for (const auto& feature : (*batch_)[b].training_features[c]) {
|
|
||||||
observed_features.set(feature.get_index());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
thread_pool.wait_for_workers_finished();
|
thread_pool.wait_for_workers_finished();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -628,7 +636,8 @@ namespace Eval::NNUE {
|
|||||||
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
|
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
|
||||||
|
|
||||||
// Features that appeared in the training data
|
// Features that appeared in the training data
|
||||||
std::bitset<kInputDimensions> observed_features;
|
using BitsetType = LargeBitset<kInputDimensions>;
|
||||||
|
BitsetType observed_features;
|
||||||
|
|
||||||
// hyper parameter
|
// hyper parameter
|
||||||
LearnFloatType momentum_;
|
LearnFloatType momentum_;
|
||||||
|
|||||||
Reference in New Issue
Block a user