Move the observed feature collection to the threaded part now that it can be done safely.

This commit is contained in:
Tomasz Sobczyk
2020-10-27 19:24:07 +01:00
committed by nodchip
parent c53be1b23f
commit 987b6c98d4
2 changed files with 28 additions and 14 deletions
+5
View File
@@ -418,6 +418,11 @@ public:
constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket; constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket;
constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size; constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size;
LargeBitset()
{
std::fill(std::begin(bits), std::end(bits), 0);
}
void set(uint64_t idx) void set(uint64_t idx)
{ {
const uint64_t bucket = idx / bits_per_bucket; const uint64_t bucket = idx / bits_per_bucket;
+20 -11
View File
@@ -408,10 +408,26 @@ namespace Eval::NNUE {
for (IndexType c = 0; c < 2; ++c) { for (IndexType c = 0; c < 2; ++c) {
const IndexType output_offset = batch_offset + kHalfDimensions * c; const IndexType output_offset = batch_offset + kHalfDimensions * c;
for (const auto& feature : (*batch_)[b].training_features[c]) { for (const auto& feature : (*batch_)[b].training_features[c]) {
if (feature.get_index() % num_threads != thread_index) const IndexType feature_index = feature.get_index();
// We assign each bucket a continuous range of bits at least
// of cache line size to prevent false sharing.
// For HalfKP this is enough to saturate about 80 threads.
const IndexType thread_bucket =
(feature_index / BitsetType::best_concurrent_access_stride)
% num_threads;
if (thread_bucket != thread_index)
continue; continue;
// This operation can be performed safely because
// each thread accesses a different memory location
// (even a different cache line)
observed_features.set(feature_index);
const IndexType weights_offset = const IndexType weights_offset =
kHalfDimensions * feature.get_index(); kHalfDimensions * feature_index;
const auto scale = static_cast<LearnFloatType>( const auto scale = static_cast<LearnFloatType>(
effective_learning_rate / feature.get_count()); effective_learning_rate / feature.get_count());
@@ -438,14 +454,6 @@ namespace Eval::NNUE {
} }
); );
for (IndexType b = 0; b < batch_->size(); ++b) {
for (IndexType c = 0; c < 2; ++c) {
for (const auto& feature : (*batch_)[b].training_features[c]) {
observed_features.set(feature.get_index());
}
}
}
thread_pool.wait_for_workers_finished(); thread_pool.wait_for_workers_finished();
} }
@@ -628,7 +636,8 @@ namespace Eval::NNUE {
std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_; std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
// Features that appeared in the training data // Features that appeared in the training data
std::bitset<kInputDimensions> observed_features; using BitsetType = LargeBitset<kInputDimensions>;
BitsetType observed_features;
// hyper parameter // hyper parameter
LearnFloatType momentum_; LearnFloatType momentum_;