Move the observed feature collection to the threaded part now that it can be done safely.

2026-05-20 14:27:45 +00:00 · 2020-10-27 19:24:07 +01:00
parent c53be1b23f
commit 987b6c98d4
2 changed files with 28 additions and 14 deletions
@@ -418,6 +418,11 @@ public:
    constexpr static uint64_t num_buckets = (num_bits + bits_per_bucket - 1) / bits_per_bucket;
    constexpr static uint64_t best_concurrent_access_stride = 8 * cache_line_size;
    LargeBitset()
    {
        std::fill(std::begin(bits), std::end(bits), 0);
    }
    void set(uint64_t idx)
    {
        const uint64_t bucket = idx / bits_per_bucket;
@@ -408,10 +408,26 @@ namespace Eval::NNUE {
                        for (IndexType c = 0; c < 2; ++c) {
                            const IndexType output_offset = batch_offset + kHalfDimensions * c;
                            for (const auto& feature : (*batch_)[b].training_features[c]) {
-                                if (feature.get_index() % num_threads != thread_index)
+                                const IndexType feature_index = feature.get_index();
                                // We assign each bucket a continuous range of bits at least
                                // of cache line size to prevent false sharing.
                                // For HalfKP this is enough to saturate about 80 threads.
                                const IndexType thread_bucket =
                                    (feature_index / BitsetType::best_concurrent_access_stride)
                                    % num_threads;
                                if (thread_bucket != thread_index)
                                    continue;
                                // This operation can be performed safely because
                                // each thread accesses a different memory location
                                // (even a different cache line)
                                observed_features.set(feature_index);
                                const IndexType weights_offset =
-                                    kHalfDimensions * feature.get_index();
+                                    kHalfDimensions * feature_index;
                                const auto scale = static_cast<LearnFloatType>(
                                    effective_learning_rate / feature.get_count());
@@ -438,14 +454,6 @@ namespace Eval::NNUE {
                }
            );
            for (IndexType b = 0; b < batch_->size(); ++b) {
                for (IndexType c = 0; c < 2; ++c) {
                    for (const auto& feature : (*batch_)[b].training_features[c]) {
                        observed_features.set(feature.get_index());
                    }
                }
            }
            thread_pool.wait_for_workers_finished();
        }
@@ -628,7 +636,8 @@ namespace Eval::NNUE {
        std::vector<LearnFloatType, CacheLineAlignedAllocator<LearnFloatType>> output_;
        // Features that appeared in the training data
-        std::bitset<kInputDimensions> observed_features;
+        using BitsetType = LargeBitset<kInputDimensions>;
        BitsetType observed_features;
        // hyper parameter
        LearnFloatType momentum_;