From 329c267e253e6119a43066fa3481e9c509e8c34a Mon Sep 17 00:00:00 2001 From: mstembera Date: Fri, 17 Jan 2025 21:18:48 -0800 Subject: [PATCH] Optimize find_nnz() by reducing the size of lookup_indices https://tests.stockfishchess.org/tests/view/67896b688082388fa0cbfdee LLR: 2.93 (-2.94,2.94) <0.00,2.00> Total: 452800 W: 118213 L: 117300 D: 217287 Ptnml(0-2): 1638, 50255, 121864, 50842, 1801 It's faster to shrink lookup_indices[] to 8 bit and zero extend to 16 bit using _mm_cvtepu8_epi16() than to read the larger 16 bit version. I suspect that having the constants available at compile time isn't too valuable and can be simplified back to generating at initialization time since this version also almost passed. https://tests.stockfishchess.org/tests/view/67863057460e2910c51de7e0 I will try that as a follow up. closes https://github.com/official-stockfish/Stockfish/pull/5793 No functional change --- .../layers/affine_transform_sparse_input.h | 110 ++++++++++++++++-- 1 file changed, 98 insertions(+), 12 deletions(-) diff --git a/src/nnue/layers/affine_transform_sparse_input.h b/src/nnue/layers/affine_transform_sparse_input.h index cbeb507f..248e19dd 100644 --- a/src/nnue/layers/affine_transform_sparse_input.h +++ b/src/nnue/layers/affine_transform_sparse_input.h @@ -38,17 +38,99 @@ namespace Stockfish::Eval::NNUE::Layers { #if (USE_SSSE3 | (USE_NEON >= 8)) -alignas(CacheLineSize) static inline const - std::array, 256> lookup_indices = []() { - std::array, 256> v{}; - for (unsigned i = 0; i < 256; ++i) - { - std::uint64_t j = i, k = 0; - while (j) - v[i][k++] = pop_lsb(j); - } - return v; - }(); + + #if (USE_SSE41) +alignas(CacheLineSize) static constexpr std::uint8_t + #else +alignas(CacheLineSize) static constexpr std::uint16_t + #endif + lookup_indices[256][8] = { + {0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 0, 0}, + {0, 1, 0, 0, 0, 0, 0, 0}, {2, 0, 0, 0, 0, 0, 0, 0}, {0, 2, 0, 0, 0, 0, 0, 0}, + {1, 2, 0, 0, 0, 0, 0, 0}, {0, 1, 2, 0, 0, 0, 0, 0}, {3, 0, 0, 0, 0, 0, 0, 0}, + {0, 3, 0, 0, 0, 0, 0, 0}, {1, 3, 0, 0, 0, 0, 0, 0}, {0, 1, 3, 0, 0, 0, 0, 0}, + {2, 3, 0, 0, 0, 0, 0, 0}, {0, 2, 3, 0, 0, 0, 0, 0}, {1, 2, 3, 0, 0, 0, 0, 0}, + {0, 1, 2, 3, 0, 0, 0, 0}, {4, 0, 0, 0, 0, 0, 0, 0}, {0, 4, 0, 0, 0, 0, 0, 0}, + {1, 4, 0, 0, 0, 0, 0, 0}, {0, 1, 4, 0, 0, 0, 0, 0}, {2, 4, 0, 0, 0, 0, 0, 0}, + {0, 2, 4, 0, 0, 0, 0, 0}, {1, 2, 4, 0, 0, 0, 0, 0}, {0, 1, 2, 4, 0, 0, 0, 0}, + {3, 4, 0, 0, 0, 0, 0, 0}, {0, 3, 4, 0, 0, 0, 0, 0}, {1, 3, 4, 0, 0, 0, 0, 0}, + {0, 1, 3, 4, 0, 0, 0, 0}, {2, 3, 4, 0, 0, 0, 0, 0}, {0, 2, 3, 4, 0, 0, 0, 0}, + {1, 2, 3, 4, 0, 0, 0, 0}, {0, 1, 2, 3, 4, 0, 0, 0}, {5, 0, 0, 0, 0, 0, 0, 0}, + {0, 5, 0, 0, 0, 0, 0, 0}, {1, 5, 0, 0, 0, 0, 0, 0}, {0, 1, 5, 0, 0, 0, 0, 0}, + {2, 5, 0, 0, 0, 0, 0, 0}, {0, 2, 5, 0, 0, 0, 0, 0}, {1, 2, 5, 0, 0, 0, 0, 0}, + {0, 1, 2, 5, 0, 0, 0, 0}, {3, 5, 0, 0, 0, 0, 0, 0}, {0, 3, 5, 0, 0, 0, 0, 0}, + {1, 3, 5, 0, 0, 0, 0, 0}, {0, 1, 3, 5, 0, 0, 0, 0}, {2, 3, 5, 0, 0, 0, 0, 0}, + {0, 2, 3, 5, 0, 0, 0, 0}, {1, 2, 3, 5, 0, 0, 0, 0}, {0, 1, 2, 3, 5, 0, 0, 0}, + {4, 5, 0, 0, 0, 0, 0, 0}, {0, 4, 5, 0, 0, 0, 0, 0}, {1, 4, 5, 0, 0, 0, 0, 0}, + {0, 1, 4, 5, 0, 0, 0, 0}, {2, 4, 5, 0, 0, 0, 0, 0}, {0, 2, 4, 5, 0, 0, 0, 0}, + {1, 2, 4, 5, 0, 0, 0, 0}, {0, 1, 2, 4, 5, 0, 0, 0}, {3, 4, 5, 0, 0, 0, 0, 0}, + {0, 3, 4, 5, 0, 0, 0, 0}, {1, 3, 4, 5, 0, 0, 0, 0}, {0, 1, 3, 4, 5, 0, 0, 0}, + {2, 3, 4, 5, 0, 0, 0, 0}, {0, 2, 3, 4, 5, 0, 0, 0}, {1, 2, 3, 4, 5, 0, 0, 0}, + {0, 1, 2, 3, 4, 5, 0, 0}, {6, 0, 0, 0, 0, 0, 0, 0}, {0, 6, 0, 0, 0, 0, 0, 0}, + {1, 6, 0, 0, 0, 0, 0, 0}, {0, 1, 6, 0, 0, 0, 0, 0}, {2, 6, 0, 0, 0, 0, 0, 0}, + {0, 2, 6, 0, 0, 0, 0, 0}, {1, 2, 6, 0, 0, 0, 0, 0}, {0, 1, 2, 6, 0, 0, 0, 0}, + {3, 6, 0, 0, 0, 0, 0, 0}, {0, 3, 6, 0, 0, 0, 0, 0}, {1, 3, 6, 0, 0, 0, 0, 0}, + {0, 1, 3, 6, 0, 0, 0, 0}, {2, 3, 6, 0, 0, 0, 0, 0}, {0, 2, 3, 6, 0, 0, 0, 0}, + {1, 2, 3, 6, 0, 0, 0, 0}, {0, 1, 2, 3, 6, 0, 0, 0}, {4, 6, 0, 0, 0, 0, 0, 0}, + {0, 4, 6, 0, 0, 0, 0, 0}, {1, 4, 6, 0, 0, 0, 0, 0}, {0, 1, 4, 6, 0, 0, 0, 0}, + {2, 4, 6, 0, 0, 0, 0, 0}, {0, 2, 4, 6, 0, 0, 0, 0}, {1, 2, 4, 6, 0, 0, 0, 0}, + {0, 1, 2, 4, 6, 0, 0, 0}, {3, 4, 6, 0, 0, 0, 0, 0}, {0, 3, 4, 6, 0, 0, 0, 0}, + {1, 3, 4, 6, 0, 0, 0, 0}, {0, 1, 3, 4, 6, 0, 0, 0}, {2, 3, 4, 6, 0, 0, 0, 0}, + {0, 2, 3, 4, 6, 0, 0, 0}, {1, 2, 3, 4, 6, 0, 0, 0}, {0, 1, 2, 3, 4, 6, 0, 0}, + {5, 6, 0, 0, 0, 0, 0, 0}, {0, 5, 6, 0, 0, 0, 0, 0}, {1, 5, 6, 0, 0, 0, 0, 0}, + {0, 1, 5, 6, 0, 0, 0, 0}, {2, 5, 6, 0, 0, 0, 0, 0}, {0, 2, 5, 6, 0, 0, 0, 0}, + {1, 2, 5, 6, 0, 0, 0, 0}, {0, 1, 2, 5, 6, 0, 0, 0}, {3, 5, 6, 0, 0, 0, 0, 0}, + {0, 3, 5, 6, 0, 0, 0, 0}, {1, 3, 5, 6, 0, 0, 0, 0}, {0, 1, 3, 5, 6, 0, 0, 0}, + {2, 3, 5, 6, 0, 0, 0, 0}, {0, 2, 3, 5, 6, 0, 0, 0}, {1, 2, 3, 5, 6, 0, 0, 0}, + {0, 1, 2, 3, 5, 6, 0, 0}, {4, 5, 6, 0, 0, 0, 0, 0}, {0, 4, 5, 6, 0, 0, 0, 0}, + {1, 4, 5, 6, 0, 0, 0, 0}, {0, 1, 4, 5, 6, 0, 0, 0}, {2, 4, 5, 6, 0, 0, 0, 0}, + {0, 2, 4, 5, 6, 0, 0, 0}, {1, 2, 4, 5, 6, 0, 0, 0}, {0, 1, 2, 4, 5, 6, 0, 0}, + {3, 4, 5, 6, 0, 0, 0, 0}, {0, 3, 4, 5, 6, 0, 0, 0}, {1, 3, 4, 5, 6, 0, 0, 0}, + {0, 1, 3, 4, 5, 6, 0, 0}, {2, 3, 4, 5, 6, 0, 0, 0}, {0, 2, 3, 4, 5, 6, 0, 0}, + {1, 2, 3, 4, 5, 6, 0, 0}, {0, 1, 2, 3, 4, 5, 6, 0}, {7, 0, 0, 0, 0, 0, 0, 0}, + {0, 7, 0, 0, 0, 0, 0, 0}, {1, 7, 0, 0, 0, 0, 0, 0}, {0, 1, 7, 0, 0, 0, 0, 0}, + {2, 7, 0, 0, 0, 0, 0, 0}, {0, 2, 7, 0, 0, 0, 0, 0}, {1, 2, 7, 0, 0, 0, 0, 0}, + {0, 1, 2, 7, 0, 0, 0, 0}, {3, 7, 0, 0, 0, 0, 0, 0}, {0, 3, 7, 0, 0, 0, 0, 0}, + {1, 3, 7, 0, 0, 0, 0, 0}, {0, 1, 3, 7, 0, 0, 0, 0}, {2, 3, 7, 0, 0, 0, 0, 0}, + {0, 2, 3, 7, 0, 0, 0, 0}, {1, 2, 3, 7, 0, 0, 0, 0}, {0, 1, 2, 3, 7, 0, 0, 0}, + {4, 7, 0, 0, 0, 0, 0, 0}, {0, 4, 7, 0, 0, 0, 0, 0}, {1, 4, 7, 0, 0, 0, 0, 0}, + {0, 1, 4, 7, 0, 0, 0, 0}, {2, 4, 7, 0, 0, 0, 0, 0}, {0, 2, 4, 7, 0, 0, 0, 0}, + {1, 2, 4, 7, 0, 0, 0, 0}, {0, 1, 2, 4, 7, 0, 0, 0}, {3, 4, 7, 0, 0, 0, 0, 0}, + {0, 3, 4, 7, 0, 0, 0, 0}, {1, 3, 4, 7, 0, 0, 0, 0}, {0, 1, 3, 4, 7, 0, 0, 0}, + {2, 3, 4, 7, 0, 0, 0, 0}, {0, 2, 3, 4, 7, 0, 0, 0}, {1, 2, 3, 4, 7, 0, 0, 0}, + {0, 1, 2, 3, 4, 7, 0, 0}, {5, 7, 0, 0, 0, 0, 0, 0}, {0, 5, 7, 0, 0, 0, 0, 0}, + {1, 5, 7, 0, 0, 0, 0, 0}, {0, 1, 5, 7, 0, 0, 0, 0}, {2, 5, 7, 0, 0, 0, 0, 0}, + {0, 2, 5, 7, 0, 0, 0, 0}, {1, 2, 5, 7, 0, 0, 0, 0}, {0, 1, 2, 5, 7, 0, 0, 0}, + {3, 5, 7, 0, 0, 0, 0, 0}, {0, 3, 5, 7, 0, 0, 0, 0}, {1, 3, 5, 7, 0, 0, 0, 0}, + {0, 1, 3, 5, 7, 0, 0, 0}, {2, 3, 5, 7, 0, 0, 0, 0}, {0, 2, 3, 5, 7, 0, 0, 0}, + {1, 2, 3, 5, 7, 0, 0, 0}, {0, 1, 2, 3, 5, 7, 0, 0}, {4, 5, 7, 0, 0, 0, 0, 0}, + {0, 4, 5, 7, 0, 0, 0, 0}, {1, 4, 5, 7, 0, 0, 0, 0}, {0, 1, 4, 5, 7, 0, 0, 0}, + {2, 4, 5, 7, 0, 0, 0, 0}, {0, 2, 4, 5, 7, 0, 0, 0}, {1, 2, 4, 5, 7, 0, 0, 0}, + {0, 1, 2, 4, 5, 7, 0, 0}, {3, 4, 5, 7, 0, 0, 0, 0}, {0, 3, 4, 5, 7, 0, 0, 0}, + {1, 3, 4, 5, 7, 0, 0, 0}, {0, 1, 3, 4, 5, 7, 0, 0}, {2, 3, 4, 5, 7, 0, 0, 0}, + {0, 2, 3, 4, 5, 7, 0, 0}, {1, 2, 3, 4, 5, 7, 0, 0}, {0, 1, 2, 3, 4, 5, 7, 0}, + {6, 7, 0, 0, 0, 0, 0, 0}, {0, 6, 7, 0, 0, 0, 0, 0}, {1, 6, 7, 0, 0, 0, 0, 0}, + {0, 1, 6, 7, 0, 0, 0, 0}, {2, 6, 7, 0, 0, 0, 0, 0}, {0, 2, 6, 7, 0, 0, 0, 0}, + {1, 2, 6, 7, 0, 0, 0, 0}, {0, 1, 2, 6, 7, 0, 0, 0}, {3, 6, 7, 0, 0, 0, 0, 0}, + {0, 3, 6, 7, 0, 0, 0, 0}, {1, 3, 6, 7, 0, 0, 0, 0}, {0, 1, 3, 6, 7, 0, 0, 0}, + {2, 3, 6, 7, 0, 0, 0, 0}, {0, 2, 3, 6, 7, 0, 0, 0}, {1, 2, 3, 6, 7, 0, 0, 0}, + {0, 1, 2, 3, 6, 7, 0, 0}, {4, 6, 7, 0, 0, 0, 0, 0}, {0, 4, 6, 7, 0, 0, 0, 0}, + {1, 4, 6, 7, 0, 0, 0, 0}, {0, 1, 4, 6, 7, 0, 0, 0}, {2, 4, 6, 7, 0, 0, 0, 0}, + {0, 2, 4, 6, 7, 0, 0, 0}, {1, 2, 4, 6, 7, 0, 0, 0}, {0, 1, 2, 4, 6, 7, 0, 0}, + {3, 4, 6, 7, 0, 0, 0, 0}, {0, 3, 4, 6, 7, 0, 0, 0}, {1, 3, 4, 6, 7, 0, 0, 0}, + {0, 1, 3, 4, 6, 7, 0, 0}, {2, 3, 4, 6, 7, 0, 0, 0}, {0, 2, 3, 4, 6, 7, 0, 0}, + {1, 2, 3, 4, 6, 7, 0, 0}, {0, 1, 2, 3, 4, 6, 7, 0}, {5, 6, 7, 0, 0, 0, 0, 0}, + {0, 5, 6, 7, 0, 0, 0, 0}, {1, 5, 6, 7, 0, 0, 0, 0}, {0, 1, 5, 6, 7, 0, 0, 0}, + {2, 5, 6, 7, 0, 0, 0, 0}, {0, 2, 5, 6, 7, 0, 0, 0}, {1, 2, 5, 6, 7, 0, 0, 0}, + {0, 1, 2, 5, 6, 7, 0, 0}, {3, 5, 6, 7, 0, 0, 0, 0}, {0, 3, 5, 6, 7, 0, 0, 0}, + {1, 3, 5, 6, 7, 0, 0, 0}, {0, 1, 3, 5, 6, 7, 0, 0}, {2, 3, 5, 6, 7, 0, 0, 0}, + {0, 2, 3, 5, 6, 7, 0, 0}, {1, 2, 3, 5, 6, 7, 0, 0}, {0, 1, 2, 3, 5, 6, 7, 0}, + {4, 5, 6, 7, 0, 0, 0, 0}, {0, 4, 5, 6, 7, 0, 0, 0}, {1, 4, 5, 6, 7, 0, 0, 0}, + {0, 1, 4, 5, 6, 7, 0, 0}, {2, 4, 5, 6, 7, 0, 0, 0}, {0, 2, 4, 5, 6, 7, 0, 0}, + {1, 2, 4, 5, 6, 7, 0, 0}, {0, 1, 2, 4, 5, 6, 7, 0}, {3, 4, 5, 6, 7, 0, 0, 0}, + {0, 3, 4, 5, 6, 7, 0, 0}, {1, 3, 4, 5, 6, 7, 0, 0}, {0, 1, 3, 4, 5, 6, 7, 0}, + {2, 3, 4, 5, 6, 7, 0, 0}, {0, 2, 3, 4, 5, 6, 7, 0}, {1, 2, 3, 4, 5, 6, 7, 0}, + {0, 1, 2, 3, 4, 5, 6, 7}}; // Find indices of nonzero numbers in an int32_t array template @@ -74,7 +156,11 @@ void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_ou using vec128_t = __m128i; #define vec128_zero _mm_setzero_si128() #define vec128_set_16(a) _mm_set1_epi16(a) - #define vec128_load(a) _mm_load_si128(a) + #if (USE_SSE41) + #define vec128_load(a) _mm_cvtepu8_epi16(_mm_loadl_epi64(a)) + #else + #define vec128_load(a) _mm_load_si128(a) + #endif #define vec128_storeu(a, b) _mm_storeu_si128(a, b) #define vec128_add(a, b) _mm_add_epi16(a, b) #elif defined(USE_NEON)