mirror of
https://github.com/opelly27/Stockfish.git
synced 2026-05-20 02:47:45 +00:00
Clean up pack reordering
closes https://github.com/official-stockfish/Stockfish/pull/5802 no functional change
This commit is contained in:
committed by
Joost VandeVondele
parent
18b3465a86
commit
e7367cef0f
@@ -26,6 +26,7 @@
|
|||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <iosfwd>
|
#include <iosfwd>
|
||||||
|
#include <type_traits>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
#include "../position.h"
|
#include "../position.h"
|
||||||
@@ -145,6 +146,46 @@ using psqt_vec_t = int32x4_t;
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Returns the inverse of a permutation
|
||||||
|
template<std::size_t Len>
|
||||||
|
constexpr std::array<std::size_t, Len>
|
||||||
|
invert_permutation(const std::array<std::size_t, Len>& order) {
|
||||||
|
std::array<std::size_t, Len> inverse{};
|
||||||
|
for (std::size_t i = 0; i < order.size(); i++)
|
||||||
|
inverse[order[i]] = i;
|
||||||
|
return inverse;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Divide a byte region of size TotalSize to chunks of size
|
||||||
|
// BlockSize, and permute the blocks by a given order
|
||||||
|
template<std::size_t BlockSize, typename T, std::size_t N, std::size_t OrderSize>
|
||||||
|
void permute(T (&data)[N], const std::array<std::size_t, OrderSize>& order) {
|
||||||
|
constexpr std::size_t TotalSize = N * sizeof(T);
|
||||||
|
|
||||||
|
static_assert(TotalSize % (BlockSize * OrderSize) == 0,
|
||||||
|
"ChunkSize * OrderSize must perfectly divide TotalSize");
|
||||||
|
|
||||||
|
constexpr std::size_t ProcessChunkSize = BlockSize * OrderSize;
|
||||||
|
|
||||||
|
std::array<std::byte, ProcessChunkSize> buffer{};
|
||||||
|
|
||||||
|
std::byte* const bytes = reinterpret_cast<std::byte*>(data);
|
||||||
|
|
||||||
|
for (std::size_t i = 0; i < TotalSize; i += ProcessChunkSize)
|
||||||
|
{
|
||||||
|
std::byte* const values = &bytes[i];
|
||||||
|
|
||||||
|
for (std::size_t j = 0; j < OrderSize; j++)
|
||||||
|
{
|
||||||
|
auto* const buffer_chunk = &buffer[j * BlockSize];
|
||||||
|
auto* const value_chunk = &values[order[j] * BlockSize];
|
||||||
|
|
||||||
|
std::copy(value_chunk, value_chunk + BlockSize, buffer_chunk);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::copy(std::begin(buffer), std::end(buffer), values);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Compute optimal SIMD register count for feature transformer accumulation.
|
// Compute optimal SIMD register count for feature transformer accumulation.
|
||||||
template<IndexType TransformedFeatureWidth, IndexType HalfDimensions>
|
template<IndexType TransformedFeatureWidth, IndexType HalfDimensions>
|
||||||
@@ -223,62 +264,42 @@ class FeatureTransformer {
|
|||||||
// Size of forward propagation buffer
|
// Size of forward propagation buffer
|
||||||
static constexpr std::size_t BufferSize = OutputDimensions * sizeof(OutputType);
|
static constexpr std::size_t BufferSize = OutputDimensions * sizeof(OutputType);
|
||||||
|
|
||||||
|
// Store the order by which 128-bit blocks of a 1024-bit data must
|
||||||
|
// be permuted so that calling packus on adjacent vectors of 16-bit
|
||||||
|
// integers loaded from the data results in the pre-permutation order
|
||||||
|
static constexpr auto PackusEpi16Order = []() -> std::array<std::size_t, 8> {
|
||||||
|
#if defined(USE_AVX512)
|
||||||
|
// _mm512_packus_epi16 after permutation:
|
||||||
|
// | 0 | 2 | 4 | 6 | // Vector 0
|
||||||
|
// | 1 | 3 | 5 | 7 | // Vector 1
|
||||||
|
// | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | // Packed Result
|
||||||
|
return {0, 2, 4, 6, 1, 3, 5, 7};
|
||||||
|
#elif defined(USE_AVX2)
|
||||||
|
// _mm256_packus_epi16 after permutation:
|
||||||
|
// | 0 | 2 | | 4 | 6 | // Vector 0, 2
|
||||||
|
// | 1 | 3 | | 5 | 7 | // Vector 1, 3
|
||||||
|
// | 0 | 1 | 2 | 3 | | 4 | 5 | 6 | 7 | // Packed Result
|
||||||
|
return {0, 2, 1, 3, 4, 6, 5, 7};
|
||||||
|
#else
|
||||||
|
return {0, 1, 2, 3, 4, 5, 6, 7};
|
||||||
|
#endif
|
||||||
|
}();
|
||||||
|
|
||||||
|
static constexpr auto InversePackusEpi16Order = invert_permutation(PackusEpi16Order);
|
||||||
|
|
||||||
// Hash value embedded in the evaluation file
|
// Hash value embedded in the evaluation file
|
||||||
static constexpr std::uint32_t get_hash_value() {
|
static constexpr std::uint32_t get_hash_value() {
|
||||||
return FeatureSet::HashValue ^ (OutputDimensions * 2);
|
return FeatureSet::HashValue ^ (OutputDimensions * 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
static constexpr void order_packs([[maybe_unused]] uint64_t* v) {
|
void permute_weights() {
|
||||||
#if defined(USE_AVX512) // _mm512_packs_epi16 ordering
|
permute<16>(biases, PackusEpi16Order);
|
||||||
uint64_t tmp0 = v[2], tmp1 = v[3];
|
permute<16>(weights, PackusEpi16Order);
|
||||||
v[2] = v[8], v[3] = v[9];
|
|
||||||
v[8] = v[4], v[9] = v[5];
|
|
||||||
v[4] = tmp0, v[5] = tmp1;
|
|
||||||
tmp0 = v[6], tmp1 = v[7];
|
|
||||||
v[6] = v[10], v[7] = v[11];
|
|
||||||
v[10] = v[12], v[11] = v[13];
|
|
||||||
v[12] = tmp0, v[13] = tmp1;
|
|
||||||
#elif defined(USE_AVX2) // _mm256_packs_epi16 ordering
|
|
||||||
std::swap(v[2], v[4]);
|
|
||||||
std::swap(v[3], v[5]);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static constexpr void inverse_order_packs([[maybe_unused]] uint64_t* v) {
|
void unpermute_weights() {
|
||||||
#if defined(USE_AVX512) // Inverse _mm512_packs_epi16 ordering
|
permute<16>(biases, InversePackusEpi16Order);
|
||||||
uint64_t tmp0 = v[2], tmp1 = v[3];
|
permute<16>(weights, InversePackusEpi16Order);
|
||||||
v[2] = v[4], v[3] = v[5];
|
|
||||||
v[4] = v[8], v[5] = v[9];
|
|
||||||
v[8] = tmp0, v[9] = tmp1;
|
|
||||||
tmp0 = v[6], tmp1 = v[7];
|
|
||||||
v[6] = v[12], v[7] = v[13];
|
|
||||||
v[12] = v[10], v[13] = v[11];
|
|
||||||
v[10] = tmp0, v[11] = tmp1;
|
|
||||||
#elif defined(USE_AVX2) // Inverse _mm256_packs_epi16 ordering
|
|
||||||
std::swap(v[2], v[4]);
|
|
||||||
std::swap(v[3], v[5]);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
void permute_weights([[maybe_unused]] void (*order_fn)(uint64_t*)) {
|
|
||||||
#if defined(USE_AVX2)
|
|
||||||
#if defined(USE_AVX512)
|
|
||||||
constexpr IndexType di = 16;
|
|
||||||
#else
|
|
||||||
constexpr IndexType di = 8;
|
|
||||||
#endif
|
|
||||||
uint64_t* b = reinterpret_cast<uint64_t*>(&biases[0]);
|
|
||||||
for (IndexType i = 0; i < HalfDimensions * sizeof(BiasType) / sizeof(uint64_t); i += di)
|
|
||||||
order_fn(&b[i]);
|
|
||||||
|
|
||||||
for (IndexType j = 0; j < InputDimensions; ++j)
|
|
||||||
{
|
|
||||||
uint64_t* w = reinterpret_cast<uint64_t*>(&weights[j * HalfDimensions]);
|
|
||||||
for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(uint64_t);
|
|
||||||
i += di)
|
|
||||||
order_fn(&w[i]);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void scale_weights(bool read) {
|
inline void scale_weights(bool read) {
|
||||||
@@ -300,7 +321,7 @@ class FeatureTransformer {
|
|||||||
read_leb_128<WeightType>(stream, weights, HalfDimensions * InputDimensions);
|
read_leb_128<WeightType>(stream, weights, HalfDimensions * InputDimensions);
|
||||||
read_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
|
read_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
|
||||||
|
|
||||||
permute_weights(inverse_order_packs);
|
permute_weights();
|
||||||
scale_weights(true);
|
scale_weights(true);
|
||||||
return !stream.fail();
|
return !stream.fail();
|
||||||
}
|
}
|
||||||
@@ -308,14 +329,14 @@ class FeatureTransformer {
|
|||||||
// Write network parameters
|
// Write network parameters
|
||||||
bool write_parameters(std::ostream& stream) {
|
bool write_parameters(std::ostream& stream) {
|
||||||
|
|
||||||
permute_weights(order_packs);
|
unpermute_weights();
|
||||||
scale_weights(false);
|
scale_weights(false);
|
||||||
|
|
||||||
write_leb_128<BiasType>(stream, biases, HalfDimensions);
|
write_leb_128<BiasType>(stream, biases, HalfDimensions);
|
||||||
write_leb_128<WeightType>(stream, weights, HalfDimensions * InputDimensions);
|
write_leb_128<WeightType>(stream, weights, HalfDimensions * InputDimensions);
|
||||||
write_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
|
write_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
|
||||||
|
|
||||||
permute_weights(inverse_order_packs);
|
permute_weights();
|
||||||
scale_weights(true);
|
scale_weights(true);
|
||||||
return !stream.fail();
|
return !stream.fail();
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user