mirror of
https://github.com/opelly27/Stockfish.git
synced 2026-05-20 14:27:45 +00:00
Merge remote-tracking branch 'upstream/master' into merge_tmp
This commit is contained in:
+1
-1
@@ -50,7 +50,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
|
|||||||
material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
|
material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
|
||||||
search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
|
search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
|
||||||
nnue/evaluate_nnue.cpp \
|
nnue/evaluate_nnue.cpp \
|
||||||
nnue/features/half_ka_v2.cpp \
|
nnue/features/half_ka_v2_hm.cpp \
|
||||||
tools/validate_training_data.cpp \
|
tools/validate_training_data.cpp \
|
||||||
tools/sfen_packer.cpp \
|
tools/sfen_packer.cpp \
|
||||||
tools/training_data_generator.cpp \
|
tools/training_data_generator.cpp \
|
||||||
|
|||||||
+1
-1
@@ -36,7 +36,7 @@ namespace Eval {
|
|||||||
// The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
|
// The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
|
||||||
// for the build process (profile-build and fishtest) to work. Do not change the
|
// for the build process (profile-build and fishtest) to work. Do not change the
|
||||||
// name of the macro, as it is used in the Makefile.
|
// name of the macro, as it is used in the Makefile.
|
||||||
#define EvalFileDefaultName "nn-46832cfbead3.nnue"
|
#define EvalFileDefaultName "nn-e8321e467bf6.nnue"
|
||||||
|
|
||||||
namespace NNUE {
|
namespace NNUE {
|
||||||
enum struct UseNNUEMode
|
enum struct UseNNUEMode
|
||||||
|
|||||||
@@ -16,31 +16,32 @@
|
|||||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
//Definition of input features HalfKAv2 of NNUE evaluation function
|
//Definition of input features HalfKAv2_hm of NNUE evaluation function
|
||||||
|
|
||||||
#include "half_ka_v2.h"
|
#include "half_ka_v2_hm.h"
|
||||||
|
|
||||||
#include "../../position.h"
|
#include "../../position.h"
|
||||||
|
|
||||||
namespace Stockfish::Eval::NNUE::Features {
|
namespace Stockfish::Eval::NNUE::Features {
|
||||||
|
|
||||||
// Orient a square according to perspective (rotates by 180 for black)
|
// Orient a square according to perspective (rotates by 180 for black)
|
||||||
inline Square HalfKAv2::orient(Color perspective, Square s) {
|
inline Square HalfKAv2_hm::orient(Color perspective, Square s, Square ksq) {
|
||||||
return Square(int(s) ^ (bool(perspective) * 56));
|
return Square(int(s) ^ (bool(perspective) * SQ_A8) ^ ((file_of(ksq) < FILE_E) * SQ_H1));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Index of a feature for a given king position and another piece on some square
|
// Index of a feature for a given king position and another piece on some square
|
||||||
inline IndexType HalfKAv2::make_index(Color perspective, Square s, Piece pc, Square ksq) {
|
inline IndexType HalfKAv2_hm::make_index(Color perspective, Square s, Piece pc, Square ksq) {
|
||||||
return IndexType(orient(perspective, s) + PieceSquareIndex[perspective][pc] + PS_NB * ksq);
|
Square o_ksq = orient(perspective, ksq, ksq);
|
||||||
|
return IndexType(orient(perspective, s, ksq) + PieceSquareIndex[perspective][pc] + PS_NB * KingBuckets[o_ksq]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get a list of indices for active features
|
// Get a list of indices for active features
|
||||||
void HalfKAv2::append_active_indices(
|
void HalfKAv2_hm::append_active_indices(
|
||||||
const Position& pos,
|
const Position& pos,
|
||||||
Color perspective,
|
Color perspective,
|
||||||
ValueListInserter<IndexType> active
|
ValueListInserter<IndexType> active
|
||||||
) {
|
) {
|
||||||
Square ksq = orient(perspective, pos.square<KING>(perspective));
|
Square ksq = pos.square<KING>(perspective);
|
||||||
Bitboard bb = pos.pieces();
|
Bitboard bb = pos.pieces();
|
||||||
while (bb)
|
while (bb)
|
||||||
{
|
{
|
||||||
@@ -52,7 +53,7 @@ namespace Stockfish::Eval::NNUE::Features {
|
|||||||
|
|
||||||
// append_changed_indices() : get a list of indices for recently changed features
|
// append_changed_indices() : get a list of indices for recently changed features
|
||||||
|
|
||||||
void HalfKAv2::append_changed_indices(
|
void HalfKAv2_hm::append_changed_indices(
|
||||||
Square ksq,
|
Square ksq,
|
||||||
StateInfo* st,
|
StateInfo* st,
|
||||||
Color perspective,
|
Color perspective,
|
||||||
@@ -60,25 +61,24 @@ namespace Stockfish::Eval::NNUE::Features {
|
|||||||
ValueListInserter<IndexType> added
|
ValueListInserter<IndexType> added
|
||||||
) {
|
) {
|
||||||
const auto& dp = st->dirtyPiece;
|
const auto& dp = st->dirtyPiece;
|
||||||
Square oriented_ksq = orient(perspective, ksq);
|
|
||||||
for (int i = 0; i < dp.dirty_num; ++i) {
|
for (int i = 0; i < dp.dirty_num; ++i) {
|
||||||
Piece pc = dp.piece[i];
|
Piece pc = dp.piece[i];
|
||||||
if (dp.from[i] != SQ_NONE)
|
if (dp.from[i] != SQ_NONE)
|
||||||
removed.push_back(make_index(perspective, dp.from[i], pc, oriented_ksq));
|
removed.push_back(make_index(perspective, dp.from[i], pc, ksq));
|
||||||
if (dp.to[i] != SQ_NONE)
|
if (dp.to[i] != SQ_NONE)
|
||||||
added.push_back(make_index(perspective, dp.to[i], pc, oriented_ksq));
|
added.push_back(make_index(perspective, dp.to[i], pc, ksq));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int HalfKAv2::update_cost(StateInfo* st) {
|
int HalfKAv2_hm::update_cost(StateInfo* st) {
|
||||||
return st->dirtyPiece.dirty_num;
|
return st->dirtyPiece.dirty_num;
|
||||||
}
|
}
|
||||||
|
|
||||||
int HalfKAv2::refresh_cost(const Position& pos) {
|
int HalfKAv2_hm::refresh_cost(const Position& pos) {
|
||||||
return pos.count<ALL_PIECES>();
|
return pos.count<ALL_PIECES>();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool HalfKAv2::requires_refresh(StateInfo* st, Color perspective) {
|
bool HalfKAv2_hm::requires_refresh(StateInfo* st, Color perspective) {
|
||||||
return st->dirtyPiece.piece[0] == make_piece(perspective, KING);
|
return st->dirtyPiece.piece[0] == make_piece(perspective, KING);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -18,8 +18,8 @@
|
|||||||
|
|
||||||
//Definition of input features HalfKP of NNUE evaluation function
|
//Definition of input features HalfKP of NNUE evaluation function
|
||||||
|
|
||||||
#ifndef NNUE_FEATURES_HALF_KA_V2_H_INCLUDED
|
#ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED
|
||||||
#define NNUE_FEATURES_HALF_KA_V2_H_INCLUDED
|
#define NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED
|
||||||
|
|
||||||
#include "../nnue_common.h"
|
#include "../nnue_common.h"
|
||||||
|
|
||||||
@@ -32,9 +32,9 @@ namespace Stockfish {
|
|||||||
|
|
||||||
namespace Stockfish::Eval::NNUE::Features {
|
namespace Stockfish::Eval::NNUE::Features {
|
||||||
|
|
||||||
// Feature HalfKAv2: Combination of the position of own king
|
// Feature HalfKAv2_hm: Combination of the position of own king
|
||||||
// and the position of pieces
|
// and the position of pieces. Position mirrored such that king always on e..h files.
|
||||||
class HalfKAv2 {
|
class HalfKAv2_hm {
|
||||||
|
|
||||||
// unique number for each piece type on each square
|
// unique number for each piece type on each square
|
||||||
enum {
|
enum {
|
||||||
@@ -63,21 +63,32 @@ namespace Stockfish::Eval::NNUE::Features {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Orient a square according to perspective (rotates by 180 for black)
|
// Orient a square according to perspective (rotates by 180 for black)
|
||||||
static Square orient(Color perspective, Square s);
|
static Square orient(Color perspective, Square s, Square ksq);
|
||||||
|
|
||||||
// Index of a feature for a given king position and another piece on some square
|
// Index of a feature for a given king position and another piece on some square
|
||||||
static IndexType make_index(Color perspective, Square s, Piece pc, Square ksq);
|
static IndexType make_index(Color perspective, Square s, Piece pc, Square ksq);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Feature name
|
// Feature name
|
||||||
static constexpr const char* Name = "HalfKAv2(Friend)";
|
static constexpr const char* Name = "HalfKAv2_hm(Friend)";
|
||||||
|
|
||||||
// Hash value embedded in the evaluation file
|
// Hash value embedded in the evaluation file
|
||||||
static constexpr std::uint32_t HashValue = 0x5f234cb8u;
|
static constexpr std::uint32_t HashValue = 0x7f234cb8u;
|
||||||
|
|
||||||
// Number of feature dimensions
|
// Number of feature dimensions
|
||||||
static constexpr IndexType Dimensions =
|
static constexpr IndexType Dimensions =
|
||||||
static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_NB);
|
static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_NB) / 2;
|
||||||
|
|
||||||
|
static constexpr int KingBuckets[64] = {
|
||||||
|
-1, -1, -1, -1, 31, 30, 29, 28,
|
||||||
|
-1, -1, -1, -1, 27, 26, 25, 24,
|
||||||
|
-1, -1, -1, -1, 23, 22, 21, 20,
|
||||||
|
-1, -1, -1, -1, 19, 18, 17, 16,
|
||||||
|
-1, -1, -1, -1, 15, 14, 13, 12,
|
||||||
|
-1, -1, -1, -1, 11, 10, 9, 8,
|
||||||
|
-1, -1, -1, -1, 7, 6, 5, 4,
|
||||||
|
-1, -1, -1, -1, 3, 2, 1, 0
|
||||||
|
};
|
||||||
|
|
||||||
// Maximum number of simultaneously active features.
|
// Maximum number of simultaneously active features.
|
||||||
static constexpr IndexType MaxActiveDimensions = 32;
|
static constexpr IndexType MaxActiveDimensions = 32;
|
||||||
@@ -108,4 +119,4 @@ namespace Stockfish::Eval::NNUE::Features {
|
|||||||
|
|
||||||
} // namespace Stockfish::Eval::NNUE::Features
|
} // namespace Stockfish::Eval::NNUE::Features
|
||||||
|
|
||||||
#endif // #ifndef NNUE_FEATURES_HALF_KA_V2_H_INCLUDED
|
#endif // #ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED
|
||||||
@@ -46,6 +46,11 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||||||
#elif defined (USE_SSSE3)
|
#elif defined (USE_SSSE3)
|
||||||
static constexpr const IndexType OutputSimdWidth = SimdWidth / 4;
|
static constexpr const IndexType OutputSimdWidth = SimdWidth / 4;
|
||||||
#endif
|
#endif
|
||||||
|
#if defined (USE_AVX512)
|
||||||
|
static constexpr const IndexType InputSimdWidth = SimdWidth * 2;
|
||||||
|
#elif defined (USE_SSSE3)
|
||||||
|
static constexpr const IndexType InputSimdWidth = SimdWidth;
|
||||||
|
#endif
|
||||||
|
|
||||||
// Size of forward propagation buffer used in this layer
|
// Size of forward propagation buffer used in this layer
|
||||||
static constexpr std::size_t SelfBufferSize =
|
static constexpr std::size_t SelfBufferSize =
|
||||||
@@ -72,6 +77,15 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||||||
for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
|
for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
|
||||||
#if !defined (USE_SSSE3)
|
#if !defined (USE_SSSE3)
|
||||||
weights[i] = read_little_endian<WeightType>(stream);
|
weights[i] = read_little_endian<WeightType>(stream);
|
||||||
|
#elif defined (USE_VNNI) || defined (USE_AVX512)
|
||||||
|
if constexpr (OutputDimensions <= 8 && OutputDimensions != 1)
|
||||||
|
weights[i] = read_little_endian<WeightType>(stream);
|
||||||
|
else
|
||||||
|
weights[
|
||||||
|
(i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 +
|
||||||
|
i / PaddedInputDimensions * 4 +
|
||||||
|
i % 4
|
||||||
|
] = read_little_endian<WeightType>(stream);
|
||||||
#else
|
#else
|
||||||
weights[
|
weights[
|
||||||
(i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 +
|
(i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 +
|
||||||
@@ -108,7 +122,6 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||||||
|
|
||||||
return !stream.fail();
|
return !stream.fail();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Forward propagation
|
// Forward propagation
|
||||||
const OutputType* propagate(
|
const OutputType* propagate(
|
||||||
const TransformedFeatureType* transformedFeatures, char* buffer) const {
|
const TransformedFeatureType* transformedFeatures, char* buffer) const {
|
||||||
@@ -123,6 +136,40 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||||||
return _mm512_reduce_add_epi32(sum) + bias;
|
return _mm512_reduce_add_epi32(sum) + bias;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
[[maybe_unused]] auto m512_hadd128x16_interleave = [](
|
||||||
|
__m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) -> __m512i {
|
||||||
|
|
||||||
|
__m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1);
|
||||||
|
__m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1);
|
||||||
|
|
||||||
|
__m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3);
|
||||||
|
__m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3);
|
||||||
|
|
||||||
|
__m512i sum01 = _mm512_add_epi32(sum01a, sum01b);
|
||||||
|
__m512i sum23 = _mm512_add_epi32(sum23a, sum23b);
|
||||||
|
|
||||||
|
__m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23);
|
||||||
|
__m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23);
|
||||||
|
|
||||||
|
return _mm512_add_epi32(sum0123a, sum0123b);
|
||||||
|
};
|
||||||
|
|
||||||
|
[[maybe_unused]] auto m512_haddx4 = [m512_hadd128x16_interleave](
|
||||||
|
__m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i {
|
||||||
|
|
||||||
|
__m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
|
||||||
|
|
||||||
|
__m256i sum256lo = _mm512_castsi512_si256(sum);
|
||||||
|
__m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
|
||||||
|
|
||||||
|
sum256lo = _mm256_add_epi32(sum256lo, sum256hi);
|
||||||
|
|
||||||
|
__m128i sum128lo = _mm256_castsi256_si128(sum256lo);
|
||||||
|
__m128i sum128hi = _mm256_extracti128_si256(sum256lo, 1);
|
||||||
|
|
||||||
|
return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
|
||||||
|
};
|
||||||
|
|
||||||
[[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
|
[[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
|
||||||
#if defined (USE_VNNI)
|
#if defined (USE_VNNI)
|
||||||
acc = _mm512_dpbusd_epi32(acc, a, b);
|
acc = _mm512_dpbusd_epi32(acc, a, b);
|
||||||
@@ -133,6 +180,19 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
[[maybe_unused]] auto m512_add_dpbusd_epi32x2 = [=](__m512i& acc, __m512i a0, __m512i b0, __m512i a1, __m512i b1) {
|
||||||
|
#if defined (USE_VNNI)
|
||||||
|
acc = _mm512_dpbusd_epi32(acc, a0, b0);
|
||||||
|
acc = _mm512_dpbusd_epi32(acc, a1, b1);
|
||||||
|
#else
|
||||||
|
__m512i product0 = _mm512_maddubs_epi16(a0, b0);
|
||||||
|
__m512i product1 = _mm512_maddubs_epi16(a1, b1);
|
||||||
|
product0 = _mm512_adds_epi16(product0, product1);
|
||||||
|
product0 = _mm512_madd_epi16(product0, Ones512);
|
||||||
|
acc = _mm512_add_epi32(acc, product0);
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
[[maybe_unused]] auto m512_add_dpbusd_epi32x4 = [=](__m512i& acc, __m512i a0, __m512i b0, __m512i a1, __m512i b1,
|
[[maybe_unused]] auto m512_add_dpbusd_epi32x4 = [=](__m512i& acc, __m512i a0, __m512i b0, __m512i a1, __m512i b1,
|
||||||
__m512i a2, __m512i b2, __m512i a3, __m512i b3) {
|
__m512i a2, __m512i b2, __m512i a3, __m512i b3) {
|
||||||
#if defined (USE_VNNI)
|
#if defined (USE_VNNI)
|
||||||
@@ -165,6 +225,18 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||||||
return _mm_cvtsi128_si32(sum128) + bias;
|
return _mm_cvtsi128_si32(sum128) + bias;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
[[maybe_unused]] auto m256_haddx4 = [](__m256i sum0, __m256i sum1, __m256i sum2, __m256i sum3, __m128i bias) -> __m128i {
|
||||||
|
sum0 = _mm256_hadd_epi32(sum0, sum1);
|
||||||
|
sum2 = _mm256_hadd_epi32(sum2, sum3);
|
||||||
|
|
||||||
|
sum0 = _mm256_hadd_epi32(sum0, sum2);
|
||||||
|
|
||||||
|
__m128i sum128lo = _mm256_castsi256_si128(sum0);
|
||||||
|
__m128i sum128hi = _mm256_extracti128_si256(sum0, 1);
|
||||||
|
|
||||||
|
return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
|
||||||
|
};
|
||||||
|
|
||||||
[[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
|
[[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
|
||||||
#if defined (USE_VNNI)
|
#if defined (USE_VNNI)
|
||||||
acc = _mm256_dpbusd_epi32(acc, a, b);
|
acc = _mm256_dpbusd_epi32(acc, a, b);
|
||||||
@@ -175,6 +247,19 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
[[maybe_unused]] auto m256_add_dpbusd_epi32x2 = [=](__m256i& acc, __m256i a0, __m256i b0, __m256i a1, __m256i b1) {
|
||||||
|
#if defined (USE_VNNI)
|
||||||
|
acc = _mm256_dpbusd_epi32(acc, a0, b0);
|
||||||
|
acc = _mm256_dpbusd_epi32(acc, a1, b1);
|
||||||
|
#else
|
||||||
|
__m256i product0 = _mm256_maddubs_epi16(a0, b0);
|
||||||
|
__m256i product1 = _mm256_maddubs_epi16(a1, b1);
|
||||||
|
product0 = _mm256_adds_epi16(product0, product1);
|
||||||
|
product0 = _mm256_madd_epi16(product0, Ones256);
|
||||||
|
acc = _mm256_add_epi32(acc, product0);
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
[[maybe_unused]] auto m256_add_dpbusd_epi32x4 = [=](__m256i& acc, __m256i a0, __m256i b0, __m256i a1, __m256i b1,
|
[[maybe_unused]] auto m256_add_dpbusd_epi32x4 = [=](__m256i& acc, __m256i a0, __m256i b0, __m256i a1, __m256i b1,
|
||||||
__m256i a2, __m256i b2, __m256i a3, __m256i b3) {
|
__m256i a2, __m256i b2, __m256i a3, __m256i b3) {
|
||||||
#if defined (USE_VNNI)
|
#if defined (USE_VNNI)
|
||||||
@@ -206,12 +291,27 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||||||
return _mm_cvtsi128_si32(sum) + bias;
|
return _mm_cvtsi128_si32(sum) + bias;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
[[maybe_unused]] auto m128_haddx4 = [](__m128i sum0, __m128i sum1, __m128i sum2, __m128i sum3, __m128i bias) -> __m128i {
|
||||||
|
sum0 = _mm_hadd_epi32(sum0, sum1);
|
||||||
|
sum2 = _mm_hadd_epi32(sum2, sum3);
|
||||||
|
sum0 = _mm_hadd_epi32(sum0, sum2);
|
||||||
|
return _mm_add_epi32(sum0, bias);
|
||||||
|
};
|
||||||
|
|
||||||
[[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) {
|
[[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) {
|
||||||
__m128i product0 = _mm_maddubs_epi16(a, b);
|
__m128i product0 = _mm_maddubs_epi16(a, b);
|
||||||
product0 = _mm_madd_epi16(product0, Ones128);
|
product0 = _mm_madd_epi16(product0, Ones128);
|
||||||
acc = _mm_add_epi32(acc, product0);
|
acc = _mm_add_epi32(acc, product0);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
[[maybe_unused]] auto m128_add_dpbusd_epi32x2 = [=](__m128i& acc, __m128i a0, __m128i b0, __m128i a1, __m128i b1) {
|
||||||
|
__m128i product0 = _mm_maddubs_epi16(a0, b0);
|
||||||
|
__m128i product1 = _mm_maddubs_epi16(a1, b1);
|
||||||
|
product0 = _mm_adds_epi16(product0, product1);
|
||||||
|
product0 = _mm_madd_epi16(product0, Ones128);
|
||||||
|
acc = _mm_add_epi32(acc, product0);
|
||||||
|
};
|
||||||
|
|
||||||
[[maybe_unused]] auto m128_add_dpbusd_epi32x4 = [=](__m128i& acc, __m128i a0, __m128i b0, __m128i a1, __m128i b1,
|
[[maybe_unused]] auto m128_add_dpbusd_epi32x4 = [=](__m128i& acc, __m128i a0, __m128i b0, __m128i a1, __m128i b1,
|
||||||
__m128i a2, __m128i b2, __m128i a3, __m128i b3) {
|
__m128i a2, __m128i b2, __m128i a3, __m128i b3) {
|
||||||
__m128i product0 = _mm_maddubs_epi16(a0, b0);
|
__m128i product0 = _mm_maddubs_epi16(a0, b0);
|
||||||
@@ -231,33 +331,116 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||||||
using vec_t = __m512i;
|
using vec_t = __m512i;
|
||||||
#define vec_setzero _mm512_setzero_si512
|
#define vec_setzero _mm512_setzero_si512
|
||||||
#define vec_set_32 _mm512_set1_epi32
|
#define vec_set_32 _mm512_set1_epi32
|
||||||
auto& vec_add_dpbusd_32 = m512_add_dpbusd_epi32;
|
[[maybe_unused]] auto& vec_add_dpbusd_32 = m512_add_dpbusd_epi32;
|
||||||
auto& vec_add_dpbusd_32x4 = m512_add_dpbusd_epi32x4;
|
[[maybe_unused]] auto& vec_add_dpbusd_32x2 = m512_add_dpbusd_epi32x2;
|
||||||
auto& vec_hadd = m512_hadd;
|
[[maybe_unused]] auto& vec_add_dpbusd_32x4 = m512_add_dpbusd_epi32x4;
|
||||||
|
[[maybe_unused]] auto& vec_hadd = m512_hadd;
|
||||||
|
[[maybe_unused]] auto& vec_haddx4 = m512_haddx4;
|
||||||
#elif defined (USE_AVX2)
|
#elif defined (USE_AVX2)
|
||||||
using vec_t = __m256i;
|
using vec_t = __m256i;
|
||||||
#define vec_setzero _mm256_setzero_si256
|
#define vec_setzero _mm256_setzero_si256
|
||||||
#define vec_set_32 _mm256_set1_epi32
|
#define vec_set_32 _mm256_set1_epi32
|
||||||
auto& vec_add_dpbusd_32 = m256_add_dpbusd_epi32;
|
[[maybe_unused]] auto& vec_add_dpbusd_32 = m256_add_dpbusd_epi32;
|
||||||
auto& vec_add_dpbusd_32x4 = m256_add_dpbusd_epi32x4;
|
[[maybe_unused]] auto& vec_add_dpbusd_32x2 = m256_add_dpbusd_epi32x2;
|
||||||
auto& vec_hadd = m256_hadd;
|
[[maybe_unused]] auto& vec_add_dpbusd_32x4 = m256_add_dpbusd_epi32x4;
|
||||||
|
[[maybe_unused]] auto& vec_hadd = m256_hadd;
|
||||||
|
[[maybe_unused]] auto& vec_haddx4 = m256_haddx4;
|
||||||
#elif defined (USE_SSSE3)
|
#elif defined (USE_SSSE3)
|
||||||
using vec_t = __m128i;
|
using vec_t = __m128i;
|
||||||
#define vec_setzero _mm_setzero_si128
|
#define vec_setzero _mm_setzero_si128
|
||||||
#define vec_set_32 _mm_set1_epi32
|
#define vec_set_32 _mm_set1_epi32
|
||||||
auto& vec_add_dpbusd_32 = m128_add_dpbusd_epi32;
|
[[maybe_unused]] auto& vec_add_dpbusd_32 = m128_add_dpbusd_epi32;
|
||||||
auto& vec_add_dpbusd_32x4 = m128_add_dpbusd_epi32x4;
|
[[maybe_unused]] auto& vec_add_dpbusd_32x2 = m128_add_dpbusd_epi32x2;
|
||||||
auto& vec_hadd = m128_hadd;
|
[[maybe_unused]] auto& vec_add_dpbusd_32x4 = m128_add_dpbusd_epi32x4;
|
||||||
|
[[maybe_unused]] auto& vec_hadd = m128_hadd;
|
||||||
|
[[maybe_unused]] auto& vec_haddx4 = m128_haddx4;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined (USE_SSSE3)
|
#if defined (USE_SSSE3)
|
||||||
const auto output = reinterpret_cast<OutputType*>(buffer);
|
const auto output = reinterpret_cast<OutputType*>(buffer);
|
||||||
const auto inputVector = reinterpret_cast<const vec_t*>(input);
|
const auto inputVector = reinterpret_cast<const vec_t*>(input);
|
||||||
|
#endif
|
||||||
|
|
||||||
static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1);
|
#if defined (USE_VNNI) || defined (USE_AVX512)
|
||||||
|
|
||||||
|
static_assert(OutputDimensions == 1 || OutputDimensions % 4 == 0);
|
||||||
|
|
||||||
// OutputDimensions is either 1 or a multiple of SimdWidth
|
// OutputDimensions is either 1 or a multiple of SimdWidth
|
||||||
// because then it is also an input dimension.
|
// because then it is also an input dimension.
|
||||||
|
if constexpr (OutputDimensions <= 8 && OutputDimensions != 1)
|
||||||
|
{
|
||||||
|
constexpr IndexType NumChunks = PaddedInputDimensions / InputSimdWidth;
|
||||||
|
|
||||||
|
static_assert(NumChunks % 2 == 0);
|
||||||
|
|
||||||
|
const auto input_vec = reinterpret_cast<const vec_t*>(input);
|
||||||
|
const auto bias_vec = reinterpret_cast<const __m128i*>(biases);
|
||||||
|
auto out_vec = reinterpret_cast<__m128i*>(output);
|
||||||
|
|
||||||
|
vec_t regs[OutputDimensions];
|
||||||
|
for (IndexType k = 0; k < OutputDimensions; ++k)
|
||||||
|
regs[k] = vec_setzero();
|
||||||
|
|
||||||
|
for (IndexType i = 0; i < NumChunks / 2; ++i)
|
||||||
|
{
|
||||||
|
const vec_t in0 = input_vec[i * 2 + 0];
|
||||||
|
const vec_t in1 = input_vec[i * 2 + 1];
|
||||||
|
for (IndexType k = 0; k < OutputDimensions; ++k)
|
||||||
|
{
|
||||||
|
const vec_t w0 = reinterpret_cast<const vec_t*>(&weights[k * PaddedInputDimensions])[i * 2 + 0];
|
||||||
|
const vec_t w1 = reinterpret_cast<const vec_t*>(&weights[k * PaddedInputDimensions])[i * 2 + 1];
|
||||||
|
vec_add_dpbusd_32(regs[k], in0, w0);
|
||||||
|
vec_add_dpbusd_32(regs[k], in1, w1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (IndexType k = 0; k < OutputDimensions / 4; ++k)
|
||||||
|
{
|
||||||
|
out_vec[k] = vec_haddx4(
|
||||||
|
regs[k * 4 + 0],
|
||||||
|
regs[k * 4 + 1],
|
||||||
|
regs[k * 4 + 2],
|
||||||
|
regs[k * 4 + 3],
|
||||||
|
bias_vec[k]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if constexpr (InputDimensions == 8)
|
||||||
|
{
|
||||||
|
const auto input32 = reinterpret_cast<const std::int32_t*>(input);
|
||||||
|
__m256i* outptr = reinterpret_cast<__m256i*>(output);
|
||||||
|
std::memcpy(output, biases, OutputDimensions * sizeof(OutputType));
|
||||||
|
|
||||||
|
const __m256i in0 = _mm256_set1_epi32(input32[0]);
|
||||||
|
const __m256i in1 = _mm256_set1_epi32(input32[1]);
|
||||||
|
const auto col0 = reinterpret_cast<const __m256i*>(&weights[0]);
|
||||||
|
const auto col1 = reinterpret_cast<const __m256i*>(&weights[OutputDimensions * 4]);
|
||||||
|
for (IndexType j = 0; j * 8 < OutputDimensions; ++j)
|
||||||
|
m256_add_dpbusd_epi32x2(outptr[j], in0, col0[j], in1, col1[j]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
|
||||||
|
#elif defined (USE_SSSE3)
|
||||||
|
|
||||||
|
if constexpr (OutputDimensions % OutputSimdWidth == 0 && InputDimensions == 8)
|
||||||
|
{
|
||||||
|
const auto input32 = reinterpret_cast<const std::int32_t*>(input);
|
||||||
|
vec_t* outptr = reinterpret_cast<vec_t*>(output);
|
||||||
|
std::memcpy(output, biases, OutputDimensions * sizeof(OutputType));
|
||||||
|
|
||||||
|
const vec_t in0 = vec_set_32(input32[0]);
|
||||||
|
const vec_t in1 = vec_set_32(input32[1]);
|
||||||
|
const auto col0 = reinterpret_cast<const vec_t*>(&weights[0]);
|
||||||
|
const auto col1 = reinterpret_cast<const vec_t*>(&weights[OutputDimensions * 4]);
|
||||||
|
for (IndexType j = 0; j * OutputSimdWidth < OutputDimensions; ++j)
|
||||||
|
vec_add_dpbusd_32x2(outptr[j], in0, col0[j], in1, col1[j]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined (USE_SSSE3)
|
||||||
|
|
||||||
if constexpr (OutputDimensions % OutputSimdWidth == 0)
|
if constexpr (OutputDimensions % OutputSimdWidth == 0)
|
||||||
{
|
{
|
||||||
static_assert(InputDimensions % 16 == 0);
|
static_assert(InputDimensions % 16 == 0);
|
||||||
@@ -337,8 +520,8 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||||||
|
|
||||||
#if defined(USE_SSE2)
|
#if defined(USE_SSE2)
|
||||||
// At least a multiple of 16, with SSE2.
|
// At least a multiple of 16, with SSE2.
|
||||||
static_assert(InputDimensions % SimdWidth == 0);
|
static_assert(PaddedInputDimensions % SimdWidth == 0);
|
||||||
constexpr IndexType NumChunks = InputDimensions / SimdWidth;
|
constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
|
||||||
const __m128i Zeros = _mm_setzero_si128();
|
const __m128i Zeros = _mm_setzero_si128();
|
||||||
const auto inputVector = reinterpret_cast<const __m128i*>(input);
|
const auto inputVector = reinterpret_cast<const __m128i*>(input);
|
||||||
|
|
||||||
@@ -349,8 +532,8 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||||||
const auto inputVector = reinterpret_cast<const __m64*>(input);
|
const auto inputVector = reinterpret_cast<const __m64*>(input);
|
||||||
|
|
||||||
#elif defined(USE_NEON)
|
#elif defined(USE_NEON)
|
||||||
static_assert(InputDimensions % SimdWidth == 0);
|
static_assert(PaddedInputDimensions % SimdWidth == 0);
|
||||||
constexpr IndexType NumChunks = InputDimensions / SimdWidth;
|
constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
|
||||||
const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
|
const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -423,6 +606,13 @@ namespace Stockfish::Eval::NNUE::Layers {
|
|||||||
_mm_empty();
|
_mm_empty();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (!defined (USE_SSSE3) && defined (USE_SSE2)) || defined (USE_NEON)
|
||||||
|
static_assert(SimdWidth <= 16, "Otherwise we run outside of the padding for the output.");
|
||||||
|
if constexpr (SimdWidth > OutputDimensions && OutputDimensions != 1)
|
||||||
|
for (IndexType i = OutputDimensions; i < SimdWidth; ++i)
|
||||||
|
output[i] = 0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return output;
|
return output;
|
||||||
|
|||||||
@@ -23,7 +23,7 @@
|
|||||||
|
|
||||||
#include "nnue_common.h"
|
#include "nnue_common.h"
|
||||||
|
|
||||||
#include "features/half_ka_v2.h"
|
#include "features/half_ka_v2_hm.h"
|
||||||
|
|
||||||
#include "layers/input_slice.h"
|
#include "layers/input_slice.h"
|
||||||
#include "layers/affine_transform.h"
|
#include "layers/affine_transform.h"
|
||||||
@@ -32,10 +32,10 @@
|
|||||||
namespace Stockfish::Eval::NNUE {
|
namespace Stockfish::Eval::NNUE {
|
||||||
|
|
||||||
// Input features used in evaluation function
|
// Input features used in evaluation function
|
||||||
using FeatureSet = Features::HalfKAv2;
|
using FeatureSet = Features::HalfKAv2_hm;
|
||||||
|
|
||||||
// Number of input feature dimensions after conversion
|
// Number of input feature dimensions after conversion
|
||||||
constexpr IndexType TransformedFeatureDimensions = 512;
|
constexpr IndexType TransformedFeatureDimensions = 1024;
|
||||||
constexpr IndexType PSQTBuckets = 8;
|
constexpr IndexType PSQTBuckets = 8;
|
||||||
constexpr IndexType LayerStacks = 8;
|
constexpr IndexType LayerStacks = 8;
|
||||||
|
|
||||||
@@ -43,7 +43,7 @@ namespace Stockfish::Eval::NNUE {
|
|||||||
|
|
||||||
// Define network structure
|
// Define network structure
|
||||||
using InputLayer = InputSlice<TransformedFeatureDimensions * 2>;
|
using InputLayer = InputSlice<TransformedFeatureDimensions * 2>;
|
||||||
using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 16>>;
|
using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 8>>;
|
||||||
using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
|
using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
|
||||||
using OutputLayer = AffineTransform<HiddenLayer2, 1>;
|
using OutputLayer = AffineTransform<HiddenLayer2, 1>;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user