Merge remote-tracking branch 'remotes/official/master' into merge

This commit is contained in:
noobpwnftw
2020-11-28 06:19:16 +08:00
16 changed files with 1086 additions and 988 deletions
+2 -1
View File
@@ -44,6 +44,7 @@ Daniel Dugovic (ddugovic)
Dariusz Orzechowski (dorzechowski) Dariusz Orzechowski (dorzechowski)
David Zar David Zar
Daylen Yang (daylen) Daylen Yang (daylen)
Deshawn Mohan-Smith (GoldenRare)
DiscanX DiscanX
Dominik Schlösser (domschl) Dominik Schlösser (domschl)
double-beep double-beep
@@ -64,7 +65,6 @@ Gary Heckman (gheckman)
George Sobala (gsobala) George Sobala (gsobala)
gguliash gguliash
Gian-Carlo Pascutto (gcp) Gian-Carlo Pascutto (gcp)
Deshawn Mohan-Smith (GoldenRare)
Gontran Lemaire (gonlem) Gontran Lemaire (gonlem)
Goodkov Vasiliy Aleksandrovich (goodkov) Goodkov Vasiliy Aleksandrovich (goodkov)
Gregor Cramer Gregor Cramer
@@ -112,6 +112,7 @@ Mark Tenzer (31m059)
marotear marotear
Matthew Lai (matthewlai) Matthew Lai (matthewlai)
Matthew Sullivan (Matt14916) Matthew Sullivan (Matt14916)
Maxim Molchanov (Maxim)
Michael An (man) Michael An (man)
Michael Byrne (MichaelB7) Michael Byrne (MichaelB7)
Michael Chaly (Vizvezdenec) Michael Chaly (Vizvezdenec)
+1 -1
View File
@@ -41,7 +41,7 @@ BINDIR = $(PREFIX)/bin
### Built-in benchmark for pgo-builds ### Built-in benchmark for pgo-builds
PGO_TRAINING_DATA_FILE = pgo_training_data.bin PGO_TRAINING_DATA_FILE = pgo_training_data.bin
PGOBENCH = ./$(EXE) bench PGOBENCH = ./$(EXE) bench
PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 output_file_name $(PGO_TRAINING_DATA_FILE) PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 sfen_format bin output_file_name $(PGO_TRAINING_DATA_FILE)
### Source and object files ### Source and object files
SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
+11 -9
View File
@@ -84,11 +84,11 @@ using namespace Trace;
namespace { namespace {
// Threshold for lazy and space evaluation // Threshold for lazy and space evaluation
constexpr Value LazyThreshold1 = Value(1400); constexpr Value LazyThreshold1 = Value(1565);
constexpr Value LazyThreshold2 = Value(1300); constexpr Value LazyThreshold2 = Value(1102);
constexpr Value SpaceThreshold = Value(12222); constexpr Value SpaceThreshold = Value(11551);
constexpr Value NNUEThreshold1 = Value(550); constexpr Value NNUEThreshold1 = Value(682);
constexpr Value NNUEThreshold2 = Value(150); constexpr Value NNUEThreshold2 = Value(176);
// KingAttackWeights[PieceType] contains king attack weights by piece type // KingAttackWeights[PieceType] contains king attack weights by piece type
constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 }; constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -930,7 +930,7 @@ Value Eval::evaluate(const Position& pos) {
{ {
// Scale and shift NNUE for compatibility with search and classical evaluation // Scale and shift NNUE for compatibility with search and classical evaluation
auto adjusted_NNUE = [&](){ auto adjusted_NNUE = [&](){
int mat = pos.non_pawn_material() + PieceValue[MG][PAWN] * pos.count<PAWN>(); int mat = pos.non_pawn_material() + PawnValueMg * pos.count<PAWN>();
return NNUE::evaluate(pos) * (720 + mat / 32) / 1024 + Tempo; return NNUE::evaluate(pos) * (720 + mat / 32) / 1024 + Tempo;
}; };
@@ -940,13 +940,15 @@ Value Eval::evaluate(const Position& pos) {
bool largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50; bool largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50;
bool classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB)); bool classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB));
v = classical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE(); bool strongClassical = pos.non_pawn_material() < 2 * RookValueMg && pos.count<PAWN>() < 2;
v = classical || strongClassical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
// If the classical eval is small and imbalance large, use NNUE nevertheless. // If the classical eval is small and imbalance large, use NNUE nevertheless.
// For the case of opposite colored bishops, switch to NNUE eval with // For the case of opposite colored bishops, switch to NNUE eval with
// small probability if the classical eval is less than the threshold. // small probability if the classical eval is less than the threshold.
if ( largePsq if ( largePsq && !strongClassical
&& (abs(v) * 16 < NNUEThreshold2 * r50 && ( abs(v) * 16 < NNUEThreshold2 * r50
|| ( pos.opposite_bishops() || ( pos.opposite_bishops()
&& abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50 && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
&& !(pos.this_thread()->nodes & 0xB)))) && !(pos.this_thread()->nodes & 0xB))))
+1 -2
View File
@@ -585,11 +585,10 @@ namespace CommandLine {
string argv0; // path+name of the executable binary, as given by argv[0] string argv0; // path+name of the executable binary, as given by argv[0]
string binaryDirectory; // path of the executable directory string binaryDirectory; // path of the executable directory
string workingDirectory; // path of the working directory string workingDirectory; // path of the working directory
string pathSeparator; // Separator for our current OS
void init(int argc, char* argv[]) { void init(int argc, char* argv[]) {
(void)argc; (void)argc;
string separator; string pathSeparator;
// extract the path+name of the executable binary // extract the path+name of the executable binary
argv0 = argv[0]; argv0 = argv[0];
+21 -38
View File
@@ -102,7 +102,6 @@ namespace Eval::NNUE {
void initialize(LargePagePtr<T>& pointer) { void initialize(LargePagePtr<T>& pointer) {
static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T"); static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T)))); pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
std::memset(pointer.get(), 0, sizeof(T)); std::memset(pointer.get(), 0, sizeof(T));
} }
@@ -113,10 +112,7 @@ namespace Eval::NNUE {
std::uint32_t header; std::uint32_t header;
header = read_little_endian<std::uint32_t>(stream); header = read_little_endian<std::uint32_t>(stream);
if (!stream || header != T::GetHashValue()) return false;
if (!stream || header != T::GetHashValue())
return false;
return reference.ReadParameters(stream); return reference.ReadParameters(stream);
} }
@@ -155,13 +151,9 @@ namespace Eval::NNUE {
version = read_little_endian<std::uint32_t>(stream); version = read_little_endian<std::uint32_t>(stream);
*hash_value = read_little_endian<std::uint32_t>(stream); *hash_value = read_little_endian<std::uint32_t>(stream);
size = read_little_endian<std::uint32_t>(stream); size = read_little_endian<std::uint32_t>(stream);
if (!stream || version != kVersion) return false;
if (!stream || version != kVersion)
return false;
architecture->resize(size); architecture->resize(size);
stream.read(&(*architecture)[0], size); stream.read(&(*architecture)[0], size);
return !stream.fail(); return !stream.fail();
} }
@@ -185,20 +177,13 @@ namespace Eval::NNUE {
std::uint32_t hash_value; std::uint32_t hash_value;
std::string architecture; std::string architecture;
if (!read_header(stream, &hash_value, &architecture)) if (!read_header(stream, &hash_value, &architecture)) return false;
return false; if (hash_value != kHashValue) return false;
if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
if (hash_value != kHashValue) if (!Detail::ReadParameters(stream, *network)) return false;
return false;
if (!Detail::ReadParameters(stream, *feature_transformer))
return false;
if (!Detail::ReadParameters(stream, *network))
return false;
return stream && stream.peek() == std::ios::traits_type::eof(); return stream && stream.peek() == std::ios::traits_type::eof();
} }
// write evaluation function parameters // write evaluation function parameters
bool WriteParameters(std::ostream& stream) { bool WriteParameters(std::ostream& stream) {
@@ -212,7 +197,8 @@ namespace Eval::NNUE {
return false; return false;
return !stream.fail(); return !stream.fail();
} }
// Evaluation function. Perform differential calculation. // Evaluation function. Perform differential calculation.
Value evaluate(const Position& pos) { Value evaluate(const Position& pos) {
@@ -238,8 +224,6 @@ namespace Eval::NNUE {
ASSERT_ALIGNED(buffer, alignment); ASSERT_ALIGNED(buffer, alignment);
feature_transformer->Transform(pos, transformed_features); feature_transformer->Transform(pos, transformed_features);
const auto output = network->Propagate(transformed_features, buffer); const auto output = network->Propagate(transformed_features, buffer);
return static_cast<Value>(output[0] / FV_SCALE); return static_cast<Value>(output[0] / FV_SCALE);
@@ -249,13 +233,12 @@ namespace Eval::NNUE {
bool load_eval(std::string name, std::istream& stream) { bool load_eval(std::string name, std::istream& stream) {
initialize(); initialize();
fileName = name; fileName = name;
return ReadParameters(stream); return ReadParameters(stream);
} }
static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode) static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
{ {
if (mode == "false") if (mode == "false")
return UseNNUEMode::False; return UseNNUEMode::False;
else if (mode == "true") else if (mode == "true")
@@ -264,9 +247,9 @@ namespace Eval::NNUE {
return UseNNUEMode::Pure; return UseNNUEMode::Pure;
return UseNNUEMode::False; return UseNNUEMode::False;
} }
void init() { void init() {
useNNUE = nnue_mode_from_option(Options["Use NNUE"]); useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
@@ -306,10 +289,10 @@ namespace Eval::NNUE {
#undef stringify2 #undef stringify2
#undef stringify #undef stringify
} }
/// NNUE::verify() verifies that the last net used was loaded successfully /// NNUE::verify() verifies that the last net used was loaded successfully
void verify_eval_file_loaded() { void verify_eval_file_loaded() {
std::string eval_file = std::string(Options["EvalFile"]); std::string eval_file = std::string(Options["EvalFile"]);
@@ -337,10 +320,10 @@ namespace Eval::NNUE {
sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl; sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
else else
sync_cout << "info string classical evaluation enabled" << sync_endl; sync_cout << "info string classical evaluation enabled" << sync_endl;
} }
/// In training we override eval file so this is useful. /// In training we override eval file so this is useful.
void verify_any_net_loaded() { void verify_any_net_loaded() {
if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty()) if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
{ {
@@ -364,6 +347,6 @@ namespace Eval::NNUE {
sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl; sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
else else
sync_cout << "info string classical evaluation enabled" << sync_endl; sync_cout << "info string classical evaluation enabled" << sync_endl;
} }
} // namespace Eval::NNUE } // namespace Eval::NNUE
+2 -1
View File
@@ -16,6 +16,8 @@
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
// header used in NNUE evaluation function
#ifndef NNUE_EVALUATE_NNUE_H_INCLUDED #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
#define NNUE_EVALUATE_NNUE_H_INCLUDED #define NNUE_EVALUATE_NNUE_H_INCLUDED
@@ -25,7 +27,6 @@
#include <memory> #include <memory>
// header used in NNUE evaluation function
namespace Eval::NNUE { namespace Eval::NNUE {
enum struct UseNNUEMode enum struct UseNNUEMode
-1
View File
@@ -22,7 +22,6 @@
#define NNUE_FEATURE_SET_H_INCLUDED #define NNUE_FEATURE_SET_H_INCLUDED
#include "features_common.h" #include "features_common.h"
#include <array> #include <array>
namespace Eval::NNUE::Features { namespace Eval::NNUE::Features {
+2 -3
View File
@@ -21,9 +21,8 @@
#ifndef NNUE_FEATURES_COMMON_H_INCLUDED #ifndef NNUE_FEATURES_COMMON_H_INCLUDED
#define NNUE_FEATURES_COMMON_H_INCLUDED #define NNUE_FEATURES_COMMON_H_INCLUDED
#include "evaluate.h" #include "../../evaluate.h"
#include "../nnue_common.h"
#include "nnue/nnue_common.h"
namespace Eval::NNUE::Features { namespace Eval::NNUE::Features {
+5 -5
View File
@@ -21,9 +21,8 @@
#ifndef NNUE_FEATURES_INDEX_LIST_H_INCLUDED #ifndef NNUE_FEATURES_INDEX_LIST_H_INCLUDED
#define NNUE_FEATURES_INDEX_LIST_H_INCLUDED #define NNUE_FEATURES_INDEX_LIST_H_INCLUDED
#include "position.h" #include "../../position.h"
#include "../nnue_architecture.h"
#include "nnue/nnue_architecture.h"
namespace Eval::NNUE::Features { namespace Eval::NNUE::Features {
@@ -51,12 +50,13 @@ namespace Eval::NNUE::Features {
} }
private: private:
T values_[MaxSize] = {}; T values_[MaxSize];
std::size_t size_ = 0; std::size_t size_ = 0;
}; };
//Type of feature index list //Type of feature index list
class IndexList : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> { class IndexList
: public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
}; };
} // namespace Eval::NNUE::Features } // namespace Eval::NNUE::Features
+146 -63
View File
@@ -223,13 +223,13 @@ namespace Eval::NNUE::Layers {
return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias); return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias);
}; };
[[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
#if defined (USE_VNNI) #if defined (USE_VNNI)
[[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
acc = _mm512_dpbusd_epi32(acc, a, b); acc = _mm512_dpbusd_epi32(acc, a, b);
#else #else
[[maybe_unused]] auto m512_dpbusd_epi32 = [=](__m512i a, __m512i b) -> __m512i {
__m512i product0 = _mm512_maddubs_epi16(a, b); __m512i product0 = _mm512_maddubs_epi16(a, b);
product0 = _mm512_madd_epi16(product0, kOnes512); return _mm512_madd_epi16(product0, kOnes512);
acc = _mm512_add_epi32(acc, product0);
#endif #endif
}; };
@@ -256,14 +256,13 @@ namespace Eval::NNUE::Layers {
return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias); return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
}; };
[[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
#if defined (USE_VNNI) #if defined (USE_VNNI)
[[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
acc = _mm256_dpbusd_epi32(acc, a, b); acc = _mm256_dpbusd_epi32(acc, a, b);
#else #else
[[maybe_unused]] auto m256_dpbusd_epi32 = [=](__m256i a, __m256i b) -> __m256i {
__m256i product0 = _mm256_maddubs_epi16(a, b); __m256i product0 = _mm256_maddubs_epi16(a, b);
product0 = _mm256_madd_epi16(product0, kOnes256); return _mm256_madd_epi16(product0, kOnes256);
acc = _mm256_add_epi32(acc, product0);
#endif #endif
}; };
@@ -288,10 +287,9 @@ namespace Eval::NNUE::Layers {
return _mm_add_epi32(sum0, bias); return _mm_add_epi32(sum0, bias);
}; };
[[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) { [[maybe_unused]] auto m128_dpbusd_epi32 = [=](__m128i a, __m128i b) -> __m128i {
__m128i product0 = _mm_maddubs_epi16(a, b); __m128i product0 = _mm_maddubs_epi16(a, b);
product0 = _mm_madd_epi16(product0, kOnes128); return _mm_madd_epi16(product0, kOnes128);
acc = _mm_add_epi32(acc, product0);
}; };
#endif #endif
@@ -335,15 +333,6 @@ namespace Eval::NNUE::Layers {
const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]); const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]);
__m512i* outptr = reinterpret_cast<__m512i*>(&output[i]); __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]);
__m512i sum01a = _mm512_setzero_si512();
__m512i sum23a = _mm512_setzero_si512();
__m512i sum45a = _mm512_setzero_si512();
__m512i sum67a = _mm512_setzero_si512();
__m512i sum01b = _mm512_setzero_si512();
__m512i sum23b = _mm512_setzero_si512();
__m512i sum45b = _mm512_setzero_si512();
__m512i sum67b = _mm512_setzero_si512();
const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]); const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]);
const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]); const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]);
const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]); const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]);
@@ -356,6 +345,16 @@ namespace Eval::NNUE::Layers {
const __m256i in256 = input_vector256[0]; const __m256i in256 = input_vector256[0];
const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1); const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1);
#if defined (USE_VNNI)
__m512i sum01a = _mm512_setzero_si512();
__m512i sum23a = _mm512_setzero_si512();
__m512i sum45a = _mm512_setzero_si512();
__m512i sum67a = _mm512_setzero_si512();
__m512i sum01b = _mm512_setzero_si512();
__m512i sum23b = _mm512_setzero_si512();
__m512i sum45b = _mm512_setzero_si512();
__m512i sum67b = _mm512_setzero_si512();
m512_add_dpbusd_epi32(sum01a, in, row01a); m512_add_dpbusd_epi32(sum01a, in, row01a);
m512_add_dpbusd_epi32(sum23a, in, row23a); m512_add_dpbusd_epi32(sum23a, in, row23a);
m512_add_dpbusd_epi32(sum45a, in, row45a); m512_add_dpbusd_epi32(sum45a, in, row45a);
@@ -364,6 +363,16 @@ namespace Eval::NNUE::Layers {
m512_add_dpbusd_epi32(sum23b, in, row23b); m512_add_dpbusd_epi32(sum23b, in, row23b);
m512_add_dpbusd_epi32(sum45b, in, row45b); m512_add_dpbusd_epi32(sum45b, in, row45b);
m512_add_dpbusd_epi32(sum67b, in, row67b); m512_add_dpbusd_epi32(sum67b, in, row67b);
#else
__m512i sum01a = m512_dpbusd_epi32(in, row01a);
__m512i sum23a = m512_dpbusd_epi32(in, row23a);
__m512i sum45a = m512_dpbusd_epi32(in, row45a);
__m512i sum67a = m512_dpbusd_epi32(in, row67a);
__m512i sum01b = m512_dpbusd_epi32(in, row01b);
__m512i sum23b = m512_dpbusd_epi32(in, row23b);
__m512i sum45b = m512_dpbusd_epi32(in, row45b);
__m512i sum67b = m512_dpbusd_epi32(in, row67b);
#endif
*outptr = m512_hadd256x16( *outptr = m512_hadd256x16(
sum01a, sum23a, sum45a, sum67a, sum01a, sum23a, sum45a, sum67a,
@@ -384,48 +393,80 @@ namespace Eval::NNUE::Layers {
if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0) if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
{ {
__m512i sum0 = _mm512_setzero_si512();
__m512i sum1 = _mm512_setzero_si512();
__m512i sum2 = _mm512_setzero_si512();
__m512i sum3 = _mm512_setzero_si512();
const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]); const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]);
const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]); const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]);
const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]); const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]);
const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]); const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]);
for (IndexType j = 0; j < kNumChunks512; ++j) #if defined (USE_VNNI)
__m512i sum0 = _mm512_setzero_si512();
__m512i sum1 = _mm512_setzero_si512();
__m512i sum2 = _mm512_setzero_si512();
__m512i sum3 = _mm512_setzero_si512();
const IndexType kStart = 0;
#else
__m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
__m512i sum1 = m512_dpbusd_epi32(input_vector512[0], row1[0]);
__m512i sum2 = m512_dpbusd_epi32(input_vector512[0], row2[0]);
__m512i sum3 = m512_dpbusd_epi32(input_vector512[0], row3[0]);
const IndexType kStart = 1;
#endif
for (IndexType j = kStart; j < kNumChunks512; ++j)
{ {
const __m512i in = input_vector512[j]; const __m512i in = input_vector512[j];
#if defined (USE_VNNI)
m512_add_dpbusd_epi32(sum0, in, row0[j]); m512_add_dpbusd_epi32(sum0, in, row0[j]);
m512_add_dpbusd_epi32(sum1, in, row1[j]); m512_add_dpbusd_epi32(sum1, in, row1[j]);
m512_add_dpbusd_epi32(sum2, in, row2[j]); m512_add_dpbusd_epi32(sum2, in, row2[j]);
m512_add_dpbusd_epi32(sum3, in, row3[j]); m512_add_dpbusd_epi32(sum3, in, row3[j]);
#else
sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
sum1 = _mm512_add_epi32(sum1, m512_dpbusd_epi32(in, row1[j]));
sum2 = _mm512_add_epi32(sum2, m512_dpbusd_epi32(in, row2[j]));
sum3 = _mm512_add_epi32(sum3, m512_dpbusd_epi32(in, row3[j]));
#endif
} }
*outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias); *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias);
} }
else else
{ {
__m256i sum0 = _mm256_setzero_si256();
__m256i sum1 = _mm256_setzero_si256();
__m256i sum2 = _mm256_setzero_si256();
__m256i sum3 = _mm256_setzero_si256();
const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]); const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]); const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]); const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]); const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
for (IndexType j = 0; j < kNumChunks256; ++j) #if defined (USE_VNNI)
__m256i sum0 = _mm256_setzero_si256();
__m256i sum1 = _mm256_setzero_si256();
__m256i sum2 = _mm256_setzero_si256();
__m256i sum3 = _mm256_setzero_si256();
const IndexType kStart = 0;
#else
__m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
__m256i sum1 = m256_dpbusd_epi32(input_vector256[0], row1[0]);
__m256i sum2 = m256_dpbusd_epi32(input_vector256[0], row2[0]);
__m256i sum3 = m256_dpbusd_epi32(input_vector256[0], row3[0]);
const IndexType kStart = 1;
#endif
for (IndexType j = kStart; j < kNumChunks256; ++j)
{ {
const __m256i in = input_vector256[j]; const __m256i in = input_vector256[j];
#if defined (USE_VNNI)
m256_add_dpbusd_epi32(sum0, in, row0[j]); m256_add_dpbusd_epi32(sum0, in, row0[j]);
m256_add_dpbusd_epi32(sum1, in, row1[j]); m256_add_dpbusd_epi32(sum1, in, row1[j]);
m256_add_dpbusd_epi32(sum2, in, row2[j]); m256_add_dpbusd_epi32(sum2, in, row2[j]);
m256_add_dpbusd_epi32(sum3, in, row3[j]); m256_add_dpbusd_epi32(sum3, in, row3[j]);
#else
sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
#endif
} }
*outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias); *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -436,30 +477,50 @@ namespace Eval::NNUE::Layers {
{ {
if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0) if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
{ {
__m512i sum0 = _mm512_setzero_si512();
const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]); const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]);
for (IndexType j = 0; j < kNumChunks512; ++j) #if defined (USE_VNNI)
__m512i sum0 = _mm512_setzero_si512();
const IndexType kStart = 0;
#else
__m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
const IndexType kStart = 1;
#endif
for (IndexType j = kStart; j < kNumChunks512; ++j)
{ {
const __m512i in = input_vector512[j]; const __m512i in = input_vector512[j];
#if defined (USE_VNNI)
m512_add_dpbusd_epi32(sum0, in, row0[j]); m512_add_dpbusd_epi32(sum0, in, row0[j]);
#else
sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
#endif
} }
output[0] = m512_hadd(sum0, biases_[0]); output[0] = m512_hadd(sum0, biases_[0]);
} }
else else
{ {
__m256i sum0 = _mm256_setzero_si256();
const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]); const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
for (IndexType j = 0; j < kNumChunks256; ++j) #if defined (USE_VNNI)
__m256i sum0 = _mm256_setzero_si256();
const IndexType kStart = 0;
#else
__m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
const IndexType kStart = 1;
#endif
for (IndexType j = kStart; j < kNumChunks256; ++j)
{ {
const __m256i in = input_vector256[j]; const __m256i in = input_vector256[j];
#if defined (USE_VNNI)
m256_add_dpbusd_epi32(sum0, in, row0[j]); m256_add_dpbusd_epi32(sum0, in, row0[j]);
#else
sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
#endif
} }
output[0] = m256_hadd(sum0, biases_[0]); output[0] = m256_hadd(sum0, biases_[0]);
@@ -493,24 +554,40 @@ namespace Eval::NNUE::Layers {
const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]); const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
__m128i* outptr = reinterpret_cast<__m128i*>(&output[i]); __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
__m256i sum0 = _mm256_setzero_si256();
__m256i sum1 = _mm256_setzero_si256();
__m256i sum2 = _mm256_setzero_si256();
__m256i sum3 = _mm256_setzero_si256();
const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]); const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]); const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]); const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]); const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
for (IndexType j = 0; j < kNumChunks; ++j) #if defined (USE_VNNI)
__m256i sum0 = _mm256_setzero_si256();
__m256i sum1 = _mm256_setzero_si256();
__m256i sum2 = _mm256_setzero_si256();
__m256i sum3 = _mm256_setzero_si256();
const IndexType kStart = 0;
#else
__m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
__m256i sum1 = m256_dpbusd_epi32(input_vector[0], row1[0]);
__m256i sum2 = m256_dpbusd_epi32(input_vector[0], row2[0]);
__m256i sum3 = m256_dpbusd_epi32(input_vector[0], row3[0]);
const IndexType kStart = 1;
#endif
for (IndexType j = kStart; j < kNumChunks; ++j)
{ {
const __m256i in = input_vector[j]; const __m256i in = input_vector[j];
#if defined (USE_VNNI)
m256_add_dpbusd_epi32(sum0, in, row0[j]); m256_add_dpbusd_epi32(sum0, in, row0[j]);
m256_add_dpbusd_epi32(sum1, in, row1[j]); m256_add_dpbusd_epi32(sum1, in, row1[j]);
m256_add_dpbusd_epi32(sum2, in, row2[j]); m256_add_dpbusd_epi32(sum2, in, row2[j]);
m256_add_dpbusd_epi32(sum3, in, row3[j]); m256_add_dpbusd_epi32(sum3, in, row3[j]);
#else
sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
#endif
} }
*outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias); *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -518,15 +595,25 @@ namespace Eval::NNUE::Layers {
} }
else if constexpr (kOutputDimensions == 1) else if constexpr (kOutputDimensions == 1)
{ {
__m256i sum0 = _mm256_setzero_si256();
const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]); const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
for (IndexType j = 0; j < kNumChunks; ++j) #if defined (USE_VNNI)
__m256i sum0 = _mm256_setzero_si256();
const IndexType kStart = 0;
#else
__m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
const IndexType kStart = 1;
#endif
for (IndexType j = kStart; j < kNumChunks; ++j)
{ {
const __m256i in = input_vector[j]; const __m256i in = input_vector[j];
#if defined (USE_VNNI)
m256_add_dpbusd_epi32(sum0, in, row0[j]); m256_add_dpbusd_epi32(sum0, in, row0[j]);
#else
sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
#endif
} }
output[0] = m256_hadd(sum0, biases_[0]); output[0] = m256_hadd(sum0, biases_[0]);
@@ -559,24 +646,24 @@ namespace Eval::NNUE::Layers {
const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]); const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
__m128i* outptr = reinterpret_cast<__m128i*>(&output[i]); __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
__m128i sum0 = _mm_setzero_si128();
__m128i sum1 = _mm_setzero_si128();
__m128i sum2 = _mm_setzero_si128();
__m128i sum3 = _mm_setzero_si128();
const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]); const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]);
const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]); const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]);
const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]); const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]);
const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]); const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]);
for (int j = 0; j < (int)kNumChunks; j += 1) __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
__m128i sum1 = m128_dpbusd_epi32(input_vector[0], row1[0]);
__m128i sum2 = m128_dpbusd_epi32(input_vector[0], row2[0]);
__m128i sum3 = m128_dpbusd_epi32(input_vector[0], row3[0]);
for (int j = 1; j < (int)kNumChunks; ++j)
{ {
const __m128i in = input_vector[j]; const __m128i in = input_vector[j];
m128_add_dpbusd_epi32(sum0, in, row0[j]); sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(in, row0[j]));
m128_add_dpbusd_epi32(sum1, in, row1[j]); sum1 = _mm_add_epi32(sum1, m128_dpbusd_epi32(in, row1[j]));
m128_add_dpbusd_epi32(sum2, in, row2[j]); sum2 = _mm_add_epi32(sum2, m128_dpbusd_epi32(in, row2[j]));
m128_add_dpbusd_epi32(sum3, in, row3[j]); sum3 = _mm_add_epi32(sum3, m128_dpbusd_epi32(in, row3[j]));
} }
*outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias); *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -584,16 +671,12 @@ namespace Eval::NNUE::Layers {
} }
else if constexpr (kOutputDimensions == 1) else if constexpr (kOutputDimensions == 1)
{ {
__m128i sum0 = _mm_setzero_si128();
const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]); const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]);
for (int j = 0; j < (int)kNumChunks; j += 1) __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
{
const __m128i in = input_vector[j];
m128_add_dpbusd_epi32(sum0, in, row0[j]); for (int j = 1; j < (int)kNumChunks; ++j)
} sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(input_vector[j], row0[j]));
output[0] = m128_hadd(sum0, biases_[0]); output[0] = m128_hadd(sum0, biases_[0]);
} }
+2 -1
View File
@@ -16,12 +16,13 @@
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
// Class for difference calculation of NNUE evaluation function
#ifndef NNUE_ACCUMULATOR_H_INCLUDED #ifndef NNUE_ACCUMULATOR_H_INCLUDED
#define NNUE_ACCUMULATOR_H_INCLUDED #define NNUE_ACCUMULATOR_H_INCLUDED
#include "nnue_architecture.h" #include "nnue_architecture.h"
// Class for difference calculation of NNUE evaluation function
namespace Eval::NNUE { namespace Eval::NNUE {
// Class that holds the result of affine transformation of input features // Class that holds the result of affine transformation of input features
+2 -1
View File
@@ -16,13 +16,14 @@
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
// Input features and network structure used in NNUE evaluation function
#ifndef NNUE_ARCHITECTURE_H_INCLUDED #ifndef NNUE_ARCHITECTURE_H_INCLUDED
#define NNUE_ARCHITECTURE_H_INCLUDED #define NNUE_ARCHITECTURE_H_INCLUDED
// Defines the network structure // Defines the network structure
#include "architectures/halfkp_256x2-32-32.h" #include "architectures/halfkp_256x2-32-32.h"
// Input features and network structure used in NNUE evaluation function
namespace Eval::NNUE { namespace Eval::NNUE {
static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, ""); static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
+99 -70
View File
@@ -23,7 +23,6 @@
#include "nnue_common.h" #include "nnue_common.h"
#include "nnue_architecture.h" #include "nnue_architecture.h"
#include "features/index_list.h" #include "features/index_list.h"
#include <cstring> #include <cstring>
@@ -34,57 +33,57 @@ namespace Eval::NNUE {
// If vector instructions are enabled, we update and refresh the // If vector instructions are enabled, we update and refresh the
// accumulator tile by tile such that each tile fits in the CPU's // accumulator tile by tile such that each tile fits in the CPU's
// vector registers. // vector registers.
#define TILING #define VECTOR
#ifdef USE_AVX512 #ifdef USE_AVX512
typedef __m512i vec_t; typedef __m512i vec_t;
#define vec_load(a) _mm512_load_si512(a) #define vec_load(a) _mm512_load_si512(a)
#define vec_store(a,b) _mm512_store_si512(a,b) #define vec_store(a,b) _mm512_store_si512(a,b)
#define vec_add_16(a,b) _mm512_add_epi16(a,b) #define vec_add_16(a,b) _mm512_add_epi16(a,b)
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b) #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
#define vec_zero _mm512_setzero_si512() #define vec_zero _mm512_setzero_si512()
static constexpr IndexType kNumRegs = 8; // only 8 are needed static constexpr IndexType kNumRegs = 8; // only 8 are needed
#elif USE_AVX2 #elif USE_AVX2
typedef __m256i vec_t; typedef __m256i vec_t;
#define vec_load(a) _mm256_load_si256(a) #define vec_load(a) _mm256_load_si256(a)
#define vec_store(a,b) _mm256_store_si256(a,b) #define vec_store(a,b) _mm256_store_si256(a,b)
#define vec_add_16(a,b) _mm256_add_epi16(a,b) #define vec_add_16(a,b) _mm256_add_epi16(a,b)
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b) #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
#define vec_zero _mm256_setzero_si256() #define vec_zero _mm256_setzero_si256()
static constexpr IndexType kNumRegs = 16; static constexpr IndexType kNumRegs = 16;
#elif USE_SSE2 #elif USE_SSE2
typedef __m128i vec_t; typedef __m128i vec_t;
#define vec_load(a) (*(a)) #define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b) #define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_epi16(a,b) #define vec_add_16(a,b) _mm_add_epi16(a,b)
#define vec_sub_16(a,b) _mm_sub_epi16(a,b) #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
#define vec_zero _mm_setzero_si128() #define vec_zero _mm_setzero_si128()
static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8; static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
#elif USE_MMX #elif USE_MMX
typedef __m64 vec_t; typedef __m64 vec_t;
#define vec_load(a) (*(a)) #define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b) #define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_pi16(a,b) #define vec_add_16(a,b) _mm_add_pi16(a,b)
#define vec_sub_16(a,b) _mm_sub_pi16(a,b) #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
#define vec_zero _mm_setzero_si64() #define vec_zero _mm_setzero_si64()
static constexpr IndexType kNumRegs = 8; static constexpr IndexType kNumRegs = 8;
#elif USE_NEON #elif USE_NEON
typedef int16x8_t vec_t; typedef int16x8_t vec_t;
#define vec_load(a) (*(a)) #define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b) #define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) vaddq_s16(a,b) #define vec_add_16(a,b) vaddq_s16(a,b)
#define vec_sub_16(a,b) vsubq_s16(a,b) #define vec_sub_16(a,b) vsubq_s16(a,b)
#define vec_zero {0} #define vec_zero {0}
static constexpr IndexType kNumRegs = 16; static constexpr IndexType kNumRegs = 16;
#else #else
#undef TILING #undef VECTOR
#endif #endif
// Input feature converter // Input feature converter
class FeatureTransformer { class FeatureTransformer {
@@ -93,10 +92,10 @@ namespace Eval::NNUE {
// Number of output dimensions for one side // Number of output dimensions for one side
static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions; static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
#ifdef TILING #ifdef VECTOR
static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2; static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions"); static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
#endif #endif
public: public:
// Output type // Output type
@@ -142,10 +141,8 @@ namespace Eval::NNUE {
for (std::size_t i = 0; i < kHalfDimensions; ++i) for (std::size_t i = 0; i < kHalfDimensions; ++i)
biases_[i] = read_little_endian<BiasType>(stream); biases_[i] = read_little_endian<BiasType>(stream);
for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i) for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
weights_[i] = read_little_endian<WeightType>(stream); weights_[i] = read_little_endian<WeightType>(stream);
return !stream.fail(); return !stream.fail();
} }
@@ -184,34 +181,58 @@ namespace Eval::NNUE {
const auto& accumulation = pos.state()->accumulator.accumulation; const auto& accumulation = pos.state()->accumulator.accumulation;
#if defined(USE_AVX2) #if defined(USE_AVX512)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2);
static_assert(kHalfDimensions % (kSimdWidth * 2) == 0);
const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
const __m512i kZero = _mm512_setzero_si512();
#elif defined(USE_AVX2)
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
constexpr int kControl = 0b11011000; constexpr int kControl = 0b11011000;
const __m256i kZero = _mm256_setzero_si256(); const __m256i kZero = _mm256_setzero_si256();
#elif defined(USE_SSE2) #elif defined(USE_SSE2)
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
#ifdef USE_SSE41 #ifdef USE_SSE41
const __m128i kZero = _mm_setzero_si128(); const __m128i kZero = _mm_setzero_si128();
#else #else
const __m128i k0x80s = _mm_set1_epi8(-128); const __m128i k0x80s = _mm_set1_epi8(-128);
#endif #endif
#elif defined(USE_MMX) #elif defined(USE_MMX)
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
const __m64 k0x80s = _mm_set1_pi8(-128); const __m64 k0x80s = _mm_set1_pi8(-128);
#elif defined(USE_NEON) #elif defined(USE_NEON)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
const int8x8_t kZero = {0}; const int8x8_t kZero = {0};
#endif #endif
const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()}; const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
for (IndexType p = 0; p < 2; ++p) { for (IndexType p = 0; p < 2; ++p) {
const IndexType offset = kHalfDimensions * p; const IndexType offset = kHalfDimensions * p;
#if defined(USE_AVX2) #if defined(USE_AVX512)
auto out = reinterpret_cast<__m512i*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m512i sum0 = _mm512_load_si512(
&reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
__m512i sum1 = _mm512_load_si512(
&reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
sum0 = _mm512_add_epi16(sum0, reinterpret_cast<const __m512i*>(
accumulation[perspectives[p]][i])[j * 2 + 0]);
sum1 = _mm512_add_epi16(sum1, reinterpret_cast<const __m512i*>(
accumulation[perspectives[p]][i])[j * 2 + 1]);
}
_mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl,
_mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero)));
}
#elif defined(USE_AVX2)
auto out = reinterpret_cast<__m256i*>(&output[offset]); auto out = reinterpret_cast<__m256i*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) { for (IndexType j = 0; j < kNumChunks; ++j) {
__m256i sum0 = _mm256_load_si256( __m256i sum0 = _mm256_load_si256(
@@ -229,7 +250,7 @@ namespace Eval::NNUE {
_mm256_packs_epi16(sum0, sum1), kZero), kControl)); _mm256_packs_epi16(sum0, sum1), kZero), kControl));
} }
#elif defined(USE_SSE2) #elif defined(USE_SSE2)
auto out = reinterpret_cast<__m128i*>(&output[offset]); auto out = reinterpret_cast<__m128i*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) { for (IndexType j = 0; j < kNumChunks; ++j) {
__m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>( __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
@@ -247,16 +268,16 @@ namespace Eval::NNUE {
_mm_store_si128(&out[j], _mm_store_si128(&out[j],
#ifdef USE_SSE41 #ifdef USE_SSE41
_mm_max_epi8(packedbytes, kZero) _mm_max_epi8(packedbytes, kZero)
#else #else
_mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
#endif #endif
); );
} }
#elif defined(USE_MMX) #elif defined(USE_MMX)
auto out = reinterpret_cast<__m64*>(&output[offset]); auto out = reinterpret_cast<__m64*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) { for (IndexType j = 0; j < kNumChunks; ++j) {
__m64 sum0 = *(&reinterpret_cast<const __m64*>( __m64 sum0 = *(&reinterpret_cast<const __m64*>(
@@ -274,7 +295,7 @@ namespace Eval::NNUE {
out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
} }
#elif defined(USE_NEON) #elif defined(USE_NEON)
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]); const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) { for (IndexType j = 0; j < kNumChunks; ++j) {
int16x8_t sum = reinterpret_cast<const int16x8_t*>( int16x8_t sum = reinterpret_cast<const int16x8_t*>(
@@ -288,7 +309,7 @@ namespace Eval::NNUE {
out[j] = vmax_s8(vqmovn_s16(sum), kZero); out[j] = vmax_s8(vqmovn_s16(sum), kZero);
} }
#else #else
for (IndexType j = 0; j < kHalfDimensions; ++j) { for (IndexType j = 0; j < kHalfDimensions; ++j) {
BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j]; BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) { for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
@@ -298,37 +319,41 @@ namespace Eval::NNUE {
output[offset + j] = static_cast<OutputType>( output[offset + j] = static_cast<OutputType>(
std::max<int>(0, std::min<int>(127, sum))); std::max<int>(0, std::min<int>(127, sum)));
} }
#endif #endif
} }
#if defined(USE_MMX) #if defined(USE_MMX)
_mm_empty(); _mm_empty();
#endif #endif
} }
private: private:
// Calculate cumulative value without using difference calculation // Calculate cumulative value without using difference calculation
void refresh_accumulator(const Position& pos) const { void refresh_accumulator(const Position& pos) const {
#ifdef VECTOR
// Gcc-10.2 unnecessarily spills AVX2 registers if this array
// is defined in the VECTOR code below, once in each branch
vec_t acc[kNumRegs];
#endif
auto& accumulator = pos.state()->accumulator; auto& accumulator = pos.state()->accumulator;
for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) { for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
Features::IndexList active_indices[2]; Features::IndexList active_indices[2];
RawFeatures::append_active_indices(pos, kRefreshTriggers[i], RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
active_indices); active_indices);
for (Color perspective : { WHITE, BLACK }) { for (Color perspective : { WHITE, BLACK }) {
#ifdef TILING #ifdef VECTOR
for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) { for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
auto accTile = reinterpret_cast<vec_t*>( auto accTile = reinterpret_cast<vec_t*>(
&accumulator.accumulation[perspective][i][j * kTileHeight]); &accumulator.accumulation[perspective][i][j * kTileHeight]);
vec_t acc[kNumRegs];
if (i == 0) { if (i == 0) {
auto biasesTile = reinterpret_cast<const vec_t*>( auto biasesTile = reinterpret_cast<const vec_t*>(
&biases_[j * kTileHeight]); &biases_[j * kTileHeight]);
for (unsigned k = 0; k < kNumRegs; ++k) for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = biasesTile[k]; acc[k] = biasesTile[k];
} else { } else {
for (unsigned k = 0; k < kNumRegs; ++k) for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_zero; acc[k] = vec_zero;
} }
@@ -336,11 +361,11 @@ namespace Eval::NNUE {
const IndexType offset = kHalfDimensions * index + j * kTileHeight; const IndexType offset = kHalfDimensions * index + j * kTileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights_[offset]); auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
for (unsigned k = 0; k < kNumRegs; ++k) for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_add_16(acc[k], column[k]); acc[k] = vec_add_16(acc[k], column[k]);
} }
for (unsigned k = 0; k < kNumRegs; k++) for (IndexType k = 0; k < kNumRegs; k++)
vec_store(&accTile[k], acc[k]); vec_store(&accTile[k], acc[k]);
} }
#else #else
@@ -373,6 +398,11 @@ namespace Eval::NNUE {
// Calculate cumulative value using difference calculation // Calculate cumulative value using difference calculation
void update_accumulator(const Position& pos) const { void update_accumulator(const Position& pos) const {
#ifdef VECTOR
// Gcc-10.2 unnecessarily spills AVX2 registers if this array
// is defined in the VECTOR code below, once in each branch
vec_t acc[kNumRegs];
#endif
const auto& prev_accumulator = pos.state()->previous->accumulator; const auto& prev_accumulator = pos.state()->previous->accumulator;
auto& accumulator = pos.state()->accumulator; auto& accumulator = pos.state()->accumulator;
for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) { for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
@@ -381,21 +411,20 @@ namespace Eval::NNUE {
RawFeatures::append_changed_indices(pos, kRefreshTriggers[i], RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
removed_indices, added_indices, reset); removed_indices, added_indices, reset);
#ifdef TILING #ifdef VECTOR
for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) { for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
for (Color perspective : { WHITE, BLACK }) { for (Color perspective : { WHITE, BLACK }) {
auto accTile = reinterpret_cast<vec_t*>( auto accTile = reinterpret_cast<vec_t*>(
&accumulator.accumulation[perspective][i][j * kTileHeight]); &accumulator.accumulation[perspective][i][j * kTileHeight]);
vec_t acc[kNumRegs];
if (reset[perspective]) { if (reset[perspective]) {
if (i == 0) { if (i == 0) {
auto biasesTile = reinterpret_cast<const vec_t*>( auto biasesTile = reinterpret_cast<const vec_t*>(
&biases_[j * kTileHeight]); &biases_[j * kTileHeight]);
for (unsigned k = 0; k < kNumRegs; ++k) for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = biasesTile[k]; acc[k] = biasesTile[k];
} else { } else {
for (unsigned k = 0; k < kNumRegs; ++k) for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_zero; acc[k] = vec_zero;
} }
} else { } else {
@@ -483,4 +512,4 @@ namespace Eval::NNUE {
} // namespace Eval::NNUE } // namespace Eval::NNUE
#endif //#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED #endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+2 -2
View File
@@ -176,8 +176,8 @@ namespace {
score -= Doubled * doubled score -= Doubled * doubled
+ WeakLever * more_than_one(lever); + WeakLever * more_than_one(lever);
if (blocked && r > RANK_4) if (blocked && r >= RANK_5)
score += BlockedPawn[r-4]; score += BlockedPawn[r - RANK_5];
} }
return score; return score;
+18 -18
View File
@@ -59,7 +59,7 @@ namespace {
// Razor and futility margins // Razor and futility margins
constexpr int RazorMargin = 510; constexpr int RazorMargin = 510;
Value futility_margin(Depth d, bool improving) { Value futility_margin(Depth d, bool improving) {
return Value(223 * (d - improving)); return Value(234 * (d - improving));
} }
// Reductions lookup table, initialized at startup // Reductions lookup table, initialized at startup
@@ -67,7 +67,7 @@ namespace {
Depth reduction(bool i, Depth d, int mn) { Depth reduction(bool i, Depth d, int mn) {
int r = Reductions[d] * Reductions[mn]; int r = Reductions[d] * Reductions[mn];
return (r + 509) / 1024 + (!i && r > 894); return (r + 503) / 1024 + (!i && r > 915);
} }
constexpr int futility_move_count(bool improving, Depth depth) { constexpr int futility_move_count(bool improving, Depth depth) {
@@ -188,7 +188,7 @@ namespace {
void Search::init() { void Search::init() {
for (int i = 1; i < MAX_MOVES; ++i) for (int i = 1; i < MAX_MOVES; ++i)
Reductions[i] = int((22.0 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i))); Reductions[i] = int((21.3 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i)));
} }
@@ -404,7 +404,7 @@ void Thread::search() {
beta = std::min(prev + delta, VALUE_INFINITE); beta = std::min(prev + delta, VALUE_INFINITE);
// Adjust contempt based on root move's previousScore (dynamic contempt) // Adjust contempt based on root move's previousScore (dynamic contempt)
int dct = ct + (105 - ct / 2) * prev / (abs(prev) + 149); int dct = ct + (113 - ct / 2) * prev / (abs(prev) + 147);
contempt = (us == WHITE ? make_score(dct, dct / 2) contempt = (us == WHITE ? make_score(dct, dct / 2)
: -make_score(dct, dct / 2)); : -make_score(dct, dct / 2));
@@ -824,7 +824,7 @@ namespace {
&& (ss-1)->statScore < 22977 && (ss-1)->statScore < 22977
&& eval >= beta && eval >= beta
&& eval >= ss->staticEval && eval >= ss->staticEval
&& ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 182 && ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 168
&& !excludedMove && !excludedMove
&& pos.non_pawn_material(us) && pos.non_pawn_material(us)
&& (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor)) && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor))
@@ -832,7 +832,7 @@ namespace {
assert(eval - beta >= 0); assert(eval - beta >= 0);
// Null move dynamic reduction based on depth and value // Null move dynamic reduction based on depth and value
Depth R = (982 + 85 * depth) / 256 + std::min(int(eval - beta) / 192, 3); Depth R = (1015 + 85 * depth) / 256 + std::min(int(eval - beta) / 191, 3);
ss->currentMove = MOVE_NULL; ss->currentMove = MOVE_NULL;
ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0]; ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0];
@@ -849,7 +849,7 @@ namespace {
if (nullValue >= VALUE_TB_WIN_IN_MAX_PLY) if (nullValue >= VALUE_TB_WIN_IN_MAX_PLY)
nullValue = beta; nullValue = beta;
if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 13)) if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 14))
return nullValue; return nullValue;
assert(!thisThread->nmpMinPly); // Recursive verification is not allowed assert(!thisThread->nmpMinPly); // Recursive verification is not allowed
@@ -868,7 +868,7 @@ namespace {
} }
} }
probCutBeta = beta + 176 - 49 * improving; probCutBeta = beta + 183 - 49 * improving;
// Step 10. ProbCut (~10 Elo) // Step 10. ProbCut (~10 Elo)
// If we have a good enough capture and a reduced search returns a value // If we have a good enough capture and a reduced search returns a value
@@ -1036,7 +1036,7 @@ moves_loop: // When in check, search starts from here
// Futility pruning: parent node (~5 Elo) // Futility pruning: parent node (~5 Elo)
if ( lmrDepth < 7 if ( lmrDepth < 7
&& !ss->inCheck && !ss->inCheck
&& ss->staticEval + 283 + 170 * lmrDepth <= alpha && ss->staticEval + 266 + 170 * lmrDepth <= alpha
&& (*contHist[0])[movedPiece][to_sq(move)] && (*contHist[0])[movedPiece][to_sq(move)]
+ (*contHist[1])[movedPiece][to_sq(move)] + (*contHist[1])[movedPiece][to_sq(move)]
+ (*contHist[3])[movedPiece][to_sq(move)] + (*contHist[3])[movedPiece][to_sq(move)]
@@ -1044,7 +1044,7 @@ moves_loop: // When in check, search starts from here
continue; continue;
// Prune moves with negative SEE (~20 Elo) // Prune moves with negative SEE (~20 Elo)
if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth))) if (!pos.see_ge(move, Value(-(30 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
continue; continue;
} }
else else
@@ -1055,8 +1055,8 @@ moves_loop: // When in check, search starts from here
&& captureHistory[movedPiece][to_sq(move)][type_of(pos.piece_on(to_sq(move)))] < 0) && captureHistory[movedPiece][to_sq(move)][type_of(pos.piece_on(to_sq(move)))] < 0)
continue; continue;
// See based pruning // SEE based pruning
if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo) if (!pos.see_ge(move, Value(-213) * depth)) // (~25 Elo)
continue; continue;
} }
} }
@@ -1150,12 +1150,12 @@ moves_loop: // When in check, search starts from here
|| moveCountPruning || moveCountPruning
|| ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha
|| cutNode || cutNode
|| thisThread->ttHitAverage < 427 * TtHitAverageResolution * TtHitAverageWindow / 1024)) || thisThread->ttHitAverage < 432 * TtHitAverageResolution * TtHitAverageWindow / 1024))
{ {
Depth r = reduction(improving, depth, moveCount); Depth r = reduction(improving, depth, moveCount);
// Decrease reduction if the ttHit running average is large // Decrease reduction if the ttHit running average is large
if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024) if (thisThread->ttHitAverage > 537 * TtHitAverageResolution * TtHitAverageWindow / 1024)
r--; r--;
// Increase reduction if other threads are searching this position // Increase reduction if other threads are searching this position
@@ -1208,10 +1208,10 @@ moves_loop: // When in check, search starts from here
- 5287; - 5287;
// Decrease/increase reduction by comparing opponent's stat score (~10 Elo) // Decrease/increase reduction by comparing opponent's stat score (~10 Elo)
if (ss->statScore >= -106 && (ss-1)->statScore < -104) if (ss->statScore >= -105 && (ss-1)->statScore < -103)
r--; r--;
else if ((ss-1)->statScore >= -119 && ss->statScore < -140) else if ((ss-1)->statScore >= -122 && ss->statScore < -129)
r++; r++;
// Decrease/increase reduction for moves with a good/bad history (~30 Elo) // Decrease/increase reduction for moves with a good/bad history (~30 Elo)
@@ -1225,7 +1225,7 @@ moves_loop: // When in check, search starts from here
// Unless giving check, this capture is likely bad // Unless giving check, this capture is likely bad
if ( !givesCheck if ( !givesCheck
&& ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha) && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 210 * depth <= alpha)
r++; r++;
} }
@@ -1499,7 +1499,7 @@ moves_loop: // When in check, search starts from here
if (PvNode && bestValue > alpha) if (PvNode && bestValue > alpha)
alpha = bestValue; alpha = bestValue;
futilityBase = bestValue + 145; futilityBase = bestValue + 155;
} }
const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory, const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory,
+2 -2
View File
@@ -204,8 +204,8 @@ enum PieceType {
enum Piece { enum Piece {
NO_PIECE, NO_PIECE,
W_PAWN = 1, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING, W_PAWN = PAWN, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
B_PAWN = 9, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING, B_PAWN = PAWN + 8, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
PIECE_NB = 16 PIECE_NB = 16
}; };