Merge remote-tracking branch 'remotes/official/master' into merge

This commit is contained in:
noobpwnftw
2020-11-28 06:19:16 +08:00
16 changed files with 1086 additions and 988 deletions
+2 -1
View File
@@ -44,6 +44,7 @@ Daniel Dugovic (ddugovic)
Dariusz Orzechowski (dorzechowski) Dariusz Orzechowski (dorzechowski)
David Zar David Zar
Daylen Yang (daylen) Daylen Yang (daylen)
Deshawn Mohan-Smith (GoldenRare)
DiscanX DiscanX
Dominik Schlösser (domschl) Dominik Schlösser (domschl)
double-beep double-beep
@@ -64,7 +65,6 @@ Gary Heckman (gheckman)
George Sobala (gsobala) George Sobala (gsobala)
gguliash gguliash
Gian-Carlo Pascutto (gcp) Gian-Carlo Pascutto (gcp)
Deshawn Mohan-Smith (GoldenRare)
Gontran Lemaire (gonlem) Gontran Lemaire (gonlem)
Goodkov Vasiliy Aleksandrovich (goodkov) Goodkov Vasiliy Aleksandrovich (goodkov)
Gregor Cramer Gregor Cramer
@@ -112,6 +112,7 @@ Mark Tenzer (31m059)
marotear marotear
Matthew Lai (matthewlai) Matthew Lai (matthewlai)
Matthew Sullivan (Matt14916) Matthew Sullivan (Matt14916)
Maxim Molchanov (Maxim)
Michael An (man) Michael An (man)
Michael Byrne (MichaelB7) Michael Byrne (MichaelB7)
Michael Chaly (Vizvezdenec) Michael Chaly (Vizvezdenec)
+1 -1
View File
@@ -41,7 +41,7 @@ BINDIR = $(PREFIX)/bin
### Built-in benchmark for pgo-builds ### Built-in benchmark for pgo-builds
PGO_TRAINING_DATA_FILE = pgo_training_data.bin PGO_TRAINING_DATA_FILE = pgo_training_data.bin
PGOBENCH = ./$(EXE) bench PGOBENCH = ./$(EXE) bench
PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 output_file_name $(PGO_TRAINING_DATA_FILE) PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 sfen_format bin output_file_name $(PGO_TRAINING_DATA_FILE)
### Source and object files ### Source and object files
SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
+14 -12
View File
@@ -84,11 +84,11 @@ using namespace Trace;
namespace { namespace {
// Threshold for lazy and space evaluation // Threshold for lazy and space evaluation
constexpr Value LazyThreshold1 = Value(1400); constexpr Value LazyThreshold1 = Value(1565);
constexpr Value LazyThreshold2 = Value(1300); constexpr Value LazyThreshold2 = Value(1102);
constexpr Value SpaceThreshold = Value(12222); constexpr Value SpaceThreshold = Value(11551);
constexpr Value NNUEThreshold1 = Value(550); constexpr Value NNUEThreshold1 = Value(682);
constexpr Value NNUEThreshold2 = Value(150); constexpr Value NNUEThreshold2 = Value(176);
// KingAttackWeights[PieceType] contains king attack weights by piece type // KingAttackWeights[PieceType] contains king attack weights by piece type
constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 }; constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -930,7 +930,7 @@ Value Eval::evaluate(const Position& pos) {
{ {
// Scale and shift NNUE for compatibility with search and classical evaluation // Scale and shift NNUE for compatibility with search and classical evaluation
auto adjusted_NNUE = [&](){ auto adjusted_NNUE = [&](){
int mat = pos.non_pawn_material() + PieceValue[MG][PAWN] * pos.count<PAWN>(); int mat = pos.non_pawn_material() + PawnValueMg * pos.count<PAWN>();
return NNUE::evaluate(pos) * (720 + mat / 32) / 1024 + Tempo; return NNUE::evaluate(pos) * (720 + mat / 32) / 1024 + Tempo;
}; };
@@ -940,16 +940,18 @@ Value Eval::evaluate(const Position& pos) {
bool largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50; bool largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50;
bool classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB)); bool classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB));
v = classical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE(); bool strongClassical = pos.non_pawn_material() < 2 * RookValueMg && pos.count<PAWN>() < 2;
v = classical || strongClassical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
// If the classical eval is small and imbalance large, use NNUE nevertheless. // If the classical eval is small and imbalance large, use NNUE nevertheless.
// For the case of opposite colored bishops, switch to NNUE eval with // For the case of opposite colored bishops, switch to NNUE eval with
// small probability if the classical eval is less than the threshold. // small probability if the classical eval is less than the threshold.
if ( largePsq if ( largePsq && !strongClassical
&& (abs(v) * 16 < NNUEThreshold2 * r50 && ( abs(v) * 16 < NNUEThreshold2 * r50
|| ( pos.opposite_bishops() || ( pos.opposite_bishops()
&& abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50 && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
&& !(pos.this_thread()->nodes & 0xB)))) && !(pos.this_thread()->nodes & 0xB))))
v = adjusted_NNUE(); v = adjusted_NNUE();
} }
+1 -2
View File
@@ -585,11 +585,10 @@ namespace CommandLine {
string argv0; // path+name of the executable binary, as given by argv[0] string argv0; // path+name of the executable binary, as given by argv[0]
string binaryDirectory; // path of the executable directory string binaryDirectory; // path of the executable directory
string workingDirectory; // path of the working directory string workingDirectory; // path of the working directory
string pathSeparator; // Separator for our current OS
void init(int argc, char* argv[]) { void init(int argc, char* argv[]) {
(void)argc; (void)argc;
string separator; string pathSeparator;
// extract the path+name of the executable binary // extract the path+name of the executable binary
argv0 = argv[0]; argv0 = argv[0];
+242 -259
View File
@@ -1,19 +1,19 @@
/* /*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1 Stockfish, a UCI chess playing engine derived from Glaurung 2.1
Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
Stockfish is free software: you can redistribute it and/or modify Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or the Free Software Foundation, either version 3 of the License, or
(at your option) any later version. (at your option) any later version.
Stockfish is distributed in the hope that it will be useful, Stockfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. GNU General Public License for more details.
You should have received a copy of the GNU General Public License You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
// Code for calculating NNUE evaluation function // Code for calculating NNUE evaluation function
@@ -40,330 +40,313 @@
namespace Eval::NNUE { namespace Eval::NNUE {
const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = { const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
// convention: W - us, B - them // convention: W - us, B - them
// viewed from other side, W and B are reversed // viewed from other side, W and B are reversed
{ PS_NONE, PS_NONE }, { PS_NONE, PS_NONE },
{ PS_W_PAWN, PS_B_PAWN }, { PS_W_PAWN, PS_B_PAWN },
{ PS_W_KNIGHT, PS_B_KNIGHT }, { PS_W_KNIGHT, PS_B_KNIGHT },
{ PS_W_BISHOP, PS_B_BISHOP }, { PS_W_BISHOP, PS_B_BISHOP },
{ PS_W_ROOK, PS_B_ROOK }, { PS_W_ROOK, PS_B_ROOK },
{ PS_W_QUEEN, PS_B_QUEEN }, { PS_W_QUEEN, PS_B_QUEEN },
{ PS_W_KING, PS_B_KING }, { PS_W_KING, PS_B_KING },
{ PS_NONE, PS_NONE }, { PS_NONE, PS_NONE },
{ PS_NONE, PS_NONE }, { PS_NONE, PS_NONE },
{ PS_B_PAWN, PS_W_PAWN }, { PS_B_PAWN, PS_W_PAWN },
{ PS_B_KNIGHT, PS_W_KNIGHT }, { PS_B_KNIGHT, PS_W_KNIGHT },
{ PS_B_BISHOP, PS_W_BISHOP }, { PS_B_BISHOP, PS_W_BISHOP },
{ PS_B_ROOK, PS_W_ROOK }, { PS_B_ROOK, PS_W_ROOK },
{ PS_B_QUEEN, PS_W_QUEEN }, { PS_B_QUEEN, PS_W_QUEEN },
{ PS_B_KING, PS_W_KING }, { PS_B_KING, PS_W_KING },
{ PS_NONE, PS_NONE } { PS_NONE, PS_NONE }
}; };
// Input feature converter // Input feature converter
LargePagePtr<FeatureTransformer> feature_transformer; LargePagePtr<FeatureTransformer> feature_transformer;
// Evaluation function // Evaluation function
AlignedPtr<Network> network; AlignedPtr<Network> network;
// Evaluation function file name // Evaluation function file name
std::string fileName; std::string fileName;
// Saved evaluation function file name // Saved evaluation function file name
std::string savedfileName = "nn.bin"; std::string savedfileName = "nn.bin";
// Get a string that represents the structure of the evaluation function // Get a string that represents the structure of the evaluation function
std::string get_architecture_string() { std::string get_architecture_string() {
return "Features=" + FeatureTransformer::get_structure_string() + return "Features=" + FeatureTransformer::get_structure_string() +
",Network=" + Network::get_structure_string(); ",Network=" + Network::get_structure_string();
} }
std::string get_layers_info() { std::string get_layers_info() {
return return
FeatureTransformer::get_layers_info() FeatureTransformer::get_layers_info()
+ '\n' + Network::get_layers_info(); + '\n' + Network::get_layers_info();
} }
UseNNUEMode useNNUE; UseNNUEMode useNNUE;
std::string eval_file_loaded = "None"; std::string eval_file_loaded = "None";
namespace Detail { namespace Detail {
// Initialize the evaluation function parameters // Initialize the evaluation function parameters
template <typename T> template <typename T>
void initialize(AlignedPtr<T>& pointer) { void initialize(AlignedPtr<T>& pointer) {
pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T)))); pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
std::memset(pointer.get(), 0, sizeof(T)); std::memset(pointer.get(), 0, sizeof(T));
} }
template <typename T> template <typename T>
void initialize(LargePagePtr<T>& pointer) { void initialize(LargePagePtr<T>& pointer) {
static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T"); static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
std::memset(pointer.get(), 0, sizeof(T));
}
pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T)))); // Read evaluation function parameters
std::memset(pointer.get(), 0, sizeof(T)); template <typename T>
} bool ReadParameters(std::istream& stream, T& reference) {
// Read evaluation function parameters std::uint32_t header;
template <typename T> header = read_little_endian<std::uint32_t>(stream);
bool ReadParameters(std::istream& stream, T& reference) { if (!stream || header != T::GetHashValue()) return false;
return reference.ReadParameters(stream);
}
std::uint32_t header; // write evaluation function parameters
header = read_little_endian<std::uint32_t>(stream); template <typename T>
bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
constexpr std::uint32_t header = T::GetHashValue();
if (!stream || header != T::GetHashValue()) stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
return false;
return reference.ReadParameters(stream); return pointer->WriteParameters(stream);
} }
// write evaluation function parameters template <typename T>
template <typename T> bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) { constexpr std::uint32_t header = T::GetHashValue();
constexpr std::uint32_t header = T::GetHashValue();
stream.write(reinterpret_cast<const char*>(&header), sizeof(header)); stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
return pointer->WriteParameters(stream); return pointer->WriteParameters(stream);
} }
} // namespace Detail
template <typename T> // Initialize the evaluation function parameters
bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) { void initialize() {
constexpr std::uint32_t header = T::GetHashValue();
stream.write(reinterpret_cast<const char*>(&header), sizeof(header)); Detail::initialize(feature_transformer);
Detail::initialize(network);
}
return pointer->WriteParameters(stream); // Read network header
} bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
} // namespace Detail {
std::uint32_t version, size;
// Initialize the evaluation function parameters version = read_little_endian<std::uint32_t>(stream);
void initialize() { *hash_value = read_little_endian<std::uint32_t>(stream);
size = read_little_endian<std::uint32_t>(stream);
if (!stream || version != kVersion) return false;
architecture->resize(size);
stream.read(&(*architecture)[0], size);
return !stream.fail();
}
Detail::initialize(feature_transformer); // write the header
Detail::initialize(network); bool write_header(std::ostream& stream,
} std::uint32_t hash_value, const std::string& architecture) {
// Read network header stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture) stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
{
std::uint32_t version, size;
version = read_little_endian<std::uint32_t>(stream); const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
*hash_value = read_little_endian<std::uint32_t>(stream);
size = read_little_endian<std::uint32_t>(stream);
if (!stream || version != kVersion) stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
return false; stream.write(architecture.data(), size);
architecture->resize(size); return !stream.fail();
stream.read(&(*architecture)[0], size); }
return !stream.fail(); // Read network parameters
} bool ReadParameters(std::istream& stream) {
// write the header std::uint32_t hash_value;
bool write_header(std::ostream& stream, std::string architecture;
std::uint32_t hash_value, const std::string& architecture) { if (!read_header(stream, &hash_value, &architecture)) return false;
if (hash_value != kHashValue) return false;
if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
if (!Detail::ReadParameters(stream, *network)) return false;
return stream && stream.peek() == std::ios::traits_type::eof();
}
stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion)); // write evaluation function parameters
stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value)); bool WriteParameters(std::ostream& stream) {
const std::uint32_t size = static_cast<std::uint32_t>(architecture.size()); if (!write_header(stream, kHashValue, get_architecture_string()))
return false;
stream.write(reinterpret_cast<const char*>(&size), sizeof(size)); if (!Detail::WriteParameters(stream, feature_transformer))
stream.write(architecture.data(), size); return false;
return !stream.fail(); if (!Detail::WriteParameters(stream, network))
} return false;
// Read network parameters return !stream.fail();
bool ReadParameters(std::istream& stream) { }
std::uint32_t hash_value; // Evaluation function. Perform differential calculation.
std::string architecture; Value evaluate(const Position& pos) {
if (!read_header(stream, &hash_value, &architecture))
return false;
if (hash_value != kHashValue) // We manually align the arrays on the stack because with gcc < 9.3
return false; // overaligning stack variables with alignas() doesn't work correctly.
if (!Detail::ReadParameters(stream, *feature_transformer)) constexpr uint64_t alignment = kCacheLineSize;
return false;
if (!Detail::ReadParameters(stream, *network))
return false;
return stream && stream.peek() == std::ios::traits_type::eof();
}
// write evaluation function parameters
bool WriteParameters(std::ostream& stream) {
if (!write_header(stream, kHashValue, get_architecture_string()))
return false;
if (!Detail::WriteParameters(stream, feature_transformer))
return false;
if (!Detail::WriteParameters(stream, network))
return false;
return !stream.fail();
}
// Evaluation function. Perform differential calculation.
Value evaluate(const Position& pos) {
// We manually align the arrays on the stack because with gcc < 9.3
// overaligning stack variables with alignas() doesn't work correctly.
constexpr uint64_t alignment = kCacheLineSize;
#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
TransformedFeatureType transformed_features_unaligned[ TransformedFeatureType transformed_features_unaligned[
FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)]; FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
char buffer_unaligned[Network::kBufferSize + alignment]; char buffer_unaligned[Network::kBufferSize + alignment];
auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]); auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]); auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
#else #else
alignas(alignment) alignas(alignment)
TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize]; TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
alignas(alignment) char buffer[Network::kBufferSize]; alignas(alignment) char buffer[Network::kBufferSize];
#endif #endif
ASSERT_ALIGNED(transformed_features, alignment); ASSERT_ALIGNED(transformed_features, alignment);
ASSERT_ALIGNED(buffer, alignment); ASSERT_ALIGNED(buffer, alignment);
feature_transformer->Transform(pos, transformed_features); feature_transformer->Transform(pos, transformed_features);
const auto output = network->Propagate(transformed_features, buffer);
return static_cast<Value>(output[0] / FV_SCALE);
}
const auto output = network->Propagate(transformed_features, buffer); // Load eval, from a file stream or a memory stream
bool load_eval(std::string name, std::istream& stream) {
return static_cast<Value>(output[0] / FV_SCALE); initialize();
} fileName = name;
return ReadParameters(stream);
}
// Load eval, from a file stream or a memory stream static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
bool load_eval(std::string name, std::istream& stream) { {
if (mode == "false")
return UseNNUEMode::False;
else if (mode == "true")
return UseNNUEMode::True;
else if (mode == "pure")
return UseNNUEMode::Pure;
initialize(); return UseNNUEMode::False;
}
fileName = name; void init() {
return ReadParameters(stream);
}
static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode) useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
{
if (mode == "false")
return UseNNUEMode::False;
else if (mode == "true")
return UseNNUEMode::True;
else if (mode == "pure")
return UseNNUEMode::Pure;
return UseNNUEMode::False; if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
} {
eval_file_loaded.clear();
return;
}
void init() { std::string eval_file = std::string(Options["EvalFile"]);
useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
{
eval_file_loaded.clear();
return;
}
std::string eval_file = std::string(Options["EvalFile"]);
#if defined(DEFAULT_NNUE_DIRECTORY) #if defined(DEFAULT_NNUE_DIRECTORY)
#define stringify2(x) #x #define stringify2(x) #x
#define stringify(x) stringify2(x) #define stringify(x) stringify2(x)
std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) }; std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
#else #else
std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory }; std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
#endif #endif
for (std::string directory : dirs) for (std::string directory : dirs)
{ {
if (eval_file_loaded != eval_file) if (eval_file_loaded != eval_file)
{ {
std::ifstream stream(directory + eval_file, std::ios::binary); std::ifstream stream(directory + eval_file, std::ios::binary);
if (load_eval(eval_file, stream)) if (load_eval(eval_file, stream))
{ {
sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl; sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
eval_file_loaded = eval_file; eval_file_loaded = eval_file;
} }
else else
{ {
sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl; sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
eval_file_loaded.clear(); eval_file_loaded.clear();
} }
} }
} }
#undef stringify2 #undef stringify2
#undef stringify #undef stringify
} }
/// NNUE::verify() verifies that the last net used was loaded successfully /// NNUE::verify() verifies that the last net used was loaded successfully
void verify_eval_file_loaded() { void verify_eval_file_loaded() {
std::string eval_file = std::string(Options["EvalFile"]); std::string eval_file = std::string(Options["EvalFile"]);
if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file) if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
{ {
UCI::OptionsMap defaults; UCI::OptionsMap defaults;
UCI::init(defaults); UCI::init(defaults);
std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available."; std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully."; std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file."; std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]); std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
std::string msg5 = "The engine will be terminated now."; std::string msg5 = "The engine will be terminated now.";
sync_cout << "info string ERROR: " << msg1 << sync_endl; sync_cout << "info string ERROR: " << msg1 << sync_endl;
sync_cout << "info string ERROR: " << msg2 << sync_endl; sync_cout << "info string ERROR: " << msg2 << sync_endl;
sync_cout << "info string ERROR: " << msg3 << sync_endl; sync_cout << "info string ERROR: " << msg3 << sync_endl;
sync_cout << "info string ERROR: " << msg4 << sync_endl; sync_cout << "info string ERROR: " << msg4 << sync_endl;
sync_cout << "info string ERROR: " << msg5 << sync_endl; sync_cout << "info string ERROR: " << msg5 << sync_endl;
std::exit(EXIT_FAILURE); std::exit(EXIT_FAILURE);
} }
if (useNNUE != UseNNUEMode::False) if (useNNUE != UseNNUEMode::False)
sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl; sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
else else
sync_cout << "info string classical evaluation enabled" << sync_endl; sync_cout << "info string classical evaluation enabled" << sync_endl;
} }
/// In training we override eval file so this is useful. /// In training we override eval file so this is useful.
void verify_any_net_loaded() { void verify_any_net_loaded() {
if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty()) if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
{ {
UCI::OptionsMap defaults; UCI::OptionsMap defaults;
UCI::init(defaults); UCI::init(defaults);
std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available."; std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
std::string msg2 = "The option is set to true, but the network file was not loaded successfully."; std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file."; std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
std::string msg5 = "The engine will be terminated now."; std::string msg5 = "The engine will be terminated now.";
sync_cout << "info string ERROR: " << msg1 << sync_endl; sync_cout << "info string ERROR: " << msg1 << sync_endl;
sync_cout << "info string ERROR: " << msg2 << sync_endl; sync_cout << "info string ERROR: " << msg2 << sync_endl;
sync_cout << "info string ERROR: " << msg3 << sync_endl; sync_cout << "info string ERROR: " << msg3 << sync_endl;
sync_cout << "info string ERROR: " << msg5 << sync_endl; sync_cout << "info string ERROR: " << msg5 << sync_endl;
std::exit(EXIT_FAILURE); std::exit(EXIT_FAILURE);
} }
if (useNNUE != UseNNUEMode::False) if (useNNUE != UseNNUEMode::False)
sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl; sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
else else
sync_cout << "info string classical evaluation enabled" << sync_endl; sync_cout << "info string classical evaluation enabled" << sync_endl;
} }
} // namespace Eval::NNUE } // namespace Eval::NNUE
+70 -69
View File
@@ -1,21 +1,23 @@
/* /*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1 Stockfish, a UCI chess playing engine derived from Glaurung 2.1
Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
Stockfish is free software: you can redistribute it and/or modify Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or the Free Software Foundation, either version 3 of the License, or
(at your option) any later version. (at your option) any later version.
Stockfish is distributed in the hope that it will be useful, Stockfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. GNU General Public License for more details.
You should have received a copy of the GNU General Public License You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
// header used in NNUE evaluation function
#ifndef NNUE_EVALUATE_NNUE_H_INCLUDED #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
#define NNUE_EVALUATE_NNUE_H_INCLUDED #define NNUE_EVALUATE_NNUE_H_INCLUDED
@@ -25,84 +27,83 @@
#include <memory> #include <memory>
// header used in NNUE evaluation function
namespace Eval::NNUE { namespace Eval::NNUE {
enum struct UseNNUEMode enum struct UseNNUEMode
{ {
False, False,
True, True,
Pure Pure
}; };
// Hash value of evaluation function structure // Hash value of evaluation function structure
constexpr std::uint32_t kHashValue = constexpr std::uint32_t kHashValue =
FeatureTransformer::GetHashValue() ^ Network::GetHashValue(); FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
// Deleter for automating release of memory area // Deleter for automating release of memory area
template <typename T> template <typename T>
struct AlignedDeleter { struct AlignedDeleter {
void operator()(T* ptr) const { void operator()(T* ptr) const {
ptr->~T(); ptr->~T();
std_aligned_free(ptr); std_aligned_free(ptr);
} }
}; };
template <typename T> template <typename T>
struct LargePageDeleter { struct LargePageDeleter {
void operator()(T* ptr) const { void operator()(T* ptr) const {
ptr->~T(); ptr->~T();
aligned_large_pages_free(ptr); aligned_large_pages_free(ptr);
} }
}; };
template <typename T> template <typename T>
using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>; using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
template <typename T> template <typename T>
using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>; using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
// Input feature converter // Input feature converter
extern LargePagePtr<FeatureTransformer> feature_transformer; extern LargePagePtr<FeatureTransformer> feature_transformer;
// Evaluation function // Evaluation function
extern AlignedPtr<Network> network; extern AlignedPtr<Network> network;
// Evaluation function file name // Evaluation function file name
extern std::string fileName; extern std::string fileName;
// Saved evaluation function file name // Saved evaluation function file name
extern std::string savedfileName; extern std::string savedfileName;
extern UseNNUEMode useNNUE; extern UseNNUEMode useNNUE;
extern std::string eval_file_loaded; extern std::string eval_file_loaded;
// Get a string that represents the structure of the evaluation function // Get a string that represents the structure of the evaluation function
std::string get_architecture_string(); std::string get_architecture_string();
std::string get_layers_info(); std::string get_layers_info();
// read the header // read the header
bool read_header(std::istream& stream, bool read_header(std::istream& stream,
std::uint32_t* hash_value, std::string* architecture); std::uint32_t* hash_value, std::string* architecture);
// write the header // write the header
bool write_header(std::ostream& stream, bool write_header(std::ostream& stream,
std::uint32_t hash_value, const std::string& architecture); std::uint32_t hash_value, const std::string& architecture);
// read evaluation function parameters // read evaluation function parameters
bool ReadParameters(std::istream& stream); bool ReadParameters(std::istream& stream);
// write evaluation function parameters // write evaluation function parameters
bool WriteParameters(std::ostream& stream); bool WriteParameters(std::ostream& stream);
Value evaluate(const Position& pos); Value evaluate(const Position& pos);
bool load_eval(std::string name, std::istream& stream); bool load_eval(std::string name, std::istream& stream);
void init(); void init();
void verify_eval_file_loaded(); void verify_eval_file_loaded();
void verify_any_net_loaded(); void verify_any_net_loaded();
} // namespace Eval::NNUE } // namespace Eval::NNUE
+12 -13
View File
@@ -1,19 +1,19 @@
/* /*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1 Stockfish, a UCI chess playing engine derived from Glaurung 2.1
Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
Stockfish is free software: you can redistribute it and/or modify Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or the Free Software Foundation, either version 3 of the License, or
(at your option) any later version. (at your option) any later version.
Stockfish is distributed in the hope that it will be useful, Stockfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. GNU General Public License for more details.
You should have received a copy of the GNU General Public License You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
// A class template that represents the input feature set of the NNUE evaluation function // A class template that represents the input feature set of the NNUE evaluation function
@@ -22,7 +22,6 @@
#define NNUE_FEATURE_SET_H_INCLUDED #define NNUE_FEATURE_SET_H_INCLUDED
#include "features_common.h" #include "features_common.h"
#include <array> #include <array>
namespace Eval::NNUE::Features { namespace Eval::NNUE::Features {
+29 -30
View File
@@ -1,19 +1,19 @@
/* /*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1 Stockfish, a UCI chess playing engine derived from Glaurung 2.1
Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
Stockfish is free software: you can redistribute it and/or modify Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or the Free Software Foundation, either version 3 of the License, or
(at your option) any later version. (at your option) any later version.
Stockfish is distributed in the hope that it will be useful, Stockfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. GNU General Public License for more details.
You should have received a copy of the GNU General Public License You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
//Common header of input features of NNUE evaluation function //Common header of input features of NNUE evaluation function
@@ -21,30 +21,29 @@
#ifndef NNUE_FEATURES_COMMON_H_INCLUDED #ifndef NNUE_FEATURES_COMMON_H_INCLUDED
#define NNUE_FEATURES_COMMON_H_INCLUDED #define NNUE_FEATURES_COMMON_H_INCLUDED
#include "evaluate.h" #include "../../evaluate.h"
#include "../nnue_common.h"
#include "nnue/nnue_common.h"
namespace Eval::NNUE::Features { namespace Eval::NNUE::Features {
class IndexList; class IndexList;
template <typename... FeatureTypes> template <typename... FeatureTypes>
class FeatureSet; class FeatureSet;
// Trigger to perform full calculations instead of difference only // Trigger to perform full calculations instead of difference only
enum class TriggerEvent { enum class TriggerEvent {
kNone, // Calculate the difference whenever possible kNone, // Calculate the difference whenever possible
kFriendKingMoved, // calculate full evaluation when own king moves kFriendKingMoved, // calculate full evaluation when own king moves
kEnemyKingMoved, // calculate full evaluation when opponent king moves kEnemyKingMoved, // calculate full evaluation when opponent king moves
kAnyKingMoved, // calculate full evaluation when any king moves kAnyKingMoved, // calculate full evaluation when any king moves
kAnyPieceMoved, // always calculate full evaluation kAnyPieceMoved, // always calculate full evaluation
}; };
enum class Side { enum class Side {
kFriend, // side to move kFriend, // side to move
kEnemy, // opponent kEnemy, // opponent
}; };
} // namespace Eval::NNUE::Features } // namespace Eval::NNUE::Features
+42 -42
View File
@@ -1,19 +1,19 @@
/* /*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1 Stockfish, a UCI chess playing engine derived from Glaurung 2.1
Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
Stockfish is free software: you can redistribute it and/or modify Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or the Free Software Foundation, either version 3 of the License, or
(at your option) any later version. (at your option) any later version.
Stockfish is distributed in the hope that it will be useful, Stockfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. GNU General Public License for more details.
You should have received a copy of the GNU General Public License You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
// Definition of index list of input features // Definition of index list of input features
@@ -21,43 +21,43 @@
#ifndef NNUE_FEATURES_INDEX_LIST_H_INCLUDED #ifndef NNUE_FEATURES_INDEX_LIST_H_INCLUDED
#define NNUE_FEATURES_INDEX_LIST_H_INCLUDED #define NNUE_FEATURES_INDEX_LIST_H_INCLUDED
#include "position.h" #include "../../position.h"
#include "../nnue_architecture.h"
#include "nnue/nnue_architecture.h"
namespace Eval::NNUE::Features { namespace Eval::NNUE::Features {
// Class template used for feature index list // Class template used for feature index list
template <typename T, std::size_t MaxSize> template <typename T, std::size_t MaxSize>
class ValueList { class ValueList {
public: public:
std::size_t size() const { return size_; } std::size_t size() const { return size_; }
void resize(std::size_t size) { size_ = size; } void resize(std::size_t size) { size_ = size; }
void push_back(const T& value) { values_[size_++] = value; } void push_back(const T& value) { values_[size_++] = value; }
T& operator[](std::size_t index) { return values_[index]; } T& operator[](std::size_t index) { return values_[index]; }
T* begin() { return values_; } T* begin() { return values_; }
T* end() { return values_ + size_; } T* end() { return values_ + size_; }
const T& operator[](std::size_t index) const { return values_[index]; } const T& operator[](std::size_t index) const { return values_[index]; }
const T* begin() const { return values_; } const T* begin() const { return values_; }
const T* end() const { return values_ + size_; } const T* end() const { return values_ + size_; }
void swap(ValueList& other) { void swap(ValueList& other) {
const std::size_t max_size = std::max(size_, other.size_); const std::size_t max_size = std::max(size_, other.size_);
for (std::size_t i = 0; i < max_size; ++i) { for (std::size_t i = 0; i < max_size; ++i) {
std::swap(values_[i], other.values_[i]); std::swap(values_[i], other.values_[i]);
} }
std::swap(size_, other.size_); std::swap(size_, other.size_);
} }
private: private:
T values_[MaxSize] = {}; T values_[MaxSize];
std::size_t size_ = 0; std::size_t size_ = 0;
}; };
//Type of feature index list //Type of feature index list
class IndexList : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> { class IndexList
}; : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
};
} // namespace Eval::NNUE::Features } // namespace Eval::NNUE::Features
+147 -64
View File
@@ -223,13 +223,13 @@ namespace Eval::NNUE::Layers {
return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias); return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias);
}; };
[[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
#if defined (USE_VNNI) #if defined (USE_VNNI)
[[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
acc = _mm512_dpbusd_epi32(acc, a, b); acc = _mm512_dpbusd_epi32(acc, a, b);
#else #else
[[maybe_unused]] auto m512_dpbusd_epi32 = [=](__m512i a, __m512i b) -> __m512i {
__m512i product0 = _mm512_maddubs_epi16(a, b); __m512i product0 = _mm512_maddubs_epi16(a, b);
product0 = _mm512_madd_epi16(product0, kOnes512); return _mm512_madd_epi16(product0, kOnes512);
acc = _mm512_add_epi32(acc, product0);
#endif #endif
}; };
@@ -256,14 +256,13 @@ namespace Eval::NNUE::Layers {
return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias); return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
}; };
[[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
#if defined (USE_VNNI) #if defined (USE_VNNI)
[[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
acc = _mm256_dpbusd_epi32(acc, a, b); acc = _mm256_dpbusd_epi32(acc, a, b);
#else #else
[[maybe_unused]] auto m256_dpbusd_epi32 = [=](__m256i a, __m256i b) -> __m256i {
__m256i product0 = _mm256_maddubs_epi16(a, b); __m256i product0 = _mm256_maddubs_epi16(a, b);
product0 = _mm256_madd_epi16(product0, kOnes256); return _mm256_madd_epi16(product0, kOnes256);
acc = _mm256_add_epi32(acc, product0);
#endif #endif
}; };
@@ -288,10 +287,9 @@ namespace Eval::NNUE::Layers {
return _mm_add_epi32(sum0, bias); return _mm_add_epi32(sum0, bias);
}; };
[[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) { [[maybe_unused]] auto m128_dpbusd_epi32 = [=](__m128i a, __m128i b) -> __m128i {
__m128i product0 = _mm_maddubs_epi16(a, b); __m128i product0 = _mm_maddubs_epi16(a, b);
product0 = _mm_madd_epi16(product0, kOnes128); return _mm_madd_epi16(product0, kOnes128);
acc = _mm_add_epi32(acc, product0);
}; };
#endif #endif
@@ -335,15 +333,6 @@ namespace Eval::NNUE::Layers {
const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]); const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]);
__m512i* outptr = reinterpret_cast<__m512i*>(&output[i]); __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]);
__m512i sum01a = _mm512_setzero_si512();
__m512i sum23a = _mm512_setzero_si512();
__m512i sum45a = _mm512_setzero_si512();
__m512i sum67a = _mm512_setzero_si512();
__m512i sum01b = _mm512_setzero_si512();
__m512i sum23b = _mm512_setzero_si512();
__m512i sum45b = _mm512_setzero_si512();
__m512i sum67b = _mm512_setzero_si512();
const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]); const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]);
const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]); const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]);
const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]); const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]);
@@ -356,6 +345,16 @@ namespace Eval::NNUE::Layers {
const __m256i in256 = input_vector256[0]; const __m256i in256 = input_vector256[0];
const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1); const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1);
#if defined (USE_VNNI)
__m512i sum01a = _mm512_setzero_si512();
__m512i sum23a = _mm512_setzero_si512();
__m512i sum45a = _mm512_setzero_si512();
__m512i sum67a = _mm512_setzero_si512();
__m512i sum01b = _mm512_setzero_si512();
__m512i sum23b = _mm512_setzero_si512();
__m512i sum45b = _mm512_setzero_si512();
__m512i sum67b = _mm512_setzero_si512();
m512_add_dpbusd_epi32(sum01a, in, row01a); m512_add_dpbusd_epi32(sum01a, in, row01a);
m512_add_dpbusd_epi32(sum23a, in, row23a); m512_add_dpbusd_epi32(sum23a, in, row23a);
m512_add_dpbusd_epi32(sum45a, in, row45a); m512_add_dpbusd_epi32(sum45a, in, row45a);
@@ -364,6 +363,16 @@ namespace Eval::NNUE::Layers {
m512_add_dpbusd_epi32(sum23b, in, row23b); m512_add_dpbusd_epi32(sum23b, in, row23b);
m512_add_dpbusd_epi32(sum45b, in, row45b); m512_add_dpbusd_epi32(sum45b, in, row45b);
m512_add_dpbusd_epi32(sum67b, in, row67b); m512_add_dpbusd_epi32(sum67b, in, row67b);
#else
__m512i sum01a = m512_dpbusd_epi32(in, row01a);
__m512i sum23a = m512_dpbusd_epi32(in, row23a);
__m512i sum45a = m512_dpbusd_epi32(in, row45a);
__m512i sum67a = m512_dpbusd_epi32(in, row67a);
__m512i sum01b = m512_dpbusd_epi32(in, row01b);
__m512i sum23b = m512_dpbusd_epi32(in, row23b);
__m512i sum45b = m512_dpbusd_epi32(in, row45b);
__m512i sum67b = m512_dpbusd_epi32(in, row67b);
#endif
*outptr = m512_hadd256x16( *outptr = m512_hadd256x16(
sum01a, sum23a, sum45a, sum67a, sum01a, sum23a, sum45a, sum67a,
@@ -384,48 +393,80 @@ namespace Eval::NNUE::Layers {
if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0) if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
{ {
__m512i sum0 = _mm512_setzero_si512();
__m512i sum1 = _mm512_setzero_si512();
__m512i sum2 = _mm512_setzero_si512();
__m512i sum3 = _mm512_setzero_si512();
const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]); const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]);
const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]); const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]);
const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]); const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]);
const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]); const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]);
for (IndexType j = 0; j < kNumChunks512; ++j) #if defined (USE_VNNI)
__m512i sum0 = _mm512_setzero_si512();
__m512i sum1 = _mm512_setzero_si512();
__m512i sum2 = _mm512_setzero_si512();
__m512i sum3 = _mm512_setzero_si512();
const IndexType kStart = 0;
#else
__m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
__m512i sum1 = m512_dpbusd_epi32(input_vector512[0], row1[0]);
__m512i sum2 = m512_dpbusd_epi32(input_vector512[0], row2[0]);
__m512i sum3 = m512_dpbusd_epi32(input_vector512[0], row3[0]);
const IndexType kStart = 1;
#endif
for (IndexType j = kStart; j < kNumChunks512; ++j)
{ {
const __m512i in = input_vector512[j]; const __m512i in = input_vector512[j];
#if defined (USE_VNNI)
m512_add_dpbusd_epi32(sum0, in, row0[j]); m512_add_dpbusd_epi32(sum0, in, row0[j]);
m512_add_dpbusd_epi32(sum1, in, row1[j]); m512_add_dpbusd_epi32(sum1, in, row1[j]);
m512_add_dpbusd_epi32(sum2, in, row2[j]); m512_add_dpbusd_epi32(sum2, in, row2[j]);
m512_add_dpbusd_epi32(sum3, in, row3[j]); m512_add_dpbusd_epi32(sum3, in, row3[j]);
#else
sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
sum1 = _mm512_add_epi32(sum1, m512_dpbusd_epi32(in, row1[j]));
sum2 = _mm512_add_epi32(sum2, m512_dpbusd_epi32(in, row2[j]));
sum3 = _mm512_add_epi32(sum3, m512_dpbusd_epi32(in, row3[j]));
#endif
} }
*outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias); *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias);
} }
else else
{ {
__m256i sum0 = _mm256_setzero_si256();
__m256i sum1 = _mm256_setzero_si256();
__m256i sum2 = _mm256_setzero_si256();
__m256i sum3 = _mm256_setzero_si256();
const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]); const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]); const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]); const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]); const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
for (IndexType j = 0; j < kNumChunks256; ++j) #if defined (USE_VNNI)
__m256i sum0 = _mm256_setzero_si256();
__m256i sum1 = _mm256_setzero_si256();
__m256i sum2 = _mm256_setzero_si256();
__m256i sum3 = _mm256_setzero_si256();
const IndexType kStart = 0;
#else
__m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
__m256i sum1 = m256_dpbusd_epi32(input_vector256[0], row1[0]);
__m256i sum2 = m256_dpbusd_epi32(input_vector256[0], row2[0]);
__m256i sum3 = m256_dpbusd_epi32(input_vector256[0], row3[0]);
const IndexType kStart = 1;
#endif
for (IndexType j = kStart; j < kNumChunks256; ++j)
{ {
const __m256i in = input_vector256[j]; const __m256i in = input_vector256[j];
#if defined (USE_VNNI)
m256_add_dpbusd_epi32(sum0, in, row0[j]); m256_add_dpbusd_epi32(sum0, in, row0[j]);
m256_add_dpbusd_epi32(sum1, in, row1[j]); m256_add_dpbusd_epi32(sum1, in, row1[j]);
m256_add_dpbusd_epi32(sum2, in, row2[j]); m256_add_dpbusd_epi32(sum2, in, row2[j]);
m256_add_dpbusd_epi32(sum3, in, row3[j]); m256_add_dpbusd_epi32(sum3, in, row3[j]);
#else
sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
#endif
} }
*outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias); *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -436,30 +477,50 @@ namespace Eval::NNUE::Layers {
{ {
if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0) if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
{ {
__m512i sum0 = _mm512_setzero_si512();
const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]); const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]);
for (IndexType j = 0; j < kNumChunks512; ++j) #if defined (USE_VNNI)
__m512i sum0 = _mm512_setzero_si512();
const IndexType kStart = 0;
#else
__m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
const IndexType kStart = 1;
#endif
for (IndexType j = kStart; j < kNumChunks512; ++j)
{ {
const __m512i in = input_vector512[j]; const __m512i in = input_vector512[j];
#if defined (USE_VNNI)
m512_add_dpbusd_epi32(sum0, in, row0[j]); m512_add_dpbusd_epi32(sum0, in, row0[j]);
#else
sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
#endif
} }
output[0] = m512_hadd(sum0, biases_[0]); output[0] = m512_hadd(sum0, biases_[0]);
} }
else else
{ {
__m256i sum0 = _mm256_setzero_si256();
const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]); const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
for (IndexType j = 0; j < kNumChunks256; ++j) #if defined (USE_VNNI)
__m256i sum0 = _mm256_setzero_si256();
const IndexType kStart = 0;
#else
__m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
const IndexType kStart = 1;
#endif
for (IndexType j = kStart; j < kNumChunks256; ++j)
{ {
const __m256i in = input_vector256[j]; const __m256i in = input_vector256[j];
#if defined (USE_VNNI)
m256_add_dpbusd_epi32(sum0, in, row0[j]); m256_add_dpbusd_epi32(sum0, in, row0[j]);
#else
sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
#endif
} }
output[0] = m256_hadd(sum0, biases_[0]); output[0] = m256_hadd(sum0, biases_[0]);
@@ -493,24 +554,40 @@ namespace Eval::NNUE::Layers {
const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]); const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
__m128i* outptr = reinterpret_cast<__m128i*>(&output[i]); __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
__m256i sum0 = _mm256_setzero_si256();
__m256i sum1 = _mm256_setzero_si256();
__m256i sum2 = _mm256_setzero_si256();
__m256i sum3 = _mm256_setzero_si256();
const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]); const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]); const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]); const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]); const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
for (IndexType j = 0; j < kNumChunks; ++j) #if defined (USE_VNNI)
__m256i sum0 = _mm256_setzero_si256();
__m256i sum1 = _mm256_setzero_si256();
__m256i sum2 = _mm256_setzero_si256();
__m256i sum3 = _mm256_setzero_si256();
const IndexType kStart = 0;
#else
__m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
__m256i sum1 = m256_dpbusd_epi32(input_vector[0], row1[0]);
__m256i sum2 = m256_dpbusd_epi32(input_vector[0], row2[0]);
__m256i sum3 = m256_dpbusd_epi32(input_vector[0], row3[0]);
const IndexType kStart = 1;
#endif
for (IndexType j = kStart; j < kNumChunks; ++j)
{ {
const __m256i in = input_vector[j]; const __m256i in = input_vector[j];
#if defined (USE_VNNI)
m256_add_dpbusd_epi32(sum0, in, row0[j]); m256_add_dpbusd_epi32(sum0, in, row0[j]);
m256_add_dpbusd_epi32(sum1, in, row1[j]); m256_add_dpbusd_epi32(sum1, in, row1[j]);
m256_add_dpbusd_epi32(sum2, in, row2[j]); m256_add_dpbusd_epi32(sum2, in, row2[j]);
m256_add_dpbusd_epi32(sum3, in, row3[j]); m256_add_dpbusd_epi32(sum3, in, row3[j]);
#else
sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
#endif
} }
*outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias); *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -518,15 +595,25 @@ namespace Eval::NNUE::Layers {
} }
else if constexpr (kOutputDimensions == 1) else if constexpr (kOutputDimensions == 1)
{ {
__m256i sum0 = _mm256_setzero_si256();
const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]); const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
for (IndexType j = 0; j < kNumChunks; ++j) #if defined (USE_VNNI)
__m256i sum0 = _mm256_setzero_si256();
const IndexType kStart = 0;
#else
__m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
const IndexType kStart = 1;
#endif
for (IndexType j = kStart; j < kNumChunks; ++j)
{ {
const __m256i in = input_vector[j]; const __m256i in = input_vector[j];
m256_add_dpbusd_epi32(sum0, in, row0[j]); #if defined (USE_VNNI)
m256_add_dpbusd_epi32(sum0, in, row0[j]);
#else
sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
#endif
} }
output[0] = m256_hadd(sum0, biases_[0]); output[0] = m256_hadd(sum0, biases_[0]);
@@ -559,24 +646,24 @@ namespace Eval::NNUE::Layers {
const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]); const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
__m128i* outptr = reinterpret_cast<__m128i*>(&output[i]); __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
__m128i sum0 = _mm_setzero_si128();
__m128i sum1 = _mm_setzero_si128();
__m128i sum2 = _mm_setzero_si128();
__m128i sum3 = _mm_setzero_si128();
const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]); const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]);
const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]); const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]);
const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]); const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]);
const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]); const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]);
for (int j = 0; j < (int)kNumChunks; j += 1) __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
__m128i sum1 = m128_dpbusd_epi32(input_vector[0], row1[0]);
__m128i sum2 = m128_dpbusd_epi32(input_vector[0], row2[0]);
__m128i sum3 = m128_dpbusd_epi32(input_vector[0], row3[0]);
for (int j = 1; j < (int)kNumChunks; ++j)
{ {
const __m128i in = input_vector[j]; const __m128i in = input_vector[j];
m128_add_dpbusd_epi32(sum0, in, row0[j]); sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(in, row0[j]));
m128_add_dpbusd_epi32(sum1, in, row1[j]); sum1 = _mm_add_epi32(sum1, m128_dpbusd_epi32(in, row1[j]));
m128_add_dpbusd_epi32(sum2, in, row2[j]); sum2 = _mm_add_epi32(sum2, m128_dpbusd_epi32(in, row2[j]));
m128_add_dpbusd_epi32(sum3, in, row3[j]); sum3 = _mm_add_epi32(sum3, m128_dpbusd_epi32(in, row3[j]));
} }
*outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias); *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -584,16 +671,12 @@ namespace Eval::NNUE::Layers {
} }
else if constexpr (kOutputDimensions == 1) else if constexpr (kOutputDimensions == 1)
{ {
__m128i sum0 = _mm_setzero_si128();
const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]); const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]);
for (int j = 0; j < (int)kNumChunks; j += 1) __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
{
const __m128i in = input_vector[j];
m128_add_dpbusd_epi32(sum0, in, row0[j]); for (int j = 1; j < (int)kNumChunks; ++j)
} sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(input_vector[j], row0[j]));
output[0] = m128_hadd(sum0, biases_[0]); output[0] = m128_hadd(sum0, biases_[0]);
} }
+19 -18
View File
@@ -1,34 +1,35 @@
/* /*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1 Stockfish, a UCI chess playing engine derived from Glaurung 2.1
Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
Stockfish is free software: you can redistribute it and/or modify Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or the Free Software Foundation, either version 3 of the License, or
(at your option) any later version. (at your option) any later version.
Stockfish is distributed in the hope that it will be useful, Stockfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. GNU General Public License for more details.
You should have received a copy of the GNU General Public License You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
// Class for difference calculation of NNUE evaluation function
#ifndef NNUE_ACCUMULATOR_H_INCLUDED #ifndef NNUE_ACCUMULATOR_H_INCLUDED
#define NNUE_ACCUMULATOR_H_INCLUDED #define NNUE_ACCUMULATOR_H_INCLUDED
#include "nnue_architecture.h" #include "nnue_architecture.h"
// Class for difference calculation of NNUE evaluation function
namespace Eval::NNUE { namespace Eval::NNUE {
// Class that holds the result of affine transformation of input features // Class that holds the result of affine transformation of input features
struct alignas(kCacheLineSize) Accumulator { struct alignas(kCacheLineSize) Accumulator {
std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions]; std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
bool computed_accumulation; bool computed_accumulation;
}; };
} // namespace Eval::NNUE } // namespace Eval::NNUE
+19 -18
View File
@@ -1,36 +1,37 @@
/* /*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1 Stockfish, a UCI chess playing engine derived from Glaurung 2.1
Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
Stockfish is free software: you can redistribute it and/or modify Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or the Free Software Foundation, either version 3 of the License, or
(at your option) any later version. (at your option) any later version.
Stockfish is distributed in the hope that it will be useful, Stockfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. GNU General Public License for more details.
You should have received a copy of the GNU General Public License You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
// Input features and network structure used in NNUE evaluation function
#ifndef NNUE_ARCHITECTURE_H_INCLUDED #ifndef NNUE_ARCHITECTURE_H_INCLUDED
#define NNUE_ARCHITECTURE_H_INCLUDED #define NNUE_ARCHITECTURE_H_INCLUDED
// Defines the network structure // Defines the network structure
#include "architectures/halfkp_256x2-32-32.h" #include "architectures/halfkp_256x2-32-32.h"
// Input features and network structure used in NNUE evaluation function
namespace Eval::NNUE { namespace Eval::NNUE {
static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, ""); static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
static_assert(Network::kOutputDimensions == 1, ""); static_assert(Network::kOutputDimensions == 1, "");
static_assert(std::is_same<Network::OutputType, std::int32_t>::value, ""); static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
// Trigger for full calculation instead of difference calculation // Trigger for full calculation instead of difference calculation
constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers; constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
} // namespace Eval::NNUE } // namespace Eval::NNUE
+466 -437
View File
@@ -1,19 +1,19 @@
/* /*
Stockfish, a UCI chess playing engine derived from Glaurung 2.1 Stockfish, a UCI chess playing engine derived from Glaurung 2.1
Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
Stockfish is free software: you can redistribute it and/or modify Stockfish is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or the Free Software Foundation, either version 3 of the License, or
(at your option) any later version. (at your option) any later version.
Stockfish is distributed in the hope that it will be useful, Stockfish is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. GNU General Public License for more details.
You should have received a copy of the GNU General Public License You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
// A class that converts the input features of the NNUE evaluation function // A class that converts the input features of the NNUE evaluation function
@@ -23,7 +23,6 @@
#include "nnue_common.h" #include "nnue_common.h"
#include "nnue_architecture.h" #include "nnue_architecture.h"
#include "features/index_list.h" #include "features/index_list.h"
#include <cstring> #include <cstring>
@@ -31,456 +30,486 @@
namespace Eval::NNUE { namespace Eval::NNUE {
// If vector instructions are enabled, we update and refresh the // If vector instructions are enabled, we update and refresh the
// accumulator tile by tile such that each tile fits in the CPU's // accumulator tile by tile such that each tile fits in the CPU's
// vector registers. // vector registers.
#define TILING #define VECTOR
#ifdef USE_AVX512 #ifdef USE_AVX512
typedef __m512i vec_t; typedef __m512i vec_t;
#define vec_load(a) _mm512_load_si512(a) #define vec_load(a) _mm512_load_si512(a)
#define vec_store(a,b) _mm512_store_si512(a,b) #define vec_store(a,b) _mm512_store_si512(a,b)
#define vec_add_16(a,b) _mm512_add_epi16(a,b) #define vec_add_16(a,b) _mm512_add_epi16(a,b)
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b) #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
#define vec_zero _mm512_setzero_si512() #define vec_zero _mm512_setzero_si512()
static constexpr IndexType kNumRegs = 8; // only 8 are needed static constexpr IndexType kNumRegs = 8; // only 8 are needed
#elif USE_AVX2 #elif USE_AVX2
typedef __m256i vec_t; typedef __m256i vec_t;
#define vec_load(a) _mm256_load_si256(a) #define vec_load(a) _mm256_load_si256(a)
#define vec_store(a,b) _mm256_store_si256(a,b) #define vec_store(a,b) _mm256_store_si256(a,b)
#define vec_add_16(a,b) _mm256_add_epi16(a,b) #define vec_add_16(a,b) _mm256_add_epi16(a,b)
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b) #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
#define vec_zero _mm256_setzero_si256() #define vec_zero _mm256_setzero_si256()
static constexpr IndexType kNumRegs = 16;
#elif USE_SSE2
typedef __m128i vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_epi16(a,b)
#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
#define vec_zero _mm_setzero_si128()
static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
#elif USE_MMX
typedef __m64 vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_pi16(a,b)
#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
#define vec_zero _mm_setzero_si64()
static constexpr IndexType kNumRegs = 8;
#elif USE_NEON
typedef int16x8_t vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) vaddq_s16(a,b)
#define vec_sub_16(a,b) vsubq_s16(a,b)
#define vec_zero {0}
static constexpr IndexType kNumRegs = 16; static constexpr IndexType kNumRegs = 16;
#elif USE_SSE2
typedef __m128i vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_epi16(a,b)
#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
#define vec_zero _mm_setzero_si128()
static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
#elif USE_MMX
typedef __m64 vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_pi16(a,b)
#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
#define vec_zero _mm_setzero_si64()
static constexpr IndexType kNumRegs = 8;
#elif USE_NEON
typedef int16x8_t vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) vaddq_s16(a,b)
#define vec_sub_16(a,b) vsubq_s16(a,b)
#define vec_zero {0}
static constexpr IndexType kNumRegs = 16;
#else
#undef VECTOR
#endif
// Input feature converter
class FeatureTransformer {
private:
// Number of output dimensions for one side
static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
#ifdef VECTOR
static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
#endif
public:
// Output type
using OutputType = TransformedFeatureType;
// Number of input/output dimensions
static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
// Size of forward propagation buffer
static constexpr std::size_t kBufferSize =
kOutputDimensions * sizeof(OutputType);
static constexpr int kLayerIndex = 0;
// Hash value embedded in the evaluation file
static constexpr std::uint32_t GetHashValue() {
return RawFeatures::kHashValue ^ kOutputDimensions;
}
static std::string get_name() {
return RawFeatures::get_name() + "[" +
std::to_string(kInputDimensions) + "->" +
std::to_string(kHalfDimensions) + "x2]";
}
// a string representing the structure
static std::string get_structure_string() {
return get_name();
}
static std::string get_layers_info() {
std::string info = " - ";
info += std::to_string(kLayerIndex);
info += " - ";
info += get_name();
return info;
}
// Read network parameters
bool ReadParameters(std::istream& stream) {
for (std::size_t i = 0; i < kHalfDimensions; ++i)
biases_[i] = read_little_endian<BiasType>(stream);
for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
weights_[i] = read_little_endian<WeightType>(stream);
return !stream.fail();
}
// write parameters
bool WriteParameters(std::ostream& stream) const {
stream.write(reinterpret_cast<const char*>(biases_),
kHalfDimensions * sizeof(BiasType));
stream.write(reinterpret_cast<const char*>(weights_),
kHalfDimensions * kInputDimensions * sizeof(WeightType));
return !stream.fail();
}
// Proceed with the difference calculation if possible
bool update_accumulator_if_possible(const Position& pos) const {
const auto now = pos.state();
if (now->accumulator.computed_accumulation)
return true;
const auto prev = now->previous;
if (prev && prev->accumulator.computed_accumulation) {
update_accumulator(pos);
return true;
}
return false;
}
// Convert input features
void Transform(const Position& pos, OutputType* output) const {
if (!update_accumulator_if_possible(pos))
refresh_accumulator(pos);
const auto& accumulation = pos.state()->accumulator.accumulation;
#if defined(USE_AVX512)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2);
static_assert(kHalfDimensions % (kSimdWidth * 2) == 0);
const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
const __m512i kZero = _mm512_setzero_si512();
#elif defined(USE_AVX2)
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
constexpr int kControl = 0b11011000;
const __m256i kZero = _mm256_setzero_si256();
#elif defined(USE_SSE2)
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
#ifdef USE_SSE41
const __m128i kZero = _mm_setzero_si128();
#else
const __m128i k0x80s = _mm_set1_epi8(-128);
#endif
#elif defined(USE_MMX)
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
const __m64 k0x80s = _mm_set1_pi8(-128);
#elif defined(USE_NEON)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
const int8x8_t kZero = {0};
#endif
const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
for (IndexType p = 0; p < 2; ++p) {
const IndexType offset = kHalfDimensions * p;
#if defined(USE_AVX512)
auto out = reinterpret_cast<__m512i*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m512i sum0 = _mm512_load_si512(
&reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
__m512i sum1 = _mm512_load_si512(
&reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
sum0 = _mm512_add_epi16(sum0, reinterpret_cast<const __m512i*>(
accumulation[perspectives[p]][i])[j * 2 + 0]);
sum1 = _mm512_add_epi16(sum1, reinterpret_cast<const __m512i*>(
accumulation[perspectives[p]][i])[j * 2 + 1]);
}
_mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl,
_mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero)));
}
#elif defined(USE_AVX2)
auto out = reinterpret_cast<__m256i*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m256i sum0 = _mm256_load_si256(
&reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
__m256i sum1 = _mm256_load_si256(
&reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
accumulation[perspectives[p]][i])[j * 2 + 0]);
sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
accumulation[perspectives[p]][i])[j * 2 + 1]);
}
_mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
_mm256_packs_epi16(sum0, sum1), kZero), kControl));
}
#elif defined(USE_SSE2)
auto out = reinterpret_cast<__m128i*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
accumulation[perspectives[p]][0])[j * 2 + 0]);
__m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
accumulation[perspectives[p]][0])[j * 2 + 1]);
for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
accumulation[perspectives[p]][i])[j * 2 + 0]);
sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
accumulation[perspectives[p]][i])[j * 2 + 1]);
}
const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
_mm_store_si128(&out[j],
#ifdef USE_SSE41
_mm_max_epi8(packedbytes, kZero)
#else
_mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
#endif
);
}
#elif defined(USE_MMX)
auto out = reinterpret_cast<__m64*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m64 sum0 = *(&reinterpret_cast<const __m64*>(
accumulation[perspectives[p]][0])[j * 2 + 0]);
__m64 sum1 = *(&reinterpret_cast<const __m64*>(
accumulation[perspectives[p]][0])[j * 2 + 1]);
for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
accumulation[perspectives[p]][i])[j * 2 + 0]);
sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
accumulation[perspectives[p]][i])[j * 2 + 1]);
}
const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
}
#elif defined(USE_NEON)
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
int16x8_t sum = reinterpret_cast<const int16x8_t*>(
accumulation[perspectives[p]][0])[j];
for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
accumulation[perspectives[p]][i])[j]);
}
out[j] = vmax_s8(vqmovn_s16(sum), kZero);
}
#else
for (IndexType j = 0; j < kHalfDimensions; ++j) {
BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
sum += accumulation[static_cast<int>(perspectives[p])][i][j];
}
output[offset + j] = static_cast<OutputType>(
std::max<int>(0, std::min<int>(127, sum)));
}
#endif
}
#if defined(USE_MMX)
_mm_empty();
#endif
}
private:
// Calculate cumulative value without using difference calculation
void refresh_accumulator(const Position& pos) const {
#ifdef VECTOR
// Gcc-10.2 unnecessarily spills AVX2 registers if this array
// is defined in the VECTOR code below, once in each branch
vec_t acc[kNumRegs];
#endif
auto& accumulator = pos.state()->accumulator;
for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
Features::IndexList active_indices[2];
RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
active_indices);
for (Color perspective : { WHITE, BLACK }) {
#ifdef VECTOR
for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
auto accTile = reinterpret_cast<vec_t*>(
&accumulator.accumulation[perspective][i][j * kTileHeight]);
if (i == 0) {
auto biasesTile = reinterpret_cast<const vec_t*>(
&biases_[j * kTileHeight]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = biasesTile[k];
} else {
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_zero;
}
for (const auto index : active_indices[perspective]) {
const IndexType offset = kHalfDimensions * index + j * kTileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_add_16(acc[k], column[k]);
}
for (IndexType k = 0; k < kNumRegs; k++)
vec_store(&accTile[k], acc[k]);
}
#else #else
#undef TILING if (i == 0) {
std::memcpy(accumulator.accumulation[perspective][i], biases_,
#endif kHalfDimensions * sizeof(BiasType));
} else {
// Input feature converter std::memset(accumulator.accumulation[perspective][i], 0,
class FeatureTransformer { kHalfDimensions * sizeof(BiasType));
private:
// Number of output dimensions for one side
static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
#ifdef TILING
static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
#endif
public:
// Output type
using OutputType = TransformedFeatureType;
// Number of input/output dimensions
static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
// Size of forward propagation buffer
static constexpr std::size_t kBufferSize =
kOutputDimensions * sizeof(OutputType);
static constexpr int kLayerIndex = 0;
// Hash value embedded in the evaluation file
static constexpr std::uint32_t GetHashValue() {
return RawFeatures::kHashValue ^ kOutputDimensions;
}
static std::string get_name() {
return RawFeatures::get_name() + "[" +
std::to_string(kInputDimensions) + "->" +
std::to_string(kHalfDimensions) + "x2]";
}
// a string representing the structure
static std::string get_structure_string() {
return get_name();
}
static std::string get_layers_info() {
std::string info = " - ";
info += std::to_string(kLayerIndex);
info += " - ";
info += get_name();
return info;
}
// Read network parameters
bool ReadParameters(std::istream& stream) {
for (std::size_t i = 0; i < kHalfDimensions; ++i)
biases_[i] = read_little_endian<BiasType>(stream);
for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
weights_[i] = read_little_endian<WeightType>(stream);
return !stream.fail();
}
// write parameters
bool WriteParameters(std::ostream& stream) const {
stream.write(reinterpret_cast<const char*>(biases_),
kHalfDimensions * sizeof(BiasType));
stream.write(reinterpret_cast<const char*>(weights_),
kHalfDimensions * kInputDimensions * sizeof(WeightType));
return !stream.fail();
}
// Proceed with the difference calculation if possible
bool update_accumulator_if_possible(const Position& pos) const {
const auto now = pos.state();
if (now->accumulator.computed_accumulation)
return true;
const auto prev = now->previous;
if (prev && prev->accumulator.computed_accumulation) {
update_accumulator(pos);
return true;
} }
return false; for (const auto index : active_indices[perspective]) {
} const IndexType offset = kHalfDimensions * index;
// Convert input features
void Transform(const Position& pos, OutputType* output) const {
if (!update_accumulator_if_possible(pos))
refresh_accumulator(pos);
const auto& accumulation = pos.state()->accumulator.accumulation;
#if defined(USE_AVX2)
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
constexpr int kControl = 0b11011000;
const __m256i kZero = _mm256_setzero_si256();
#elif defined(USE_SSE2)
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
#ifdef USE_SSE41
const __m128i kZero = _mm_setzero_si128();
#else
const __m128i k0x80s = _mm_set1_epi8(-128);
#endif
#elif defined(USE_MMX)
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
const __m64 k0x80s = _mm_set1_pi8(-128);
#elif defined(USE_NEON)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
const int8x8_t kZero = {0};
#endif
const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
for (IndexType p = 0; p < 2; ++p) {
const IndexType offset = kHalfDimensions * p;
#if defined(USE_AVX2)
auto out = reinterpret_cast<__m256i*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m256i sum0 = _mm256_load_si256(
&reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
__m256i sum1 = _mm256_load_si256(
&reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
accumulation[perspectives[p]][i])[j * 2 + 0]);
sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
accumulation[perspectives[p]][i])[j * 2 + 1]);
}
_mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
_mm256_packs_epi16(sum0, sum1), kZero), kControl));
}
#elif defined(USE_SSE2)
auto out = reinterpret_cast<__m128i*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
accumulation[perspectives[p]][0])[j * 2 + 0]);
__m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
accumulation[perspectives[p]][0])[j * 2 + 1]);
for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
accumulation[perspectives[p]][i])[j * 2 + 0]);
sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
accumulation[perspectives[p]][i])[j * 2 + 1]);
}
const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
_mm_store_si128(&out[j],
#ifdef USE_SSE41
_mm_max_epi8(packedbytes, kZero)
#else
_mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
#endif
);
}
#elif defined(USE_MMX)
auto out = reinterpret_cast<__m64*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m64 sum0 = *(&reinterpret_cast<const __m64*>(
accumulation[perspectives[p]][0])[j * 2 + 0]);
__m64 sum1 = *(&reinterpret_cast<const __m64*>(
accumulation[perspectives[p]][0])[j * 2 + 1]);
for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
accumulation[perspectives[p]][i])[j * 2 + 0]);
sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
accumulation[perspectives[p]][i])[j * 2 + 1]);
}
const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
}
#elif defined(USE_NEON)
const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
int16x8_t sum = reinterpret_cast<const int16x8_t*>(
accumulation[perspectives[p]][0])[j];
for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
accumulation[perspectives[p]][i])[j]);
}
out[j] = vmax_s8(vqmovn_s16(sum), kZero);
}
#else
for (IndexType j = 0; j < kHalfDimensions; ++j) {
BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
sum += accumulation[static_cast<int>(perspectives[p])][i][j];
}
output[offset + j] = static_cast<OutputType>(
std::max<int>(0, std::min<int>(127, sum)));
}
#endif
for (IndexType j = 0; j < kHalfDimensions; ++j)
accumulator.accumulation[perspective][i][j] += weights_[offset + j];
} }
#if defined(USE_MMX)
_mm_empty();
#endif #endif
}
} }
private:
// Calculate cumulative value without using difference calculation
void refresh_accumulator(const Position& pos) const {
auto& accumulator = pos.state()->accumulator;
for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
Features::IndexList active_indices[2];
RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
active_indices);
for (Color perspective : { WHITE, BLACK }) {
#ifdef TILING
for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
auto accTile = reinterpret_cast<vec_t*>(
&accumulator.accumulation[perspective][i][j * kTileHeight]);
vec_t acc[kNumRegs];
if (i == 0) {
auto biasesTile = reinterpret_cast<const vec_t*>(
&biases_[j * kTileHeight]);
for (unsigned k = 0; k < kNumRegs; ++k)
acc[k] = biasesTile[k];
} else {
for (unsigned k = 0; k < kNumRegs; ++k)
acc[k] = vec_zero;
}
for (const auto index : active_indices[perspective]) {
const IndexType offset = kHalfDimensions * index + j * kTileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
for (unsigned k = 0; k < kNumRegs; ++k)
acc[k] = vec_add_16(acc[k], column[k]);
}
for (unsigned k = 0; k < kNumRegs; k++)
vec_store(&accTile[k], acc[k]);
}
#else
if (i == 0) {
std::memcpy(accumulator.accumulation[perspective][i], biases_,
kHalfDimensions * sizeof(BiasType));
} else {
std::memset(accumulator.accumulation[perspective][i], 0,
kHalfDimensions * sizeof(BiasType));
}
for (const auto index : active_indices[perspective]) {
const IndexType offset = kHalfDimensions * index;
for (IndexType j = 0; j < kHalfDimensions; ++j)
accumulator.accumulation[perspective][i][j] += weights_[offset + j];
}
#endif
}
}
#if defined(USE_MMX) #if defined(USE_MMX)
_mm_empty(); _mm_empty();
#endif #endif
accumulator.computed_accumulation = true; accumulator.computed_accumulation = true;
}
// Calculate cumulative value using difference calculation
void update_accumulator(const Position& pos) const {
#ifdef VECTOR
// Gcc-10.2 unnecessarily spills AVX2 registers if this array
// is defined in the VECTOR code below, once in each branch
vec_t acc[kNumRegs];
#endif
const auto& prev_accumulator = pos.state()->previous->accumulator;
auto& accumulator = pos.state()->accumulator;
for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
Features::IndexList removed_indices[2], added_indices[2];
bool reset[2] = { false, false };
RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
removed_indices, added_indices, reset);
#ifdef VECTOR
for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
for (Color perspective : { WHITE, BLACK }) {
auto accTile = reinterpret_cast<vec_t*>(
&accumulator.accumulation[perspective][i][j * kTileHeight]);
if (reset[perspective]) {
if (i == 0) {
auto biasesTile = reinterpret_cast<const vec_t*>(
&biases_[j * kTileHeight]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = biasesTile[k];
} else {
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_zero;
}
} else {
auto prevAccTile = reinterpret_cast<const vec_t*>(
&prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_load(&prevAccTile[k]);
// Difference calculation for the deactivated features
for (const auto index : removed_indices[perspective]) {
const IndexType offset = kHalfDimensions * index + j * kTileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_sub_16(acc[k], column[k]);
}
}
{ // Difference calculation for the activated features
for (const auto index : added_indices[perspective]) {
const IndexType offset = kHalfDimensions * index + j * kTileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_add_16(acc[k], column[k]);
}
}
for (IndexType k = 0; k < kNumRegs; ++k)
vec_store(&accTile[k], acc[k]);
} }
}
// Calculate cumulative value using difference calculation
void update_accumulator(const Position& pos) const {
const auto& prev_accumulator = pos.state()->previous->accumulator;
auto& accumulator = pos.state()->accumulator;
for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
Features::IndexList removed_indices[2], added_indices[2];
bool reset[2] = { false, false };
RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
removed_indices, added_indices, reset);
#ifdef TILING
for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
for (Color perspective : { WHITE, BLACK }) {
auto accTile = reinterpret_cast<vec_t*>(
&accumulator.accumulation[perspective][i][j * kTileHeight]);
vec_t acc[kNumRegs];
if (reset[perspective]) {
if (i == 0) {
auto biasesTile = reinterpret_cast<const vec_t*>(
&biases_[j * kTileHeight]);
for (unsigned k = 0; k < kNumRegs; ++k)
acc[k] = biasesTile[k];
} else {
for (unsigned k = 0; k < kNumRegs; ++k)
acc[k] = vec_zero;
}
} else {
auto prevAccTile = reinterpret_cast<const vec_t*>(
&prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_load(&prevAccTile[k]);
// Difference calculation for the deactivated features
for (const auto index : removed_indices[perspective]) {
const IndexType offset = kHalfDimensions * index + j * kTileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_sub_16(acc[k], column[k]);
}
}
{ // Difference calculation for the activated features
for (const auto index : added_indices[perspective]) {
const IndexType offset = kHalfDimensions * index + j * kTileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_add_16(acc[k], column[k]);
}
}
for (IndexType k = 0; k < kNumRegs; ++k)
vec_store(&accTile[k], acc[k]);
}
}
#if defined(USE_MMX) #if defined(USE_MMX)
_mm_empty(); _mm_empty();
#endif #endif
#else #else
for (Color perspective : { WHITE, BLACK }) { for (Color perspective : { WHITE, BLACK }) {
if (reset[perspective]) { if (reset[perspective]) {
if (i == 0) { if (i == 0) {
std::memcpy(accumulator.accumulation[perspective][i], biases_, std::memcpy(accumulator.accumulation[perspective][i], biases_,
kHalfDimensions * sizeof(BiasType)); kHalfDimensions * sizeof(BiasType));
} else { } else {
std::memset(accumulator.accumulation[perspective][i], 0, std::memset(accumulator.accumulation[perspective][i], 0,
kHalfDimensions * sizeof(BiasType)); kHalfDimensions * sizeof(BiasType));
} }
} else { } else {
std::memcpy(accumulator.accumulation[perspective][i], std::memcpy(accumulator.accumulation[perspective][i],
prev_accumulator.accumulation[perspective][i], prev_accumulator.accumulation[perspective][i],
kHalfDimensions * sizeof(BiasType)); kHalfDimensions * sizeof(BiasType));
// Difference calculation for the deactivated features // Difference calculation for the deactivated features
for (const auto index : removed_indices[perspective]) { for (const auto index : removed_indices[perspective]) {
const IndexType offset = kHalfDimensions * index; const IndexType offset = kHalfDimensions * index;
for (IndexType j = 0; j < kHalfDimensions; ++j) for (IndexType j = 0; j < kHalfDimensions; ++j)
accumulator.accumulation[perspective][i][j] -= weights_[offset + j]; accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
} }
}
{ // Difference calculation for the activated features
for (const auto index : added_indices[perspective]) {
const IndexType offset = kHalfDimensions * index;
for (IndexType j = 0; j < kHalfDimensions; ++j)
accumulator.accumulation[perspective][i][j] += weights_[offset + j];
}
}
}
#endif
}
accumulator.computed_accumulation = true;
} }
{ // Difference calculation for the activated features
for (const auto index : added_indices[perspective]) {
const IndexType offset = kHalfDimensions * index;
using BiasType = std::int16_t; for (IndexType j = 0; j < kHalfDimensions; ++j)
using WeightType = std::int16_t; accumulator.accumulation[perspective][i][j] += weights_[offset + j];
}
}
}
#endif
}
accumulator.computed_accumulation = true;
}
// Make the learning class a friend using BiasType = std::int16_t;
friend class Trainer<FeatureTransformer>; using WeightType = std::int16_t;
alignas(kCacheLineSize) BiasType biases_[kHalfDimensions]; // Make the learning class a friend
alignas(kCacheLineSize) friend class Trainer<FeatureTransformer>;
WeightType weights_[kHalfDimensions * kInputDimensions];
}; alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
alignas(kCacheLineSize)
WeightType weights_[kHalfDimensions * kInputDimensions];
};
} // namespace Eval::NNUE } // namespace Eval::NNUE
#endif //#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED #endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+2 -2
View File
@@ -176,8 +176,8 @@ namespace {
score -= Doubled * doubled score -= Doubled * doubled
+ WeakLever * more_than_one(lever); + WeakLever * more_than_one(lever);
if (blocked && r > RANK_4) if (blocked && r >= RANK_5)
score += BlockedPawn[r-4]; score += BlockedPawn[r - RANK_5];
} }
return score; return score;
+18 -18
View File
@@ -59,7 +59,7 @@ namespace {
// Razor and futility margins // Razor and futility margins
constexpr int RazorMargin = 510; constexpr int RazorMargin = 510;
Value futility_margin(Depth d, bool improving) { Value futility_margin(Depth d, bool improving) {
return Value(223 * (d - improving)); return Value(234 * (d - improving));
} }
// Reductions lookup table, initialized at startup // Reductions lookup table, initialized at startup
@@ -67,7 +67,7 @@ namespace {
Depth reduction(bool i, Depth d, int mn) { Depth reduction(bool i, Depth d, int mn) {
int r = Reductions[d] * Reductions[mn]; int r = Reductions[d] * Reductions[mn];
return (r + 509) / 1024 + (!i && r > 894); return (r + 503) / 1024 + (!i && r > 915);
} }
constexpr int futility_move_count(bool improving, Depth depth) { constexpr int futility_move_count(bool improving, Depth depth) {
@@ -188,7 +188,7 @@ namespace {
void Search::init() { void Search::init() {
for (int i = 1; i < MAX_MOVES; ++i) for (int i = 1; i < MAX_MOVES; ++i)
Reductions[i] = int((22.0 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i))); Reductions[i] = int((21.3 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i)));
} }
@@ -404,7 +404,7 @@ void Thread::search() {
beta = std::min(prev + delta, VALUE_INFINITE); beta = std::min(prev + delta, VALUE_INFINITE);
// Adjust contempt based on root move's previousScore (dynamic contempt) // Adjust contempt based on root move's previousScore (dynamic contempt)
int dct = ct + (105 - ct / 2) * prev / (abs(prev) + 149); int dct = ct + (113 - ct / 2) * prev / (abs(prev) + 147);
contempt = (us == WHITE ? make_score(dct, dct / 2) contempt = (us == WHITE ? make_score(dct, dct / 2)
: -make_score(dct, dct / 2)); : -make_score(dct, dct / 2));
@@ -824,7 +824,7 @@ namespace {
&& (ss-1)->statScore < 22977 && (ss-1)->statScore < 22977
&& eval >= beta && eval >= beta
&& eval >= ss->staticEval && eval >= ss->staticEval
&& ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 182 && ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 168
&& !excludedMove && !excludedMove
&& pos.non_pawn_material(us) && pos.non_pawn_material(us)
&& (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor)) && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor))
@@ -832,7 +832,7 @@ namespace {
assert(eval - beta >= 0); assert(eval - beta >= 0);
// Null move dynamic reduction based on depth and value // Null move dynamic reduction based on depth and value
Depth R = (982 + 85 * depth) / 256 + std::min(int(eval - beta) / 192, 3); Depth R = (1015 + 85 * depth) / 256 + std::min(int(eval - beta) / 191, 3);
ss->currentMove = MOVE_NULL; ss->currentMove = MOVE_NULL;
ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0]; ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0];
@@ -849,7 +849,7 @@ namespace {
if (nullValue >= VALUE_TB_WIN_IN_MAX_PLY) if (nullValue >= VALUE_TB_WIN_IN_MAX_PLY)
nullValue = beta; nullValue = beta;
if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 13)) if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 14))
return nullValue; return nullValue;
assert(!thisThread->nmpMinPly); // Recursive verification is not allowed assert(!thisThread->nmpMinPly); // Recursive verification is not allowed
@@ -868,7 +868,7 @@ namespace {
} }
} }
probCutBeta = beta + 176 - 49 * improving; probCutBeta = beta + 183 - 49 * improving;
// Step 10. ProbCut (~10 Elo) // Step 10. ProbCut (~10 Elo)
// If we have a good enough capture and a reduced search returns a value // If we have a good enough capture and a reduced search returns a value
@@ -1036,7 +1036,7 @@ moves_loop: // When in check, search starts from here
// Futility pruning: parent node (~5 Elo) // Futility pruning: parent node (~5 Elo)
if ( lmrDepth < 7 if ( lmrDepth < 7
&& !ss->inCheck && !ss->inCheck
&& ss->staticEval + 283 + 170 * lmrDepth <= alpha && ss->staticEval + 266 + 170 * lmrDepth <= alpha
&& (*contHist[0])[movedPiece][to_sq(move)] && (*contHist[0])[movedPiece][to_sq(move)]
+ (*contHist[1])[movedPiece][to_sq(move)] + (*contHist[1])[movedPiece][to_sq(move)]
+ (*contHist[3])[movedPiece][to_sq(move)] + (*contHist[3])[movedPiece][to_sq(move)]
@@ -1044,7 +1044,7 @@ moves_loop: // When in check, search starts from here
continue; continue;
// Prune moves with negative SEE (~20 Elo) // Prune moves with negative SEE (~20 Elo)
if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth))) if (!pos.see_ge(move, Value(-(30 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
continue; continue;
} }
else else
@@ -1055,8 +1055,8 @@ moves_loop: // When in check, search starts from here
&& captureHistory[movedPiece][to_sq(move)][type_of(pos.piece_on(to_sq(move)))] < 0) && captureHistory[movedPiece][to_sq(move)][type_of(pos.piece_on(to_sq(move)))] < 0)
continue; continue;
// See based pruning // SEE based pruning
if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo) if (!pos.see_ge(move, Value(-213) * depth)) // (~25 Elo)
continue; continue;
} }
} }
@@ -1150,12 +1150,12 @@ moves_loop: // When in check, search starts from here
|| moveCountPruning || moveCountPruning
|| ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha
|| cutNode || cutNode
|| thisThread->ttHitAverage < 427 * TtHitAverageResolution * TtHitAverageWindow / 1024)) || thisThread->ttHitAverage < 432 * TtHitAverageResolution * TtHitAverageWindow / 1024))
{ {
Depth r = reduction(improving, depth, moveCount); Depth r = reduction(improving, depth, moveCount);
// Decrease reduction if the ttHit running average is large // Decrease reduction if the ttHit running average is large
if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024) if (thisThread->ttHitAverage > 537 * TtHitAverageResolution * TtHitAverageWindow / 1024)
r--; r--;
// Increase reduction if other threads are searching this position // Increase reduction if other threads are searching this position
@@ -1208,10 +1208,10 @@ moves_loop: // When in check, search starts from here
- 5287; - 5287;
// Decrease/increase reduction by comparing opponent's stat score (~10 Elo) // Decrease/increase reduction by comparing opponent's stat score (~10 Elo)
if (ss->statScore >= -106 && (ss-1)->statScore < -104) if (ss->statScore >= -105 && (ss-1)->statScore < -103)
r--; r--;
else if ((ss-1)->statScore >= -119 && ss->statScore < -140) else if ((ss-1)->statScore >= -122 && ss->statScore < -129)
r++; r++;
// Decrease/increase reduction for moves with a good/bad history (~30 Elo) // Decrease/increase reduction for moves with a good/bad history (~30 Elo)
@@ -1225,7 +1225,7 @@ moves_loop: // When in check, search starts from here
// Unless giving check, this capture is likely bad // Unless giving check, this capture is likely bad
if ( !givesCheck if ( !givesCheck
&& ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha) && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 210 * depth <= alpha)
r++; r++;
} }
@@ -1499,7 +1499,7 @@ moves_loop: // When in check, search starts from here
if (PvNode && bestValue > alpha) if (PvNode && bestValue > alpha)
alpha = bestValue; alpha = bestValue;
futilityBase = bestValue + 145; futilityBase = bestValue + 155;
} }
const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory, const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory,
+2 -2
View File
@@ -204,8 +204,8 @@ enum PieceType {
enum Piece { enum Piece {
NO_PIECE, NO_PIECE,
W_PAWN = 1, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING, W_PAWN = PAWN, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
B_PAWN = 9, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING, B_PAWN = PAWN + 8, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
PIECE_NB = 16 PIECE_NB = 16
}; };