Merge remote-tracking branch 'remotes/origin/master' into trainer

This commit is contained in:
noobpwnftw
2020-09-19 02:26:03 +08:00
7 changed files with 196 additions and 147 deletions
+1
View File
@@ -63,6 +63,7 @@ Gary Heckman (gheckman)
George Sobala (gsobala)
gguliash
Gian-Carlo Pascutto (gcp)
Deshawn Mohan-Smith (GoldenRare)
Gontran Lemaire (gonlem)
Goodkov Vasiliy Aleksandrovich (goodkov)
Gregor Cramer
+13 -19
View File
@@ -410,19 +410,6 @@ ifeq ($(COMP),clang)
endif
endif
ifeq ($(comp),icc)
profile_make = icc-profile-make
profile_use = icc-profile-use
else
ifeq ($(comp),clang)
profile_make = clang-profile-make
profile_use = clang-profile-use
else
profile_make = gcc-profile-make
profile_use = gcc-profile-use
endif
endif
ifeq ($(KERNEL),Darwin)
CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14
LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
@@ -434,20 +421,30 @@ endif
# Currently we don't know how to make PGO builds with the NDK yet.
ifeq ($(COMP),ndk)
CXXFLAGS += -stdlib=libc++ -fPIE
comp=clang
ifeq ($(arch),armv7)
comp=armv7a-linux-androideabi16-clang
CXX=armv7a-linux-androideabi16-clang++
CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
STRIP=arm-linux-androideabi-strip
endif
ifeq ($(arch),armv8)
comp=aarch64-linux-android21-clang
CXX=aarch64-linux-android21-clang++
STRIP=aarch64-linux-android-strip
endif
LDFLAGS += -static-libstdc++ -pie -lm -latomic
endif
ifeq ($(comp),icc)
profile_make = icc-profile-make
profile_use = icc-profile-use
else ifeq ($(comp),clang)
profile_make = clang-profile-make
profile_use = clang-profile-use
else
profile_make = gcc-profile-make
profile_use = gcc-profile-use
endif
### Travis CI script uses COMPILER to overwrite CXX
ifdef COMPILER
COMPCXX=$(COMPILER)
@@ -619,10 +616,7 @@ endif
### needs access to the optimization flags.
ifeq ($(optimize),yes)
ifeq ($(debug), no)
ifeq ($(COMP),ndk)
CXXFLAGS += -flto=thin
LDFLAGS += $(CXXFLAGS)
else ifeq ($(comp),clang)
ifeq ($(comp),clang)
CXXFLAGS += -flto=thin
ifneq ($(findstring MINGW,$(KERNEL)),)
CXXFLAGS += -fuse-ld=lld
+125 -108
View File
@@ -29,6 +29,56 @@
namespace Eval::NNUE {
// If vector instructions are enabled, we update and refresh the
// accumulator tile by tile such that each tile fits in the CPU's
// vector registers.
#define TILING
#ifdef USE_AVX512
typedef __m512i vec_t;
#define vec_load(a) _mm512_loadA_si512(a)
#define vec_store(a,b) _mm512_storeA_si512(a,b)
#define vec_add_16(a,b) _mm512_add_epi16(a,b)
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
static constexpr IndexType kNumRegs = 8; // only 8 are needed
#elif USE_AVX2
typedef __m256i vec_t;
#define vec_load(a) _mm256_loadA_si256(a)
#define vec_store(a,b) _mm256_storeA_si256(a,b)
#define vec_add_16(a,b) _mm256_add_epi16(a,b)
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
static constexpr IndexType kNumRegs = 16;
#elif USE_SSE2
typedef __m128i vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_epi16(a,b)
#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
#elif USE_MMX
typedef __m64 vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_pi16(a,b)
#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
static constexpr IndexType kNumRegs = 8;
#elif USE_NEON
typedef int16x8_t vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) vaddq_s16(a,b)
#define vec_sub_16(a,b) vsubq_s16(a,b)
static constexpr IndexType kNumRegs = 16;
#else
#undef TILING
#endif
// Input feature converter
class FeatureTransformer {
@@ -36,6 +86,11 @@ namespace Eval::NNUE {
// Number of output dimensions for one side
static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
#ifdef TILING
static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
#endif
public:
// Output type
using OutputType = TransformedFeatureType;
@@ -205,57 +260,41 @@ namespace Eval::NNUE {
RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
active_indices);
for (Color perspective : { WHITE, BLACK }) {
#ifdef TILING
for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
auto biasesTile = reinterpret_cast<const vec_t*>(
&biases_[j * kTileHeight]);
auto accTile = reinterpret_cast<vec_t*>(
&accumulator.accumulation[perspective][i][j * kTileHeight]);
vec_t acc[kNumRegs];
for (unsigned k = 0; k < kNumRegs; ++k)
acc[k] = biasesTile[k];
for (const auto index : active_indices[perspective]) {
const IndexType offset = kHalfDimensions * index + j * kTileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
for (unsigned k = 0; k < kNumRegs; ++k)
acc[k] = vec_add_16(acc[k], column[k]);
}
for (unsigned k = 0; k < kNumRegs; k++)
vec_store(&accTile[k], acc[k]);
}
#else
std::memcpy(accumulator.accumulation[perspective][i], biases_,
kHalfDimensions * sizeof(BiasType));
kHalfDimensions * sizeof(BiasType));
for (const auto index : active_indices[perspective]) {
const IndexType offset = kHalfDimensions * index;
#if defined(USE_AVX512)
auto accumulation = reinterpret_cast<__m512i*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
for (IndexType j = 0; j < kNumChunks; ++j)
_mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
#elif defined(USE_AVX2)
auto accumulation = reinterpret_cast<__m256i*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
for (IndexType j = 0; j < kNumChunks; ++j)
_mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
#elif defined(USE_SSE2)
auto accumulation = reinterpret_cast<__m128i*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
#elif defined(USE_MMX)
auto accumulation = reinterpret_cast<__m64*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
#elif defined(USE_NEON)
auto accumulation = reinterpret_cast<int16x8_t*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = vaddq_s16(accumulation[j], column[j]);
#else
for (IndexType j = 0; j < kHalfDimensions; ++j)
accumulator.accumulation[perspective][i][j] += weights_[offset + j];
#endif
}
#endif
}
#if defined(USE_MMX)
_mm_empty();
#endif
@@ -273,29 +312,55 @@ namespace Eval::NNUE {
bool reset[2];
RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
removed_indices, added_indices, reset);
for (Color perspective : { WHITE, BLACK }) {
#if defined(USE_AVX2)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
auto accumulation = reinterpret_cast<__m256i*>(
&accumulator.accumulation[perspective][i][0]);
#ifdef TILING
for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
for (Color perspective : { WHITE, BLACK }) {
auto accTile = reinterpret_cast<vec_t*>(
&accumulator.accumulation[perspective][i][j * kTileHeight]);
vec_t acc[kNumRegs];
#elif defined(USE_SSE2)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
auto accumulation = reinterpret_cast<__m128i*>(
&accumulator.accumulation[perspective][i][0]);
if (reset[perspective]) {
auto biasesTile = reinterpret_cast<const vec_t*>(
&biases_[j * kTileHeight]);
for (unsigned k = 0; k < kNumRegs; ++k)
acc[k] = biasesTile[k];
} else {
auto prevAccTile = reinterpret_cast<const vec_t*>(
&prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_load(&prevAccTile[k]);
#elif defined(USE_MMX)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
auto accumulation = reinterpret_cast<__m64*>(
&accumulator.accumulation[perspective][i][0]);
// Difference calculation for the deactivated features
for (const auto index : removed_indices[perspective]) {
const IndexType offset = kHalfDimensions * index + j * kTileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
#elif defined(USE_NEON)
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
auto accumulation = reinterpret_cast<int16x8_t*>(
&accumulator.accumulation[perspective][i][0]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_sub_16(acc[k], column[k]);
}
}
{ // Difference calculation for the activated features
for (const auto index : added_indices[perspective]) {
const IndexType offset = kHalfDimensions * index + j * kTileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
for (IndexType k = 0; k < kNumRegs; ++k)
acc[k] = vec_add_16(acc[k], column[k]);
}
}
for (IndexType k = 0; k < kNumRegs; ++k)
vec_store(&accTile[k], acc[k]);
}
}
#if defined(USE_MMX)
_mm_empty();
#endif
#else
for (Color perspective : { WHITE, BLACK }) {
if (reset[perspective]) {
std::memcpy(accumulator.accumulation[perspective][i], biases_,
kHalfDimensions * sizeof(BiasType));
@@ -307,67 +372,19 @@ namespace Eval::NNUE {
for (const auto index : removed_indices[perspective]) {
const IndexType offset = kHalfDimensions * index;
#if defined(USE_AVX2)
auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
#elif defined(USE_SSE2)
auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
#elif defined(USE_MMX)
auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
#elif defined(USE_NEON)
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = vsubq_s16(accumulation[j], column[j]);
#else
for (IndexType j = 0; j < kHalfDimensions; ++j)
accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
#endif
}
}
{ // Difference calculation for the activated features
for (const auto index : added_indices[perspective]) {
const IndexType offset = kHalfDimensions * index;
#if defined(USE_AVX2)
auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
#elif defined(USE_SSE2)
auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
#elif defined(USE_MMX)
auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
#elif defined(USE_NEON)
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = vaddq_s16(accumulation[j], column[j]);
#else
for (IndexType j = 0; j < kHalfDimensions; ++j)
accumulator.accumulation[perspective][i][j] += weights_[offset + j];
#endif
}
}
}
#if defined(USE_MMX)
_mm_empty();
#endif
accumulator.computed_accumulation = true;
+1
View File
@@ -194,6 +194,7 @@ public:
// Returns the position of the ball on the c side.
Square king_square(Color c) const { return pieceList[make_piece(c, KING)][0]; }
#endif // EVAL_LEARN
bool RootInTB;
private:
// Initialization helpers (used while setting up a position)
+9 -18
View File
@@ -43,7 +43,6 @@ namespace Search {
namespace Tablebases {
int Cardinality;
bool RootInTB;
bool UseRule50;
Depth ProbeDepth;
}
@@ -520,7 +519,7 @@ void Thread::search() {
totBestMoveChanges += th->bestMoveChanges;
th->bestMoveChanges = 0;
}
double bestMoveInstability = 1 + totBestMoveChanges / Threads.size();
double bestMoveInstability = 1 + 2 * totBestMoveChanges / Threads.size();
double totalTime = rootMoves.size() == 1 ? 0 :
Time.optimum() * fallingEval * reduction * bestMoveInstability;
@@ -654,9 +653,7 @@ namespace {
// starts with statScore = 0. Later grandchildren start with the last calculated
// statScore of the previous grandchild. This influences the reduction rules in
// LMR which are based on the statScore of parent position.
if (rootNode)
(ss+4)->statScore = 0;
else
if (!rootNode)
(ss+2)->statScore = 0;
// Step 4. Transposition table lookup. We don't want the score of a partial
@@ -1062,7 +1059,6 @@ moves_loop: // When in check, search starts from here
if ( !givesCheck
&& lmrDepth < 6
&& !(PvNode && abs(bestValue) < 2)
&& PieceValue[MG][type_of(movedPiece)] >= PieceValue[MG][type_of(pos.piece_on(to_sq(move)))]
&& !ss->inCheck
&& ss->staticEval + 169 + 244 * lmrDepth
+ PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha)
@@ -1133,11 +1129,6 @@ moves_loop: // When in check, search starts from here
&& pos.non_pawn_material() <= 2 * RookValueMg)
extension = 1;
// Castling extension
if ( type_of(move) == CASTLING
&& popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2)
extension = 1;
// Late irreversible move extension
if ( move == ttMove
&& pos.rule50_count() > 80
@@ -1853,7 +1844,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
size_t pvIdx = pos.this_thread()->pvIdx;
size_t multiPV = std::min((size_t)Options["MultiPV"], rootMoves.size());
uint64_t nodesSearched = Threads.nodes_searched();
uint64_t tbHits = Threads.tb_hits() + (TB::RootInTB ? rootMoves.size() : 0);
uint64_t tbHits = Threads.tb_hits() + (pos.RootInTB ? rootMoves.size() : 0);
for (size_t i = 0; i < multiPV; ++i)
{
@@ -1868,7 +1859,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
if (v == -VALUE_INFINITE)
v = VALUE_ZERO;
bool tb = TB::RootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
bool tb = pos.RootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
v = tb ? rootMoves[i].tbScore : v;
if (ss.rdbuf()->in_avail()) // Not at first line
@@ -1935,7 +1926,7 @@ bool RootMove::extract_ponder_from_tt(Position& pos) {
void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
RootInTB = false;
pos.RootInTB = false;
UseRule50 = bool(Options["Syzygy50MoveRule"]);
ProbeDepth = int(Options["SyzygyProbeDepth"]);
Cardinality = int(Options["SyzygyProbeLimit"]);
@@ -1952,17 +1943,17 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
if (Cardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING))
{
// Rank moves using DTZ tables
RootInTB = root_probe(pos, rootMoves);
pos.RootInTB = root_probe(pos, rootMoves);
if (!RootInTB)
if (!pos.RootInTB)
{
// DTZ tables are missing; try to rank moves using WDL tables
dtz_available = false;
RootInTB = root_probe_wdl(pos, rootMoves);
pos.RootInTB = root_probe_wdl(pos, rootMoves);
}
}
if (RootInTB)
if (pos.RootInTB)
{
// Sort moves according to TB rank
std::stable_sort(rootMoves.begin(), rootMoves.end(),
+46 -1
View File
@@ -32,7 +32,27 @@ TranspositionTable TT; // Our global transposition table
/// overwriting an old position. Update is not atomic and can be racy.
void TTEntry::save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev) {
if (Options["Training"])
return;
// Preserve any existing move for the same position
if (m || (uint16_t)k != key16)
move16 = (uint16_t)m;
// Overwrite less valuable entries (cheapest checks first)
if (b == BOUND_EXACT
|| (uint16_t)k != key16
|| d - DEPTH_OFFSET > depth8 - 4)
{
assert(d > DEPTH_OFFSET);
assert(d < 256 + DEPTH_OFFSET);
key16 = (uint16_t)k;
depth8 = (uint8_t)(d - DEPTH_OFFSET);
genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b);
value16 = (int16_t)v;
eval16 = (int16_t)ev;
}
}
@@ -97,7 +117,32 @@ void TranspositionTable::clear() {
/// TTEntry t2 if its replace value is greater than that of t2.
TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
return found = false, first_entry(0);
if (Options["Training"])
return found = false, first_entry(0);
TTEntry* const tte = first_entry(key);
const uint16_t key16 = (uint16_t)key; // Use the low 16 bits as key inside the cluster
for (int i = 0; i < ClusterSize; ++i)
if (tte[i].key16 == key16 || !tte[i].depth8)
{
tte[i].genBound8 = uint8_t(generation8 | (tte[i].genBound8 & 0x7)); // Refresh
return found = (bool)tte[i].depth8, &tte[i];
}
// Find an entry to be replaced according to the replacement strategy
TTEntry* replace = tte;
for (int i = 1; i < ClusterSize; ++i)
// Due to our packed storage format for generation and its cyclic
// nature we add 263 (256 is the modulus plus 7 to keep the unrelated
// lowest three bits from affecting the result) to calculate the entry
// age correctly even after generation8 overflows into the next cycle.
if ( replace->depth8 - ((263 + generation8 - replace->genBound8) & 0xF8)
> tte[i].depth8 - ((263 + generation8 - tte[i].genBound8) & 0xF8))
replace = &tte[i];
return found = false, replace;
}
+1 -1
View File
@@ -200,7 +200,7 @@ namespace UCI {
if (token == "go" || token == "eval")
{
cerr << "\nPosition: " << cnt++ << '/' << num << endl;
cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")" << endl;
if (token == "go")
{
go(pos, is, states);