Merge remote-tracking branch 'remotes/origin/master' into trainer

2026-05-20 15:37:47 +00:00 · 2020-09-19 02:26:03 +08:00
parent a47a3bfc7c 8b8a510fd6
commit 26f63fe741
7 changed files with 196 additions and 147 deletions
@@ -63,6 +63,7 @@ Gary Heckman (gheckman)
 George Sobala (gsobala)
 gguliash
 Gian-Carlo Pascutto (gcp)
 Deshawn Mohan-Smith (GoldenRare)
 Gontran Lemaire (gonlem)
 Goodkov Vasiliy Aleksandrovich (goodkov)
 Gregor Cramer
@@ -410,19 +410,6 @@ ifeq ($(COMP),clang)
 	endif
 endif
 ifeq ($(comp),icc)
 	profile_make = icc-profile-make
 	profile_use = icc-profile-use
 else
 ifeq ($(comp),clang)
 	profile_make = clang-profile-make
 	profile_use = clang-profile-use
 else
 	profile_make = gcc-profile-make
 	profile_use = gcc-profile-use
 endif
 endif
 ifeq ($(KERNEL),Darwin)
 	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
@@ -434,20 +421,30 @@ endif
 # Currently we don't know how to make PGO builds with the NDK yet.
 ifeq ($(COMP),ndk)
 	CXXFLAGS += -stdlib=libc++ -fPIE
 	comp=clang
 	ifeq ($(arch),armv7)
 		comp=armv7a-linux-androideabi16-clang
 		CXX=armv7a-linux-androideabi16-clang++
 		CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
 		STRIP=arm-linux-androideabi-strip
 	endif
 	ifeq ($(arch),armv8)
 		comp=aarch64-linux-android21-clang
 		CXX=aarch64-linux-android21-clang++
 		STRIP=aarch64-linux-android-strip
 	endif
 	LDFLAGS += -static-libstdc++ -pie -lm -latomic
 endif
 ifeq ($(comp),icc)
 	profile_make = icc-profile-make
 	profile_use = icc-profile-use
 else ifeq ($(comp),clang)
 	profile_make = clang-profile-make
 	profile_use = clang-profile-use
 else
 	profile_make = gcc-profile-make
 	profile_use = gcc-profile-use
 endif
 ### Travis CI script uses COMPILER to overwrite CXX
 ifdef COMPILER
 	COMPCXX=$(COMPILER)
@@ -619,10 +616,7 @@ endif
 ### needs access to the optimization flags.
 ifeq ($(optimize),yes)
 ifeq ($(debug), no)
-	ifeq ($(COMP),ndk)
+	ifeq ($(comp),clang)
 		CXXFLAGS += -flto=thin
 		LDFLAGS += $(CXXFLAGS)
 	else ifeq ($(comp),clang)
 		CXXFLAGS += -flto=thin
 		ifneq ($(findstring MINGW,$(KERNEL)),)
 			CXXFLAGS += -fuse-ld=lld
@@ -29,6 +29,56 @@
 namespace Eval::NNUE {
  // If vector instructions are enabled, we update and refresh the
  // accumulator tile by tile such that each tile fits in the CPU's
  // vector registers.
  #define TILING
  #ifdef USE_AVX512
  typedef __m512i vec_t;
  #define vec_load(a) _mm512_loadA_si512(a)
  #define vec_store(a,b) _mm512_storeA_si512(a,b)
  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
  static constexpr IndexType kNumRegs = 8; // only 8 are needed
  #elif USE_AVX2
  typedef __m256i vec_t;
  #define vec_load(a) _mm256_loadA_si256(a)
  #define vec_store(a,b) _mm256_storeA_si256(a,b)
  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
  static constexpr IndexType kNumRegs = 16;
  #elif USE_SSE2
  typedef __m128i vec_t;
  #define vec_load(a) (*(a))
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) _mm_add_epi16(a,b)
  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
  static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
  #elif USE_MMX
  typedef __m64 vec_t;
  #define vec_load(a) (*(a))
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) _mm_add_pi16(a,b)
  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
  static constexpr IndexType kNumRegs = 8;
  #elif USE_NEON
  typedef int16x8_t vec_t;
  #define vec_load(a) (*(a))
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) vaddq_s16(a,b)
  #define vec_sub_16(a,b) vsubq_s16(a,b)
  static constexpr IndexType kNumRegs = 16;
  #else
  #undef TILING
  #endif
  // Input feature converter
  class FeatureTransformer {
@@ -36,6 +86,11 @@ namespace Eval::NNUE {
    // Number of output dimensions for one side
    static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
    #ifdef TILING
    static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
    static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
    #endif
   public:
    // Output type
    using OutputType = TransformedFeatureType;
@@ -205,57 +260,41 @@ namespace Eval::NNUE {
      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
                                       active_indices);
      for (Color perspective : { WHITE, BLACK }) {
  #ifdef TILING
        for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
          auto biasesTile = reinterpret_cast<const vec_t*>(
              &biases_[j * kTileHeight]);
          auto accTile = reinterpret_cast<vec_t*>(
              &accumulator.accumulation[perspective][i][j * kTileHeight]);
          vec_t acc[kNumRegs];
          for (unsigned k = 0; k < kNumRegs; ++k)
            acc[k] = biasesTile[k];
          for (const auto index : active_indices[perspective]) {
            const IndexType offset = kHalfDimensions * index + j * kTileHeight;
            auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
            for (unsigned k = 0; k < kNumRegs; ++k)
              acc[k] = vec_add_16(acc[k], column[k]);
          }
          for (unsigned k = 0; k < kNumRegs; k++)
            vec_store(&accTile[k], acc[k]);
        }
  #else
        std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                   kHalfDimensions * sizeof(BiasType));
+            kHalfDimensions * sizeof(BiasType));
        for (const auto index : active_indices[perspective]) {
          const IndexType offset = kHalfDimensions * index;
  #if defined(USE_AVX512)
          auto accumulation = reinterpret_cast<__m512i*>(
              &accumulator.accumulation[perspective][i][0]);
          auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
          constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
          for (IndexType j = 0; j < kNumChunks; ++j)
            _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
  #elif defined(USE_AVX2)
          auto accumulation = reinterpret_cast<__m256i*>(
              &accumulator.accumulation[perspective][i][0]);
          auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
          for (IndexType j = 0; j < kNumChunks; ++j)
            _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
  #elif defined(USE_SSE2)
          auto accumulation = reinterpret_cast<__m128i*>(
              &accumulator.accumulation[perspective][i][0]);
          auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
          for (IndexType j = 0; j < kNumChunks; ++j)
            accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
  #elif defined(USE_MMX)
          auto accumulation = reinterpret_cast<__m64*>(
              &accumulator.accumulation[perspective][i][0]);
          auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
          for (IndexType j = 0; j < kNumChunks; ++j)
            accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
  #elif defined(USE_NEON)
          auto accumulation = reinterpret_cast<int16x8_t*>(
              &accumulator.accumulation[perspective][i][0]);
          auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
          for (IndexType j = 0; j < kNumChunks; ++j)
            accumulation[j] = vaddq_s16(accumulation[j], column[j]);
  #else
          for (IndexType j = 0; j < kHalfDimensions; ++j)
            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
  #endif
        }
  #endif
      }
  #if defined(USE_MMX)
      _mm_empty();
  #endif
@@ -273,29 +312,55 @@ namespace Eval::NNUE {
      bool reset[2];
      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
                                        removed_indices, added_indices, reset);
      for (Color perspective : { WHITE, BLACK }) {
-  #if defined(USE_AVX2)
+  #ifdef TILING
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+      for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
-        auto accumulation = reinterpret_cast<__m256i*>(
+        for (Color perspective : { WHITE, BLACK }) {
-            &accumulator.accumulation[perspective][i][0]);
+          auto accTile = reinterpret_cast<vec_t*>(
              &accumulator.accumulation[perspective][i][j * kTileHeight]);
          vec_t acc[kNumRegs];
-  #elif defined(USE_SSE2)
+          if (reset[perspective]) {
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+            auto biasesTile = reinterpret_cast<const vec_t*>(
-        auto accumulation = reinterpret_cast<__m128i*>(
+                &biases_[j * kTileHeight]);
-            &accumulator.accumulation[perspective][i][0]);
+            for (unsigned k = 0; k < kNumRegs; ++k)
              acc[k] = biasesTile[k];
          } else {
            auto prevAccTile = reinterpret_cast<const vec_t*>(
                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
            for (IndexType k = 0; k < kNumRegs; ++k)
              acc[k] = vec_load(&prevAccTile[k]);
-  #elif defined(USE_MMX)
+            // Difference calculation for the deactivated features
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+            for (const auto index : removed_indices[perspective]) {
-        auto accumulation = reinterpret_cast<__m64*>(
+              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
-            &accumulator.accumulation[perspective][i][0]);
+              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-  #elif defined(USE_NEON)
+              for (IndexType k = 0; k < kNumRegs; ++k)
-        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+                acc[k] = vec_sub_16(acc[k], column[k]);
-        auto accumulation = reinterpret_cast<int16x8_t*>(
+            }
-            &accumulator.accumulation[perspective][i][0]);
+          }
          { // Difference calculation for the activated features
            for (const auto index : added_indices[perspective]) {
              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
              for (IndexType k = 0; k < kNumRegs; ++k)
                acc[k] = vec_add_16(acc[k], column[k]);
            }
          }
          for (IndexType k = 0; k < kNumRegs; ++k)
            vec_store(&accTile[k], acc[k]);
        }
      }
  #if defined(USE_MMX)
      _mm_empty();
  #endif
  #else
      for (Color perspective : { WHITE, BLACK }) {
        if (reset[perspective]) {
          std::memcpy(accumulator.accumulation[perspective][i], biases_,
                      kHalfDimensions * sizeof(BiasType));
@@ -307,67 +372,19 @@ namespace Eval::NNUE {
          for (const auto index : removed_indices[perspective]) {
            const IndexType offset = kHalfDimensions * index;
  #if defined(USE_AVX2)
            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
            for (IndexType j = 0; j < kNumChunks; ++j)
              accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
  #elif defined(USE_SSE2)
            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
            for (IndexType j = 0; j < kNumChunks; ++j)
              accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
  #elif defined(USE_MMX)
            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
            for (IndexType j = 0; j < kNumChunks; ++j)
              accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
  #elif defined(USE_NEON)
            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
            for (IndexType j = 0; j < kNumChunks; ++j)
              accumulation[j] = vsubq_s16(accumulation[j], column[j]);
  #else
            for (IndexType j = 0; j < kHalfDimensions; ++j)
              accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
  #endif
          }
        }
        { // Difference calculation for the activated features
          for (const auto index : added_indices[perspective]) {
            const IndexType offset = kHalfDimensions * index;
  #if defined(USE_AVX2)
            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
            for (IndexType j = 0; j < kNumChunks; ++j)
              accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
  #elif defined(USE_SSE2)
            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
            for (IndexType j = 0; j < kNumChunks; ++j)
              accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
  #elif defined(USE_MMX)
            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
            for (IndexType j = 0; j < kNumChunks; ++j)
              accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
  #elif defined(USE_NEON)
            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
            for (IndexType j = 0; j < kNumChunks; ++j)
              accumulation[j] = vaddq_s16(accumulation[j], column[j]);
  #else
            for (IndexType j = 0; j < kHalfDimensions; ++j)
              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
  #endif
          }
        }
      }
  #if defined(USE_MMX)
      _mm_empty();
  #endif
      accumulator.computed_accumulation = true;
@@ -194,6 +194,7 @@ public:
  // Returns the position of the ball on the c side.
  Square king_square(Color c) const { return pieceList[make_piece(c, KING)][0]; }
 #endif // EVAL_LEARN
  bool RootInTB;
 private:
  // Initialization helpers (used while setting up a position)
@@ -43,7 +43,6 @@ namespace Search {
 namespace Tablebases {
  int Cardinality;
  bool RootInTB;
  bool UseRule50;
  Depth ProbeDepth;
 }
@@ -520,7 +519,7 @@ void Thread::search() {
              totBestMoveChanges += th->bestMoveChanges;
              th->bestMoveChanges = 0;
          }
-          double bestMoveInstability = 1 + totBestMoveChanges / Threads.size();
+          double bestMoveInstability = 1 + 2 * totBestMoveChanges / Threads.size();
          double totalTime = rootMoves.size() == 1 ? 0 :
                             Time.optimum() * fallingEval * reduction * bestMoveInstability;
@@ -654,9 +653,7 @@ namespace {
    // starts with statScore = 0. Later grandchildren start with the last calculated
    // statScore of the previous grandchild. This influences the reduction rules in
    // LMR which are based on the statScore of parent position.
-    if (rootNode)
+    if (!rootNode)
        (ss+4)->statScore = 0;
    else
        (ss+2)->statScore = 0;
    // Step 4. Transposition table lookup. We don't want the score of a partial
@@ -1062,7 +1059,6 @@ moves_loop: // When in check, search starts from here
              if (   !givesCheck
                  && lmrDepth < 6
                  && !(PvNode && abs(bestValue) < 2)
                  && PieceValue[MG][type_of(movedPiece)] >= PieceValue[MG][type_of(pos.piece_on(to_sq(move)))]
                  && !ss->inCheck
                  && ss->staticEval + 169 + 244 * lmrDepth
                     + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha)
@@ -1133,11 +1129,6 @@ moves_loop: // When in check, search starts from here
               && pos.non_pawn_material() <= 2 * RookValueMg)
          extension = 1;
      // Castling extension
      if (   type_of(move) == CASTLING
          && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2)
          extension = 1;
      // Late irreversible move extension
      if (   move == ttMove
          && pos.rule50_count() > 80
@@ -1853,7 +1844,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
  size_t pvIdx = pos.this_thread()->pvIdx;
  size_t multiPV = std::min((size_t)Options["MultiPV"], rootMoves.size());
  uint64_t nodesSearched = Threads.nodes_searched();
-  uint64_t tbHits = Threads.tb_hits() + (TB::RootInTB ? rootMoves.size() : 0);
+  uint64_t tbHits = Threads.tb_hits() + (pos.RootInTB ? rootMoves.size() : 0);
  for (size_t i = 0; i < multiPV; ++i)
  {
@@ -1868,7 +1859,7 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
      if (v == -VALUE_INFINITE)
          v = VALUE_ZERO;
-      bool tb = TB::RootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
+      bool tb = pos.RootInTB && abs(v) < VALUE_MATE_IN_MAX_PLY;
      v = tb ? rootMoves[i].tbScore : v;
      if (ss.rdbuf()->in_avail()) // Not at first line
@@ -1935,7 +1926,7 @@ bool RootMove::extract_ponder_from_tt(Position& pos) {
 void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
-    RootInTB = false;
+    pos.RootInTB = false;
    UseRule50 = bool(Options["Syzygy50MoveRule"]);
    ProbeDepth = int(Options["SyzygyProbeDepth"]);
    Cardinality = int(Options["SyzygyProbeLimit"]);
@@ -1952,17 +1943,17 @@ void Tablebases::rank_root_moves(Position& pos, Search::RootMoves& rootMoves) {
    if (Cardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING))
    {
        // Rank moves using DTZ tables
-        RootInTB = root_probe(pos, rootMoves);
+        pos.RootInTB = root_probe(pos, rootMoves);
-        if (!RootInTB)
+        if (!pos.RootInTB)
        {
            // DTZ tables are missing; try to rank moves using WDL tables
            dtz_available = false;
-            RootInTB = root_probe_wdl(pos, rootMoves);
+            pos.RootInTB = root_probe_wdl(pos, rootMoves);
        }
    }
-    if (RootInTB)
+    if (pos.RootInTB)
    {
        // Sort moves according to TB rank
        std::stable_sort(rootMoves.begin(), rootMoves.end(),
@@ -32,7 +32,27 @@ TranspositionTable TT; // Our global transposition table
 /// overwriting an old position. Update is not atomic and can be racy.
 void TTEntry::save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev) {
  if (Options["Training"])
    return;
  // Preserve any existing move for the same position
  if (m || (uint16_t)k != key16)
      move16 = (uint16_t)m;
  // Overwrite less valuable entries (cheapest checks first)
  if (b == BOUND_EXACT
      || (uint16_t)k != key16
      || d - DEPTH_OFFSET > depth8 - 4)
  {
      assert(d > DEPTH_OFFSET);
      assert(d < 256 + DEPTH_OFFSET);
      key16     = (uint16_t)k;
      depth8    = (uint8_t)(d - DEPTH_OFFSET);
      genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b);
      value16   = (int16_t)v;
      eval16    = (int16_t)ev;
  }
 }
@@ -97,7 +117,32 @@ void TranspositionTable::clear() {
 /// TTEntry t2 if its replace value is greater than that of t2.
 TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
-  return found = false, first_entry(0);
+  if (Options["Training"])
    return found = false, first_entry(0);
  TTEntry* const tte = first_entry(key);
  const uint16_t key16 = (uint16_t)key;  // Use the low 16 bits as key inside the cluster
  for (int i = 0; i < ClusterSize; ++i)
      if (tte[i].key16 == key16 || !tte[i].depth8)
      {
          tte[i].genBound8 = uint8_t(generation8 | (tte[i].genBound8 & 0x7)); // Refresh
          return found = (bool)tte[i].depth8, &tte[i];
      }
  // Find an entry to be replaced according to the replacement strategy
  TTEntry* replace = tte;
  for (int i = 1; i < ClusterSize; ++i)
      // Due to our packed storage format for generation and its cyclic
      // nature we add 263 (256 is the modulus plus 7 to keep the unrelated
      // lowest three bits from affecting the result) to calculate the entry
      // age correctly even after generation8 overflows into the next cycle.
      if (  replace->depth8 - ((263 + generation8 - replace->genBound8) & 0xF8)
          >   tte[i].depth8 - ((263 + generation8 -   tte[i].genBound8) & 0xF8))
          replace = &tte[i];
  return found = false, replace;
 }
@@ -200,7 +200,7 @@ namespace UCI {
        if (token == "go" || token == "eval")
        {
-            cerr << "\nPosition: " << cnt++ << '/' << num << endl;
+            cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")" << endl;
            if (token == "go")
            {
               go(pos, is, states);