From de24fcebc873ce2d65b30e039745dbc2e851f443 Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Fri, 26 Jun 2020 17:26:46 -0700
Subject: [PATCH 01/86] Fix fragile code to use proper random 64 bit keys.

This fixes an old issue where we want to make a position unique but only
change a small number of bits in the key instead of all 64 of them randomly.
This is fragile and can lead to non uniqueness issues in the TT.

Key make_key(uint64_t seed) takes any integer and produces a unique random 64 bit key.
It is computationally efficient and is based on a congruential pseudo random number
generator using well tested constants by Donald Knuth
(see https://en.wikipedia.org/wiki/Linear_congruential_generator)

STC https://tests.stockfishchess.org/tests/view/5ef6c78f761b685b4c724bb6
LLR: 2.95 (-2.94,2.94) {-1.50,0.50}
Total: 154320 W: 29343 L: 29376 D: 95601
Ptnml(0-2): 2543, 18170, 35891, 17889, 2667

LTC https://tests.stockfishchess.org/tests/view/5ef7d1a9020eec13834a940e
LLR: 2.95 (-2.94,2.94) {-1.50,0.50}
Total: 53488 W: 6629 L: 6584 D: 40275
Ptnml(0-2): 372, 4878, 16183, 4955, 356

closes https://github.com/official-stockfish/Stockfish/pull/2773

bench: 4626776
---
 src/search.cpp | 2 +-
 src/types.h    | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/src/search.cpp b/src/search.cpp
index 1e2980cb..0fa39988 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -662,7 +662,7 @@ namespace {
     // search to overwrite a previous full search TT value, so we use a different
     // position key in case of an excluded move.
     excludedMove = ss->excludedMove;
-    posKey = pos.key() ^ (Key(excludedMove) << 48); // Isn't a very good hash
+    posKey = excludedMove == MOVE_NONE ? pos.key() : pos.key() ^ make_key(excludedMove);
     tte = TT.probe(posKey, ttHit);
     ttValue = ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE;
     ttMove =  rootNode ? thisThread->rootMoves[thisThread->pvIdx].pv[0]
diff --git a/src/types.h b/src/types.h
index 0c512f5b..c1598561 100644
--- a/src/types.h
+++ b/src/types.h
@@ -455,6 +455,11 @@ constexpr bool is_ok(Move m) {
   return from_sq(m) != to_sq(m); // Catch MOVE_NULL and MOVE_NONE
 }
 
+/// Based on a congruential pseudo random number generator
+constexpr Key make_key(uint64_t seed) {
+  return seed * 6364136223846793005ULL + 1442695040888963407ULL;
+}
+
 #endif // #ifndef TYPES_H_INCLUDED
 
 #include "tune.h" // Global visibility to tuning setup

From 547c4a216a9931e4d5ff95414f146cb6eb877611 Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Thu, 25 Jun 2020 22:08:17 -0700
Subject: [PATCH 02/86] Remove old zobrist trick for castling rights

Removes an 8 year old micro optimization aimed at 32-bit architectures
because back then doing an xor of a Key could not be done in one instruction.
See original commit here 821e1c7

STC https://tests.stockfishchess.org/tests/view/5ef5833dde213bf647527d0c
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 162648 W: 31053 L: 31097 D: 100498
Ptnml(0-2): 2841, 18966, 37715, 19000, 2802

LTC https://tests.stockfishchess.org/tests/view/5ef7b1bbf993893290cc1489
LLR: 2.93 (-2.94,2.94) {-1.50,0.50}
Total: 62360 W: 7617 L: 7586 D: 47157
Ptnml(0-2): 423, 5662, 18994, 5663, 438

closes https://github.com/official-stockfish/Stockfish/pull/2775

bench: 4591425
---
 src/position.cpp | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/position.cpp b/src/position.cpp
index 471ef01f..6ef7aedc 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -119,15 +119,7 @@ void Position::init() {
       Zobrist::enpassant[f] = rng.rand<Key>();
 
   for (int cr = NO_CASTLING; cr <= ANY_CASTLING; ++cr)
-  {
-      Zobrist::castling[cr] = 0;
-      Bitboard b = cr;
-      while (b)
-      {
-          Key k = Zobrist::castling[1ULL << pop_lsb(&b)];
-          Zobrist::castling[cr] ^= k ? k : rng.rand<Key>();
-      }
-  }
+      Zobrist::castling[cr] = rng.rand<Key>();
 
   Zobrist::side = rng.rand<Key>();
   Zobrist::noPawns = rng.rand<Key>();
@@ -780,9 +772,9 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   // Update castling rights if needed
   if (st->castlingRights && (castlingRightsMask[from] | castlingRightsMask[to]))
   {
-      int cr = castlingRightsMask[from] | castlingRightsMask[to];
-      k ^= Zobrist::castling[st->castlingRights & cr];
-      st->castlingRights &= ~cr;
+      k ^= Zobrist::castling[st->castlingRights];
+      st->castlingRights &= ~(castlingRightsMask[from] | castlingRightsMask[to]);
+      k ^= Zobrist::castling[st->castlingRights];
   }
 
   // Move the piece. The tricky Chess960 castling is handled earlier

From 2810a1ea85b3fbe62095fcb24442c08306f00af3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Sun, 28 Jun 2020 06:00:28 +0200
Subject: [PATCH 03/86] Increase value of pawns on fifth rank

This patch increases the endgame value of pawns on the fifth rank.
The increase is very small (+1 evaluation point, about 0.005 pawn)
for the pawns on external columns (a-b-c-f-g-h) and a bit bigger
(+7 evaluation points, about 0.033 pawn) for the pawns on d5/e5.

STC:
LLR: 2.93 (-2.94,2.94) {-0.50,1.50}
Total: 79864 W: 15331 L: 15027 D: 49506
Ptnml(0-2): 1336, 9284, 18433, 9498, 1381
https://tests.stockfishchess.org/tests/view/5ef73e2ef993893290cc0c47

LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.75}
Total: 47240 W: 5927 L: 5630 D: 35683
Ptnml(0-2): 320, 4133, 14440, 4384, 343
https://tests.stockfishchess.org/tests/view/5ef7c0c4f993893290cc14b7

closes https://github.com/official-stockfish/Stockfish/pull/2776

Bench: 4794633
---
 src/psqt.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/psqt.cpp b/src/psqt.cpp
index c5da9785..5e8dd2c7 100644
--- a/src/psqt.cpp
+++ b/src/psqt.cpp
@@ -92,7 +92,7 @@ constexpr Score PBonus[RANK_NB][FILE_NB] =
    { S(  3,-10), S(  3, -6), S( 10, 10), S( 19,  0), S( 16, 14), S( 19,  7), S(  7, -5), S( -5,-19) },
    { S( -9,-10), S(-15,-10), S( 11,-10), S( 15,  4), S( 32,  4), S( 22,  3), S(  5, -6), S(-22, -4) },
    { S( -4,  6), S(-23, -2), S(  6, -8), S( 20, -4), S( 40,-13), S( 17,-12), S(  4,-10), S( -8, -9) },
-   { S( 13,  9), S(  0,  4), S(-13,  3), S(  1,-12), S( 11,-12), S( -2, -6), S(-13, 13), S(  5,  8) },
+   { S( 13, 10), S(  0,  5), S(-13,  4), S(  1, -5), S( 11, -5), S( -2, -5), S(-13, 14), S(  5,  9) },
    { S(  5, 28), S(-12, 20), S( -7, 21), S( 22, 28), S( -8, 30), S( -5,  7), S(-15,  6), S( -8, 13) },
    { S( -7,  0), S(  7,-11), S( -3, 12), S(-13, 21), S(  5, 25), S(-16, 19), S( 10,  4), S( -8,  7) }
   };

From 16836f39b295ec635c9883498400f7006ac2869f Mon Sep 17 00:00:00 2001
From: Stefan Geschwentner <stgeschwentner@gmail.com>
Date: Sun, 28 Jun 2020 16:28:55 +0200
Subject: [PATCH 04/86] Scale down eval for drawish rook endgames.

STC:
LLR: 2.96 (-2.94,2.94) {-0.50,1.50}
Total: 82136 W: 15694 L: 15407 D: 51035
Ptnml(0-2): 1076, 8960, 20767, 9131, 1134
https://tests.stockfishchess.org/tests/view/5ef86cf8020eec13834a94dd

LTC:
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 70200 W: 8787 L: 8440 D: 52973
Ptnml(0-2): 325, 5983, 22170, 6264, 358
https://tests.stockfishchess.org/tests/view/5ef88225020eec13834a950a

closes https://github.com/official-stockfish/Stockfish/pull/2780

Bench: 4478869
---
 src/evaluate.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 60ec9c72..65f7bddc 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -782,6 +782,13 @@ namespace {
             else
                 sf = 22 + 3 * pos.count<ALL_PIECES>(strongSide);
         }
+        else if(   pos.non_pawn_material(WHITE) == RookValueMg
+                && pos.non_pawn_material(BLACK) == RookValueMg
+                && !pe->passed_pawns(strongSide)
+                && pos.count<PAWN>(strongSide) - pos.count<PAWN>(~strongSide) <= 1
+                && bool(KingSide & pos.pieces(strongSide, PAWN)) != bool(QueenSide & pos.pieces(strongSide, PAWN))
+                && (attacks_bb<KING>(pos.square<KING>(~strongSide)) & pos.pieces(~strongSide, PAWN)))
+            sf = 36;
         else
             sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide));
     }

From c7194bd924a606ab75d582d30cb41749312ea94e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Sun, 28 Jun 2020 22:24:57 +0200
Subject: [PATCH 05/86] Scale down eval for queen imbalance

We lower the endgame value of the evaluation when we detect that there
is only one queen left on the board (more precisely, we use a scale
factor of 37/64, or about 0.58, for the endgame part of the evaluation).
Hopefully this helps a little bit for the assessment of positions with
queen imbalance, which are one of the well-known Stockfish weaknesses.

STC:
LLR: 2.94 (-2.94,2.94) {-0.50,1.50}
Total: 21600 W: 4176 L: 3955 D: 13469
Ptnml(0-2): 351, 2457, 5003, 2598, 391
https://tests.stockfishchess.org/tests/view/5ef871b6020eec13834a94e8

LTC:
LLR: 2.97 (-2.94,2.94) {0.25,1.75}
Total: 248328 W: 30596 L: 29720 D: 188012
Ptnml(0-2): 1544, 22345, 75665, 22911, 1699
https://tests.stockfishchess.org/tests/view/5ef87aec020eec13834a94fe

Closes https://github.com/official-stockfish/Stockfish/pull/2781

Bench: 4441323
---
 src/evaluate.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 65f7bddc..d19cf34e 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -767,7 +767,6 @@ namespace {
     eg += v;
 
     // Compute the scale factor for the winning side
-
     Color strongSide = eg > VALUE_DRAW ? WHITE : BLACK;
     int sf = me->scale_factor(pos, strongSide);
 
@@ -782,13 +781,15 @@ namespace {
             else
                 sf = 22 + 3 * pos.count<ALL_PIECES>(strongSide);
         }
-        else if(   pos.non_pawn_material(WHITE) == RookValueMg
+        else if (  pos.non_pawn_material(WHITE) == RookValueMg
                 && pos.non_pawn_material(BLACK) == RookValueMg
                 && !pe->passed_pawns(strongSide)
                 && pos.count<PAWN>(strongSide) - pos.count<PAWN>(~strongSide) <= 1
                 && bool(KingSide & pos.pieces(strongSide, PAWN)) != bool(QueenSide & pos.pieces(strongSide, PAWN))
                 && (attacks_bb<KING>(pos.square<KING>(~strongSide)) & pos.pieces(~strongSide, PAWN)))
             sf = 36;
+        else if (pos.count<QUEEN>() == 1)
+            sf = 37;
         else
             sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide));
     }

From 69d3be42a112645a9e599df615f730d61a5dca8c Mon Sep 17 00:00:00 2001
From: Stefan Geschwentner <stgeschwentner@gmail.com>
Date: Mon, 29 Jun 2020 19:35:24 +0200
Subject: [PATCH 06/86] Tweak single queen endgame scaling.

Increase scaling factor for each minor of the opponent side of the queen.

STC:
LLR: 2.94 (-2.94,2.94) {-0.50,1.50}
Total: 14528 W: 2860 L: 2653 D: 9015
Ptnml(0-2): 217, 1632, 3408, 1741, 266
https://tests.stockfishchess.org/tests/view/5ef98384020eec13834a96a0

LTC:
LLR: 2.95 (-2.94,2.94) {0.25,1.75}
Total: 34584 W: 4371 L: 4111 D: 26102
Ptnml(0-2): 205, 3080, 10501, 3262, 244
https://tests.stockfishchess.org/tests/view/5ef99972020eec13834a96c9

closes https://github.com/official-stockfish/Stockfish/pull/2782

Bench: 4523573
---
 src/evaluate.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index d19cf34e..615df1ba 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -789,7 +789,8 @@ namespace {
                 && (attacks_bb<KING>(pos.square<KING>(~strongSide)) & pos.pieces(~strongSide, PAWN)))
             sf = 36;
         else if (pos.count<QUEEN>() == 1)
-            sf = 37;
+            sf = 37 + 3 * (pos.count<QUEEN>(WHITE) == 1 ? pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK)
+                                                        : pos.count<BISHOP>(WHITE) + pos.count<KNIGHT>(WHITE));
         else
             sf = std::min(sf, 36 + 7 * pos.count<PAWN>(strongSide));
     }

From 110068808b51344ac59f8c6a0846f5dfdf670392 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sat, 27 Jun 2020 21:29:29 +0200
Subject: [PATCH 07/86] Provide WDL statistics

A number of engines, GUIs and tournaments start to report WDL estimates
along or instead of scores. This patch enables reporting of those stats
in a more or less standard way (http://www.talkchess.com/forum3/viewtopic.php?t=72140)

The model this reporting uses is based on data derived from a few million fishtest LTC games,
given a score and a game ply, a win rate is provided that matches rather closely,
especially in the intermediate range [0.05, 0.95] that data. Some data is shown at
https://github.com/glinscott/fishtest/wiki/UsefulData#win-loss-draw-statistics-of-ltc-games-on-fishtest
Making the conversion game ply dependent is important for a good fit, and is in line
with experience that a +1 score in the early midgame is more likely a win than in the late endgame.

Even when enabled, the printing of the info causes no significant overhead.

Passed STC:
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 197112 W: 37226 L: 37347 D: 122539
Ptnml(0-2): 2591, 21025, 51464, 20866, 2610
https://tests.stockfishchess.org/tests/view/5ef79ef4f993893290cc146b

closes https://github.com/official-stockfish/Stockfish/pull/2778

No functional change
---
 Readme.md         |  5 +++++
 src/search.cpp    |  3 +++
 src/uci.cpp       | 39 +++++++++++++++++++++++++++++++++++++++
 src/uci.h         |  1 +
 src/ucioption.cpp |  1 +
 5 files changed, 49 insertions(+)

diff --git a/Readme.md b/Readme.md
index 2b1de86b..e60ac718 100644
--- a/Readme.md
+++ b/Readme.md
@@ -66,6 +66,11 @@ Currently, Stockfish has the following UCI options:
     If enabled by UCI_LimitStrength, aim for an engine strength of the given Elo.
     This Elo rating has been calibrated at a time control of 60s+0.6s and anchored to CCRL 40/4.
 
+  * #### UCI_ShowWDL
+    If enabled, show approximate WDL statistics as part of the engine output.
+    These WDL numbers model expected game outcomes for a given evaluation and
+    game ply for engine self-play at fishtest LTC conditions (60+0.6s per game).
+
   * #### Move Overhead
     Assume a time delay of x ms due to network and GUI overheads. This is useful to
     avoid losses on time in those cases.
diff --git a/src/search.cpp b/src/search.cpp
index 0fa39988..f14bdf77 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1835,6 +1835,9 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
          << " multipv "  << i + 1
          << " score "    << UCI::value(v);
 
+      if (Options["UCI_ShowWDL"])
+          ss << UCI::wdl(v, pos.game_ply());
+
       if (!tb && i == pvIdx)
           ss << (v >= beta ? " lowerbound" : v <= alpha ? " upperbound" : "");
 
diff --git a/src/uci.cpp b/src/uci.cpp
index 11d5adc6..bb57c80b 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -19,6 +19,7 @@
 */
 
 #include <cassert>
+#include <cmath>
 #include <iostream>
 #include <sstream>
 #include <string>
@@ -182,6 +183,28 @@ namespace {
          << "\nNodes/second    : " << 1000 * nodes / elapsed << endl;
   }
 
+  // The win rate model returns the probability (per mille) of winning given an eval
+  // and a game-ply. The model fits rather accurately the LTC fishtest statistics.
+  int win_rate_model(Value v, int ply) {
+
+     // The model captures only up to 240 plies, so limit input (and rescale)
+     double m = std::min(240, ply) / 64.0;
+
+     // Coefficients of a 3rd order polynomial fit based on fishtest data
+     // for two parameters needed to transform eval to the argument of a
+     // logistic function.
+     double as[] = {-8.24404295, 64.23892342, -95.73056462, 153.86478679};
+     double bs[] = {-3.37154371, 28.44489198, -56.67657741,  72.05858751};
+     double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
+     double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
+
+     // Transform eval to centipawns with limited range
+     double x = Utility::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
+
+     // Return win rate in per mille (rounded to nearest)
+     return int(0.5 + 1000 / (1 + std::exp((a - x) / b)));
+  }
+
 } // namespace
 
 
@@ -269,6 +292,22 @@ string UCI::value(Value v) {
 }
 
 
+/// UCI::wdl() report WDL statistics given an evaluation and a game ply, based on
+/// data gathered for fishtest LTC games.
+
+string UCI::wdl(Value v, int ply) {
+
+  stringstream ss;
+
+  int wdl_w = win_rate_model( v, ply);
+  int wdl_l = win_rate_model(-v, ply);
+  int wdl_d = 1000 - wdl_w - wdl_l;
+  ss << " wdl " << wdl_w << " " << wdl_d << " " << wdl_l;
+
+  return ss.str();
+}
+
+
 /// UCI::square() converts a Square to a string in algebraic notation (g1, a7, etc.)
 
 std::string UCI::square(Square s) {
diff --git a/src/uci.h b/src/uci.h
index b845889b..ad954d9f 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -73,6 +73,7 @@ std::string value(Value v);
 std::string square(Square s);
 std::string move(Move m, bool chess960);
 std::string pv(const Position& pos, Depth depth, Value alpha, Value beta);
+std::string wdl(Value v, int ply);
 Move to_move(const Position& pos, std::string& str);
 
 } // namespace UCI
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index c268c975..4befa6ac 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -74,6 +74,7 @@ void init(OptionsMap& o) {
   o["UCI_AnalyseMode"]       << Option(false);
   o["UCI_LimitStrength"]     << Option(false);
   o["UCI_Elo"]               << Option(1350, 1350, 2850);
+  o["UCI_ShowWDL"]           << Option(true);
   o["SyzygyPath"]            << Option("<empty>", on_tb_path);
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);

From 268c00b648ba4a48be79a849dde5733e6705ddbf Mon Sep 17 00:00:00 2001
From: Alain SAVARD <support@multicim.com>
Date: Wed, 1 Jul 2020 02:12:59 -0400
Subject: [PATCH 08/86] Use arrays

for safe checks, outposts and king protectors in evaluate.cpp

Tested for non regression on the safe checks
https://tests.stockfishchess.org/tests/view/5ef8b75c020eec13834a9596
LLR: 2.95 (-2.94,2.94) {-1.50,0.50}
Total: 22256 W: 4283 L: 4143 D: 13830
Ptnml(0-2): 291, 2439, 5588, 2459, 351

Tested for non regression on the safe checks, outposts and king protectors
https://tests.stockfishchess.org/tests/view/5ef8e543020eec13834a95e7
LLR: 2.95 (-2.94,2.94) {-1.50,0.50}
Total: 28400 W: 5382 L: 5253 D: 17765
Ptnml(0-2): 394, 3078, 7119, 3223, 386

closes https://github.com/official-stockfish/Stockfish/pull/2785

No functional change
---
 src/evaluate.cpp | 76 ++++++++++++++++++++++--------------------------
 src/pawns.cpp    |  4 ++-
 2 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 615df1ba..48db2b3b 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -80,11 +80,11 @@ namespace {
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
 
-  // Penalties for enemy's safe checks
-  constexpr int QueenSafeCheck  = 772;
-  constexpr int RookSafeCheck   = 1084;
-  constexpr int BishopSafeCheck = 645;
-  constexpr int KnightSafeCheck = 792;
+  // SafeCheck[PieceType][single/multiple] contains safe check bonus by piece type,
+  // higher if multiple safe checks are possible for that piece type.
+  constexpr int SafeCheck[][2] = {
+      {}, {}, {792, 1283}, {645, 967}, {1084, 1897}, {772, 1119}
+  };
 
 #define S(mg, eg) make_score(mg, eg)
 
@@ -106,6 +106,18 @@ namespace {
       S(110,182), S(114,182), S(114,192), S(116,219) }
   };
 
+  // KingProtector[knight/bishop] contains penalty for each distance unit to own king
+  constexpr Score KingProtector[] = { S(8, 9), S(6, 9) };
+
+  // Outpost[knight/bishop] contains bonuses for each knight or bishop occupying a
+  // pawn protected square on rank 4 to 6 which is also safe from a pawn attack.
+  constexpr Score Outpost[] = { S(56, 36), S(30, 23) };
+
+  // PassedRank[Rank] contains a bonus according to the rank of a passed pawn
+  constexpr Score PassedRank[RANK_NB] = {
+    S(0, 0), S(10, 28), S(17, 33), S(15, 41), S(62, 72), S(168, 177), S(276, 260)
+  };
+
   // RookOnFile[semiopen/open] contains bonuses for each rook when there is
   // no (friendly) pawn on the rook file.
   constexpr Score RookOnFile[] = { S(19, 7), S(48, 29) };
@@ -121,23 +133,14 @@ namespace {
     S(0, 0), S(3, 46), S(37, 68), S(42, 60), S(0, 38), S(58, 41)
   };
 
-  // PassedRank[Rank] contains a bonus according to the rank of a passed pawn
-  constexpr Score PassedRank[RANK_NB] = {
-    S(0, 0), S(10, 28), S(17, 33), S(15, 41), S(62, 72), S(168, 177), S(276, 260)
-  };
-
   // Assorted bonuses and penalties
-  constexpr Score BishopKingProtector = S(  6,  9);
   constexpr Score BishopOnKingRing    = S( 24,  0);
-  constexpr Score BishopOutpost       = S( 30, 23);
   constexpr Score BishopPawns         = S(  3,  7);
   constexpr Score BishopXRayPawns     = S(  4,  5);
   constexpr Score CorneredBishop      = S( 50, 50);
   constexpr Score FlankAttacks        = S(  8,  0);
   constexpr Score Hanging             = S( 69, 36);
-  constexpr Score KnightKingProtector = S(  8,  9);
   constexpr Score KnightOnQueen       = S( 16, 11);
-  constexpr Score KnightOutpost       = S( 56, 36);
   constexpr Score LongDiagonalBishop  = S( 45,  0);
   constexpr Score MinorBehindPawn     = S( 18,  3);
   constexpr Score PassedFile          = S( 11,  8);
@@ -308,7 +311,7 @@ namespace {
             // Bonus if piece is on an outpost square or can reach one
             bb = OutpostRanks & attackedBy[Us][PAWN] & ~pe->pawn_attacks_span(Them);
             if (bb & s)
-                score += (Pt == KNIGHT) ? KnightOutpost : BishopOutpost;
+                score += Outpost[Pt == BISHOP];
             else if (Pt == KNIGHT && bb & b & ~pos.pieces(Us))
                 score += ReachableOutpost;
 
@@ -317,8 +320,7 @@ namespace {
                 score += MinorBehindPawn;
 
             // Penalty if the piece is far from the king
-            score -= (Pt == KNIGHT ? KnightKingProtector
-                                   : BishopKingProtector) * distance(pos.square<KING>(Us), s);
+            score -= KingProtector[Pt == BISHOP] * distance(pos.square<KING>(Us), s);
 
             if (Pt == BISHOP)
             {
@@ -420,41 +422,33 @@ namespace {
     b2 = attacks_bb<BISHOP>(ksq, pos.pieces() ^ pos.pieces(Us, QUEEN));
 
     // Enemy rooks checks
-    rookChecks = b1 & safe & attackedBy[Them][ROOK];
+    rookChecks = b1 & attackedBy[Them][ROOK] & safe;
     if (rookChecks)
-        kingDanger += more_than_one(rookChecks) ? RookSafeCheck * 175/100
-                                                : RookSafeCheck;
+        kingDanger += SafeCheck[ROOK][more_than_one(rookChecks)];
     else
         unsafeChecks |= b1 & attackedBy[Them][ROOK];
 
-    // Enemy queen safe checks: we count them only if they are from squares from
-    // which we can't give a rook check, because rook checks are more valuable.
-    queenChecks =  (b1 | b2)
-                 & attackedBy[Them][QUEEN]
-                 & safe
-                 & ~attackedBy[Us][QUEEN]
-                 & ~rookChecks;
+    // Enemy queen safe checks: count them only if the checks are from squares from
+    // which opponent cannot give a rook check, because rook checks are more valuable.
+    queenChecks =  (b1 | b2) & attackedBy[Them][QUEEN] & safe
+                 & ~(attackedBy[Us][QUEEN] | rookChecks);
     if (queenChecks)
-        kingDanger += more_than_one(queenChecks) ? QueenSafeCheck * 145/100
-                                                 : QueenSafeCheck;
+        kingDanger += SafeCheck[QUEEN][more_than_one(queenChecks)];
 
-    // Enemy bishops checks: we count them only if they are from squares from
-    // which we can't give a queen check, because queen checks are more valuable.
-    bishopChecks =  b2
-                  & attackedBy[Them][BISHOP]
-                  & safe
+    // Enemy bishops checks: count them only if they are from squares from which
+    // opponent cannot give a queen check, because queen checks are more valuable.
+    bishopChecks =  b2 & attackedBy[Them][BISHOP] & safe
                   & ~queenChecks;
     if (bishopChecks)
-        kingDanger += more_than_one(bishopChecks) ? BishopSafeCheck * 3/2
-                                                  : BishopSafeCheck;
+        kingDanger += SafeCheck[BISHOP][more_than_one(bishopChecks)];
+
     else
         unsafeChecks |= b2 & attackedBy[Them][BISHOP];
 
     // Enemy knights checks
     knightChecks = attacks_bb<KNIGHT>(ksq) & attackedBy[Them][KNIGHT];
     if (knightChecks & safe)
-        kingDanger += more_than_one(knightChecks & safe) ? KnightSafeCheck * 162/100
-                                                         : KnightSafeCheck;
+        kingDanger += SafeCheck[KNIGHT][more_than_one(knightChecks & safe)];
     else
         unsafeChecks |= knightChecks;
 
@@ -464,7 +458,7 @@ namespace {
     b2 = b1 & attackedBy2[Them];
     b3 = attackedBy[Us][ALL_PIECES] & KingFlank[file_of(ksq)] & Camp;
 
-    int kingFlankAttack = popcount(b1) + popcount(b2);
+    int kingFlankAttack  = popcount(b1) + popcount(b2);
     int kingFlankDefense = popcount(b3);
 
     kingDanger +=        kingAttackersCount[Them] * kingAttackersWeight[Them]
@@ -741,8 +735,8 @@ namespace {
     bool almostUnwinnable =   outflanking < 0
                            && !pawnsOnBothFlanks;
 
-    bool infiltration = rank_of(pos.square<KING>(WHITE)) > RANK_4
-                     || rank_of(pos.square<KING>(BLACK)) < RANK_5;
+    bool infiltration =   rank_of(pos.square<KING>(WHITE)) > RANK_4
+                       || rank_of(pos.square<KING>(BLACK)) < RANK_5;
 
     // Compute the initiative bonus for the attacking side
     int complexity =   9 * pe->passed_count()
diff --git a/src/pawns.cpp b/src/pawns.cpp
index d741b2ef..d365ba12 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -38,7 +38,9 @@ namespace {
   constexpr Score WeakLever     = S( 0, 56);
   constexpr Score WeakUnopposed = S(13, 27);
 
-  constexpr Score BlockedStorm[RANK_NB]  = {S( 0, 0), S( 0, 0), S( 76, 78), S(-10, 15), S(-7, 10), S(-4, 6), S(-1, 2)};
+  constexpr Score BlockedStorm[RANK_NB] = {
+    S(0, 0), S(0, 0), S(76, 78), S(-10, 15), S(-7, 10), S(-4, 6), S(-1, 2)
+  };
 
   // Connected pawn bonus
   constexpr int Connected[RANK_NB] = { 0, 7, 8, 12, 29, 48, 86 };

From fb83da0892c183690ddeb1f7c3dbf6779b12707a Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Thu, 2 Jul 2020 18:58:37 +0200
Subject: [PATCH 09/86] Set UCI_ShowWDL by default to false

UCI_ShowWDL might not be shown by GUIs that don't know the option,
but crash on the WDL output, effectively making it hard for users to
turn it off and run the engine. This sets it by default to false.

fixes https://github.com/official-stockfish/Stockfish/issues/2787

closes https://github.com/official-stockfish/Stockfish/pull/2788

No functional change.
---
 src/ucioption.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 4befa6ac..ef54ef4e 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -74,7 +74,7 @@ void init(OptionsMap& o) {
   o["UCI_AnalyseMode"]       << Option(false);
   o["UCI_LimitStrength"]     << Option(false);
   o["UCI_Elo"]               << Option(1350, 1350, 2850);
-  o["UCI_ShowWDL"]           << Option(true);
+  o["UCI_ShowWDL"]           << Option(false);
   o["SyzygyPath"]            << Option("<empty>", on_tb_path);
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);

From 67818ee9481ba99369fa8a8d92e5c50428fb300e Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Thu, 2 Jul 2020 00:11:23 +0800
Subject: [PATCH 10/86] Remove passed pawn condition.

This will help scale down relatively high eval in drawish rook endgames with passed pawn like in TCEC S18 Superfinal Game 90.

Passed STC
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 50456 W: 9644 L: 9540 D: 31272
Ptnml(0-2): 760, 5637, 12332, 5737, 762
https://tests.stockfishchess.org/tests/view/5efcb76e59f6f035328940ed

Passed LTC
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 77264 W: 9518 L: 9518 D: 58228
Ptnml(0-2): 402, 6766, 24321, 6716, 427
https://tests.stockfishchess.org/tests/view/5efd2ad759f6f03532894143

closes https://github.com/official-stockfish/Stockfish/pull/2792

Bench: 4431626
---
 src/evaluate.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 48db2b3b..bb1724a4 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -777,7 +777,6 @@ namespace {
         }
         else if (  pos.non_pawn_material(WHITE) == RookValueMg
                 && pos.non_pawn_material(BLACK) == RookValueMg
-                && !pe->passed_pawns(strongSide)
                 && pos.count<PAWN>(strongSide) - pos.count<PAWN>(~strongSide) <= 1
                 && bool(KingSide & pos.pieces(strongSide, PAWN)) != bool(QueenSide & pos.pieces(strongSide, PAWN))
                 && (attacks_bb<KING>(pos.square<KING>(~strongSide)) & pos.pieces(~strongSide, PAWN)))

From c5b2a92cd17c65a639ec6739dd511767f65e188d Mon Sep 17 00:00:00 2001
From: protonspring <mike@whiteley.org>
Date: Tue, 30 Jun 2020 10:17:50 -0600
Subject: [PATCH 11/86] denormalize KRKP.

a non-functional code style change that denormalizes the KRKP endgame,
making it somewhat easier to read.

closes https://github.com/official-stockfish/Stockfish/pull/2786

No functional change
---
 src/endgame.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/endgame.cpp b/src/endgame.cpp
index be0755a8..40f49dce 100644
--- a/src/endgame.cpp
+++ b/src/endgame.cpp
@@ -181,15 +181,15 @@ Value Endgame<KRKP>::operator()(const Position& pos) const {
   assert(verify_material(pos, strongSide, RookValueMg, 0));
   assert(verify_material(pos, weakSide, VALUE_ZERO, 1));
 
-  Square strongKing = relative_square(strongSide, pos.square<KING>(strongSide));
-  Square weakKing   = relative_square(strongSide, pos.square<KING>(weakSide));
-  Square strongRook = relative_square(strongSide, pos.square<ROOK>(strongSide));
-  Square weakPawn   = relative_square(strongSide, pos.square<PAWN>(weakSide));
-  Square queeningSquare = make_square(file_of(weakPawn), RANK_1);
+  Square strongKing = pos.square<KING>(strongSide);
+  Square weakKing   = pos.square<KING>(weakSide);
+  Square strongRook = pos.square<ROOK>(strongSide);
+  Square weakPawn   = pos.square<PAWN>(weakSide);
+  Square queeningSquare = make_square(file_of(weakPawn), relative_rank(weakSide, RANK_8));
   Value result;
 
   // If the stronger side's king is in front of the pawn, it's a win
-  if (forward_file_bb(WHITE, strongKing) & weakPawn)
+  if (forward_file_bb(strongSide, strongKing) & weakPawn)
       result = RookValueEg - distance(strongKing, weakPawn);
 
   // If the weaker side's king is too far from the pawn and the rook,
@@ -200,15 +200,15 @@ Value Endgame<KRKP>::operator()(const Position& pos) const {
 
   // If the pawn is far advanced and supported by the defending king,
   // the position is drawish
-  else if (   rank_of(weakKing) <= RANK_3
+  else if (   relative_rank(strongSide, weakKing) <= RANK_3
            && distance(weakKing, weakPawn) == 1
-           && rank_of(strongKing) >= RANK_4
+           && relative_rank(strongSide, strongKing) >= RANK_4
            && distance(strongKing, weakPawn) > 2 + (pos.side_to_move() == strongSide))
       result = Value(80) - 8 * distance(strongKing, weakPawn);
 
   else
-      result =  Value(200) - 8 * (  distance(strongKing, weakPawn + SOUTH)
-                                  - distance(weakKing, weakPawn + SOUTH)
+      result =  Value(200) - 8 * (  distance(strongKing, weakPawn + pawn_push(weakSide))
+                                  - distance(weakKing, weakPawn + pawn_push(weakSide))
                                   - distance(weakPawn, queeningSquare));
 
   return strongSide == pos.side_to_move() ? result : -result;

From 7225d254f90c7b9d64d4adf85ec2d319c6cf75a0 Mon Sep 17 00:00:00 2001
From: Stefan Geschwentner <stgeschwentner@gmail.com>
Date: Mon, 6 Jul 2020 09:30:23 +0200
Subject: [PATCH 12/86] Add a rank based bonus for blocked pawns.

Fix for overevaluated blocked pawns on the 5th and 6th rank.
This is a rewrite of the original idea that uses only two parameters.
Thanks to rocky640 for pointing this out.

STC:
LLR: 2.94 (-2.94,2.94) {-0.50,1.50}
Total: 50800 W: 9707 L: 9446 D: 31647
Ptnml(0-2): 831, 5851, 11822, 6018, 878
https://tests.stockfishchess.org/tests/view/5f00b4f359f6f03532894304

LTC:
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 52064 W: 6477 L: 6167 D: 39420
Ptnml(0-2): 331, 4628, 15834, 4878, 361
https://tests.stockfishchess.org/tests/view/5f0115fe59f6f03532894345

closes https://github.com/official-stockfish/Stockfish/pull/2794

Bench: 4882833
---
 src/pawns.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/pawns.cpp b/src/pawns.cpp
index d365ba12..f18e0315 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -38,6 +38,9 @@ namespace {
   constexpr Score WeakLever     = S( 0, 56);
   constexpr Score WeakUnopposed = S(13, 27);
 
+  // Bonus for blocked pawns at 5th or 6th rank
+  constexpr Score BlockedPawn[2] = { S(-10, -3), S(-3, 3) };
+
   constexpr Score BlockedStorm[RANK_NB] = {
     S(0, 0), S(0, 0), S(76, 78), S(-10, 15), S(-7, 10), S(-4, 6), S(-1, 2)
   };
@@ -169,6 +172,9 @@ namespace {
         if (!support)
             score -=  Doubled * doubled
                     + WeakLever * more_than_one(lever);
+
+        if (blocked && r > RANK_4)
+            score += BlockedPawn[r-4];
     }
 
     return score;

From 76a039027d14640852f60bda6d62ca16bdac3b9e Mon Sep 17 00:00:00 2001
From: Alain SAVARD <support@multicim.com>
Date: Mon, 6 Jul 2020 22:43:54 -0400
Subject: [PATCH 13/86] Clean-up en passant processing

the goal of this PR is to better document how we process
the ep square (if any) given position fen command, and to
output more meaningful (and consistent) debug fen on the "d"
command. The implementation follows
https://en.wikipedia.org/wiki/X-FEN#Encoding_en-passant
following x-fen, it is "valid" to record ep even if ep would put king en prise.

fixes #2784

closes https://github.com/official-stockfish/Stockfish/pull/2797

No functional change
---
 src/position.cpp | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/position.cpp b/src/position.cpp
index 6ef7aedc..396bff5f 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -178,9 +178,9 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
 
    4) En passant target square (in algebraic notation). If there's no en passant
       target square, this is "-". If a pawn has just made a 2-square move, this
-      is the position "behind" the pawn. This is recorded only if there is a pawn
-      in position to make an en passant capture, and if there really is a pawn
-      that might have advanced two squares.
+      is the position "behind" the pawn. Following X-FEN standard, this is recorded only
+      if there is a pawn in position to make an en passant capture, and if there really
+      is a pawn that might have advanced two squares.
 
    5) Halfmove clock. This is the number of halfmoves since the last pawn advance
       or capture. This is used to determine if a draw can be claimed under the
@@ -251,17 +251,25 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
       set_castling_right(c, rsq);
   }
 
-  // 4. En passant square. Ignore if no pawn capture is possible
+  // 4. En passant square.
+  // Ignore if square is invalid or not on side to move relative rank 6.
+  bool enpassant = false;
+
   if (   ((ss >> col) && (col >= 'a' && col <= 'h'))
-      && ((ss >> row) && (row == '3' || row == '6')))
+      && ((ss >> row) && (row == (sideToMove == WHITE ? '6' : '3'))))
   {
       st->epSquare = make_square(File(col - 'a'), Rank(row - '1'));
 
-      if (   !(attackers_to(st->epSquare) & pieces(sideToMove, PAWN))
-          || !(pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove))))
-          st->epSquare = SQ_NONE;
+      // En passant square will be considered only if
+      // a) side to move have a pawn threatening epSquare
+      // b) there is an enemy pawn in front of epSquare
+      // c) there is no piece on epSquare or behind epSquare
+      enpassant = pawn_attacks_bb(~sideToMove, st->epSquare) & pieces(sideToMove, PAWN)
+               && (pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove)))
+               && !(pieces() & (st->epSquare | (st->epSquare + pawn_push(sideToMove))));
   }
-  else
+
+  if (!enpassant)
       st->epSquare = SQ_NONE;
 
   // 5-6. Halfmove clock and fullmove number

From 804a29c738847b7ea5f8a4bff001964bd234d332 Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Wed, 8 Jul 2020 01:29:03 +0300
Subject: [PATCH 14/86] Connected / blocked pawns simplification

There is no need to score blocked pawns at many places.
The idea originated from: Rocky
Tuning and testing by: Fauzi

Passed STC:
https://tests.stockfishchess.org/tests/view/5f04f8fd59f6f035328945d4
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 6352 W: 1299 L: 1118 D: 3935
Ptnml(0-2): 89, 695, 1469, 792, 131

Passed LTC:
https://tests.stockfishchess.org/tests/view/5f0527bd59f6f035328945e3
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 27648 W: 3517 L: 3433 D: 20698
Ptnml(0-2): 177, 2561, 8301, 2571, 214

closes https://github.com/official-stockfish/Stockfish/pull/2799

Bench: 4734746
---
 src/pawns.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pawns.cpp b/src/pawns.cpp
index f18e0315..7f8d451a 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -39,7 +39,7 @@ namespace {
   constexpr Score WeakUnopposed = S(13, 27);
 
   // Bonus for blocked pawns at 5th or 6th rank
-  constexpr Score BlockedPawn[2] = { S(-10, -3), S(-3, 3) };
+  constexpr Score BlockedPawn[2] = { S(-11, -4), S(-3, 4) };
 
   constexpr Score BlockedStorm[RANK_NB] = {
     S(0, 0), S(0, 0), S(76, 78), S(-10, 15), S(-7, 10), S(-4, 6), S(-1, 2)
@@ -148,7 +148,7 @@ namespace {
         // Score this pawn
         if (support | phalanx)
         {
-            int v =  Connected[r] * (4 + 2 * bool(phalanx) - 2 * bool(opposed) - bool(blocked)) / 2
+            int v =  Connected[r] * (2 + bool(phalanx) - bool(opposed))
                    + 21 * popcount(support);
 
             score += make_score(v, v * (r - 2) / 4);

From bf5ce1c214f8f8e3f98e5e3ac43db0dd28617e35 Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sun, 5 Jul 2020 15:17:04 -0700
Subject: [PATCH 15/86] Simplify make_promotions()

Remove special case handling of QUIET_CHECKS in make_promotions()

STC https://tests.stockfishchess.org/tests/view/5f055dbb59f6f035328945fb
LLR: 2.98 (-2.94,2.94) {-1.50,0.50}
Total: 42808 W: 8177 L: 8054 D: 26577
Ptnml(0-2): 665, 4890, 10201, 4953, 695

LTC https://tests.stockfishchess.org/tests/view/5f06231a59f6f03532894661
LLR: 2.96 (-2.94,2.94) {-1.50,0.50}
Total: 9616 W: 1214 L: 1111 D: 7291
Ptnml(0-2): 53, 821, 2965, 908, 61

closes https://github.com/official-stockfish/Stockfish/pull/2800

Bench: 4576410
---
 src/movegen.cpp | 22 ++++++++++------------
 src/search.cpp  |  4 ++--
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/movegen.cpp b/src/movegen.cpp
index 17203a95..4ff12fc6 100644
--- a/src/movegen.cpp
+++ b/src/movegen.cpp
@@ -29,22 +29,20 @@ namespace {
   ExtMove* make_promotions(ExtMove* moveList, Square to, Square ksq) {
 
     if (Type == CAPTURES || Type == EVASIONS || Type == NON_EVASIONS)
+    {
         *moveList++ = make<PROMOTION>(to - D, to, QUEEN);
+        if (attacks_bb<KNIGHT>(to) & ksq)
+            *moveList++ = make<PROMOTION>(to - D, to, KNIGHT);
+    }
 
     if (Type == QUIETS || Type == EVASIONS || Type == NON_EVASIONS)
     {
         *moveList++ = make<PROMOTION>(to - D, to, ROOK);
         *moveList++ = make<PROMOTION>(to - D, to, BISHOP);
-        *moveList++ = make<PROMOTION>(to - D, to, KNIGHT);
+        if (!(attacks_bb<KNIGHT>(to) & ksq))
+            *moveList++ = make<PROMOTION>(to - D, to, KNIGHT);
     }
 
-    // Knight promotion is the only promotion that can give a direct check
-    // that's not already included in the queen promotion.
-    if (Type == QUIET_CHECKS && (attacks_bb<KNIGHT>(to) & ksq))
-        *moveList++ = make<PROMOTION>(to - D, to, KNIGHT);
-    else
-        (void)ksq; // Silence a warning under MSVC
-
     return moveList;
   }
 
@@ -263,8 +261,8 @@ namespace {
 } // namespace
 
 
-/// <CAPTURES>     Generates all pseudo-legal captures and queen promotions
-/// <QUIETS>       Generates all pseudo-legal non-captures and underpromotions
+/// <CAPTURES>     Generates all pseudo-legal captures plus queen and checking knight promotions
+/// <QUIETS>       Generates all pseudo-legal non-captures and underpromotions(except checking knight)
 /// <NON_EVASIONS> Generates all pseudo-legal captures and non-captures
 ///
 /// Returns a pointer to the end of the move list.
@@ -287,8 +285,8 @@ template ExtMove* generate<QUIETS>(const Position&, ExtMove*);
 template ExtMove* generate<NON_EVASIONS>(const Position&, ExtMove*);
 
 
-/// generate<QUIET_CHECKS> generates all pseudo-legal non-captures and knight
-/// underpromotions that give check. Returns a pointer to the end of the move list.
+/// generate<QUIET_CHECKS> generates all pseudo-legal non-captures.
+/// Returns a pointer to the end of the move list.
 template<>
 ExtMove* generate<QUIET_CHECKS>(const Position& pos, ExtMove* moveList) {
 
diff --git a/src/search.cpp b/src/search.cpp
index f14bdf77..1610c206 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1486,8 +1486,8 @@ moves_loop: // When in check, search starts from here
 
     // Initialize a MovePicker object for the current position, and prepare
     // to search the moves. Because the depth is <= 0 here, only captures,
-    // queen promotions and checks (only if depth >= DEPTH_QS_CHECKS) will
-    // be generated.
+    // queen and checking knight promotions, and other checks(only if depth >= DEPTH_QS_CHECKS)
+    // will be generated.
     MovePicker mp(pos, ttMove, depth, &thisThread->mainHistory,
                                       &thisThread->captureHistory,
                                       contHist,

From 4006f2c9132db034a27a94be33070d6aaab75b24 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Thu, 9 Jul 2020 22:01:06 +0200
Subject: [PATCH 16/86] Small cleanups

closes https://github.com/official-stockfish/Stockfish/pull/2772

No functional change
---
 Readme.md         |  3 ---
 src/benchmark.cpp |  2 +-
 src/bitboard.h    | 32 ++++++++++++++++----------------
 src/evaluate.cpp  | 10 +++++-----
 src/search.cpp    | 14 +++++++++-----
 5 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/Readme.md b/Readme.md
index e60ac718..823518d1 100644
--- a/Readme.md
+++ b/Readme.md
@@ -75,9 +75,6 @@ Currently, Stockfish has the following UCI options:
     Assume a time delay of x ms due to network and GUI overheads. This is useful to
     avoid losses on time in those cases.
 
-  * #### Minimum Thinking Time
-    Search for at least x ms per move.
-
   * #### Slow Mover
     Lower values will make Stockfish take less time in games, higher values will
     make it think longer.
diff --git a/src/benchmark.cpp b/src/benchmark.cpp
index f338cdda..3299f373 100644
--- a/src/benchmark.cpp
+++ b/src/benchmark.cpp
@@ -88,7 +88,7 @@ const vector<string> Defaults = {
 
   // Chess 960
   "setoption name UCI_Chess960 value true",
-  "bbqnnrkr/pppppppp/8/8/8/8/PPPPPPPP/BBQNNRKR w KQkq - 0 1 moves g2g3 d7d5 d2d4 c8h3 c1g5 e8d6 g5e7 f7f6",
+  "bbqnnrkr/pppppppp/8/8/8/8/PPPPPPPP/BBQNNRKR w HFhf - 0 1 moves g2g3 d7d5 d2d4 c8h3 c1g5 e8d6 g5e7 f7f6",
   "setoption name UCI_Chess960 value false"
 };
 
diff --git a/src/bitboard.h b/src/bitboard.h
index 1c598108..afeb40ec 100644
--- a/src/bitboard.h
+++ b/src/bitboard.h
@@ -124,7 +124,7 @@ inline Bitboard  operator&(Square s, Bitboard b) { return b & s; }
 inline Bitboard  operator|(Square s, Bitboard b) { return b | s; }
 inline Bitboard  operator^(Square s, Bitboard b) { return b ^ s; }
 
-inline Bitboard  operator|(Square s, Square s2) { return square_bb(s) | s2; }
+inline Bitboard  operator|(Square s1, Square s2) { return square_bb(s1) | s2; }
 
 constexpr bool more_than_one(Bitboard b) {
   return b & (b - 1);
@@ -138,19 +138,19 @@ constexpr bool opposite_colors(Square s1, Square s2) {
 /// rank_bb() and file_bb() return a bitboard representing all the squares on
 /// the given file or rank.
 
-inline Bitboard rank_bb(Rank r) {
+constexpr Bitboard rank_bb(Rank r) {
   return Rank1BB << (8 * r);
 }
 
-inline Bitboard rank_bb(Square s) {
+constexpr Bitboard rank_bb(Square s) {
   return rank_bb(rank_of(s));
 }
 
-inline Bitboard file_bb(File f) {
+constexpr Bitboard file_bb(File f) {
   return FileABB << f;
 }
 
-inline Bitboard file_bb(Square s) {
+constexpr Bitboard file_bb(Square s) {
   return file_bb(file_of(s));
 }
 
@@ -195,16 +195,16 @@ constexpr Bitboard pawn_double_attacks_bb(Bitboard b) {
 
 
 /// adjacent_files_bb() returns a bitboard representing all the squares on the
-/// adjacent files of the given one.
+/// adjacent files of a given square.
 
-inline Bitboard adjacent_files_bb(Square s) {
+constexpr Bitboard adjacent_files_bb(Square s) {
   return shift<EAST>(file_bb(s)) | shift<WEST>(file_bb(s));
 }
 
 
-/// line_bb(Square, Square) returns a bitboard representing an entire line,
-/// from board edge to board edge, that intersects the given squares. If the
-/// given squares are not on a same file/rank/diagonal, returns 0. For instance,
+/// line_bb() returns a bitboard representing an entire line (from board edge
+/// to board edge) that intersects the two given squares. If the given squares
+/// are not on a same file/rank/diagonal, the function returns 0. For instance,
 /// line_bb(SQ_C4, SQ_F7) will return a bitboard with the A2-G8 diagonal.
 
 inline Bitboard line_bb(Square s1, Square s2) {
@@ -215,8 +215,8 @@ inline Bitboard line_bb(Square s1, Square s2) {
 
 
 /// between_bb() returns a bitboard representing squares that are linearly
-/// between the given squares (excluding the given squares). If the given
-/// squares are not on a same file/rank/diagonal, return 0. For instance,
+/// between the two given squares (excluding the given squares). If the given
+/// squares are not on a same file/rank/diagonal, we return 0. For instance,
 /// between_bb(SQ_C4, SQ_F7) will return a bitboard with squares D5 and E6.
 
 inline Bitboard between_bb(Square s1, Square s2) {
@@ -229,7 +229,7 @@ inline Bitboard between_bb(Square s1, Square s2) {
 /// in front of the given one, from the point of view of the given color. For instance,
 /// forward_ranks_bb(BLACK, SQ_D3) will return the 16 squares on ranks 1 and 2.
 
-inline Bitboard forward_ranks_bb(Color c, Square s) {
+constexpr Bitboard forward_ranks_bb(Color c, Square s) {
   return c == WHITE ? ~Rank1BB << 8 * relative_rank(WHITE, s)
                     : ~Rank8BB >> 8 * relative_rank(BLACK, s);
 }
@@ -238,7 +238,7 @@ inline Bitboard forward_ranks_bb(Color c, Square s) {
 /// forward_file_bb() returns a bitboard representing all the squares along the
 /// line in front of the given one, from the point of view of the given color.
 
-inline Bitboard forward_file_bb(Color c, Square s) {
+constexpr Bitboard forward_file_bb(Color c, Square s) {
   return forward_ranks_bb(c, s) & file_bb(s);
 }
 
@@ -247,7 +247,7 @@ inline Bitboard forward_file_bb(Color c, Square s) {
 /// be attacked by a pawn of the given color when it moves along its file, starting
 /// from the given square.
 
-inline Bitboard pawn_attack_span(Color c, Square s) {
+constexpr Bitboard pawn_attack_span(Color c, Square s) {
   return forward_ranks_bb(c, s) & adjacent_files_bb(s);
 }
 
@@ -255,7 +255,7 @@ inline Bitboard pawn_attack_span(Color c, Square s) {
 /// passed_pawn_span() returns a bitboard which can be used to test if a pawn of
 /// the given color and on the given square is a passed pawn.
 
-inline Bitboard passed_pawn_span(Color c, Square s) {
+constexpr Bitboard passed_pawn_span(Color c, Square s) {
   return pawn_attack_span(c, s) | forward_file_bb(c, s);
 }
 
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index bb1724a4..6f2dd69b 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -719,9 +719,9 @@ namespace {
   }
 
 
-  // Evaluation::winnable() adjusts the mg and eg score components based on the
-  // known attacking/defending status of the players. A single value is derived
-  // by interpolation from the mg and eg values and returned.
+  // Evaluation::winnable() adjusts the midgame and endgame score components, based on
+  // the known attacking/defending status of the players. The final value is derived
+  // by interpolation from the midgame and endgame values.
 
   template<Tracing T>
   Value Evaluation<T>::winnable(Score score) const {
@@ -764,7 +764,7 @@ namespace {
     Color strongSide = eg > VALUE_DRAW ? WHITE : BLACK;
     int sf = me->scale_factor(pos, strongSide);
 
-    // If scale is not already specific, scale down the endgame via general heuristics
+    // If scale factor is not already specific, scale down via general heuristics
     if (sf == SCALE_FACTOR_NORMAL)
     {
         if (pos.opposite_bishops())
@@ -779,7 +779,7 @@ namespace {
                 && pos.non_pawn_material(BLACK) == RookValueMg
                 && pos.count<PAWN>(strongSide) - pos.count<PAWN>(~strongSide) <= 1
                 && bool(KingSide & pos.pieces(strongSide, PAWN)) != bool(QueenSide & pos.pieces(strongSide, PAWN))
-                && (attacks_bb<KING>(pos.square<KING>(~strongSide)) & pos.pieces(~strongSide, PAWN)))
+                && (attackedBy[~strongSide][KING] & pos.pieces(~strongSide, PAWN)))
             sf = 36;
         else if (pos.count<QUEEN>() == 1)
             sf = 37 + 3 * (pos.count<QUEEN>(WHITE) == 1 ? pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK)
diff --git a/src/search.cpp b/src/search.cpp
index 1610c206..720a9100 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -263,10 +263,10 @@ void MainThread::search() {
 
   Thread* bestThread = this;
 
-  if (int(Options["MultiPV"]) == 1 &&
-      !Limits.depth &&
-      !(Skill(Options["Skill Level"]).enabled() || int(Options["UCI_LimitStrength"])) &&
-      rootMoves[0].pv[0] != MOVE_NONE)
+  if (   int(Options["MultiPV"]) == 1
+      && !Limits.depth
+      && !(Skill(Options["Skill Level"]).enabled() || int(Options["UCI_LimitStrength"]))
+      && rootMoves[0].pv[0] != MOVE_NONE)
       bestThread = Threads.get_best_thread();
 
   bestPreviousScore = bestThread->rootMoves[0].score;
@@ -670,7 +670,11 @@ namespace {
     ttPv = PvNode || (ttHit && tte->is_pv());
     formerPv = ttPv && !PvNode;
 
-    if (ttPv && depth > 12 && ss->ply - 1 < MAX_LPH && !priorCapture && is_ok((ss-1)->currentMove))
+    if (   ttPv
+        && depth > 12
+        && ss->ply - 1 < MAX_LPH
+        && !priorCapture
+        && is_ok((ss-1)->currentMove))
         thisThread->lowPlyHistory[ss->ply - 1][from_to((ss-1)->currentMove)] << stat_bonus(depth - 5);
 
     // thisThread->ttHitAverage can be used to approximate the running average of ttHit

From 5e91c5dcc8066e9f346a10010ddce70f2d317ef6 Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Sat, 11 Jul 2020 00:06:55 +0300
Subject: [PATCH 17/86] Maximize usage of transposition table in probcut

Probcut is a heuristic that wasn't changed a lot in past years,
all attempts to change it using information / writing info to transposition table failed.

This patch has a number of differences that can be summarized as follows:

* For TT write/read we use depth - 3. Because probcut search is depth - 4 but we actually do the move prior to it so effectively we do depth - 3 search;
* In any case of depth of eval from transposition table being >= depth - 3 we either produce cutoff or refuse to even do probcut search, this is allowing us to write info of probcut to transposition table because we know that we wouldn't be overwriting some deeper data with our depth - 3 search - this is an important aspect of this patch;
* For some not really known reason this patch completely ignores tte->bound() - which was the case for previous patch that made probcut interact with TT, maybe 2) is the reason, although it's unproven.

A first version of this patch passed STC and LTC

passed STC
https://tests.stockfishchess.org/tests/view/5f05908a59f6f03532894613
LLR: 2.95 (-2.94,2.94) {-0.50,1.50}
Total: 95776 W: 18300 L: 17973 D: 59503
Ptnml(0-2): 1646, 10944, 22377, 11279, 1642

passed LTC
https://tests.stockfishchess.org/tests/view/5f06b54059f6f035328946bb
LLR: 2.94 (-2.94,2.94) {0.25,1.75}
Total: 57128 W: 7266 L: 6938 D: 42924
Ptnml(0-2): 372, 5163, 17217, 5389, 423

However, an additional bugfix was needed to avoid checking a condition on ttMove if was not available. This passed non-regression bounds on top of the first version:

at STC
https://tests.stockfishchess.org/tests/view/5f080e5059f6f03532894766
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 14096 W: 2800 L: 2628 D: 8668
Ptnml(0-2): 225, 1620, 3238, 1688, 277

at LTC
https://tests.stockfishchess.org/tests/view/5f0836a559f6f0353289479c
LLR: 2.95 (-2.94,2.94) {-1.50,0.50}
Total: 25352 W: 3228 L: 3139 D: 18985
Ptnml(0-2): 175, 2350, 7549, 2415, 187

closes https://github.com/official-stockfish/Stockfish/pull/2804

Bench 4540940
---
 src/search.cpp | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 720a9100..6cf2f90d 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -596,7 +596,7 @@ namespace {
     Key posKey;
     Move ttMove, move, excludedMove, bestMove;
     Depth extension, newDepth;
-    Value bestValue, value, ttValue, eval, maxValue;
+    Value bestValue, value, ttValue, eval, maxValue, probcutBeta;
     bool ttHit, ttPv, formerPv, givesCheck, improving, didLMR, priorCapture;
     bool captureOrPromotion, doFullDepthSearch, moveCountPruning,
          ttCapture, singularQuietLMR;
@@ -871,23 +871,33 @@ namespace {
         }
     }
 
+    probcutBeta = beta + 176 - 49 * improving;
+
     // Step 10. ProbCut (~10 Elo)
     // If we have a good enough capture and a reduced search returns a value
     // much above beta, we can (almost) safely prune the previous move.
     if (   !PvNode
         &&  depth > 4
-        &&  abs(beta) < VALUE_TB_WIN_IN_MAX_PLY)
+        &&  abs(beta) < VALUE_TB_WIN_IN_MAX_PLY
+        && !(   ttHit
+             && tte->depth() >= depth - 3
+             && ttValue != VALUE_NONE
+             && ttValue < probcutBeta))
     {
-        Value raisedBeta = beta + 176 - 49 * improving;
-        assert(raisedBeta < VALUE_INFINITE);
-        MovePicker mp(pos, ttMove, raisedBeta - ss->staticEval, &captureHistory);
+        if (   ttHit
+            && tte->depth() >= depth - 3
+            && ttValue != VALUE_NONE
+            && ttValue >= probcutBeta
+            && ttMove
+            && pos.capture_or_promotion(ttMove))
+            return probcutBeta;
+
+        assert(probcutBeta < VALUE_INFINITE);
+        MovePicker mp(pos, ttMove, probcutBeta - ss->staticEval, &captureHistory);
         int probCutCount = 0;
 
         while (   (move = mp.next_move()) != MOVE_NONE
-               && probCutCount < 2 + 2 * cutNode
-               && !(   move == ttMove
-                    && tte->depth() >= depth - 4
-                    && ttValue < raisedBeta))
+               && probCutCount < 2 + 2 * cutNode)
             if (move != excludedMove && pos.legal(move))
             {
                 assert(pos.capture_or_promotion(move));
@@ -905,16 +915,21 @@ namespace {
                 pos.do_move(move, st);
 
                 // Perform a preliminary qsearch to verify that the move holds
-                value = -qsearch<NonPV>(pos, ss+1, -raisedBeta, -raisedBeta+1);
+                value = -qsearch<NonPV>(pos, ss+1, -probcutBeta, -probcutBeta+1);
 
                 // If the qsearch held, perform the regular search
-                if (value >= raisedBeta)
-                    value = -search<NonPV>(pos, ss+1, -raisedBeta, -raisedBeta+1, depth - 4, !cutNode);
+                if (value >= probcutBeta)
+                    value = -search<NonPV>(pos, ss+1, -probcutBeta, -probcutBeta+1, depth - 4, !cutNode);
 
                 pos.undo_move(move);
 
-                if (value >= raisedBeta)
+                if (value >= probcutBeta)
+                {
+                    tte->save(posKey, value_to_tt(value, ss->ply), ttPv,
+                        BOUND_LOWER,
+                        depth - 3, move, ss->staticEval);
                     return value;
+                }
             }
     }
 

From 1f3bd968bb194a1f42af661cca9ec445c13978e8 Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Wed, 8 Jul 2020 10:09:32 +0800
Subject: [PATCH 18/86] Introduce bad outpost penalty

In some French games, Stockfish likes to bring the Knight to a bad outpost spot. This is evident in TCEC S18 Superfinal Game 63, where there is a Knight outpost on the queenside but is actually useless. Stockfish is effectively playing a piece down while holding ground against Leela's break on the kingside.

This patch turns the +56 mg bonus for a Knight outpost into a -7 mg penalty if it satisfies the following conditions:

* The outpost square is not on the CenterFiles (i.e. not on files C,D,E and F)
* The knight is not attacking non pawn enemies.
* The side where the outpost is located contains only few enemies, with a particular conditional_more_than_two() implementation

Thank you to apospa...@gmail.com for bringing this to our attention and for providing insights.
See https://groups.google.com/forum/?fromgroups=#!topic/fishcooking/dEXNzSIBgZU
Reference game: https://tcec-chess.com/#div=sf&game=63&season=18

Passed STC:
LLR: 2.93 (-2.94,2.94) {-0.50,1.50}
Total: 6960 W: 1454 L: 1247 D: 4259
Ptnml(0-2): 115, 739, 1610, 856, 160
https://tests.stockfishchess.org/tests/view/5f08221059f6f0353289477e

Passed LTC:
LLR: 2.98 (-2.94,2.94) {0.25,1.75}
Total: 21440 W: 2767 L: 2543 D: 16130
Ptnml(0-2): 122, 1904, 6462, 2092, 140
https://tests.stockfishchess.org/tests/view/5f0838ed59f6f035328947a2

various related tests show strong test results, but so far no generalizations or simplifications of conditional_more_than_two() are found. See PR for details.

closes https://github.com/official-stockfish/Stockfish/pull/2803

Bench: 4366686
---
 src/bitboard.h   | 7 +++++++
 src/evaluate.cpp | 9 ++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/bitboard.h b/src/bitboard.h
index afeb40ec..15ec4153 100644
--- a/src/bitboard.h
+++ b/src/bitboard.h
@@ -130,6 +130,13 @@ constexpr bool more_than_one(Bitboard b) {
   return b & (b - 1);
 }
 
+/// Counts the occupation of the bitboard depending on the occupation of SQ_A1
+/// as in `b & (1ULL << SQ_A1) ? more_than_two(b) : more_than_one(b)`
+
+constexpr bool conditional_more_than_two(Bitboard b) {
+  return b & (b - 1) & (b - 2);
+}
+
 constexpr bool opposite_colors(Square s1, Square s2) {
   return (s1 + rank_of(s1) + s2 + rank_of(s2)) & 1;
 }
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 6f2dd69b..ca6ea5c4 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -134,6 +134,7 @@ namespace {
   };
 
   // Assorted bonuses and penalties
+  constexpr Score BadOutpost          = S( -7, 36);
   constexpr Score BishopOnKingRing    = S( 24,  0);
   constexpr Score BishopPawns         = S(  3,  7);
   constexpr Score BishopXRayPawns     = S(  4,  5);
@@ -310,7 +311,13 @@ namespace {
         {
             // Bonus if piece is on an outpost square or can reach one
             bb = OutpostRanks & attackedBy[Us][PAWN] & ~pe->pawn_attacks_span(Them);
-            if (bb & s)
+            if (   Pt == KNIGHT
+                && bb & s & ~CenterFiles
+                && !(b & pos.pieces(Them) & ~pos.pieces(PAWN))
+                && !conditional_more_than_two(
+                      pos.pieces(Them) & ~pos.pieces(PAWN) & (s & QueenSide ? QueenSide : KingSide)))
+                score += BadOutpost;
+            else if (bb & s)
                 score += Outpost[Pt == BISHOP];
             else if (Pt == KNIGHT && bb & b & ~pos.pieces(Us))
                 score += ReachableOutpost;

From 6c197c3964ca0c637ff1f646dc7e6653b1bb4b45 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Sat, 11 Jul 2020 16:25:34 +0200
Subject: [PATCH 19/86] Corrects a functional change in a cleanup patch.

This corrects a functional change in
https://github.com/official-stockfish/Stockfish/commit/ddcbacd04d1c860e808202ce8c1206c8acdca627
changing evaluation of KPPvK. Bench remains unchanged at low depth

With this patch, 8/8/5k1p/8/7p/7K/8/8 b - - 1 11 is again correctly evaluated as a draw.

closes https://github.com/official-stockfish/Stockfish/pull/2807

Bench: 4366686
---
 src/endgame.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/endgame.cpp b/src/endgame.cpp
index 40f49dce..a8ceb648 100644
--- a/src/endgame.cpp
+++ b/src/endgame.cpp
@@ -589,8 +589,8 @@ ScaleFactor Endgame<KPsK>::operator()(const Position& pos) const {
   Bitboard strongPawns = pos.pieces(strongSide, PAWN);
 
   // If all pawns are ahead of the king on a single rook file, it's a draw.
-  if (!((strongPawns & ~FileABB) || (strongPawns & ~FileHBB)) &&
-      !(strongPawns & ~passed_pawn_span(weakSide, weakKing)))
+  if (   !(strongPawns & ~(FileABB | FileHBB))
+      && !(strongPawns & ~passed_pawn_span(weakSide, weakKing)))
       return SCALE_FACTOR_DRAW;
 
   return SCALE_FACTOR_NONE;

From c3092c54bc6fb837137365fc60eb57bd188deaca Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sun, 12 Jul 2020 13:58:00 -0700
Subject: [PATCH 20/86] Multiple lazy stages.

An extension of the lazy eval idea: when the score is sufficiently large
we now skip more granular parts of the eval.

Inspired by an original patch by Moez Jellouli
https://tests.stockfishchess.org/tests/view/5f03b2a159f6f03532894529
Credit to him!

STC https://tests.stockfishchess.org/tests/view/5f0a862c59f6f03532894924
LLR: 2.95 (-2.94,2.94) {-0.50,1.50}
Total: 13504 W: 2684 L: 2472 D: 8348
Ptnml(0-2): 229, 1496, 3111, 1666, 250

LTC https://tests.stockfishchess.org/tests/view/5f0ac0e159f6f0353289495b
LLR: 2.94 (-2.94,2.94) {0.25,1.75}
Total: 31312 W: 3926 L: 3677 D: 23709
Ptnml(0-2): 185, 2773, 9509, 2986, 203

closes https://github.com/official-stockfish/Stockfish/pull/2814

bench: 4541608
---
 src/evaluate.cpp | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index ca6ea5c4..dbb725d4 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -74,7 +74,8 @@ using namespace Trace;
 namespace {
 
   // Threshold for lazy and space evaluation
-  constexpr Value LazyThreshold  = Value(1400);
+  constexpr Value LazyThreshold1  = Value(1400);
+  constexpr Value LazyThreshold2  = Value(1300);
   constexpr Value SpaceThreshold = Value(12222);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
@@ -786,7 +787,7 @@ namespace {
                 && pos.non_pawn_material(BLACK) == RookValueMg
                 && pos.count<PAWN>(strongSide) - pos.count<PAWN>(~strongSide) <= 1
                 && bool(KingSide & pos.pieces(strongSide, PAWN)) != bool(QueenSide & pos.pieces(strongSide, PAWN))
-                && (attackedBy[~strongSide][KING] & pos.pieces(~strongSide, PAWN)))
+                && (attacks_bb<KING>(pos.square<KING>(~strongSide)) & pos.pieces(~strongSide, PAWN)))
             sf = 36;
         else if (pos.count<QUEEN>() == 1)
             sf = 37 + 3 * (pos.count<QUEEN>(WHITE) == 1 ? pos.count<BISHOP>(BLACK) + pos.count<KNIGHT>(BLACK)
@@ -837,9 +838,12 @@ namespace {
     score += pe->pawn_score(WHITE) - pe->pawn_score(BLACK);
 
     // Early exit if score is high
-    Value v = (mg_value(score) + eg_value(score)) / 2;
-    if (abs(v) > LazyThreshold + pos.non_pawn_material() / 64)
-       return pos.side_to_move() == WHITE ? v : -v;
+    auto lazy_skip = [&](Value lazyThreshold) {
+        return abs(mg_value(score) + eg_value(score)) / 2 > lazyThreshold + pos.non_pawn_material() / 64;
+    };
+
+    if (lazy_skip(LazyThreshold1))
+        goto make_v;
 
     // Main evaluation begins here
     initialize<WHITE>();
@@ -856,12 +860,17 @@ namespace {
 
     // More complex interactions that require fully populated attack bitboards
     score +=  king<   WHITE>() - king<   BLACK>()
-            + threats<WHITE>() - threats<BLACK>()
-            + passed< WHITE>() - passed< BLACK>()
+            + passed< WHITE>() - passed< BLACK>();
+
+    if (lazy_skip(LazyThreshold2))
+        goto make_v;
+
+    score +=  threats<WHITE>() - threats<BLACK>()
             + space<  WHITE>() - space<  BLACK>();
 
+make_v:
     // Derive single value from mg and eg parts of score
-    v = winnable(score);
+    Value v = winnable(score);
 
     // In case of tracing add all remaining individual evaluation terms
     if (T)

From d89730d5c8dcf10eb9e1d91a81f903d9fc3c949a Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Mon, 13 Jul 2020 20:30:58 +0300
Subject: [PATCH 21/86] Do not overwrite valuable TT data after probcut.

This patch allows an engine to write probcut data only in case
the probcut search depth is greater than transposition table depth.

passed STC
https://tests.stockfishchess.org/tests/view/5f0b52e959f6f035328949a6
LLR: 2.97 (-2.94,2.94) {-0.50,1.50}
Total: 52544 W: 10145 L: 9880 D: 32519
Ptnml(0-2): 853, 6097, 12121, 6334, 867

passed LTC
https://tests.stockfishchess.org/tests/view/5f0bd94c59f6f035328949f3
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 49576 W: 6164 L: 5863 D: 37549
Ptnml(0-2): 297, 4371, 15218, 4538, 364

closes https://github.com/official-stockfish/Stockfish/pull/2815

bench 4578298
---
 src/search.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 6cf2f90d..17ccab92 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -925,9 +925,12 @@ namespace {
 
                 if (value >= probcutBeta)
                 {
-                    tte->save(posKey, value_to_tt(value, ss->ply), ttPv,
-                        BOUND_LOWER,
-                        depth - 3, move, ss->staticEval);
+                    if ( !(ttHit
+                       && tte->depth() >= depth - 3
+                       && ttValue != VALUE_NONE))
+                        tte->save(posKey, value_to_tt(value, ss->ply), ttPv,
+                            BOUND_LOWER,
+                            depth - 3, move, ss->staticEval);
                     return value;
                 }
             }

From f0abde241d39ee4507778bf41b392492c5391652 Mon Sep 17 00:00:00 2001
From: protonspring <mike@whiteley.org>
Date: Sat, 25 Jul 2020 07:32:19 -0600
Subject: [PATCH 22/86] Remove conditional_more_than_two().

This is a functional simplification that removes the conditional_more_than_two()
function, which was quite strange and kooky. Note the very minor change to the bench
value.

See this thread for relevant comments on the passing branch:
protonspring/Stockfish@d89730d...ff35b50

STC
LLR: 2.95 (-2.94,2.94) {-1.50,0.50}
Total: 59760 W: 11411 L: 11311 D: 37038
Ptnml(0-2): 992, 6863, 14044, 7015, 966
https://tests.stockfishchess.org/tests/view/5f179988c09435d870cb9b9a

LTC
LLR: 2.93 (-2.94,2.94) {-1.50,0.50}
Total: 45208 W: 5553 L: 5497 D: 34158
Ptnml(0-2): 315, 4081, 13761, 4127, 320
https://tests.stockfishchess.org/tests/view/5f184847c09435d870cb9bee

closes https://github.com/official-stockfish/Stockfish/pull/2826

Bench: 4578290
---
 src/bitboard.h   |  6 ------
 src/evaluate.cpp | 12 +++++++-----
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/bitboard.h b/src/bitboard.h
index 15ec4153..8c95de8c 100644
--- a/src/bitboard.h
+++ b/src/bitboard.h
@@ -130,12 +130,6 @@ constexpr bool more_than_one(Bitboard b) {
   return b & (b - 1);
 }
 
-/// Counts the occupation of the bitboard depending on the occupation of SQ_A1
-/// as in `b & (1ULL << SQ_A1) ? more_than_two(b) : more_than_one(b)`
-
-constexpr bool conditional_more_than_two(Bitboard b) {
-  return b & (b - 1) & (b - 2);
-}
 
 constexpr bool opposite_colors(Square s1, Square s2) {
   return (s1 + rank_of(s1) + s2 + rank_of(s2)) & 1;
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index dbb725d4..d16648a8 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -310,13 +310,15 @@ namespace {
 
         if (Pt == BISHOP || Pt == KNIGHT)
         {
-            // Bonus if piece is on an outpost square or can reach one
+            // Bonus if the piece is on an outpost square or can reach one
+            // Reduced bonus for knights (BadOutpost) if few relevant targets
             bb = OutpostRanks & attackedBy[Us][PAWN] & ~pe->pawn_attacks_span(Them);
+            Bitboard targets = pos.pieces(Them) & ~pos.pieces(PAWN);
+
             if (   Pt == KNIGHT
-                && bb & s & ~CenterFiles
-                && !(b & pos.pieces(Them) & ~pos.pieces(PAWN))
-                && !conditional_more_than_two(
-                      pos.pieces(Them) & ~pos.pieces(PAWN) & (s & QueenSide ? QueenSide : KingSide)))
+                && bb & s & ~CenterFiles // on a side outpost
+                && !(b & targets)        // no relevant attacks
+                && (!more_than_one(targets & (s & QueenSide ? QueenSide : KingSide))))
                 score += BadOutpost;
             else if (bb & s)
                 score += Outpost[Pt == BISHOP];

From 62d3106caa2f5acf5ba32500cc19912b8f10612c Mon Sep 17 00:00:00 2001
From: UnaiCorzo <corzounai@gmail.com>
Date: Sat, 25 Jul 2020 22:30:05 +0200
Subject: [PATCH 23/86] Remove late irreversible move extension

We simplify away the late irreversible move extension, which
does not seem to be necessary in the current master.

STC
LLR: 2.93 (-2.94,2.94) {-1.50,0.50}
Total: 38584 W: 7464 L: 7342 D: 23778
Ptnml(0-2): 581, 4328, 9365, 4424, 594
https://tests.stockfishchess.org/tests/view/5f1c9669c09435d870cb9de9

LTC
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 27840 W: 3417 L: 3353 D: 21070
Ptnml(0-2): 120, 2315, 8994, 2363, 128
https://tests.stockfishchess.org/tests/view/5f1d2e22c09435d870cb9e21

closes https://github.com/official-stockfish/Stockfish/pull/2836

bench: 4829420
---
 src/search.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 17ccab92..6ec4d803 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1067,7 +1067,7 @@ moves_loop: // When in check, search starts from here
       // search of (alpha-s, beta-s), and just one fails high on (alpha, beta),
       // then that move is singular and should be extended. To verify this we do
       // a reduced search on all the other moves but the ttMove and if the
-      // result is lower than ttValue minus a margin then we will extend the ttMove.
+      // result is lower than ttValue minus a margin, then we will extend the ttMove.
       if (    depth >= 6
           &&  move == ttMove
           && !rootNode
@@ -1131,12 +1131,6 @@ moves_loop: // When in check, search starts from here
       if (type_of(move) == CASTLING)
           extension = 1;
 
-      // Late irreversible move extension
-      if (   move == ttMove
-          && pos.rule50_count() > 80
-          && (captureOrPromotion || type_of(movedPiece) == PAWN))
-          extension = 2;
-
       // Add extension to new depth
       newDepth += extension;
 

From 33f3cfae0093b934563e1eca78486261f18e4650 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Nicolet?= <Stephane.Nicolet@u-paris2.fr>
Date: Tue, 28 Jul 2020 10:08:09 +0200
Subject: [PATCH 24/86] Improve handling of queen imbalance

We double the bonus for potential threats by minors and rooks against
our queen, in case of "queen vs pieces imbalance". Hopefully this will
improve a little bit the evaluation for this well-known Stockfish weakness.

passed STC:
LLR: 2.93 (-2.94,2.94) {-0.50,1.50}
Total: 72976 W: 14003 L: 13710 D: 45263
Ptnml(0-2): 1218, 8370, 17094, 8513, 1293
https://tests.stockfishchess.org/tests/view/5efa50eb020eec13834a977d

passed LTC:
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 22232 W: 2779 L: 2560 D: 16893
Ptnml(0-2): 129, 1885, 6896, 2050, 156
https://tests.stockfishchess.org/tests/view/5f1fdd2dc09435d870cb9f13

closes https://github.com/official-stockfish/Stockfish/pull/2864

Bench: 4367349
---
 src/evaluate.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index d16648a8..b34d82f6 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -579,17 +579,21 @@ namespace {
     // Bonus for threats on the next moves against enemy queen
     if (pos.count<QUEEN>(Them) == 1)
     {
+        bool queenImbalance = pos.count<QUEEN>() == 1;
+
         Square s = pos.square<QUEEN>(Them);
-        safe = mobilityArea[Us] & ~stronglyProtected;
+        safe =   mobilityArea[Us]
+              & ~pos.pieces(Us, PAWN)
+              & ~stronglyProtected;
 
         b = attackedBy[Us][KNIGHT] & attacks_bb<KNIGHT>(s);
 
-        score += KnightOnQueen * popcount(b & safe);
+        score += KnightOnQueen * popcount(b & safe) * (1 + queenImbalance);
 
         b =  (attackedBy[Us][BISHOP] & attacks_bb<BISHOP>(s, pos.pieces()))
            | (attackedBy[Us][ROOK  ] & attacks_bb<ROOK  >(s, pos.pieces()));
 
-        score += SliderOnQueen * popcount(b & safe & attackedBy2[Us]);
+        score += SliderOnQueen * popcount(b & safe & attackedBy2[Us]) * (1 + queenImbalance);
     }
 
     if (T)

From 9587eeeb5ed29f834d4f956b92e0e732877c47a7 Mon Sep 17 00:00:00 2001
From: Stefan Geschwentner <stgeschwentner@gmail.com>
Date: Thu, 30 Jul 2020 18:56:11 +0200
Subject: [PATCH 25/86] Tweak cutnode reduction

Less reduction for second move at non-check CUT node with depth <= 10.

STC:
LLR: 2.94 (-2.94,2.94) {-0.50,1.50}
Total: 38680 W: 7490 L: 7245 D: 23945
Ptnml(0-2): 643, 4441, 8967, 4606, 683
https://tests.stockfishchess.org/tests/view/5f21e1782f7e63962b99f451

LTC:
LLR: 2.95 (-2.94,2.94) {0.25,1.75}
Total: 71976 W: 9003 L: 8636 D: 54337
Ptnml(0-2): 440, 6414, 21972, 6663, 499
https://tests.stockfishchess.org/tests/view/5f2245762f7e63962b99f4bd

closes https://github.com/official-stockfish/Stockfish/pull/2868

Bench: 4746616
---
 src/search.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/search.cpp b/src/search.cpp
index 6ec4d803..91ac60ad 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1167,6 +1167,13 @@ moves_loop: // When in check, search starts from here
       {
           Depth r = reduction(improving, depth, moveCount);
 
+          // Decrease reduction at non-check cut nodes for second move at low depths
+          if (   cutNode
+              && depth <= 10
+              && moveCount <= 2
+              && !ss->inCheck)
+              r--;
+
           // Decrease reduction if the ttHit running average is large
           if (thisThread->ttHitAverage > 473 * TtHitAverageResolution * TtHitAverageWindow / 1024)
               r--;

From 84f3e867903f62480c33243dd0ecbffd342796fc Mon Sep 17 00:00:00 2001
From: nodchip <nodchip@gmail.com>
Date: Wed, 5 Aug 2020 17:11:15 +0200
Subject: [PATCH 26/86] Add NNUE evaluation

This patch ports the efficiently updatable neural network (NNUE) evaluation to Stockfish.

Both the NNUE and the classical evaluations are available, and can be used to
assign a value to a position that is later used in alpha-beta (PVS) search to find the
best move. The classical evaluation computes this value as a function of various chess
concepts, handcrafted by experts, tested and tuned using fishtest. The NNUE evaluation
computes this value with a neural network based on basic inputs. The network is optimized
and trained on the evalutions of millions of positions at moderate search depth.

The NNUE evaluation was first introduced in shogi, and ported to Stockfish afterward.
It can be evaluated efficiently on CPUs, and exploits the fact that only parts
of the neural network need to be updated after a typical chess move.
[The nodchip repository](https://github.com/nodchip/Stockfish) provides additional
tools to train and develop the NNUE networks.

This patch is the result of contributions of various authors, from various communities,
including: nodchip, ynasu87, yaneurao (initial port and NNUE authors), domschl, FireFather,
rqs, xXH4CKST3RXx, tttak, zz4032, joergoster, mstembera, nguyenpham, erbsenzaehler,
dorzechowski, and vondele.

This new evaluation needed various changes to fishtest and the corresponding infrastructure,
for which tomtor, ppigazzini, noobpwnftw, daylen, and vondele are gratefully acknowledged.

The first networks have been provided by gekkehenker and sergiovieri, with the latter
net (nn-97f742aaefcd.nnue) being the current default.

The evaluation function can be selected at run time with the `Use NNUE` (true/false) UCI option,
provided the `EvalFile` option points the the network file (depending on the GUI, with full path).

The performance of the NNUE evaluation relative to the classical evaluation depends somewhat on
the hardware, and is expected to improve quickly, but is currently on > 80 Elo on fishtest:

60000 @ 10+0.1 th 1
https://tests.stockfishchess.org/tests/view/5f28fe6ea5abc164f05e4c4c
ELO: 92.77 +-2.1 (95%) LOS: 100.0%
Total: 60000 W: 24193 L: 8543 D: 27264
Ptnml(0-2): 609, 3850, 9708, 10948, 4885

40000 @ 20+0.2 th 8
https://tests.stockfishchess.org/tests/view/5f290229a5abc164f05e4c58
ELO: 89.47 +-2.0 (95%) LOS: 100.0%
Total: 40000 W: 12756 L: 2677 D: 24567
Ptnml(0-2): 74, 1583, 8550, 7776, 2017

At the same time, the impact on the classical evaluation remains minimal, causing no significant
regression:

sprt @ 10+0.1 th 1
https://tests.stockfishchess.org/tests/view/5f2906a2a5abc164f05e4c5b
LLR: 2.94 (-2.94,2.94) {-6.00,-4.00}
Total: 34936 W: 6502 L: 6825 D: 21609
Ptnml(0-2): 571, 4082, 8434, 3861, 520

sprt @ 60+0.6 th 1
https://tests.stockfishchess.org/tests/view/5f2906cfa5abc164f05e4c5d
LLR: 2.93 (-2.94,2.94) {-6.00,-4.00}
Total: 10088 W: 1232 L: 1265 D: 7591
Ptnml(0-2): 49, 914, 3170, 843, 68

The needed networks can be found at https://tests.stockfishchess.org/nns
It is recommended to use the default one as indicated by the `EvalFile` UCI option.

Guidelines for testing new nets can be found at
https://github.com/glinscott/fishtest/wiki/Creating-my-first-test#nnue-net-tests

Integration has been discussed in various issues:
https://github.com/official-stockfish/Stockfish/issues/2823
https://github.com/official-stockfish/Stockfish/issues/2728

The integration branch will be closed after the merge:
https://github.com/official-stockfish/Stockfish/pull/2825
https://github.com/official-stockfish/Stockfish/tree/nnue-player-wip

closes https://github.com/official-stockfish/Stockfish/pull/2912

This will be an exciting time for computer chess, looking forward to seeing the evolution of
this approach.

Bench: 4746616
---
 .travis.yml                                 |  31 +-
 AUTHORS                                     |  17 +-
 Readme.md => README.md                      | 129 ++++---
 appveyor.yml                                |  17 +-
 src/Makefile                                | 227 +++++++++++--
 src/benchmark.cpp                           |   4 +-
 src/bitbase.cpp                             |   4 +-
 src/bitboard.cpp                            |   4 +-
 src/bitboard.h                              |   4 +-
 src/endgame.cpp                             |   4 +-
 src/endgame.h                               |   4 +-
 src/evaluate.cpp                            | 114 +++++--
 src/evaluate.h                              |  24 +-
 src/main.cpp                                |   5 +-
 src/material.cpp                            |   4 +-
 src/material.h                              |   4 +-
 src/misc.cpp                                |  62 +++-
 src/misc.h                                  |   6 +-
 src/movegen.cpp                             |   4 +-
 src/movegen.h                               |   4 +-
 src/movepick.cpp                            |   4 +-
 src/movepick.h                              |   4 +-
 src/nnue/architectures/halfkp_256x2-32-32.h |  54 +++
 src/nnue/evaluate_nnue.cpp                  | 178 ++++++++++
 src/nnue/evaluate_nnue.h                    |  48 +++
 src/nnue/features/feature_set.h             | 135 ++++++++
 src/nnue/features/features_common.h         |  45 +++
 src/nnue/features/half_kp.cpp               |  92 +++++
 src/nnue/features/half_kp.h                 |  67 ++++
 src/nnue/features/index_list.h              |  64 ++++
 src/nnue/layers/affine_transform.h          | 215 ++++++++++++
 src/nnue/layers/clipped_relu.h              | 186 ++++++++++
 src/nnue/layers/input_slice.h               |  68 ++++
 src/nnue/nnue_accumulator.h                 |  39 +++
 src/nnue/nnue_architecture.h                |  38 +++
 src/nnue/nnue_common.h                      |  77 +++++
 src/nnue/nnue_feature_transformer.h         | 355 ++++++++++++++++++++
 src/pawns.cpp                               |   4 +-
 src/pawns.h                                 |   4 +-
 src/position.cpp                            | 108 +++++-
 src/position.h                              |  42 ++-
 src/psqt.cpp                                |   4 +-
 src/search.cpp                              |   6 +-
 src/search.h                                |   4 +-
 src/syzygy/tbprobe.cpp                      |   3 +-
 src/syzygy/tbprobe.h                        |   3 +-
 src/thread.cpp                              |   4 +-
 src/thread.h                                |   4 +-
 src/thread_win32_osx.h                      |   4 +-
 src/timeman.cpp                             |   4 +-
 src/timeman.h                               |   4 +-
 src/tt.cpp                                  |   4 +-
 src/tt.h                                    |   4 +-
 src/tune.cpp                                |   4 +-
 src/tune.h                                  |   4 +-
 src/types.h                                 | 129 ++++++-
 src/uci.cpp                                 |  22 +-
 src/uci.h                                   |   4 +-
 src/ucioption.cpp                           |   9 +-
 59 files changed, 2474 insertions(+), 245 deletions(-)
 rename Readme.md => README.md (79%)
 create mode 100644 src/nnue/architectures/halfkp_256x2-32-32.h
 create mode 100644 src/nnue/evaluate_nnue.cpp
 create mode 100644 src/nnue/evaluate_nnue.h
 create mode 100644 src/nnue/features/feature_set.h
 create mode 100644 src/nnue/features/features_common.h
 create mode 100644 src/nnue/features/half_kp.cpp
 create mode 100644 src/nnue/features/half_kp.h
 create mode 100644 src/nnue/features/index_list.h
 create mode 100644 src/nnue/layers/affine_transform.h
 create mode 100644 src/nnue/layers/clipped_relu.h
 create mode 100644 src/nnue/layers/input_slice.h
 create mode 100644 src/nnue/nnue_accumulator.h
 create mode 100644 src/nnue/nnue_architecture.h
 create mode 100644 src/nnue/nnue_common.h
 create mode 100644 src/nnue/nnue_feature_transformer.h

diff --git a/.travis.yml b/.travis.yml
index e2ae61be..d563a1e1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,5 @@
 language: cpp
-dist: xenial
+dist: bionic
 
 matrix:
   include:
@@ -7,7 +7,6 @@ matrix:
       compiler: gcc
       addons:
         apt:
-          sources: ['ubuntu-toolchain-r-test']
           packages: ['g++-8', 'g++-8-multilib', 'g++-multilib', 'valgrind', 'expect', 'curl']
       env:
         - COMPILER=g++-8
@@ -17,23 +16,23 @@ matrix:
       compiler: clang
       addons:
         apt:
-          sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-xenial-6.0']
-          packages: ['clang-6.0', 'llvm-6.0-dev', 'g++-multilib', 'valgrind', 'expect', 'curl']
+          packages: ['clang-10', 'llvm-10-dev', 'g++-multilib', 'valgrind', 'expect', 'curl']
       env:
-        - COMPILER=clang++-6.0
+        - COMPILER=clang++-10
         - COMP=clang
-        - LDFLAGS=-fuse-ld=lld
 
     - os: osx
+      osx_image: xcode12
       compiler: gcc
       env:
         - COMPILER=g++
         - COMP=gcc
 
     - os: osx
+      osx_image: xcode12
       compiler: clang
       env:
-        - COMPILER=clang++ V='Apple LLVM 9.4.1' # Apple LLVM version 9.1.0 (clang-902.0.39.2)
+        - COMPILER=clang++
         - COMP=clang
 
 branches:
@@ -48,26 +47,34 @@ script:
   - git log HEAD | grep "\b[Bb]ench[ :]\+[0-9]\{7\}" | head -n 1 | sed "s/[^0-9]*\([0-9]*\).*/\1/g" > git_sig
   - export benchref=$(cat git_sig)
   - echo "Reference bench:" $benchref
+
+  #
+  # Compiler version string
+  - $COMPILER -v
+
   #
   # Verify bench number against various builds
   - export CXXFLAGS="-Werror -D_GLIBCXX_DEBUG"
   - make clean && make -j2 ARCH=x86-64 optimize=no debug=yes build && ../tests/signature.sh $benchref
-  - make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref
-  - make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
 
   #
   # Check perft and reproducible search
+  - export CXXFLAGS="-Werror"
+  - make clean && make -j2 ARCH=x86-64 build
   - ../tests/perft.sh
   - ../tests/reprosearch.sh
+
   #
   # Valgrind
   #
   - export CXXFLAGS="-O1 -fno-inline"
   - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64 debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi
   - if [ -x "$(command -v valgrind )" ]; then ../tests/instrumented.sh --valgrind-thread; fi
+
   #
   # Sanitizer
   #
-  # Use g++-8 as a proxy for having sanitizers, might need revision as they become available for more recent versions of clang/gcc
-  - if [[ "$COMPILER" == "g++-8" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
-  - if [[ "$COMPILER" == "g++-8" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
diff --git a/AUTHORS b/AUTHORS
index f08d71d3..2e080e61 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,10 +1,17 @@
-# List of authors for Stockfish, as of March 30, 2020
+# List of authors for Stockfish, as of August 4, 2020
 
+# Founders of the Stockfish project and fishtest infrastructure
 Tord Romstad (romstad)
 Marco Costalba (mcostalba)
 Joona Kiiski (zamar)
 Gary Linscott (glinscott)
 
+# Authors and inventors of NNUE, training, NNUE port
+Yu Nasu (ynasu87)
+Motohiro Isozaki (yaneurao)
+Hisayori Noda (nodchip)
+
+# all other authors of the code in alphabetical order
 Aditya (absimaldata)
 Adrian Petrescu (apetresc)
 Ajith Chandy Jose (ajithcj)
@@ -36,6 +43,7 @@ Dariusz Orzechowski
 David Zar
 Daylen Yang (daylen)
 DiscanX
+Dominik Schlösser (domschl)
 double-beep
 Eduardo Cáceres (eduherminio)
 Eelco de Groot (KingDefender)
@@ -115,7 +123,8 @@ Nick Pelling (nickpelling)
 Nicklas Persson (NicklasPersson)
 Niklas Fiekas (niklasf)
 Nikolay Kostov (NikolayIT)
-Nguyen Pham
+Nguyen Pham (nguyenpham)
+Norman Schmidt (FireFather)
 Ondrej Mosnáček (WOnder93)
 Oskar Werkelin Ahlin
 Pablo Vazquez
@@ -135,6 +144,7 @@ Richard Lloyd
 Rodrigo Exterckötter Tjäder
 Ron Britvich (Britvich)
 Ronald de Man (syzygy1, syzygy)
+rqs
 Ryan Schmitt
 Ryan Takker
 Sami Kiminki (skiminki)
@@ -143,6 +153,7 @@ Sergei Antonov (saproj)
 Sergei Ivanov (svivanov72)
 sf-x
 Shane Booth (shane31)
+Shawn Varghese (xXH4CKST3RXx)
 Stefan Geschwentner (locutus2)
 Stefano Cardanobile (Stefano80)
 Steinar Gunderson (sesse)
@@ -155,9 +166,11 @@ Tom Vijlbrief (tomtor)
 Tomasz Sobczyk (Sopel97)
 Torsten Franz (torfranz, tfranzer)
 Tracey Emery (basepr1me)
+tttak
 Unai Corzo (unaiic)
 Uri Blass (uriblass)
 Vince Negri (cuddlestmonkey)
+zz4032
 
 
 # Additionally, we acknowledge the authors and maintainers of fishtest,
diff --git a/Readme.md b/README.md
similarity index 79%
rename from Readme.md
rename to README.md
index 823518d1..f71a8b34 100644
--- a/Readme.md
+++ b/README.md
@@ -4,7 +4,13 @@
 [![Build Status](https://ci.appveyor.com/api/projects/status/github/official-stockfish/Stockfish?branch=master&svg=true)](https://ci.appveyor.com/project/mcostalba/stockfish/branch/master)
 
 [Stockfish](https://stockfishchess.org) is a free, powerful UCI chess engine
-derived from Glaurung 2.1. It is not a complete chess program and requires a
+derived from Glaurung 2.1. It features two evaluation functions, the classical
+evaluation based on handcrafted terms, and the NNUE evaluation based on
+efficiently updateable neural networks. The classical evaluation runs efficiently
+on most 64bit CPU architectures, while the NNUE evaluation benefits strongly from the
+vector intrinsics available on modern CPUs (avx2 or similar).
+
+Stockfish is not a complete chess program and requires a
 UCI-compatible GUI (e.g. XBoard with PolyGlot, Scid, Cute Chess, eboard, Arena,
 Sigma Chess, Shredder, Chess Partner or Fritz) in order to be used comfortably.
 Read the documentation for your GUI of choice for information about how to use
@@ -22,21 +28,20 @@ This distribution of Stockfish consists of the following files:
   * src, a subdirectory containing the full source code, including a Makefile
     that can be used to compile Stockfish on Unix-like systems.
 
+To use the NNUE evaluation an additional data file with neural network parameters
+needs to be downloaded. The filename for the default set can be found as the default
+value of the `EvalFile` UCI option, with the format
+`nn-[SHA256 first 12 digits].nnue` (e.g. nn-c157e0a5755b.nnue). This file can be downloaded from
+```
+https://tests.stockfishchess.org/api/nn/[filename]
+```
+replacing `[filename]` as needed.
 
-## UCI parameters
+
+## UCI options
 
 Currently, Stockfish has the following UCI options:
 
-  * #### Debug Log File
-    Write all communication to and from the engine into a text file.
-
-  * #### Contempt
-    A positive value for contempt favors middle game positions and avoids draws.
-
-  * #### Analysis Contempt
-    By default, contempt is set to prefer the side to move. Set this option to "White"
-    or "Black" to analyse with contempt for that side, or "Off" to disable contempt.
-
   * #### Threads
     The number of CPU threads used for searching a position. For best performance, set
     this equal to the number of CPU cores available.
@@ -44,9 +49,6 @@ Currently, Stockfish has the following UCI options:
   * #### Hash
     The size of the hash table in MB. It is recommended to set Hash after setting Threads.
 
-  * #### Clear Hash
-    Clear the hash table.
-
   * #### Ponder
     Let Stockfish ponder its next move while the opponent is thinking.
 
@@ -54,10 +56,32 @@ Currently, Stockfish has the following UCI options:
     Output the N best lines (principal variations, PVs) when searching.
     Leave at 1 for best performance.
 
-  * #### Skill Level
-    Lower the Skill Level in order to make Stockfish play weaker (see also UCI_LimitStrength).
-    Internally, MultiPV is enabled, and with a certain probability depending on the Skill Level a
-    weaker move will be played.
+  * #### Use NNUE
+    Toggle between the NNUE and classical evaluation functions. If set to "true",
+    the network parameters must be availabe to load from file (see also EvalFile).
+
+  * #### EvalFile
+    The name of the file of the NNUE evaluation parameters. Depending on the GUI the
+    filename should include the full path to the folder/directory that contains the file.
+
+  * #### Contempt
+    A positive value for contempt favors middle game positions and avoids draws,
+    effective for the classical evaluation only.
+
+  * #### Analysis Contempt
+    By default, contempt is set to prefer the side to move. Set this option to "White"
+    or "Black" to analyse with contempt for that side, or "Off" to disable contempt.
+
+  * #### UCI_AnalyseMode
+    An option handled by your GUI.
+
+  * #### UCI_Chess960
+    An option handled by your GUI. If true, Stockfish will play Chess960.
+
+  * #### UCI_ShowWDL
+    If enabled, show approximate WDL statistics as part of the engine output.
+    These WDL numbers model expected game outcomes for a given evaluation and
+    game ply for engine self-play at fishtest LTC conditions (60+0.6s per game).
 
   * #### UCI_LimitStrength
     Enable weaker play aiming for an Elo rating as set by UCI_Elo. This option overrides Skill Level.
@@ -66,28 +90,10 @@ Currently, Stockfish has the following UCI options:
     If enabled by UCI_LimitStrength, aim for an engine strength of the given Elo.
     This Elo rating has been calibrated at a time control of 60s+0.6s and anchored to CCRL 40/4.
 
-  * #### UCI_ShowWDL
-    If enabled, show approximate WDL statistics as part of the engine output.
-    These WDL numbers model expected game outcomes for a given evaluation and
-    game ply for engine self-play at fishtest LTC conditions (60+0.6s per game).
-
-  * #### Move Overhead
-    Assume a time delay of x ms due to network and GUI overheads. This is useful to
-    avoid losses on time in those cases.
-
-  * #### Slow Mover
-    Lower values will make Stockfish take less time in games, higher values will
-    make it think longer.
-
-  * #### nodestime
-    Tells the engine to use nodes searched instead of wall time to account for
-    elapsed time. Useful for engine testing.
-
-  * #### UCI_Chess960
-    An option handled by your GUI. If true, Stockfish will play Chess960.
-
-  * #### UCI_AnalyseMode
-    An option handled by your GUI.
+  * #### Skill Level
+    Lower the Skill Level in order to make Stockfish play weaker (see also UCI_LimitStrength).
+    Internally, MultiPV is enabled, and with a certain probability depending on the Skill Level a
+    weaker move will be played.
 
   * #### SyzygyPath
     Path to the folders/directories storing the Syzygy tablebase files. Multiple
@@ -114,6 +120,47 @@ Currently, Stockfish has the following UCI options:
     Limit Syzygy tablebase probing to positions with at most this many pieces left
     (including kings and pawns).
 
+  * #### Move Overhead
+    Assume a time delay of x ms due to network and GUI overheads. This is useful to
+    avoid losses on time in those cases.
+
+  * #### Slow Mover
+    Lower values will make Stockfish take less time in games, higher values will
+    make it think longer.
+
+  * #### nodestime
+    Tells the engine to use nodes searched instead of wall time to account for
+    elapsed time. Useful for engine testing.
+
+  * #### Clear Hash
+    Clear the hash table.
+
+  * #### Debug Log File
+    Write all communication to and from the engine into a text file.
+
+## classical and NNUE evaluation
+
+Both approaches assign a value to a position that is used in alpha-beta (PVS) search
+to find the best move. The classical evaluation computes this value as a function
+of various chess concepts, handcrafted by experts, tested and tuned using fishtest.
+The NNUE evaluation computes this value with a neural network based on basic
+inputs (e.g. piece positions only). The network is optimized and trained
+on the evalutions of millions of positions at moderate search depth.
+
+The NNUE evaluation was first introduced in shogi, and ported to Stockfish afterward.
+It can be evaluated efficiently on CPUs, and exploits the fact that only parts
+of the neural network need to be updated after a typical chess move.
+[The nodchip repository](https://github.com/nodchip/Stockfish) provides additional
+tools to train and develop the NNUE networks.
+
+On CPUs supporting modern vector instructions (avx2 and similar), the NNUE evaluation
+results in stronger playing strength, even if the nodes per second computed by the engine
+is somewhat lower (roughly 60% of nps is typical).
+
+Note that the NNUE evaluation depends on the Stockfish binary and the network parameter
+file (see EvalFile). Not every parameter file is compatible with a given Stockfish binary.
+The default value of the EvalFile UCI option is the name of a network that is guaranteed
+to be compatible with that binary.
 
 ## What to expect from Syzygybases?
 
diff --git a/appveyor.yml b/appveyor.yml
index 21f3bbe3..d356ba2f 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -4,10 +4,9 @@ clone_depth: 50
 branches:
   only:
     - master
-    - appveyor
 
 # Operating system (build VM template)
-os: Visual Studio 2017
+os: Visual Studio 2019
 
 # Build platform, i.e. x86, x64, AnyCPU. This setting is optional.
 platform:
@@ -36,8 +35,11 @@ before_build:
       $src = $src.Replace("\", "/")
 
       # Build CMakeLists.txt
-      $t = 'cmake_minimum_required(VERSION 3.8)',
+      $t = 'cmake_minimum_required(VERSION 3.17)',
            'project(Stockfish)',
+           'set(CMAKE_CXX_STANDARD 17)',
+           'set(CMAKE_CXX_STANDARD_REQUIRED ON)',
+           'set (CMAKE_CXX_EXTENSIONS OFF)',
            'set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/src)',
            'set(source_files', $src, ')',
            'add_executable(stockfish ${source_files})'
@@ -51,10 +53,11 @@ before_build:
       $b = git log HEAD | sls "\b[Bb]ench[ :]+[0-9]{7}" | select -first 1
       $bench = $b -match '\D+(\d+)' | % { $matches[1] }
       Write-Host "Reference bench:" $bench
-      $g = "Visual Studio 15 2017"
-      If (${env:PLATFORM} -eq 'x64') { $g = $g + ' Win64' }
-      cmake -G "${g}" .
-      Write-Host "Generated files for: " $g
+      $g = "Visual Studio 16 2019"
+      If (${env:PLATFORM} -eq 'x64') { $a = "x64" }
+      If (${env:PLATFORM} -eq 'x86') { $a = "Win32" }
+      cmake -G "${g}" -A ${a} .
+      Write-Host "Generated files for: " $g $a
 
 build_script:
   - cmake --build . --config %CONFIGURATION% -- /verbosity:minimal
diff --git a/src/Makefile b/src/Makefile
index c3660a20..4741e722 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -38,11 +38,12 @@ PGOBENCH = ./$(EXE) bench
 ### Source and object files
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
 	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
-	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp
+	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
+	nnue/evaluate_nnue.cpp nnue/features/half_kp.cpp
 
 OBJS = $(notdir $(SRCS:.cpp=.o))
 
-VPATH = syzygy
+VPATH = syzygy:nnue:nnue/features
 
 ### Establish the operating system name
 KERNEL = $(shell uname -s)
@@ -67,7 +68,14 @@ endif
 # prefetch = yes/no   --- -DUSE_PREFETCH   --- Use prefetch asm-instruction
 # popcnt = yes/no     --- -DUSE_POPCNT     --- Use popcnt asm-instruction
 # sse = yes/no        --- -msse            --- Use Intel Streaming SIMD Extensions
+# sse3 = yes/no       --- -msse3           --- Use Intel Streaming SIMD Extensions 3
+# ssse3 = yes/no      --- -mssse3          --- Use Intel Supplemental Streaming SIMD Extensions 3
+# sse41 = yes/no      --- -msse4.1         --- Use Intel Streaming SIMD Extensions 4.1
+# sse42 = yes/no      --- -msse4.2         --- Use Intel Streaming SIMD Extensions 4.2
+# avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
 # pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
+# avx512 = yes/no     --- -mavx512bw       --- Use Intel Advanced Vector Extensions 512
+# neon = yes/no       --- -DUSE_NEON       --- Use ARM SIMD architecture
 #
 # Note that Makefile is space sensitive, so when adding new architectures
 # or modifying existing flags, you have to make sure there are no extra spaces
@@ -81,7 +89,15 @@ bits = 64
 prefetch = no
 popcnt = no
 sse = no
+sse3 = no
+ssse3 = no
+sse41 = no
+sse42 = no
+avx2 = no
 pext = no
+avx512 = no
+neon = no
+ARCH = x86-64-modern
 
 ### 2.2 Architecture specific
 ifeq ($(ARCH),general-32)
@@ -111,11 +127,70 @@ ifeq ($(ARCH),x86-64)
 	sse = yes
 endif
 
+ifeq ($(ARCH),x86-64-sse3)
+	arch = x86_64
+	prefetch = yes
+	sse = yes
+	sse3 = yes
+endif
+
+ifeq ($(ARCH),x86-64-sse3-popcnt)
+	arch = x86_64
+	prefetch = yes
+	sse = yes
+	sse3 = yes
+	popcnt = yes
+endif
+
+ifeq ($(ARCH),x86-64-ssse3)
+	arch = x86_64
+	prefetch = yes
+	sse = yes
+	sse3 = yes
+	ssse3 = yes
+endif
+
+ifeq ($(ARCH),x86-64-sse41)
+	arch = x86_64
+	prefetch = yes
+	popcnt = yes
+	sse = yes
+	sse3 = yes
+	ssse3 = yes
+	sse41 = yes
+endif
+
 ifeq ($(ARCH),x86-64-modern)
 	arch = x86_64
 	prefetch = yes
 	popcnt = yes
 	sse = yes
+	sse3 = yes
+	ssse3 = yes
+	sse41 = yes
+endif
+
+ifeq ($(ARCH),x86-64-sse42)
+	arch = x86_64
+	prefetch = yes
+	popcnt = yes
+	sse = yes
+	sse3 = yes
+	ssse3 = yes
+	sse41 = yes
+	sse42 = yes
+endif
+
+ifeq ($(ARCH),x86-64-avx2)
+	arch = x86_64
+	prefetch = yes
+	popcnt = yes
+	sse = yes
+	sse3 = yes
+	ssse3 = yes
+	sse41 = yes
+	sse42 = yes
+	avx2 = yes
 endif
 
 ifeq ($(ARCH),x86-64-bmi2)
@@ -123,9 +198,28 @@ ifeq ($(ARCH),x86-64-bmi2)
 	prefetch = yes
 	popcnt = yes
 	sse = yes
+	sse3 = yes
+	ssse3 = yes
+	sse41 = yes
+	sse42 = yes
+	avx2 = yes
 	pext = yes
 endif
 
+ifeq ($(ARCH),x86-64-avx512)
+	arch = x86_64
+	prefetch = yes
+	popcnt = yes
+	sse = yes
+	sse3 = yes
+	ssse3 = yes
+	sse41 = yes
+	sse42 = yes
+	avx2 = yes
+	pext = yes
+	avx512 = yes
+endif
+
 ifeq ($(ARCH),armv7)
 	arch = armv7
 	prefetch = yes
@@ -136,6 +230,14 @@ ifeq ($(ARCH),armv8)
 	arch = armv8-a
 	prefetch = yes
 	popcnt = yes
+	neon = yes
+endif
+
+ifeq ($(ARCH),apple-silicon)
+	arch = arm64
+	prefetch = yes
+	popcnt = yes
+	neon = yes
 endif
 
 ifeq ($(ARCH),ppc-32)
@@ -154,8 +256,8 @@ endif
 ### ==========================================================================
 
 ### 3.1 Selecting compiler (default = gcc)
-CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++11 $(EXTRACXXFLAGS)
-DEPENDFLAGS += -std=c++11
+CXXFLAGS += -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS)
+DEPENDFLAGS += -std=c++17
 LDFLAGS += $(EXTRALDFLAGS)
 
 ifeq ($(COMP),)
@@ -249,8 +351,8 @@ endif
 endif
 
 ifeq ($(KERNEL),Darwin)
-	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.9
-	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.9
+	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.15
+	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.15
 endif
 
 ### Travis CI script uses COMPILER to overwrite CXX
@@ -283,8 +385,8 @@ endif
 
 ### 3.2.2 Debugging with undefined behavior sanitizers
 ifneq ($(sanitize),no)
-        CXXFLAGS += -g3 -fsanitize=$(sanitize) -fuse-ld=gold
-        LDFLAGS += -fsanitize=$(sanitize) -fuse-ld=gold
+        CXXFLAGS += -g3 -fsanitize=$(sanitize)
+        LDFLAGS += -fsanitize=$(sanitize)
 endif
 
 ### 3.3 Optimization
@@ -322,7 +424,7 @@ endif
 
 ### 3.6 popcnt
 ifeq ($(popcnt),yes)
-	ifeq ($(arch),$(filter $(arch),ppc64 armv8-a))
+	ifeq ($(arch),$(filter $(arch),ppc64 armv8-a arm64))
 		CXXFLAGS += -DUSE_POPCNT
 	else ifeq ($(comp),icc)
 		CXXFLAGS += -msse3 -DUSE_POPCNT
@@ -331,11 +433,61 @@ ifeq ($(popcnt),yes)
 	endif
 endif
 
+ifeq ($(avx2),yes)
+	CXXFLAGS += -DUSE_AVX2
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -mavx2
+	endif
+endif
+
+ifeq ($(avx512),yes)
+	CXXFLAGS += -DUSE_AVX512
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -mavx512bw
+	endif
+endif
+
+ifeq ($(sse42),yes)
+	CXXFLAGS += -DUSE_SSE42
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -msse4.2
+	endif
+endif
+
+ifeq ($(sse41),yes)
+	CXXFLAGS += -DUSE_SSE41
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -msse4.1
+	endif
+endif
+
+ifeq ($(ssse3),yes)
+	CXXFLAGS += -DUSE_SSSE3
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -mssse3
+	endif
+endif
+
+ifeq ($(sse3),yes)
+	CXXFLAGS += -DUSE_SSE3
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -msse3
+	endif
+endif
+
+ifeq ($(neon),yes)
+	CXXFLAGS += -DUSE_NEON
+endif
+
+ifeq ($(arch),x86_64)
+	CXXFLAGS += -DUSE_SSE2
+endif
+
 ### 3.7 pext
 ifeq ($(pext),yes)
 	CXXFLAGS += -DUSE_PEXT
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
-		CXXFLAGS += -msse4 -mbmi2
+		CXXFLAGS += -mbmi2
 	endif
 endif
 
@@ -381,15 +533,23 @@ help:
 	@echo "Supported targets:"
 	@echo ""
 	@echo "build                   > Standard build"
-	@echo "profile-build           > PGO build"
+	@echo "profile-build           > Standard build with PGO"
 	@echo "strip                   > Strip executable"
 	@echo "install                 > Install executable"
 	@echo "clean                   > Clean up"
+	@echo "net                     > Download the default nnue net"
 	@echo ""
 	@echo "Supported archs:"
 	@echo ""
-	@echo "x86-64-bmi2             > x86 64-bit with pext support (also enables SSE4)"
-	@echo "x86-64-modern           > x86 64-bit with popcnt support (also enables SSE3)"
+	@echo "x86-64-avx512           > x86 64-bit with avx512 support"
+	@echo "x86-64-bmi2             > x86 64-bit with bmi2 support"
+	@echo "x86-64-avx2             > x86 64-bit with avx2 support"
+	@echo "x86-64-sse42            > x86 64-bit with sse42 support"
+	@echo "x86-64-modern           > x86 64-bit with sse41 support (x86-64-sse41)"
+	@echo "x86-64-sse41            > x86 64-bit with sse41 support"
+	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
+	@echo "x86-64-sse3-popcnt      > x86 64-bit with sse3 and popcnt support"
+	@echo "x86-64-sse3             > x86 64-bit with sse3 support"
 	@echo "x86-64                  > x86 64-bit generic"
 	@echo "x86-32                  > x86 32-bit (also enables SSE)"
 	@echo "x86-32-old              > x86 32-bit fall back for old hardware"
@@ -397,6 +557,7 @@ help:
 	@echo "ppc-32                  > PPC 32-bit"
 	@echo "armv7                   > ARMv7 32-bit"
 	@echo "armv8                   > ARMv8 64-bit"
+	@echo "apple-silicon           > Apple silicon ARM64"
 	@echo "general-64              > unspecified 64-bit"
 	@echo "general-32              > unspecified 32-bit"
 	@echo ""
@@ -409,17 +570,20 @@ help:
 	@echo ""
 	@echo "Simple examples. If you don't know what to do, you likely want to run: "
 	@echo ""
-	@echo "make build ARCH=x86-64    (This is for 64-bit systems)"
-	@echo "make build ARCH=x86-32    (This is for 32-bit systems)"
+	@echo "make -j build ARCH=x86-64    (This is for 64-bit systems)"
+	@echo "make -j build ARCH=x86-32    (This is for 32-bit systems)"
 	@echo ""
 	@echo "Advanced examples, for experienced users: "
 	@echo ""
-	@echo "make build ARCH=x86-64 COMP=clang"
-	@echo "make profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-4.8"
+	@echo "make -j build ARCH=x86-64-modern COMP=clang"
+	@echo "make -j profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-4.8"
 	@echo ""
+	@echo "The selected architecture $(ARCH) enables the following configuration: "
+	@echo ""
+	@$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity
 
 
-.PHONY: help build profile-build strip install clean objclean profileclean \
+.PHONY: help build profile-build strip install clean net objclean profileclean \
         config-sanity icc-profile-use icc-profile-make gcc-profile-use gcc-profile-make \
         clang-profile-use clang-profile-make
 
@@ -453,14 +617,21 @@ install:
 clean: objclean profileclean
 	@rm -f .depend *~ core
 
+net:
+	$(eval nnuenet := $(shell grep EvalFile ucioption.cpp | grep Option | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
+	@echo "Default net: $(nnuenet)"
+	$(eval nnuedownloadurl := https://tests.stockfishchess.org/api/nn/$(nnuenet))
+	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -sL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
+	@if test -f "$(nnuenet)"; then echo "Already available."; else echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet); fi
+
 # clean binaries and objects
 objclean:
-	@rm -f $(EXE) *.o ./syzygy/*.o
+	@rm -f $(EXE) *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o
 
 # clean auxiliary profiling files
 profileclean:
 	@rm -rf profdir
-	@rm -f bench.txt *.gcda *.gcno
+	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda
 	@rm -f stockfish.profdata *.profraw
 
 default:
@@ -485,7 +656,14 @@ config-sanity:
 	@echo "prefetch: '$(prefetch)'"
 	@echo "popcnt: '$(popcnt)'"
 	@echo "sse: '$(sse)'"
+	@echo "sse3: '$(sse3)'"
+	@echo "ssse3: '$(ssse3)'"
+	@echo "sse41: '$(sse41)'"
+	@echo "sse42: '$(sse42)'"
+	@echo "avx2: '$(avx2)'"
 	@echo "pext: '$(pext)'"
+	@echo "avx512: '$(avx512)'"
+	@echo "neon: '$(neon)'"
 	@echo ""
 	@echo "Flags:"
 	@echo "CXX: $(CXX)"
@@ -499,12 +677,19 @@ config-sanity:
 	@test "$(optimize)" = "yes" || test "$(optimize)" = "no"
 	@test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \
 	 test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || \
-	 test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a"
+	 test "$(arch)" = "armv7" || test "$(arch)" = "armv8-a" || test "$(arch)" = "arm64"
 	@test "$(bits)" = "32" || test "$(bits)" = "64"
 	@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
 	@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
 	@test "$(sse)" = "yes" || test "$(sse)" = "no"
+	@test "$(sse3)" = "yes" || test "$(sse3)" = "no"
+	@test "$(ssse3)" = "yes" || test "$(ssse3)" = "no"
+	@test "$(sse41)" = "yes" || test "$(sse41)" = "no"
+	@test "$(sse42)" = "yes" || test "$(sse42)" = "no"
+	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
 	@test "$(pext)" = "yes" || test "$(pext)" = "no"
+	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
+	@test "$(neon)" = "yes" || test "$(neon)" = "no"
 	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang"
 
 $(EXE): $(OBJS)
diff --git a/src/benchmark.cpp b/src/benchmark.cpp
index 3299f373..6041d642 100644
--- a/src/benchmark.cpp
+++ b/src/benchmark.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/bitbase.cpp b/src/bitbase.cpp
index 7e27eb96..bbe8e9a7 100644
--- a/src/bitbase.cpp
+++ b/src/bitbase.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/bitboard.cpp b/src/bitboard.cpp
index 0bf7eef9..f531010c 100644
--- a/src/bitboard.cpp
+++ b/src/bitboard.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/bitboard.h b/src/bitboard.h
index 8c95de8c..a899d879 100644
--- a/src/bitboard.h
+++ b/src/bitboard.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/endgame.cpp b/src/endgame.cpp
index a8ceb648..c8be2198 100644
--- a/src/endgame.cpp
+++ b/src/endgame.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/endgame.h b/src/endgame.h
index fd1aba2d..1351d88a 100644
--- a/src/endgame.h
+++ b/src/endgame.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index b34d82f6..f43c62d6 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -20,15 +18,50 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cstdlib>
 #include <cstring>   // For std::memset
 #include <iomanip>
 #include <sstream>
+#include <iostream>
 
 #include "bitboard.h"
 #include "evaluate.h"
 #include "material.h"
 #include "pawns.h"
 #include "thread.h"
+#include "uci.h"
+
+namespace Eval {
+
+  bool useNNUE;
+  std::string eval_file_loaded="None";
+
+  void init_NNUE() {
+
+    useNNUE = Options["Use NNUE"];
+    std::string eval_file = std::string(Options["EvalFile"]);
+    if (useNNUE && eval_file_loaded != eval_file)
+        if (Eval::NNUE::load_eval_file(eval_file))
+            eval_file_loaded = eval_file;
+  }
+
+  void verify_NNUE() {
+
+    std::string eval_file = std::string(Options["EvalFile"]);
+    if (useNNUE && eval_file_loaded != eval_file)
+    {
+        std::cerr << "Use of NNUE evaluation, but the file " << eval_file << " was not loaded successfully. "
+                  << "These network evaluation parameters must be available, compatible with this version of the code. "
+                  << "The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file." << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+
+    if (useNNUE)
+        sync_cout << "info string NNUE evaluation using " << eval_file << " enabled." << sync_endl;
+    else
+        sync_cout << "info string classical evaluation enabled." << sync_endl;
+  }
+}
 
 namespace Trace {
 
@@ -906,47 +939,62 @@ make_v:
 /// evaluation of the position from the point of view of the side to move.
 
 Value Eval::evaluate(const Position& pos) {
-  return Evaluation<NO_TRACE>(pos).value();
-}
 
+  if (Eval::useNNUE)
+      return NNUE::evaluate(pos);
+  else
+      return Evaluation<NO_TRACE>(pos).value();
+}
 
 /// trace() is like evaluate(), but instead of returning a value, it returns
 /// a string (suitable for outputting to stdout) that contains the detailed
 /// descriptions and values of each evaluation term. Useful for debugging.
+/// Trace scores are from white's point of view
 
 std::string Eval::trace(const Position& pos) {
 
   if (pos.checkers())
-      return "Total evaluation: none (in check)";
-
-  std::memset(scores, 0, sizeof(scores));
-
-  pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt
-
-  Value v = Evaluation<TRACE>(pos).value();
-
-  v = pos.side_to_move() == WHITE ? v : -v; // Trace scores are from white's point of view
+      return "Final evaluation: none (in check)";
 
   std::stringstream ss;
-  ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2)
-     << "     Term    |    White    |    Black    |    Total   \n"
-     << "             |   MG    EG  |   MG    EG  |   MG    EG \n"
-     << " ------------+-------------+-------------+------------\n"
-     << "    Material | " << Term(MATERIAL)
-     << "   Imbalance | " << Term(IMBALANCE)
-     << "       Pawns | " << Term(PAWN)
-     << "     Knights | " << Term(KNIGHT)
-     << "     Bishops | " << Term(BISHOP)
-     << "       Rooks | " << Term(ROOK)
-     << "      Queens | " << Term(QUEEN)
-     << "    Mobility | " << Term(MOBILITY)
-     << " King safety | " << Term(KING)
-     << "     Threats | " << Term(THREAT)
-     << "      Passed | " << Term(PASSED)
-     << "       Space | " << Term(SPACE)
-     << "    Winnable | " << Term(WINNABLE)
-     << " ------------+-------------+-------------+------------\n"
-     << "       Total | " << Term(TOTAL);
+  ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2);
+
+  Value v;
+
+  if (Eval::useNNUE)
+  {
+      v = NNUE::evaluate(pos);
+  }
+  else
+  {
+      std::memset(scores, 0, sizeof(scores));
+
+      pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt
+
+      v = Evaluation<TRACE>(pos).value();
+
+      ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2)
+         << "     Term    |    White    |    Black    |    Total   \n"
+         << "             |   MG    EG  |   MG    EG  |   MG    EG \n"
+         << " ------------+-------------+-------------+------------\n"
+         << "    Material | " << Term(MATERIAL)
+         << "   Imbalance | " << Term(IMBALANCE)
+         << "       Pawns | " << Term(PAWN)
+         << "     Knights | " << Term(KNIGHT)
+         << "     Bishops | " << Term(BISHOP)
+         << "       Rooks | " << Term(ROOK)
+         << "      Queens | " << Term(QUEEN)
+         << "    Mobility | " << Term(MOBILITY)
+         << " King safety | " << Term(KING)
+         << "     Threats | " << Term(THREAT)
+         << "      Passed | " << Term(PASSED)
+         << "       Space | " << Term(SPACE)
+         << "    Winnable | " << Term(WINNABLE)
+         << " ------------+-------------+-------------+------------\n"
+         << "       Total | " << Term(TOTAL);
+  }
+
+  v = pos.side_to_move() == WHITE ? v : -v;
 
   ss << "\nFinal evaluation: " << to_cp(v) << " (white side)\n";
 
diff --git a/src/evaluate.h b/src/evaluate.h
index 7c8a2a6f..e808068d 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -29,9 +27,23 @@ class Position;
 
 namespace Eval {
 
-std::string trace(const Position& pos);
+  std::string trace(const Position& pos);
+  Value evaluate(const Position& pos);
 
-Value evaluate(const Position& pos);
-}
+  extern bool useNNUE;
+  extern std::string eval_file_loaded;
+  void init_NNUE();
+  void verify_NNUE();
+
+  namespace NNUE {
+
+    Value evaluate(const Position& pos);
+    Value compute_eval(const Position& pos);
+    void  update_eval(const Position& pos);
+    bool  load_eval_file(const std::string& evalFile);
+
+  } // namespace NNUE
+
+} // namespace Eval
 
 #endif // #ifndef EVALUATE_H_INCLUDED
diff --git a/src/main.cpp b/src/main.cpp
index fafefee2..fbad6622 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -46,6 +44,7 @@ int main(int argc, char* argv[]) {
   Endgames::init();
   Threads.set(size_t(Options["Threads"]));
   Search::clear(); // After threads are up
+  Eval::init_NNUE();
 
   UCI::loop(argc, argv);
 
diff --git a/src/material.cpp b/src/material.cpp
index bb25d3ca..0ef9926f 100644
--- a/src/material.cpp
+++ b/src/material.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/material.h b/src/material.h
index 21647f23..80d01655 100644
--- a/src/material.h
+++ b/src/material.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/misc.cpp b/src/misc.cpp
index 2bc05c5b..3d7c75e5 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -46,6 +44,7 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 #include <iostream>
 #include <sstream>
 #include <vector>
+#include <cstdlib>
 
 #if defined(__linux__) && !defined(__ANDROID__)
 #include <stdlib.h>
@@ -147,10 +146,8 @@ const string engine_info(bool to_uci) {
       ss << setw(2) << day << setw(2) << (1 + months.find(month) / 4) << year.substr(2);
   }
 
-  ss << (Is64Bit ? " 64" : "")
-     << (HasPext ? " BMI2" : (HasPopCnt ? " POPCNT" : ""))
-     << (to_uci  ? "\nid author ": " by ")
-     << "T. Romstad, M. Costalba, J. Kiiski, G. Linscott";
+  ss << (to_uci  ? "\nid author ": " by ")
+     << "the Stockfish developers (see AUTHORS file)";
 
   return ss.str();
 }
@@ -215,7 +212,33 @@ const std::string compiler_info() {
      compiler += " on unknown system";
   #endif
 
-  compiler += "\n __VERSION__ macro expands to: ";
+  compiler += "\nCompilation settings include: ";
+  compiler += (Is64Bit ? " 64bit" : " 32bit");
+  #if defined(USE_AVX512)
+    compiler += " AVX512";
+  #endif
+  #if defined(USE_AVX2)
+    compiler += " AVX2";
+  #endif
+  #if defined(USE_SSE42)
+    compiler += " SSE42";
+  #endif
+  #if defined(USE_SSE41)
+    compiler += " SSE41";
+  #endif
+  #if defined(USE_SSSE3)
+    compiler += " SSSE3";
+  #endif
+  #if defined(USE_SSE3)
+    compiler += " SSE3";
+  #endif
+    compiler += (HasPext ? " BMI2" : "");
+    compiler += (HasPopCnt ? " POPCNT" : "");
+  #if !defined(NDEBUG)
+    compiler += " DEBUG";
+  #endif
+
+  compiler += "\n__VERSION__ macro expands to: ";
   #ifdef __VERSION__
      compiler += __VERSION__;
   #else
@@ -293,6 +316,29 @@ void prefetch(void* addr) {
 
 #endif
 
+/// Wrappers for systems where the c++17 implementation doesn't guarantee the availability of aligned_alloc.
+/// Memory allocated with std_aligned_alloc must be freed with std_aligned_free.
+///
+
+void* std_aligned_alloc(size_t alignment, size_t size) {
+#if defined(__APPLE__)
+  return aligned_alloc(alignment, size);
+#elif defined(_WIN32)
+  return _mm_malloc(size, alignment);
+#else
+  return std::aligned_alloc(alignment, size);
+#endif
+}
+
+void std_aligned_free(void* ptr) {
+#if defined(__APPLE__)
+  free(ptr);
+#elif defined(_WIN32)
+  _mm_free(ptr);
+#else
+  free(ptr);
+#endif
+}
 
 /// aligned_ttmem_alloc() will return suitably aligned memory, and if possible use large pages.
 /// The returned pointer is the aligned one, while the mem argument is the one that needs
diff --git a/src/misc.h b/src/misc.h
index 373f1b77..eb4e05c0 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -33,6 +31,8 @@ const std::string engine_info(bool to_uci = false);
 const std::string compiler_info();
 void prefetch(void* addr);
 void start_logger(const std::string& fname);
+void* std_aligned_alloc(size_t alignment, size_t size);
+void std_aligned_free(void* ptr);
 void* aligned_ttmem_alloc(size_t size, void*& mem);
 void aligned_ttmem_free(void* mem); // nop if mem == nullptr
 
diff --git a/src/movegen.cpp b/src/movegen.cpp
index 4ff12fc6..d74df4c3 100644
--- a/src/movegen.cpp
+++ b/src/movegen.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/movegen.h b/src/movegen.h
index c2e7c3f1..fb616d00 100644
--- a/src/movegen.h
+++ b/src/movegen.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/movepick.cpp b/src/movepick.cpp
index 5775f810..96a44449 100644
--- a/src/movepick.cpp
+++ b/src/movepick.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/movepick.h b/src/movepick.h
index aaff388f..f080935a 100644
--- a/src/movepick.h
+++ b/src/movepick.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/nnue/architectures/halfkp_256x2-32-32.h b/src/nnue/architectures/halfkp_256x2-32-32.h
new file mode 100644
index 00000000..9216bd41
--- /dev/null
+++ b/src/nnue/architectures/halfkp_256x2-32-32.h
@@ -0,0 +1,54 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// Definition of input features and network structure used in NNUE evaluation function
+
+#ifndef NNUE_HALFKP_256X2_32_32_H_INCLUDED
+#define NNUE_HALFKP_256X2_32_32_H_INCLUDED
+
+#include "../features/feature_set.h"
+#include "../features/half_kp.h"
+
+#include "../layers/input_slice.h"
+#include "../layers/affine_transform.h"
+#include "../layers/clipped_relu.h"
+
+namespace Eval::NNUE {
+
+// Input features used in evaluation function
+using RawFeatures = Features::FeatureSet<
+    Features::HalfKP<Features::Side::kFriend>>;
+
+// Number of input feature dimensions after conversion
+constexpr IndexType kTransformedFeatureDimensions = 256;
+
+namespace Layers {
+
+// Define network structure
+using InputLayer = InputSlice<kTransformedFeatureDimensions * 2>;
+using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
+using OutputLayer = AffineTransform<HiddenLayer2, 1>;
+
+}  // namespace Layers
+
+using Network = Layers::OutputLayer;
+
+}  // namespace Eval::NNUE
+
+#endif // #ifndef NNUE_HALFKP_256X2_32_32_H_INCLUDED
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
new file mode 100644
index 00000000..af0894b2
--- /dev/null
+++ b/src/nnue/evaluate_nnue.cpp
@@ -0,0 +1,178 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// Code for calculating NNUE evaluation function
+
+#include <fstream>
+#include <iostream>
+#include <set>
+
+#include "../evaluate.h"
+#include "../position.h"
+#include "../misc.h"
+#include "../uci.h"
+
+#include "evaluate_nnue.h"
+
+ExtPieceSquare kpp_board_index[PIECE_NB] = {
+ // convention: W - us, B - them
+ // viewed from other side, W and B are reversed
+    { PS_NONE,     PS_NONE     },
+    { PS_W_PAWN,   PS_B_PAWN   },
+    { PS_W_KNIGHT, PS_B_KNIGHT },
+    { PS_W_BISHOP, PS_B_BISHOP },
+    { PS_W_ROOK,   PS_B_ROOK   },
+    { PS_W_QUEEN,  PS_B_QUEEN  },
+    { PS_W_KING,   PS_B_KING   },
+    { PS_NONE,     PS_NONE     },
+    { PS_NONE,     PS_NONE     },
+    { PS_B_PAWN,   PS_W_PAWN   },
+    { PS_B_KNIGHT, PS_W_KNIGHT },
+    { PS_B_BISHOP, PS_W_BISHOP },
+    { PS_B_ROOK,   PS_W_ROOK   },
+    { PS_B_QUEEN,  PS_W_QUEEN  },
+    { PS_B_KING,   PS_W_KING   },
+    { PS_NONE,     PS_NONE     }
+};
+
+
+namespace Eval::NNUE {
+
+  // Input feature converter
+  AlignedPtr<FeatureTransformer> feature_transformer;
+
+  // Evaluation function
+  AlignedPtr<Network> network;
+
+  // Evaluation function file name
+  std::string fileName;
+
+  namespace Detail {
+
+  // Initialize the evaluation function parameters
+  template <typename T>
+  void Initialize(AlignedPtr<T>& pointer) {
+
+    pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
+    std::memset(pointer.get(), 0, sizeof(T));
+  }
+
+  // Read evaluation function parameters
+  template <typename T>
+  bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
+
+    std::uint32_t header;
+    stream.read(reinterpret_cast<char*>(&header), sizeof(header));
+    if (!stream || header != T::GetHashValue()) return false;
+    return pointer->ReadParameters(stream);
+  }
+
+  }  // namespace Detail
+
+  // Initialize the evaluation function parameters
+  void Initialize() {
+
+    Detail::Initialize(feature_transformer);
+    Detail::Initialize(network);
+  }
+
+  // Read network header
+  bool ReadHeader(std::istream& stream,
+    std::uint32_t* hash_value, std::string* architecture) {
+
+    std::uint32_t version, size;
+    stream.read(reinterpret_cast<char*>(&version), sizeof(version));
+    stream.read(reinterpret_cast<char*>(hash_value), sizeof(*hash_value));
+    stream.read(reinterpret_cast<char*>(&size), sizeof(size));
+    if (!stream || version != kVersion) return false;
+    architecture->resize(size);
+    stream.read(&(*architecture)[0], size);
+    return !stream.fail();
+  }
+
+  // Read network parameters
+  bool ReadParameters(std::istream& stream) {
+
+    std::uint32_t hash_value;
+    std::string architecture;
+    if (!ReadHeader(stream, &hash_value, &architecture)) return false;
+    if (hash_value != kHashValue) return false;
+    if (!Detail::ReadParameters(stream, feature_transformer)) return false;
+    if (!Detail::ReadParameters(stream, network)) return false;
+    return stream && stream.peek() == std::ios::traits_type::eof();
+  }
+
+  // Proceed with the difference calculation if possible
+  static void UpdateAccumulatorIfPossible(const Position& pos) {
+
+    feature_transformer->UpdateAccumulatorIfPossible(pos);
+  }
+
+  // Calculate the evaluation value
+  static Value ComputeScore(const Position& pos, bool refresh) {
+
+    auto& accumulator = pos.state()->accumulator;
+    if (!refresh && accumulator.computed_score) {
+      return accumulator.score;
+    }
+
+    alignas(kCacheLineSize) TransformedFeatureType
+        transformed_features[FeatureTransformer::kBufferSize];
+    feature_transformer->Transform(pos, transformed_features, refresh);
+    alignas(kCacheLineSize) char buffer[Network::kBufferSize];
+    const auto output = network->Propagate(transformed_features, buffer);
+
+    auto score = static_cast<Value>(output[0] / FV_SCALE);
+
+    accumulator.score = score;
+    accumulator.computed_score = true;
+    return accumulator.score;
+  }
+
+  // Load the evaluation function file
+  bool load_eval_file(const std::string& evalFile) {
+
+    Initialize();
+    fileName = evalFile;
+
+    std::ifstream stream(evalFile, std::ios::binary);
+
+    const bool result = ReadParameters(stream);
+
+    return result;
+  }
+
+  // Evaluation function. Perform differential calculation.
+  Value evaluate(const Position& pos) {
+    Value v = ComputeScore(pos, false);
+    v = Utility::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+
+    return v;
+  }
+
+  // Evaluation function. Perform full calculation.
+  Value compute_eval(const Position& pos) {
+    return ComputeScore(pos, true);
+  }
+
+  // Proceed with the difference calculation if possible
+  void update_eval(const Position& pos) {
+    UpdateAccumulatorIfPossible(pos);
+  }
+
+} // namespace Eval::NNUE
diff --git a/src/nnue/evaluate_nnue.h b/src/nnue/evaluate_nnue.h
new file mode 100644
index 00000000..5f0d1855
--- /dev/null
+++ b/src/nnue/evaluate_nnue.h
@@ -0,0 +1,48 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// header used in NNUE evaluation function
+
+#ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
+#define NNUE_EVALUATE_NNUE_H_INCLUDED
+
+#include "nnue_feature_transformer.h"
+
+#include <memory>
+
+namespace Eval::NNUE {
+
+  // Hash value of evaluation function structure
+  constexpr std::uint32_t kHashValue =
+      FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
+
+  // Deleter for automating release of memory area
+  template <typename T>
+  struct AlignedDeleter {
+    void operator()(T* ptr) const {
+      ptr->~T();
+      std_aligned_free(ptr);
+    }
+  };
+
+  template <typename T>
+  using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
+
+}  // namespace Eval::NNUE
+
+#endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
diff --git a/src/nnue/features/feature_set.h b/src/nnue/features/feature_set.h
new file mode 100644
index 00000000..79ca83ae
--- /dev/null
+++ b/src/nnue/features/feature_set.h
@@ -0,0 +1,135 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// A class template that represents the input feature set of the NNUE evaluation function
+
+#ifndef NNUE_FEATURE_SET_H_INCLUDED
+#define NNUE_FEATURE_SET_H_INCLUDED
+
+#include "features_common.h"
+#include <array>
+
+namespace Eval::NNUE::Features {
+
+  // Class template that represents a list of values
+  template <typename T, T... Values>
+  struct CompileTimeList;
+
+  template <typename T, T First, T... Remaining>
+  struct CompileTimeList<T, First, Remaining...> {
+    static constexpr bool Contains(T value) {
+      return value == First || CompileTimeList<T, Remaining...>::Contains(value);
+    }
+    static constexpr std::array<T, sizeof...(Remaining) + 1>
+        kValues = {{First, Remaining...}};
+  };
+
+  // Base class of feature set
+  template <typename Derived>
+  class FeatureSetBase {
+
+   public:
+    // Get a list of indices for active features
+    template <typename IndexListType>
+    static void AppendActiveIndices(
+        const Position& pos, TriggerEvent trigger, IndexListType active[2]) {
+
+      for (Color perspective : { WHITE, BLACK }) {
+        Derived::CollectActiveIndices(
+            pos, trigger, perspective, &active[perspective]);
+      }
+    }
+
+    // Get a list of indices for recently changed features
+    template <typename PositionType, typename IndexListType>
+    static void AppendChangedIndices(
+        const PositionType& pos, TriggerEvent trigger,
+        IndexListType removed[2], IndexListType added[2], bool reset[2]) {
+
+      const auto& dp = pos.state()->dirtyPiece;
+      if (dp.dirty_num == 0) return;
+
+      for (Color perspective : { WHITE, BLACK }) {
+        reset[perspective] = false;
+        switch (trigger) {
+          case TriggerEvent::kFriendKingMoved:
+            reset[perspective] =
+                dp.pieceId[0] == PIECE_ID_KING + perspective;
+            break;
+          default:
+            assert(false);
+            break;
+        }
+        if (reset[perspective]) {
+          Derived::CollectActiveIndices(
+              pos, trigger, perspective, &added[perspective]);
+        } else {
+          Derived::CollectChangedIndices(
+              pos, trigger, perspective,
+              &removed[perspective], &added[perspective]);
+        }
+      }
+    }
+  };
+
+  // Class template that represents the feature set
+  template <typename FeatureType>
+  class FeatureSet<FeatureType> : public FeatureSetBase<FeatureSet<FeatureType>> {
+
+   public:
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t kHashValue = FeatureType::kHashValue;
+    // Number of feature dimensions
+    static constexpr IndexType kDimensions = FeatureType::kDimensions;
+    // Maximum number of simultaneously active features
+    static constexpr IndexType kMaxActiveDimensions =
+        FeatureType::kMaxActiveDimensions;
+    // Trigger for full calculation instead of difference calculation
+    using SortedTriggerSet =
+        CompileTimeList<TriggerEvent, FeatureType::kRefreshTrigger>;
+    static constexpr auto kRefreshTriggers = SortedTriggerSet::kValues;
+
+   private:
+    // Get a list of indices for active features
+    static void CollectActiveIndices(
+        const Position& pos, const TriggerEvent trigger, const Color perspective,
+        IndexList* const active) {
+      if (FeatureType::kRefreshTrigger == trigger) {
+        FeatureType::AppendActiveIndices(pos, perspective, active);
+      }
+    }
+
+    // Get a list of indices for recently changed features
+    static void CollectChangedIndices(
+        const Position& pos, const TriggerEvent trigger, const Color perspective,
+        IndexList* const removed, IndexList* const added) {
+
+      if (FeatureType::kRefreshTrigger == trigger) {
+        FeatureType::AppendChangedIndices(pos, perspective, removed, added);
+      }
+    }
+
+    // Make the base class and the class template that recursively uses itself a friend
+    friend class FeatureSetBase<FeatureSet>;
+    template <typename... FeatureTypes>
+    friend class FeatureSet;
+  };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef NNUE_FEATURE_SET_H_INCLUDED
diff --git a/src/nnue/features/features_common.h b/src/nnue/features/features_common.h
new file mode 100644
index 00000000..d00a35df
--- /dev/null
+++ b/src/nnue/features/features_common.h
@@ -0,0 +1,45 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//Common header of input features of NNUE evaluation function
+
+#ifndef NNUE_FEATURES_COMMON_H_INCLUDED
+#define NNUE_FEATURES_COMMON_H_INCLUDED
+
+#include "../../evaluate.h"
+#include "../nnue_common.h"
+
+namespace Eval::NNUE::Features {
+
+  class IndexList;
+
+  template <typename... FeatureTypes>
+  class FeatureSet;
+
+  // Trigger to perform full calculations instead of difference only
+  enum class TriggerEvent {
+    kFriendKingMoved // calculate full evaluation when own king moves
+  };
+
+  enum class Side {
+    kFriend // side to move
+  };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef NNUE_FEATURES_COMMON_H_INCLUDED
diff --git a/src/nnue/features/half_kp.cpp b/src/nnue/features/half_kp.cpp
new file mode 100644
index 00000000..628add6e
--- /dev/null
+++ b/src/nnue/features/half_kp.cpp
@@ -0,0 +1,92 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//Definition of input features HalfKP of NNUE evaluation function
+
+#include "half_kp.h"
+#include "index_list.h"
+
+namespace Eval::NNUE::Features {
+
+  // Find the index of the feature quantity from the king position and PieceSquare
+  template <Side AssociatedKing>
+  inline IndexType HalfKP<AssociatedKing>::MakeIndex(Square sq_k, PieceSquare p) {
+    return static_cast<IndexType>(PS_END) * static_cast<IndexType>(sq_k) + p;
+  }
+
+  // Get pieces information
+  template <Side AssociatedKing>
+  inline void HalfKP<AssociatedKing>::GetPieces(
+      const Position& pos, Color perspective,
+      PieceSquare** pieces, Square* sq_target_k) {
+
+    *pieces = (perspective == BLACK) ?
+        pos.eval_list()->piece_list_fb() :
+        pos.eval_list()->piece_list_fw();
+    const PieceId target = (AssociatedKing == Side::kFriend) ?
+        static_cast<PieceId>(PIECE_ID_KING + perspective) :
+        static_cast<PieceId>(PIECE_ID_KING + ~perspective);
+    *sq_target_k = static_cast<Square>(((*pieces)[target] - PS_W_KING) % SQUARE_NB);
+  }
+
+  // Get a list of indices for active features
+  template <Side AssociatedKing>
+  void HalfKP<AssociatedKing>::AppendActiveIndices(
+      const Position& pos, Color perspective, IndexList* active) {
+
+    // Do nothing if array size is small to avoid compiler warning
+    if (RawFeatures::kMaxActiveDimensions < kMaxActiveDimensions) return;
+
+    PieceSquare* pieces;
+    Square sq_target_k;
+    GetPieces(pos, perspective, &pieces, &sq_target_k);
+    for (PieceId i = PIECE_ID_ZERO; i < PIECE_ID_KING; ++i) {
+      if (pieces[i] != PS_NONE) {
+        active->push_back(MakeIndex(sq_target_k, pieces[i]));
+      }
+    }
+  }
+
+  // Get a list of indices for recently changed features
+  template <Side AssociatedKing>
+  void HalfKP<AssociatedKing>::AppendChangedIndices(
+      const Position& pos, Color perspective,
+      IndexList* removed, IndexList* added) {
+
+    PieceSquare* pieces;
+    Square sq_target_k;
+    GetPieces(pos, perspective, &pieces, &sq_target_k);
+    const auto& dp = pos.state()->dirtyPiece;
+    for (int i = 0; i < dp.dirty_num; ++i) {
+      if (dp.pieceId[i] >= PIECE_ID_KING) continue;
+      const auto old_p = static_cast<PieceSquare>(
+          dp.old_piece[i].from[perspective]);
+      if (old_p != PS_NONE) {
+        removed->push_back(MakeIndex(sq_target_k, old_p));
+      }
+      const auto new_p = static_cast<PieceSquare>(
+          dp.new_piece[i].from[perspective]);
+      if (new_p != PS_NONE) {
+        added->push_back(MakeIndex(sq_target_k, new_p));
+      }
+    }
+  }
+
+  template class HalfKP<Side::kFriend>;
+
+}  // namespace Eval::NNUE::Features
diff --git a/src/nnue/features/half_kp.h b/src/nnue/features/half_kp.h
new file mode 100644
index 00000000..99842eea
--- /dev/null
+++ b/src/nnue/features/half_kp.h
@@ -0,0 +1,67 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//Definition of input features HalfKP of NNUE evaluation function
+
+#ifndef NNUE_FEATURES_HALF_KP_H_INCLUDED
+#define NNUE_FEATURES_HALF_KP_H_INCLUDED
+
+#include "../../evaluate.h"
+#include "features_common.h"
+
+namespace Eval::NNUE::Features {
+
+  // Feature HalfKP: Combination of the position of own king
+  // and the position of pieces other than kings
+  template <Side AssociatedKing>
+  class HalfKP {
+
+   public:
+    // Feature name
+    static constexpr const char* kName = "HalfKP(Friend)";
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t kHashValue =
+        0x5D69D5B9u ^ (AssociatedKing == Side::kFriend);
+    // Number of feature dimensions
+    static constexpr IndexType kDimensions =
+        static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_END);
+    // Maximum number of simultaneously active features
+    static constexpr IndexType kMaxActiveDimensions = PIECE_ID_KING;
+    // Trigger for full calculation instead of difference calculation
+    static constexpr TriggerEvent kRefreshTrigger = TriggerEvent::kFriendKingMoved;
+
+    // Get a list of indices for active features
+    static void AppendActiveIndices(const Position& pos, Color perspective,
+                                    IndexList* active);
+
+    // Get a list of indices for recently changed features
+    static void AppendChangedIndices(const Position& pos, Color perspective,
+                                     IndexList* removed, IndexList* added);
+
+    // Index of a feature for a given king position and another piece on some square
+    static IndexType MakeIndex(Square sq_k, PieceSquare p);
+
+   private:
+    // Get pieces information
+    static void GetPieces(const Position& pos, Color perspective,
+                          PieceSquare** pieces, Square* sq_target_k);
+  };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // #ifndef NNUE_FEATURES_HALF_KP_H_INCLUDED
diff --git a/src/nnue/features/index_list.h b/src/nnue/features/index_list.h
new file mode 100644
index 00000000..d9ad680a
--- /dev/null
+++ b/src/nnue/features/index_list.h
@@ -0,0 +1,64 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// Definition of index list of input features
+
+#ifndef NNUE_FEATURES_INDEX_LIST_H_INCLUDED
+#define NNUE_FEATURES_INDEX_LIST_H_INCLUDED
+
+#include "../../position.h"
+#include "../nnue_architecture.h"
+
+namespace Eval::NNUE::Features {
+
+  // Class template used for feature index list
+  template <typename T, std::size_t MaxSize>
+  class ValueList {
+
+   public:
+    std::size_t size() const { return size_; }
+    void resize(std::size_t size) { size_ = size; }
+    void push_back(const T& value) { values_[size_++] = value; }
+    T& operator[](std::size_t index) { return values_[index]; }
+    T* begin() { return values_; }
+    T* end() { return values_ + size_; }
+    const T& operator[](std::size_t index) const { return values_[index]; }
+    const T* begin() const { return values_; }
+    const T* end() const { return values_ + size_; }
+
+    void swap(ValueList& other) {
+      const std::size_t max_size = std::max(size_, other.size_);
+      for (std::size_t i = 0; i < max_size; ++i) {
+        std::swap(values_[i], other.values_[i]);
+      }
+      std::swap(size_, other.size_);
+    }
+
+   private:
+    T values_[MaxSize];
+    std::size_t size_ = 0;
+  };
+
+  //Type of feature index list
+  class IndexList
+      : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
+  };
+
+}  // namespace Eval::NNUE::Features
+
+#endif // NNUE_FEATURES_INDEX_LIST_H_INCLUDED
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
new file mode 100644
index 00000000..b585bc87
--- /dev/null
+++ b/src/nnue/layers/affine_transform.h
@@ -0,0 +1,215 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// Definition of layer AffineTransform of NNUE evaluation function
+
+#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+
+#include <iostream>
+#include "../nnue_common.h"
+
+namespace Eval::NNUE::Layers {
+
+  // Affine transformation layer
+  template <typename PreviousLayer, IndexType OutputDimensions>
+  class AffineTransform {
+   public:
+    // Input/output type
+    using InputType = typename PreviousLayer::OutputType;
+    using OutputType = std::int32_t;
+    static_assert(std::is_same<InputType, std::uint8_t>::value, "");
+
+    // Number of input/output dimensions
+    static constexpr IndexType kInputDimensions =
+        PreviousLayer::kOutputDimensions;
+    static constexpr IndexType kOutputDimensions = OutputDimensions;
+    static constexpr IndexType kPaddedInputDimensions =
+        CeilToMultiple<IndexType>(kInputDimensions, kMaxSimdWidth);
+
+    // Size of forward propagation buffer used in this layer
+    static constexpr std::size_t kSelfBufferSize =
+        CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+
+    // Size of the forward propagation buffer used from the input layer to this layer
+    static constexpr std::size_t kBufferSize =
+        PreviousLayer::kBufferSize + kSelfBufferSize;
+
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t GetHashValue() {
+      std::uint32_t hash_value = 0xCC03DAE4u;
+      hash_value += kOutputDimensions;
+      hash_value ^= PreviousLayer::GetHashValue() >> 1;
+      hash_value ^= PreviousLayer::GetHashValue() << 31;
+      return hash_value;
+    }
+
+   // Read network parameters
+    bool ReadParameters(std::istream& stream) {
+      if (!previous_layer_.ReadParameters(stream)) return false;
+      stream.read(reinterpret_cast<char*>(biases_),
+                  kOutputDimensions * sizeof(BiasType));
+      stream.read(reinterpret_cast<char*>(weights_),
+                  kOutputDimensions * kPaddedInputDimensions *
+                  sizeof(WeightType));
+      return !stream.fail();
+    }
+
+    // Forward propagation
+    const OutputType* Propagate(
+        const TransformedFeatureType* transformed_features, char* buffer) const {
+      const auto input = previous_layer_.Propagate(
+          transformed_features, buffer + kSelfBufferSize);
+      const auto output = reinterpret_cast<OutputType*>(buffer);
+
+  #if defined(USE_AVX512)
+      constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
+      const __m512i kOnes = _mm512_set1_epi16(1);
+      const auto input_vector = reinterpret_cast<const __m512i*>(input);
+
+  #elif defined(USE_AVX2)
+      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+      const __m256i kOnes = _mm256_set1_epi16(1);
+      const auto input_vector = reinterpret_cast<const __m256i*>(input);
+
+  #elif defined(USE_SSSE3)
+      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+      const __m128i kOnes = _mm_set1_epi16(1);
+      const auto input_vector = reinterpret_cast<const __m128i*>(input);
+
+  #elif defined(USE_NEON)
+      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+      const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
+  #endif
+
+      for (IndexType i = 0; i < kOutputDimensions; ++i) {
+        const IndexType offset = i * kPaddedInputDimensions;
+
+  #if defined(USE_AVX512)
+        __m512i sum = _mm512_setzero_si512();
+        const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+  #else
+            __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+  #endif
+
+            product = _mm512_madd_epi16(product, kOnes);
+            sum = _mm512_add_epi32(sum, product);
+        }
+        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
+
+        // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
+        // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
+        // and we have to do one more 256bit chunk.
+        if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
+        {
+            const auto iv_256  = reinterpret_cast<const __m256i*>(input);
+            const auto row_256 = reinterpret_cast<const __m256i*>(&weights_[offset]);
+            int j = kNumChunks * 2;
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)  // See HACK comment below in AVX2.
+            __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
+  #else
+            __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
+  #endif
+
+            sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1));
+            sum256 = _mm256_hadd_epi32(sum256, sum256);
+            sum256 = _mm256_hadd_epi32(sum256, sum256);
+            const __m128i lo = _mm256_extracti128_si256(sum256, 0);
+            const __m128i hi = _mm256_extracti128_si256(sum256, 1);
+            output[i] += _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
+        }
+
+  #elif defined(USE_AVX2)
+        __m256i sum = _mm256_setzero_si256();
+        const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m256i product = _mm256_maddubs_epi16(
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+            //       even though alignas is specified.
+            _mm256_loadu_si256
+  #else
+            _mm256_load_si256
+  #endif
+
+            (&input_vector[j]), _mm256_load_si256(&row[j]));
+          product = _mm256_madd_epi16(product, kOnes);
+          sum = _mm256_add_epi32(sum, product);
+        }
+        sum = _mm256_hadd_epi32(sum, sum);
+        sum = _mm256_hadd_epi32(sum, sum);
+        const __m128i lo = _mm256_extracti128_si256(sum, 0);
+        const __m128i hi = _mm256_extracti128_si256(sum, 1);
+        output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi) + biases_[i];
+
+  #elif defined(USE_SSSE3)
+        __m128i sum = _mm_cvtsi32_si128(biases_[i]);
+        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m128i product = _mm_maddubs_epi16(
+              _mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+          product = _mm_madd_epi16(product, kOnes);
+          sum = _mm_add_epi32(sum, product);
+        }
+        sum = _mm_hadd_epi32(sum, sum);
+        sum = _mm_hadd_epi32(sum, sum);
+        output[i] = _mm_cvtsi128_si32(sum);
+
+  #elif defined(USE_NEON)
+        int32x4_t sum = {biases_[i]};
+        const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          int16x8_t product = vmull_s8(input_vector[j * 2], row[j * 2]);
+          product = vmlal_s8(product, input_vector[j * 2 + 1], row[j * 2 + 1]);
+          sum = vpadalq_s16(sum, product);
+        }
+        output[i] = sum[0] + sum[1] + sum[2] + sum[3];
+
+  #else
+        OutputType sum = biases_[i];
+        for (IndexType j = 0; j < kInputDimensions; ++j) {
+          sum += weights_[offset + j] * input[j];
+        }
+        output[i] = sum;
+  #endif
+
+      }
+      return output;
+    }
+
+   private:
+    using BiasType = OutputType;
+    using WeightType = std::int8_t;
+
+    PreviousLayer previous_layer_;
+
+    alignas(kCacheLineSize) BiasType biases_[kOutputDimensions];
+    alignas(kCacheLineSize)
+        WeightType weights_[kOutputDimensions * kPaddedInputDimensions];
+  };
+
+}  // namespace Eval::NNUE::Layers
+
+#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
new file mode 100644
index 00000000..7ade598f
--- /dev/null
+++ b/src/nnue/layers/clipped_relu.h
@@ -0,0 +1,186 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// Definition of layer ClippedReLU of NNUE evaluation function
+
+#ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+
+#include "../nnue_common.h"
+
+namespace Eval::NNUE::Layers {
+
+  // Clipped ReLU
+  template <typename PreviousLayer>
+  class ClippedReLU {
+   public:
+    // Input/output type
+    using InputType = typename PreviousLayer::OutputType;
+    using OutputType = std::uint8_t;
+    static_assert(std::is_same<InputType, std::int32_t>::value, "");
+
+    // Number of input/output dimensions
+    static constexpr IndexType kInputDimensions =
+        PreviousLayer::kOutputDimensions;
+    static constexpr IndexType kOutputDimensions = kInputDimensions;
+
+    // Size of forward propagation buffer used in this layer
+    static constexpr std::size_t kSelfBufferSize =
+        CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+
+    // Size of the forward propagation buffer used from the input layer to this layer
+    static constexpr std::size_t kBufferSize =
+        PreviousLayer::kBufferSize + kSelfBufferSize;
+
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t GetHashValue() {
+      std::uint32_t hash_value = 0x538D24C7u;
+      hash_value += PreviousLayer::GetHashValue();
+      return hash_value;
+    }
+
+    // Read network parameters
+    bool ReadParameters(std::istream& stream) {
+      return previous_layer_.ReadParameters(stream);
+    }
+
+    // Forward propagation
+    const OutputType* Propagate(
+        const TransformedFeatureType* transformed_features, char* buffer) const {
+      const auto input = previous_layer_.Propagate(
+          transformed_features, buffer + kSelfBufferSize);
+      const auto output = reinterpret_cast<OutputType*>(buffer);
+
+  #if defined(USE_AVX2)
+      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+      const __m256i kZero = _mm256_setzero_si256();
+      const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+      const auto in = reinterpret_cast<const __m256i*>(input);
+      const auto out = reinterpret_cast<__m256i*>(output);
+      for (IndexType i = 0; i < kNumChunks; ++i) {
+        const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+          //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+          //       even though alignas is specified.
+          _mm256_loadu_si256
+  #else
+          _mm256_load_si256
+  #endif
+
+          (&in[i * 4 + 0]),
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_loadu_si256
+  #else
+          _mm256_load_si256
+  #endif
+
+          (&in[i * 4 + 1])), kWeightScaleBits);
+        const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_loadu_si256
+  #else
+          _mm256_load_si256
+  #endif
+
+          (&in[i * 4 + 2]),
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_loadu_si256
+  #else
+          _mm256_load_si256
+  #endif
+
+          (&in[i * 4 + 3])), kWeightScaleBits);
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+        _mm256_storeu_si256
+  #else
+        _mm256_store_si256
+  #endif
+
+          (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+            _mm256_packs_epi16(words0, words1), kZero), kOffsets));
+      }
+      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
+  #elif defined(USE_SSSE3)
+      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+
+  #ifdef USE_SSE41
+      const __m128i kZero = _mm_setzero_si128();
+  #else
+      const __m128i k0x80s = _mm_set1_epi8(-128);
+  #endif
+
+      const auto in = reinterpret_cast<const __m128i*>(input);
+      const auto out = reinterpret_cast<__m128i*>(output);
+      for (IndexType i = 0; i < kNumChunks; ++i) {
+        const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
+            _mm_load_si128(&in[i * 4 + 0]),
+            _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits);
+        const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
+            _mm_load_si128(&in[i * 4 + 2]),
+            _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
+        const __m128i packedbytes = _mm_packs_epi16(words0, words1);
+        _mm_store_si128(&out[i],
+
+  #ifdef USE_SSE41
+          _mm_max_epi8(packedbytes, kZero)
+  #else
+          _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+  #endif
+
+        );
+      }
+      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
+  #elif defined(USE_NEON)
+      constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
+      const int8x8_t kZero = {0};
+      const auto in = reinterpret_cast<const int32x4_t*>(input);
+      const auto out = reinterpret_cast<int8x8_t*>(output);
+      for (IndexType i = 0; i < kNumChunks; ++i) {
+        int16x8_t shifted;
+        const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
+        pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits);
+        pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits);
+        out[i] = vmax_s8(vqmovn_s16(shifted), kZero);
+      }
+      constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2);
+  #else
+      constexpr IndexType kStart = 0;
+  #endif
+
+      for (IndexType i = kStart; i < kInputDimensions; ++i) {
+        output[i] = static_cast<OutputType>(
+            std::max(0, std::min(127, input[i] >> kWeightScaleBits)));
+      }
+      return output;
+    }
+
+   private:
+    PreviousLayer previous_layer_;
+  };
+
+}  // namespace Eval::NNUE::Layers
+
+#endif // NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h
new file mode 100644
index 00000000..afca14c8
--- /dev/null
+++ b/src/nnue/layers/input_slice.h
@@ -0,0 +1,68 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// NNUE evaluation function layer InputSlice definition
+
+#ifndef NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
+#define NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
+
+#include "../nnue_common.h"
+
+namespace Eval::NNUE::Layers {
+
+// Input layer
+template <IndexType OutputDimensions, IndexType Offset = 0>
+class InputSlice {
+ public:
+  // Need to maintain alignment
+  static_assert(Offset % kMaxSimdWidth == 0, "");
+
+  // Output type
+  using OutputType = TransformedFeatureType;
+
+  // Output dimensionality
+  static constexpr IndexType kOutputDimensions = OutputDimensions;
+
+  // Size of forward propagation buffer used from the input layer to this layer
+  static constexpr std::size_t kBufferSize = 0;
+
+  // Hash value embedded in the evaluation file
+  static constexpr std::uint32_t GetHashValue() {
+    std::uint32_t hash_value = 0xEC42E90Du;
+    hash_value ^= kOutputDimensions ^ (Offset << 10);
+    return hash_value;
+  }
+
+  // Read network parameters
+  bool ReadParameters(std::istream& /*stream*/) {
+    return true;
+  }
+
+  // Forward propagation
+  const OutputType* Propagate(
+      const TransformedFeatureType* transformed_features,
+      char* /*buffer*/) const {
+    return transformed_features + Offset;
+  }
+
+ private:
+};
+
+}  // namespace Layers
+
+#endif // #ifndef NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
new file mode 100644
index 00000000..2a354a3c
--- /dev/null
+++ b/src/nnue/nnue_accumulator.h
@@ -0,0 +1,39 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// Class for difference calculation of NNUE evaluation function
+
+#ifndef NNUE_ACCUMULATOR_H_INCLUDED
+#define NNUE_ACCUMULATOR_H_INCLUDED
+
+#include "nnue_architecture.h"
+
+namespace Eval::NNUE {
+
+  // Class that holds the result of affine transformation of input features
+  struct alignas(32) Accumulator {
+    std::int16_t
+        accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+    Value score;
+    bool computed_accumulation;
+    bool computed_score;
+  };
+
+}  // namespace Eval::NNUE
+
+#endif // NNUE_ACCUMULATOR_H_INCLUDED
diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h
new file mode 100644
index 00000000..91cdc4bd
--- /dev/null
+++ b/src/nnue/nnue_architecture.h
@@ -0,0 +1,38 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// Input features and network structure used in NNUE evaluation function
+
+#ifndef NNUE_ARCHITECTURE_H_INCLUDED
+#define NNUE_ARCHITECTURE_H_INCLUDED
+
+// Defines the network structure
+#include "architectures/halfkp_256x2-32-32.h"
+
+namespace Eval::NNUE {
+
+  static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
+  static_assert(Network::kOutputDimensions == 1, "");
+  static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
+
+  // Trigger for full calculation instead of difference calculation
+  constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
+
+}  // namespace Eval::NNUE
+
+#endif // #ifndef NNUE_ARCHITECTURE_H_INCLUDED
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
new file mode 100644
index 00000000..972ef3e5
--- /dev/null
+++ b/src/nnue/nnue_common.h
@@ -0,0 +1,77 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// Constants used in NNUE evaluation function
+
+#ifndef NNUE_COMMON_H_INCLUDED
+#define NNUE_COMMON_H_INCLUDED
+
+#if defined(USE_AVX2)
+#include <immintrin.h>
+
+#elif defined(USE_SSE41)
+#include <smmintrin.h>
+
+#elif defined(USE_SSSE3)
+#include <tmmintrin.h>
+
+#elif defined(USE_SSE2)
+#include <emmintrin.h>
+
+#elif defined(USE_NEON)
+#include <arm_neon.h>
+#endif
+
+namespace Eval::NNUE {
+
+  // Version of the evaluation file
+  constexpr std::uint32_t kVersion = 0x7AF32F16u;
+
+  // Constant used in evaluation value calculation
+  constexpr int FV_SCALE = 16;
+  constexpr int kWeightScaleBits = 6;
+
+  // Size of cache line (in bytes)
+  constexpr std::size_t kCacheLineSize = 64;
+
+  // SIMD width (in bytes)
+  #if defined(USE_AVX2)
+  constexpr std::size_t kSimdWidth = 32;
+
+  #elif defined(USE_SSE2)
+  constexpr std::size_t kSimdWidth = 16;
+
+  #elif defined(USE_NEON)
+  constexpr std::size_t kSimdWidth = 16;
+  #endif
+
+  constexpr std::size_t kMaxSimdWidth = 32;
+
+  // Type of input feature after conversion
+  using TransformedFeatureType = std::uint8_t;
+  using IndexType = std::uint32_t;
+
+  // Round n up to be a multiple of base
+  template <typename IntType>
+  constexpr IntType CeilToMultiple(IntType n, IntType base) {
+    return (n + base - 1) / base * base;
+  }
+
+}  // namespace Eval::NNUE
+
+#endif // #ifndef NNUE_COMMON_H_INCLUDED
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
new file mode 100644
index 00000000..1cfebbe4
--- /dev/null
+++ b/src/nnue/nnue_feature_transformer.h
@@ -0,0 +1,355 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// A class that converts the input features of the NNUE evaluation function
+
+#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#define NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+
+#include "nnue_common.h"
+#include "nnue_architecture.h"
+#include "features/index_list.h"
+
+#include <cstring> // std::memset()
+
+namespace Eval::NNUE {
+
+  // Input feature converter
+  class FeatureTransformer {
+
+   private:
+    // Number of output dimensions for one side
+    static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
+
+   public:
+    // Output type
+    using OutputType = TransformedFeatureType;
+
+    // Number of input/output dimensions
+    static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
+    static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
+
+    // Size of forward propagation buffer
+    static constexpr std::size_t kBufferSize =
+        kOutputDimensions * sizeof(OutputType);
+
+    // Hash value embedded in the evaluation file
+    static constexpr std::uint32_t GetHashValue() {
+      return RawFeatures::kHashValue ^ kOutputDimensions;
+    }
+
+    // Read network parameters
+    bool ReadParameters(std::istream& stream) {
+      stream.read(reinterpret_cast<char*>(biases_),
+                  kHalfDimensions * sizeof(BiasType));
+      stream.read(reinterpret_cast<char*>(weights_),
+                  kHalfDimensions * kInputDimensions * sizeof(WeightType));
+      return !stream.fail();
+    }
+
+    // Proceed with the difference calculation if possible
+    bool UpdateAccumulatorIfPossible(const Position& pos) const {
+      const auto now = pos.state();
+      if (now->accumulator.computed_accumulation) {
+        return true;
+      }
+      const auto prev = now->previous;
+      if (prev && prev->accumulator.computed_accumulation) {
+        UpdateAccumulator(pos);
+        return true;
+      }
+      return false;
+    }
+
+    // Convert input features
+    void Transform(const Position& pos, OutputType* output, bool refresh) const {
+      if (refresh || !UpdateAccumulatorIfPossible(pos)) {
+        RefreshAccumulator(pos);
+      }
+      const auto& accumulation = pos.state()->accumulator.accumulation;
+
+  #if defined(USE_AVX2)
+      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+      constexpr int kControl = 0b11011000;
+      const __m256i kZero = _mm256_setzero_si256();
+
+  #elif defined(USE_SSSE3)
+      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+
+  #ifdef USE_SSE41
+      const __m128i kZero = _mm_setzero_si128();
+  #else
+      const __m128i k0x80s = _mm_set1_epi8(-128);
+  #endif
+
+  #elif defined(USE_NEON)
+      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+      const int8x8_t kZero = {0};
+  #endif
+
+      const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+      for (IndexType p = 0; p < 2; ++p) {
+        const IndexType offset = kHalfDimensions * p;
+
+  #if defined(USE_AVX2)
+        auto out = reinterpret_cast<__m256i*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m256i sum0 =
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+            //       even though alignas is specified.
+            _mm256_loadu_si256
+  #else
+            _mm256_load_si256
+  #endif
+
+            (&reinterpret_cast<const __m256i*>(
+              accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m256i sum1 =
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            _mm256_loadu_si256
+  #else
+            _mm256_load_si256
+  #endif
+
+            (&reinterpret_cast<const __m256i*>(
+              accumulation[perspectives[p]][0])[j * 2 + 1]);
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_storeu_si256
+  #else
+          _mm256_store_si256
+  #endif
+
+          (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+              _mm256_packs_epi16(sum0, sum1), kZero), kControl));
+        }
+
+  #elif defined(USE_SSSE3)
+        auto out = reinterpret_cast<__m128i*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+              accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
+              accumulation[perspectives[p]][0])[j * 2 + 1]);
+      const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
+
+          _mm_store_si128(&out[j],
+
+  #ifdef USE_SSE41
+            _mm_max_epi8(packedbytes, kZero)
+  #else
+            _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
+  #endif
+
+          );
+        }
+
+  #elif defined(USE_NEON)
+        const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          int16x8_t sum = reinterpret_cast<const int16x8_t*>(
+              accumulation[perspectives[p]][0])[j];
+          out[j] = vmax_s8(vqmovn_s16(sum), kZero);
+        }
+
+  #else
+        for (IndexType j = 0; j < kHalfDimensions; ++j) {
+          BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
+          output[offset + j] = static_cast<OutputType>(
+              std::max<int>(0, std::min<int>(127, sum)));
+        }
+  #endif
+
+      }
+    }
+
+   private:
+    // Calculate cumulative value without using difference calculation
+    void RefreshAccumulator(const Position& pos) const {
+      auto& accumulator = pos.state()->accumulator;
+      IndexType i = 0;
+      Features::IndexList active_indices[2];
+      RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
+                                       active_indices);
+      for (Color perspective : { WHITE, BLACK }) {
+        std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                   kHalfDimensions * sizeof(BiasType));
+        for (const auto index : active_indices[perspective]) {
+          const IndexType offset = kHalfDimensions * index;
+
+  #if defined(USE_AVX2)
+          auto accumulation = reinterpret_cast<__m256i*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j]));
+  #else
+            accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
+  #endif
+          }
+
+  #elif defined(USE_SSE2)
+          auto accumulation = reinterpret_cast<__m128i*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
+          }
+
+  #elif defined(USE_NEON)
+          auto accumulation = reinterpret_cast<int16x8_t*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = vaddq_s16(accumulation[j], column[j]);
+          }
+
+  #else
+          for (IndexType j = 0; j < kHalfDimensions; ++j) {
+            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
+          }
+  #endif
+
+        }
+      }
+
+      accumulator.computed_accumulation = true;
+      accumulator.computed_score = false;
+    }
+
+    // Calculate cumulative value using difference calculation
+    void UpdateAccumulator(const Position& pos) const {
+      const auto prev_accumulator = pos.state()->previous->accumulator;
+      auto& accumulator = pos.state()->accumulator;
+      IndexType i = 0;
+      Features::IndexList removed_indices[2], added_indices[2];
+      bool reset[2];
+      RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
+                                        removed_indices, added_indices, reset);
+      for (Color perspective : { WHITE, BLACK }) {
+
+  #if defined(USE_AVX2)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<__m256i*>(
+            &accumulator.accumulation[perspective][i][0]);
+
+  #elif defined(USE_SSE2)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<__m128i*>(
+            &accumulator.accumulation[perspective][i][0]);
+
+  #elif defined(USE_NEON)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<int16x8_t*>(
+            &accumulator.accumulation[perspective][i][0]);
+  #endif
+
+        if (reset[perspective]) {
+          std::memcpy(accumulator.accumulation[perspective][i], biases_,
+                      kHalfDimensions * sizeof(BiasType));
+        } else {
+          std::memcpy(accumulator.accumulation[perspective][i],
+                      prev_accumulator.accumulation[perspective][i],
+                      kHalfDimensions * sizeof(BiasType));
+          // Difference calculation for the deactivated features
+          for (const auto index : removed_indices[perspective]) {
+            const IndexType offset = kHalfDimensions * index;
+
+  #if defined(USE_AVX2)
+            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
+            }
+
+  #elif defined(USE_SSE2)
+            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
+            }
+
+  #elif defined(USE_NEON)
+            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = vsubq_s16(accumulation[j], column[j]);
+            }
+
+  #else
+            for (IndexType j = 0; j < kHalfDimensions; ++j) {
+              accumulator.accumulation[perspective][i][j] -=
+                  weights_[offset + j];
+            }
+  #endif
+
+          }
+        }
+        { // Difference calculation for the activated features
+          for (const auto index : added_indices[perspective]) {
+            const IndexType offset = kHalfDimensions * index;
+
+  #if defined(USE_AVX2)
+            auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
+            }
+
+  #elif defined(USE_SSE2)
+            auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
+            }
+
+  #elif defined(USE_NEON)
+            auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = vaddq_s16(accumulation[j], column[j]);
+            }
+
+  #else
+            for (IndexType j = 0; j < kHalfDimensions; ++j) {
+              accumulator.accumulation[perspective][i][j] +=
+                  weights_[offset + j];
+            }
+  #endif
+
+          }
+        }
+      }
+
+      accumulator.computed_accumulation = true;
+      accumulator.computed_score = false;
+    }
+
+    using BiasType = std::int16_t;
+    using WeightType = std::int16_t;
+
+    alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
+    alignas(kCacheLineSize)
+        WeightType weights_[kHalfDimensions * kInputDimensions];
+  };
+
+}  // namespace Eval::NNUE
+
+#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
diff --git a/src/pawns.cpp b/src/pawns.cpp
index 7f8d451a..73682529 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/pawns.h b/src/pawns.h
index e6098069..5499826e 100644
--- a/src/pawns.h
+++ b/src/pawns.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/position.cpp b/src/position.cpp
index 396bff5f..46e5d78b 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -200,6 +198,9 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
   std::fill_n(&pieceList[0][0], sizeof(pieceList) / sizeof(Square), SQ_NONE);
   st = si;
 
+  // Each piece on board gets a unique ID used to track the piece later
+  PieceId piece_id, next_piece_id = PIECE_ID_ZERO;
+
   ss >> std::noskipws;
 
   // 1. Piece placement
@@ -213,7 +214,19 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
 
       else if ((idx = PieceToChar.find(token)) != string::npos)
       {
-          put_piece(Piece(idx), sq);
+          auto pc = Piece(idx);
+          put_piece(pc, sq);
+
+          if (Eval::useNNUE)
+          {
+              // Kings get a fixed ID, other pieces get ID in order of placement
+              piece_id =
+                (idx == W_KING) ? PIECE_ID_WKING :
+                (idx == B_KING) ? PIECE_ID_BKING :
+                next_piece_id++;
+              evalList.put_piece(piece_id, sq, pc);
+          }
+
           ++sq;
       }
   }
@@ -705,6 +718,14 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   ++st->rule50;
   ++st->pliesFromNull;
 
+  // Used by NNUE
+  st->accumulator.computed_accumulation = false;
+  st->accumulator.computed_score = false;
+  PieceId dp0 = PIECE_ID_NONE;
+  PieceId dp1 = PIECE_ID_NONE;
+  auto& dp = st->dirtyPiece;
+  dp.dirty_num = 1;
+
   Color us = sideToMove;
   Color them = ~us;
   Square from = from_sq(m);
@@ -752,6 +773,16 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
       else
           st->nonPawnMaterial[them] -= PieceValue[MG][captured];
 
+      if (Eval::useNNUE)
+      {
+          dp.dirty_num = 2; // 2 pieces moved
+          dp1 = piece_id_on(capsq);
+          dp.pieceId[1] = dp1;
+          dp.old_piece[1] = evalList.piece_with_id(dp1);
+          evalList.put_piece(dp1, capsq, NO_PIECE);
+          dp.new_piece[1] = evalList.piece_with_id(dp1);
+      }
+
       // Update board and piece lists
       remove_piece(capsq);
 
@@ -787,7 +818,18 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
 
   // Move the piece. The tricky Chess960 castling is handled earlier
   if (type_of(m) != CASTLING)
+  {
+      if (Eval::useNNUE)
+      {
+          dp0 = piece_id_on(from);
+          dp.pieceId[0] = dp0;
+          dp.old_piece[0] = evalList.piece_with_id(dp0);
+          evalList.put_piece(dp0, to, pc);
+          dp.new_piece[0] = evalList.piece_with_id(dp0);
+      }
+
       move_piece(from, to);
+  }
 
   // If the moving piece is a pawn do some special extra work
   if (type_of(pc) == PAWN)
@@ -810,6 +852,13 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
           remove_piece(to);
           put_piece(promotion, to);
 
+          if (Eval::useNNUE)
+          {
+              dp0 = piece_id_on(to);
+              evalList.put_piece(dp0, to, promotion);
+              dp.new_piece[0] = evalList.piece_with_id(dp0);
+          }
+
           // Update hash keys
           k ^= Zobrist::psq[pc][to] ^ Zobrist::psq[promotion][to];
           st->pawnKey ^= Zobrist::psq[pc][to];
@@ -901,6 +950,12 @@ void Position::undo_move(Move m) {
   {
       move_piece(to, from); // Put the piece back at the source square
 
+      if (Eval::useNNUE)
+      {
+          PieceId dp0 = st->dirtyPiece.pieceId[0];
+          evalList.put_piece(dp0, from, pc);
+      }
+
       if (st->capturedPiece)
       {
           Square capsq = to;
@@ -917,6 +972,14 @@ void Position::undo_move(Move m) {
           }
 
           put_piece(st->capturedPiece, capsq); // Restore the captured piece
+
+          if (Eval::useNNUE)
+          {
+              PieceId dp1 = st->dirtyPiece.pieceId[1];
+              assert(evalList.piece_with_id(dp1).from[WHITE] == PS_NONE);
+              assert(evalList.piece_with_id(dp1).from[BLACK] == PS_NONE);
+              evalList.put_piece(dp1, capsq, st->capturedPiece);
+          }
       }
   }
 
@@ -938,6 +1001,34 @@ void Position::do_castling(Color us, Square from, Square& to, Square& rfrom, Squ
   rto = relative_square(us, kingSide ? SQ_F1 : SQ_D1);
   to = relative_square(us, kingSide ? SQ_G1 : SQ_C1);
 
+  if (Eval::useNNUE)
+  {
+      PieceId dp0, dp1;
+      auto& dp = st->dirtyPiece;
+      dp.dirty_num = 2; // 2 pieces moved
+
+      if (Do)
+      {
+          dp0 = piece_id_on(from);
+          dp1 = piece_id_on(rfrom);
+          dp.pieceId[0] = dp0;
+          dp.old_piece[0] = evalList.piece_with_id(dp0);
+          evalList.put_piece(dp0, to, make_piece(us, KING));
+          dp.new_piece[0] = evalList.piece_with_id(dp0);
+          dp.pieceId[1] = dp1;
+          dp.old_piece[1] = evalList.piece_with_id(dp1);
+          evalList.put_piece(dp1, rto, make_piece(us, ROOK));
+          dp.new_piece[1] = evalList.piece_with_id(dp1);
+      }
+      else
+      {
+          dp0 = piece_id_on(to);
+          dp1 = piece_id_on(rto);
+          evalList.put_piece(dp0, from, make_piece(us, KING));
+          evalList.put_piece(dp1, rfrom, make_piece(us, ROOK));
+      }
+  }
+
   // Remove both pieces first since squares could overlap in Chess960
   remove_piece(Do ? from : to);
   remove_piece(Do ? rfrom : rto);
@@ -955,7 +1046,14 @@ void Position::do_null_move(StateInfo& newSt) {
   assert(!checkers());
   assert(&newSt != st);
 
-  std::memcpy(&newSt, st, sizeof(StateInfo));
+  if (Eval::useNNUE)
+  {
+      std::memcpy(&newSt, st, sizeof(StateInfo));
+      st->accumulator.computed_score = false;
+  }
+  else
+      std::memcpy(&newSt, st, offsetof(StateInfo, accumulator));
+
   newSt.previous = st;
   st = &newSt;
 
diff --git a/src/position.h b/src/position.h
index 8cfa3920..a77050eb 100644
--- a/src/position.h
+++ b/src/position.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -27,8 +25,11 @@
 #include <string>
 
 #include "bitboard.h"
+#include "evaluate.h"
 #include "types.h"
 
+#include "nnue/nnue_accumulator.h"
+
 
 /// StateInfo struct stores information needed to restore a Position object to
 /// its previous state when we retract a move. Whenever a move is made on the
@@ -54,6 +55,10 @@ struct StateInfo {
   Bitboard   pinners[COLOR_NB];
   Bitboard   checkSquares[PIECE_TYPE_NB];
   int        repetition;
+
+  // Used by NNUE
+  Eval::NNUE::Accumulator accumulator;
+  DirtyPiece dirtyPiece;
 };
 
 
@@ -163,6 +168,10 @@ public:
   bool pos_is_ok() const;
   void flip();
 
+  // Used by NNUE
+  StateInfo* state() const;
+  const EvalList* eval_list() const;
+
 private:
   // Initialization helpers (used while setting up a position)
   void set_castling_right(Color c, Square rfrom);
@@ -176,6 +185,9 @@ private:
   template<bool Do>
   void do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto);
 
+  // ID of a piece on a given square
+  PieceId piece_id_on(Square sq) const;
+
   // Data members
   Piece board[SQUARE_NB];
   Bitboard byTypeBB[PIECE_TYPE_NB];
@@ -192,6 +204,9 @@ private:
   Thread* thisThread;
   StateInfo* st;
   bool chess960;
+
+  // List of pieces used in NNUE evaluation function
+  EvalList evalList;
 };
 
 namespace PSQT {
@@ -426,4 +441,25 @@ inline void Position::do_move(Move m, StateInfo& newSt) {
   do_move(m, newSt, gives_check(m));
 }
 
+inline StateInfo* Position::state() const {
+
+  return st;
+}
+
+inline const EvalList* Position::eval_list() const {
+
+  return &evalList;
+}
+
+inline PieceId Position::piece_id_on(Square sq) const
+{
+
+  assert(piece_on(sq) != NO_PIECE);
+
+  PieceId pid = evalList.piece_id_list[sq];
+  assert(is_ok(pid));
+
+  return pid;
+}
+
 #endif // #ifndef POSITION_H_INCLUDED
diff --git a/src/psqt.cpp b/src/psqt.cpp
index 5e8dd2c7..eb36e75e 100644
--- a/src/psqt.cpp
+++ b/src/psqt.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/search.cpp b/src/search.cpp
index 91ac60ad..d1dc4489 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -227,6 +225,8 @@ void MainThread::search() {
   Time.init(Limits, us, rootPos.game_ply());
   TT.new_search();
 
+  Eval::verify_NNUE();
+
   if (rootMoves.empty())
   {
       rootMoves.emplace_back(MOVE_NONE);
diff --git a/src/search.h b/src/search.h
index 3e855c8b..2554f3fb 100644
--- a/src/search.h
+++ b/src/search.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp
index 95d58945..20215b96 100644
--- a/src/syzygy/tbprobe.cpp
+++ b/src/syzygy/tbprobe.cpp
@@ -1,7 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (c) 2013 Ronald de Man
-  Copyright (C) 2016-2020 Marco Costalba, Lucas Braesch
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/syzygy/tbprobe.h b/src/syzygy/tbprobe.h
index df3ca4fe..b998989b 100644
--- a/src/syzygy/tbprobe.h
+++ b/src/syzygy/tbprobe.h
@@ -1,7 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (c) 2013 Ronald de Man
-  Copyright (C) 2016-2020 Marco Costalba, Lucas Braesch
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/thread.cpp b/src/thread.cpp
index a0ee2b25..44aea14e 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/thread.h b/src/thread.h
index a69e1d10..46da1e34 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/thread_win32_osx.h b/src/thread_win32_osx.h
index 0ef5c981..c4b55a48 100644
--- a/src/thread_win32_osx.h
+++ b/src/thread_win32_osx.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/timeman.cpp b/src/timeman.cpp
index 546eadd2..df4ba9b2 100644
--- a/src/timeman.cpp
+++ b/src/timeman.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/timeman.h b/src/timeman.h
index 9301dc94..5ad72b32 100644
--- a/src/timeman.h
+++ b/src/timeman.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/tt.cpp b/src/tt.cpp
index 34590903..d494c27d 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/tt.h b/src/tt.h
index e18db8ce..c177ca52 100644
--- a/src/tt.h
+++ b/src/tt.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/tune.cpp b/src/tune.cpp
index c1b1c76b..e94f67f8 100644
--- a/src/tune.cpp
+++ b/src/tune.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/tune.h b/src/tune.h
index 27c3f961..1489fa32 100644
--- a/src/tune.h
+++ b/src/tune.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2017 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2018 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/types.h b/src/types.h
index c1598561..379859f7 100644
--- a/src/types.h
+++ b/src/types.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -203,6 +201,22 @@ enum Piece {
   PIECE_NB = 16
 };
 
+// An ID used to track the pieces. Max. 32 pieces on board.
+enum PieceId {
+  PIECE_ID_ZERO   = 0,
+  PIECE_ID_KING   = 30,
+  PIECE_ID_WKING  = 30,
+  PIECE_ID_BKING  = 31,
+  PIECE_ID_NONE   = 32
+};
+
+inline PieceId operator++(PieceId& d, int) {
+
+  PieceId x = d;
+  d = PieceId(int(d) + 1);
+  return x;
+}
+
 constexpr Value PieceValue[PHASE_NB][PIECE_NB] = {
   { VALUE_ZERO, PawnValueMg, KnightValueMg, BishopValueMg, RookValueMg, QueenValueMg, VALUE_ZERO, VALUE_ZERO,
     VALUE_ZERO, PawnValueMg, KnightValueMg, BishopValueMg, RookValueMg, QueenValueMg, VALUE_ZERO, VALUE_ZERO },
@@ -232,7 +246,8 @@ enum Square : int {
   SQ_A8, SQ_B8, SQ_C8, SQ_D8, SQ_E8, SQ_F8, SQ_G8, SQ_H8,
   SQ_NONE,
 
-  SQUARE_NB = 64
+  SQUARE_ZERO = 0,
+  SQUARE_NB   = 64
 };
 
 enum Direction : int {
@@ -255,6 +270,94 @@ enum Rank : int {
   RANK_1, RANK_2, RANK_3, RANK_4, RANK_5, RANK_6, RANK_7, RANK_8, RANK_NB
 };
 
+// unique number for each piece type on each square
+enum PieceSquare : uint32_t {
+  PS_NONE     =  0,
+  PS_W_PAWN   =  1,
+  PS_B_PAWN   =  1 * SQUARE_NB + 1,
+  PS_W_KNIGHT =  2 * SQUARE_NB + 1,
+  PS_B_KNIGHT =  3 * SQUARE_NB + 1,
+  PS_W_BISHOP =  4 * SQUARE_NB + 1,
+  PS_B_BISHOP =  5 * SQUARE_NB + 1,
+  PS_W_ROOK   =  6 * SQUARE_NB + 1,
+  PS_B_ROOK   =  7 * SQUARE_NB + 1,
+  PS_W_QUEEN  =  8 * SQUARE_NB + 1,
+  PS_B_QUEEN  =  9 * SQUARE_NB + 1,
+  PS_W_KING   = 10 * SQUARE_NB + 1,
+  PS_END      = PS_W_KING, // pieces without kings (pawns included)
+  PS_B_KING   = 11 * SQUARE_NB + 1,
+  PS_END2     = 12 * SQUARE_NB + 1
+};
+
+struct ExtPieceSquare {
+  PieceSquare from[COLOR_NB];
+};
+
+// Array for finding the PieceSquare corresponding to the piece on the board
+extern ExtPieceSquare kpp_board_index[PIECE_NB];
+
+constexpr bool is_ok(PieceId pid);
+constexpr Square rotate180(Square sq);
+
+// Structure holding which tracked piece (PieceId) is where (PieceSquare)
+class EvalList {
+
+public:
+  // Max. number of pieces without kings is 30 but must be a multiple of 4 in AVX2
+  static const int MAX_LENGTH = 32;
+
+  // Array that holds the piece id for the pieces on the board
+  PieceId piece_id_list[SQUARE_NB];
+
+  // List of pieces, separate from White and Black POV
+  PieceSquare* piece_list_fw() const { return const_cast<PieceSquare*>(pieceListFw); }
+  PieceSquare* piece_list_fb() const { return const_cast<PieceSquare*>(pieceListFb); }
+
+  // Place the piece pc with piece_id on the square sq on the board
+  void put_piece(PieceId piece_id, Square sq, Piece pc)
+  {
+      assert(is_ok(piece_id));
+      if (pc != NO_PIECE)
+      {
+          pieceListFw[piece_id] = PieceSquare(kpp_board_index[pc].from[WHITE] + sq);
+          pieceListFb[piece_id] = PieceSquare(kpp_board_index[pc].from[BLACK] + rotate180(sq));
+          piece_id_list[sq] = piece_id;
+      }
+      else
+      {
+          pieceListFw[piece_id] = PS_NONE;
+          pieceListFb[piece_id] = PS_NONE;
+          piece_id_list[sq] = piece_id;
+      }
+  }
+
+  // Convert the specified piece_id piece to ExtPieceSquare type and return it
+  ExtPieceSquare piece_with_id(PieceId piece_id) const
+  {
+      ExtPieceSquare eps;
+      eps.from[WHITE] = pieceListFw[piece_id];
+      eps.from[BLACK] = pieceListFb[piece_id];
+      return eps;
+  }
+
+private:
+  PieceSquare pieceListFw[MAX_LENGTH];
+  PieceSquare pieceListFb[MAX_LENGTH];
+};
+
+// For differential evaluation of pieces that changed since last turn
+struct DirtyPiece {
+
+  // Number of changed pieces
+  int dirty_num;
+
+  // The ids of changed pieces, max. 2 pieces can change in one move
+  PieceId pieceId[2];
+
+  // What changed from the piece with that piece number
+  ExtPieceSquare old_piece[2];
+  ExtPieceSquare new_piece[2];
+};
 
 /// Score enum stores a middlegame and an endgame value in a single integer (enum).
 /// The least significant 16 bits are used to store the middlegame value and the
@@ -280,10 +383,10 @@ inline Value mg_value(Score s) {
 }
 
 #define ENABLE_BASE_OPERATORS_ON(T)                                \
-constexpr T operator+(T d1, int d2) { return T(int(d1) + d2); } \
-constexpr T operator-(T d1, int d2) { return T(int(d1) - d2); } \
+constexpr T operator+(T d1, int d2) { return T(int(d1) + d2); }    \
+constexpr T operator-(T d1, int d2) { return T(int(d1) - d2); }    \
 constexpr T operator-(T d) { return T(-int(d)); }                  \
-inline T& operator+=(T& d1, int d2) { return d1 = d1 + d2; }         \
+inline T& operator+=(T& d1, int d2) { return d1 = d1 + d2; }       \
 inline T& operator-=(T& d1, int d2) { return d1 = d1 - d2; }
 
 #define ENABLE_INCR_OPERATORS_ON(T)                                \
@@ -302,6 +405,9 @@ inline T& operator/=(T& d, int i) { return d = T(int(d) / i); }
 ENABLE_FULL_OPERATORS_ON(Value)
 ENABLE_FULL_OPERATORS_ON(Direction)
 
+ENABLE_INCR_OPERATORS_ON(Piece)
+ENABLE_INCR_OPERATORS_ON(PieceSquare)
+ENABLE_INCR_OPERATORS_ON(PieceId)
 ENABLE_INCR_OPERATORS_ON(PieceType)
 ENABLE_INCR_OPERATORS_ON(Square)
 ENABLE_INCR_OPERATORS_ON(File)
@@ -390,6 +496,10 @@ inline Color color_of(Piece pc) {
   return Color(pc >> 3);
 }
 
+constexpr bool is_ok(PieceId pid) {
+  return pid < PIECE_ID_NONE;
+}
+
 constexpr bool is_ok(Square s) {
   return s >= SQ_A1 && s <= SQ_H8;
 }
@@ -426,6 +536,11 @@ constexpr Square to_sq(Move m) {
   return Square(m & 0x3F);
 }
 
+// Return relative square when turning the board 180 degrees
+constexpr Square rotate180(Square sq) {
+  return (Square)(sq ^ 0x3F);
+}
+
 constexpr int from_to(Move m) {
  return m & 0xFFF;
 }
diff --git a/src/uci.cpp b/src/uci.cpp
index bb57c80b..d6486320 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -78,6 +76,20 @@ namespace {
     }
   }
 
+  // trace_eval() prints the evaluation for the current position, consistent with the UCI
+  // options set so far.
+
+  void trace_eval(Position& pos) {
+
+    StateListPtr states(new std::deque<StateInfo>(1));
+    Position p;
+    p.set(pos.fen(), Options["UCI_Chess960"], &states->back(), Threads.main());
+
+    Eval::verify_NNUE();
+
+    sync_cout << "\n" << Eval::trace(p) << sync_endl;
+  }
+
 
   // setoption() is called when engine receives the "setoption" UCI command. The
   // function updates the UCI option ("name") to the given value ("value").
@@ -166,7 +178,7 @@ namespace {
                nodes += Threads.nodes_searched();
             }
             else
-               sync_cout << "\n" << Eval::trace(pos) << sync_endl;
+               trace_eval(pos);
         }
         else if (token == "setoption")  setoption(is);
         else if (token == "position")   position(pos, is, states);
@@ -261,7 +273,7 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "flip")     pos.flip();
       else if (token == "bench")    bench(pos, is, states);
       else if (token == "d")        sync_cout << pos << sync_endl;
-      else if (token == "eval")     sync_cout << Eval::trace(pos) << sync_endl;
+      else if (token == "eval")     trace_eval(pos);
       else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
       else
           sync_cout << "Unknown command: " << cmd << sync_endl;
diff --git a/src/uci.h b/src/uci.h
index ad954d9f..eb0b390b 100644
--- a/src/uci.h
+++ b/src/uci.h
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index ef54ef4e..788aed17 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -1,8 +1,6 @@
 /*
   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
-  Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
-  Copyright (C) 2015-2020 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
 
   Stockfish is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -42,7 +40,8 @@ void on_hash_size(const Option& o) { TT.resize(size_t(o)); }
 void on_logger(const Option& o) { start_logger(o); }
 void on_threads(const Option& o) { Threads.set(size_t(o)); }
 void on_tb_path(const Option& o) { Tablebases::init(o); }
-
+void on_use_NNUE(const Option& ) { Eval::init_NNUE(); }
+void on_eval_file(const Option& ) { Eval::init_NNUE(); }
 
 /// Our case insensitive less() function as required by UCI protocol
 bool CaseInsensitiveLess::operator() (const string& s1, const string& s2) const {
@@ -79,6 +78,8 @@ void init(OptionsMap& o) {
   o["SyzygyProbeDepth"]      << Option(1, 1, 100);
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
+  o["Use NNUE"]              << Option(false, on_use_NNUE);
+  o["EvalFile"]              << Option("nn-97f742aaefcd.nnue", on_eval_file);
 }
 
 

From 3dca13a958cd0dfea1cdea91da230c5aac9e322f Mon Sep 17 00:00:00 2001
From: MJZ1977 <37274752+MJZ1977@users.noreply.github.com>
Date: Thu, 6 Aug 2020 17:39:10 +0200
Subject: [PATCH 27/86] NNUE evaluation threshold

The idea is to use NNUE only on quite balanced material positions. This bring a big speedup on research since NNUE eval is slower than classical eval for most of the hardwares and specially on unbalanced positions with LazyEval.

STC: https://tests.stockfishchess.org/tests/view/5f2c2680b3ebe5cbfee85b61
LLR: 2.95 (-2.94,2.94) {-0.50,1.50}
Total: 3168 W: 560 L: 400 D: 2208
Ptnml(0-2): 21, 294, 819, 404, 46

LTC: https://tests.stockfishchess.org/tests/view/5f2c2ca6b3ebe5cbfee85b69
LLR: 2.98 (-2.94,2.94) {0.25,1.75}
Total: 3200 W: 287 L: 183 D: 2730
Ptnml(0-2): 4, 149, 1191, 251, 5

closes https://github.com/official-stockfish/Stockfish/pull/2916

Bench 4746616
---
 src/evaluate.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index f43c62d6..09496fdc 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -107,9 +107,10 @@ using namespace Trace;
 namespace {
 
   // Threshold for lazy and space evaluation
-  constexpr Value LazyThreshold1  = Value(1400);
-  constexpr Value LazyThreshold2  = Value(1300);
+  constexpr Value LazyThreshold1 =  Value(1400);
+  constexpr Value LazyThreshold2 =  Value(1300);
   constexpr Value SpaceThreshold = Value(12222);
+  constexpr Value NNUEThreshold  =   Value(500);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -941,9 +942,14 @@ make_v:
 Value Eval::evaluate(const Position& pos) {
 
   if (Eval::useNNUE)
-      return NNUE::evaluate(pos);
-  else
-      return Evaluation<NO_TRACE>(pos).value();
+  {
+      Value balance = pos.non_pawn_material(WHITE) - pos.non_pawn_material(BLACK);
+      balance += 200 * (pos.count<PAWN>(WHITE) - pos.count<PAWN>(BLACK));
+      // Take NNUE eval only on balanced positions
+      if (abs(balance) < NNUEThreshold)
+         return NNUE::evaluate(pos);
+  }
+  return Evaluation<NO_TRACE>(pos).value();
 }
 
 /// trace() is like evaluate(), but instead of returning a value, it returns

From 8b8412ef87825d8e341e160585307dc89843b7f6 Mon Sep 17 00:00:00 2001
From: Stefan Geschwentner <stgeschwentner@gmail.com>
Date: Fri, 7 Aug 2020 01:08:15 +0200
Subject: [PATCH 28/86] Add tempo also to NNUE eval.

STC:
LLR: 2.93 (-2.94,2.94) {-0.50,1.50}
Total: 10608 W: 1507 L: 1358 D: 7743
Ptnml(0-2): 94, 945, 3074, 1100, 91
https://tests.stockfishchess.org/tests/view/5f2c5921b3ebe5cbfee85b8b

LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.75}
Total: 7536 W: 556 L: 448 D: 6532
Ptnml(0-2): 9, 383, 2881, 481, 14
https://tests.stockfishchess.org/tests/view/5f2c6f4461e3b6af64881e95

closes https://github.com/official-stockfish/Stockfish/pull/2919

Bench: 4746616
---
 src/evaluate.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 09496fdc..015efa48 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -947,7 +947,7 @@ Value Eval::evaluate(const Position& pos) {
       balance += 200 * (pos.count<PAWN>(WHITE) - pos.count<PAWN>(BLACK));
       // Take NNUE eval only on balanced positions
       if (abs(balance) < NNUEThreshold)
-         return NNUE::evaluate(pos);
+         return NNUE::evaluate(pos) + Tempo;
   }
   return Evaluation<NO_TRACE>(pos).value();
 }

From af935365e3e528f445c1c0f48bb43b8cf685719c Mon Sep 17 00:00:00 2001
From: FauziAkram <fauzi.dabat@hotmail.com>
Date: Thu, 6 Aug 2020 17:37:54 -0700
Subject: [PATCH 29/86] Tuned pawn values

Passed STC:
https://tests.stockfishchess.org/tests/view/5f2aa49fa5abc164f05e4d1b
LLR: 2.95 (-2.94,2.94) {-0.50,1.50}
Total: 40888 W: 7977 L: 7726 D: 25185
Ptnml(0-2): 665, 4806, 9333, 4893, 747

Passed LTC:
https://tests.stockfishchess.org/tests/view/5f2b1059b3ebe5cbfee85ae7
LLR: 2.98 (-2.94,2.94) {0.25,1.75}
Total: 51264 W: 6445 L: 6134 D: 38685
Ptnml(0-2): 328, 4564, 15580, 4789, 371

closes https://github.com/official-stockfish/Stockfish/pull/2920

bench: 4314943
---
 src/pawns.cpp | 14 +++++++-------
 src/types.h   |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/pawns.cpp b/src/pawns.cpp
index 73682529..868d0c8e 100644
--- a/src/pawns.cpp
+++ b/src/pawns.cpp
@@ -30,21 +30,21 @@ namespace {
   #define S(mg, eg) make_score(mg, eg)
 
   // Pawn penalties
-  constexpr Score Backward      = S( 9, 24);
-  constexpr Score Doubled       = S(11, 56);
-  constexpr Score Isolated      = S( 5, 15);
-  constexpr Score WeakLever     = S( 0, 56);
-  constexpr Score WeakUnopposed = S(13, 27);
+  constexpr Score Backward      = S( 8, 27);
+  constexpr Score Doubled       = S(11, 55);
+  constexpr Score Isolated      = S( 5, 17);
+  constexpr Score WeakLever     = S( 2, 54);
+  constexpr Score WeakUnopposed = S(15, 25);
 
   // Bonus for blocked pawns at 5th or 6th rank
-  constexpr Score BlockedPawn[2] = { S(-11, -4), S(-3, 4) };
+  constexpr Score BlockedPawn[2] = { S(-13, -4), S(-4, 3) };
 
   constexpr Score BlockedStorm[RANK_NB] = {
     S(0, 0), S(0, 0), S(76, 78), S(-10, 15), S(-7, 10), S(-4, 6), S(-1, 2)
   };
 
   // Connected pawn bonus
-  constexpr int Connected[RANK_NB] = { 0, 7, 8, 12, 29, 48, 86 };
+  constexpr int Connected[RANK_NB] = { 0, 7, 8, 11, 24, 45, 85 };
 
   // Strength of pawn shelter for our king by [distance from edge][rank].
   // RANK_1 = 0 is used for files where we have no pawn, or pawn is behind our king.
diff --git a/src/types.h b/src/types.h
index 379859f7..73da41e2 100644
--- a/src/types.h
+++ b/src/types.h
@@ -178,7 +178,7 @@ enum Value : int {
   VALUE_MATE_IN_MAX_PLY  =  VALUE_MATE - MAX_PLY,
   VALUE_MATED_IN_MAX_PLY = -VALUE_MATE_IN_MAX_PLY,
 
-  PawnValueMg   = 124,   PawnValueEg   = 206,
+  PawnValueMg   = 126,   PawnValueEg   = 208,
   KnightValueMg = 781,   KnightValueEg = 854,
   BishopValueMg = 825,   BishopValueEg = 915,
   RookValueMg   = 1276,  RookValueEg   = 1380,

From 7f336dd59b3b1365943d73ee706a9610e18108bb Mon Sep 17 00:00:00 2001
From: UnaiCorzo <corzounai@gmail.com>
Date: Tue, 4 Aug 2020 14:32:52 +0200
Subject: [PATCH 30/86] Remove QueenInfiltration

STC https://tests.stockfishchess.org/tests/view/5f2955b1a5abc164f05e4c85
LLR: 2.96 (-2.94,2.94) {-1.50,0.50}
Total: 29216 W: 5560 L: 5416 D: 18240
Ptnml(0-2): 466, 3329, 6902, 3417, 494

LTC https://tests.stockfishchess.org/tests/view/5f299154a5abc164f05e4ca1
LLR: 2.92 (-2.94,2.94) {-1.50,0.50}
Total: 54144 W: 6635 L: 6594 D: 40915
Ptnml(0-2): 372, 4859, 16536, 4966, 339

closes https://github.com/official-stockfish/Stockfish/pull/2910

Bench: 4609008
---
 src/evaluate.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 015efa48..d20c7b70 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -181,7 +181,6 @@ namespace {
   constexpr Score MinorBehindPawn     = S( 18,  3);
   constexpr Score PassedFile          = S( 11,  8);
   constexpr Score PawnlessFlank       = S( 17, 95);
-  constexpr Score QueenInfiltration   = S( -2, 14);
   constexpr Score ReachableOutpost    = S( 31, 22);
   constexpr Score RestrictedPiece     = S(  7,  7);
   constexpr Score RookOnKingRing      = S( 16,  0);
@@ -423,10 +422,6 @@ namespace {
             Bitboard queenPinners;
             if (pos.slider_blockers(pos.pieces(Them, ROOK, BISHOP), s, queenPinners))
                 score -= WeakQueen;
-
-            // Bonus for queen on weak square in enemy camp
-            if (relative_rank(Us, s) > RANK_4 && (~pe->pawn_attacks_span(Them) & s))
-                score += QueenInfiltration;
         }
     }
     if (T)

From 615d98da2447e79ceceae205e0cd4e878115acc3 Mon Sep 17 00:00:00 2001
From: Stefan Geschwentner <stgeschwentner@gmail.com>
Date: Wed, 5 Aug 2020 09:29:27 +0200
Subject: [PATCH 31/86] Do move legality check before pruning.

This alllows to simplify the code because the move counter haven't to be
decremented later if a move isn't legal. As a side effect now illegal
pruned moves doesn't included anymore in move counter. So slightly less
pruning and reductions are done.

STC:
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 111016 W: 21106 L: 21077 D: 68833
Ptnml(0-2): 1830, 13083, 25736, 12946, 1913
https://tests.stockfishchess.org/tests/view/5f28816fa5abc164f05e4c26

LTC:
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 39264 W: 4909 L: 4843 D: 29512
Ptnml(0-2): 263, 3601, 11854, 3635, 279
https://tests.stockfishchess.org/tests/view/5f297902a5abc164f05e4c8e

closes https://github.com/official-stockfish/Stockfish/pull/2906

Bench: 4390086
---
 src/search.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index d1dc4489..2f83f4f4 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -986,6 +986,10 @@ moves_loop: // When in check, search starts from here
                                   thisThread->rootMoves.begin() + thisThread->pvLast, move))
           continue;
 
+      // Check for legality
+      if (!rootNode && !pos.legal(move))
+          continue;
+
       ss->moveCount = ++moveCount;
 
       if (rootNode && thisThread == Threads.main() && Time.elapsed() > 3000)
@@ -1137,13 +1141,6 @@ moves_loop: // When in check, search starts from here
       // Speculative prefetch as early as possible
       prefetch(TT.first_entry(pos.key_after(move)));
 
-      // Check for legality just before making the move
-      if (!rootNode && !pos.legal(move))
-      {
-          ss->moveCount = --moveCount;
-          continue;
-      }
-
       // Update the current move (this must be done after singular extension search)
       ss->currentMove = move;
       ss->continuationHistory = &thisThread->continuationHistory[ss->inCheck]

From 857e045ced9e20f202e15d825e47b3ab8241dcef Mon Sep 17 00:00:00 2001
From: Sergio Vieri <sergio.vieri.hp@gmail.com>
Date: Fri, 7 Aug 2020 15:15:04 +0800
Subject: [PATCH 32/86] Update default net to nn-9931db908a9b.nnue

Net created at 20200806-1802

passed STC:
https://tests.stockfishchess.org/tests/view/5f2d00b461e3b6af64881f21
LLR: 2.94 (-2.94,2.94) {-0.50,1.50}
Total: 6672 W: 1052 L: 898 D: 4722
Ptnml(0-2): 63, 600, 1868, 730, 75

passed LTC:
https://tests.stockfishchess.org/tests/view/5f2d052a61e3b6af64881f29
LLR: 2.96 (-2.94,2.94) {0.25,1.75}
Total: 7576 W: 573 L: 463 D: 6540
Ptnml(0-2): 8, 392, 2889, 480, 19

closes https://github.com/official-stockfish/Stockfish/pull/2923

Bench: 4390086
---
 AUTHORS           | 1 +
 src/ucioption.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/AUTHORS b/AUTHORS
index 2e080e61..07e07297 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -151,6 +151,7 @@ Sami Kiminki (skiminki)
 Sebastian Buchwald (UniQP)
 Sergei Antonov (saproj)
 Sergei Ivanov (svivanov72)
+Sergio Vieri (sergiovieri)
 sf-x
 Shane Booth (shane31)
 Shawn Varghese (xXH4CKST3RXx)
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 788aed17..faeb78ae 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -79,7 +79,7 @@ void init(OptionsMap& o) {
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
   o["Use NNUE"]              << Option(false, on_use_NNUE);
-  o["EvalFile"]              << Option("nn-97f742aaefcd.nnue", on_eval_file);
+  o["EvalFile"]              << Option("nn-9931db908a9b.nnue", on_eval_file);
 }
 
 

From dc5af66eadf3cbe3c3ef106657e561c1aa8ac97f Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Sat, 8 Aug 2020 08:24:20 +0200
Subject: [PATCH 33/86] Tweak futility pruning depth.

STC https://tests.stockfishchess.org/tests/view/5f2d237161e3b6af64881f43
LLR: 2.96 (-2.94,2.94) {-0.50,1.50}
Total: 12712 W: 1823 L: 1664 D: 9225
Ptnml(0-2): 122, 1166, 3627, 1313, 128

LTC https://tests.stockfishchess.org/tests/view/5f2d473061e3b6af64881f6f
LLR: 2.96 (-2.94,2.94) {0.25,1.75}
Total: 12104 W: 912 L: 788 D: 10404
Ptnml(0-2): 13, 665, 4582, 769, 23

closes https://github.com/official-stockfish/Stockfish/pull/2930

bench: 4271421
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 2f83f4f4..886ed52c 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -816,7 +816,7 @@ namespace {
 
     // Step 8. Futility pruning: child node (~50 Elo)
     if (   !PvNode
-        &&  depth < 6
+        &&  depth < 8
         &&  eval - futility_margin(depth, improving) >= beta
         &&  eval < VALUE_KNOWN_WIN) // Do not return unproven wins
         return eval;

From 5ccff25df2e8fcbee3d4c1428bbc101afa88e700 Mon Sep 17 00:00:00 2001
From: Lolligerhans <lolligerhans@gmx.de>
Date: Fri, 7 Aug 2020 11:24:37 +0200
Subject: [PATCH 34/86] Expand outposts to minors shielded by pawns

Allow any pawn in front of a minor piece to replace the pawn protection
requirement for outposts.

  +-------+  +-------+
  | . . o |  | o . . |    o  Their pawns
  | . o x |  | o . . |    x  Our pawns
  | o N . |  | x o B |  N,B  New (reachable) outpost
  | . . . |  | . _ . |    _  Reachable square behind a pawn
  +-------+  +-------+
  N outpost  B reaches
               outpost

  We want outposts to be secured by pawns against major pieces. If
a minor is shielded by any pawn from above, it is rarely at the same
time protected by our pawn attacks from below. However, the pawn shield
in itself offers some degree of protection.
  A pawn shield will now suffice to replace the pawn protection for the
outpost (and reachable outpost) bonus.

This effect stacks with the existing "minor behind pawn" bonus.

STC
https://tests.stockfishchess.org/tests/view/5f2bcd14b3ebe5cbfee85b2c
LLR: 2.94 (-2.94,2.94) {-0.50,1.50}
Total: 27248 W: 5353 L: 5119 D: 16776
Ptnml(0-2): 462, 3174, 6185, 3274, 529

LTC
https://tests.stockfishchess.org/tests/view/5f2bfef5b3ebe5cbfee85b5a
LLR: 2.96 (-2.94,2.94) {0.25,1.75}
Total: 99432 W: 12580 L: 12130 D: 74722
Ptnml(0-2): 696, 8903, 30049, 9391, 677

Closes #2935

Bench: 4143673
---
 src/evaluate.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index d20c7b70..47b84ee6 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -345,7 +345,8 @@ namespace {
         {
             // Bonus if the piece is on an outpost square or can reach one
             // Reduced bonus for knights (BadOutpost) if few relevant targets
-            bb = OutpostRanks & attackedBy[Us][PAWN] & ~pe->pawn_attacks_span(Them);
+            bb = OutpostRanks & (attackedBy[Us][PAWN] | shift<Down>(pos.pieces(PAWN)))
+                              & ~pe->pawn_attacks_span(Them);
             Bitboard targets = pos.pieces(Them) & ~pos.pieces(PAWN);
 
             if (   Pt == KNIGHT

From f4c27cda1a6874550fcbf6cf991b0b9abe43ff39 Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Sat, 8 Aug 2020 03:45:08 +0800
Subject: [PATCH 35/86] Reintroduce late irreversible move extension

Reintroduce vondele's late irreversible move extension for fortress keeping.
This was removed when we only had classical eval.
Now that we have the NNUE net, it seems that this is useful again.

STC:
LLR: 2.93 (-2.94,2.94) {-0.50,1.50}
Total: 5352 W: 787 L: 653 D: 3912
Ptnml(0-2): 34, 451, 1579, 571, 41
https://tests.stockfishchess.org/tests/view/5f2dc8ad61e3b6af64881ff0

LTC:
LLR: 2.94 (-2.94,2.94) {0.25,1.75}
Total: 14416 W: 1013 L: 891 D: 12512
Ptnml(0-2): 15, 722, 5623, 822, 26
https://tests.stockfishchess.org/tests/view/5f2e0e3661e3b6af6488201e

closes https://github.com/official-stockfish/Stockfish/pull/2936

Bench: 4154696
---
 src/search.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/search.cpp b/src/search.cpp
index 886ed52c..8be96e29 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1134,6 +1134,12 @@ moves_loop: // When in check, search starts from here
       // Castling extension
       if (type_of(move) == CASTLING)
           extension = 1;
+	  
+      // Late irreversible move extension
+      if (   move == ttMove
+          && pos.rule50_count() > 80
+          && (captureOrPromotion || type_of(movedPiece) == PAWN))
+          extension = 2;
 
       // Add extension to new depth
       newDepth += extension;

From 910f779eb1f432c3f90fc19c7824840e02cac837 Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Sat, 8 Aug 2020 05:51:26 +0300
Subject: [PATCH 36/86] Do more futility pruning for parent nodes.

This patch increases LMRdepth threshold for futility pruning at parent nodes so it can apply more often.
With radical change to evaluation approach it seems that search is really far from optimal state, especially it parts that use static evaluation of position.

passed STC
https://tests.stockfishchess.org/tests/view/5f2da75661e3b6af64881fd0
LLR: 2.93 (-2.94,2.94) {-0.50,1.50}
Total: 8744 W: 1305 L: 1156 D: 6283
Ptnml(0-2): 75, 789, 2500, 928, 80

passed LTC
https://tests.stockfishchess.org/tests/view/5f2dcb2a61e3b6af64881ff3
LLR: 2.98 (-2.94,2.94) {0.25,1.75}
Total: 17728 W: 1256 L: 1117 D: 15355
Ptnml(0-2): 22, 961, 6774, 1070, 37

Bench: 4067325
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 8be96e29..4a9bd7de 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1028,7 +1028,7 @@ moves_loop: // When in check, search starts from here
                   continue;
 
               // Futility pruning: parent node (~5 Elo)
-              if (   lmrDepth < 6
+              if (   lmrDepth < 8
                   && !ss->inCheck
                   && ss->staticEval + 284 + 188 * lmrDepth <= alpha
                   &&  (*contHist[0])[movedPiece][to_sq(move)]

From 23ecf3d5c6ffbcfbe45acd2afcf503929474a4db Mon Sep 17 00:00:00 2001
From: "U-DESKTOP-3900\\Mark" <jjoshua2@gmail.com>
Date: Fri, 7 Aug 2020 19:53:18 -0400
Subject: [PATCH 37/86] simplified and increased threshold to switch between
 NNUE and classical

STC https://tests.stockfishchess.org/tests/view/5f2deb1661e3b6af6488200f
LLR: 2.96 (-2.94,2.94) {-1.50,0.50}
Total: 10376 W: 1481 L: 1359 D: 7536
Ptnml(0-2): 91, 953, 2981, 1069, 94

LTC: https://tests.stockfishchess.org/html/live_elo.html?5f2e0a0461e3b6af64882019
LLR: 2.99 (-2.94,2.94) {-1.50,0.50}
Total: 5040 W: 375 L: 315 D: 4350
Ptnml(0-2): 7, 263, 1926, 311, 13

closes https://github.com/official-stockfish/Stockfish/pull/2934

Bench: 4067325
---
 src/evaluate.cpp | 7 +++----
 src/search.cpp   | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 47b84ee6..1ae6cb3a 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -110,7 +110,7 @@ namespace {
   constexpr Value LazyThreshold1 =  Value(1400);
   constexpr Value LazyThreshold2 =  Value(1300);
   constexpr Value SpaceThreshold = Value(12222);
-  constexpr Value NNUEThreshold  =   Value(500);
+  constexpr Value NNUEThreshold  =   Value(520);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -939,10 +939,9 @@ Value Eval::evaluate(const Position& pos) {
 
   if (Eval::useNNUE)
   {
-      Value balance = pos.non_pawn_material(WHITE) - pos.non_pawn_material(BLACK);
-      balance += 200 * (pos.count<PAWN>(WHITE) - pos.count<PAWN>(BLACK));
+      Value v = eg_value(pos.psq_score());
       // Take NNUE eval only on balanced positions
-      if (abs(balance) < NNUEThreshold)
+      if (abs(v) < NNUEThreshold)
          return NNUE::evaluate(pos) + Tempo;
   }
   return Evaluation<NO_TRACE>(pos).value();
diff --git a/src/search.cpp b/src/search.cpp
index 4a9bd7de..4a993b01 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1134,7 +1134,7 @@ moves_loop: // When in check, search starts from here
       // Castling extension
       if (type_of(move) == CASTLING)
           extension = 1;
-	  
+
       // Late irreversible move extension
       if (   move == ttMove
           && pos.rule50_count() > 80

From 450b60a303b0c59b0cc5dd22d95b9a983dfc4f96 Mon Sep 17 00:00:00 2001
From: mckx00 <mckx00@gmail.com>
Date: Sat, 8 Aug 2020 03:07:07 -0700
Subject: [PATCH 38/86] Remove unnecessay legality check

Possible after the recent reording pos.legal(move) check

https://github.com/official-stockfish/Stockfish/pull/2941

No functional change.
---
 AUTHORS        | 1 +
 src/search.cpp | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 07e07297..21ef3e50 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -79,6 +79,7 @@ Jean Gauthier (OuaisBla)
 Jean-Francois Romang (jromang)
 Jekaa
 Jerry Donald Watson (jerrydonaldwatson)
+jjoshua2
 Jonathan Calovski (Mysseno)
 Jonathan Dumale (SFisGOD)
 Joost VandeVondele (vondele)
diff --git a/src/search.cpp b/src/search.cpp
index 4a993b01..e5d18f77 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1079,8 +1079,7 @@ moves_loop: // When in check, search starts from here
        /* &&  ttValue != VALUE_NONE Already implicit in the next condition */
           &&  abs(ttValue) < VALUE_KNOWN_WIN
           && (tte->bound() & BOUND_LOWER)
-          &&  tte->depth() >= depth - 3
-          &&  pos.legal(move))
+          &&  tte->depth() >= depth - 3)
       {
           Value singularBeta = ttValue - ((formerPv + 4) * depth) / 2;
           Depth singularDepth = (depth - 1 + 3 * formerPv) / 2;

From 3368d0328591b2741ca32e57cfa0a35a7144fdd1 Mon Sep 17 00:00:00 2001
From: Moez Jellouli <37274752+MJZ1977@users.noreply.github.com>
Date: Sat, 8 Aug 2020 12:35:34 +0200
Subject: [PATCH 39/86] update Null Move Pruning parameters

STC: https://tests.stockfishchess.org/tests/view/5f2dc38561e3b6af64881fec
LLR: 2.99 (-2.94,2.94) {-0.50,1.50}
Total: 6120 W: 903 L: 758 D: 4459
Ptnml(0-2): 44, 535, 1775, 644, 62

LTC: https://tests.stockfishchess.org/tests/view/5f2dd55f61e3b6af64882003
LLR: 2.95 (-2.94,2.94) {0.25,1.75}
Total: 7424 W: 577 L: 463 D: 6384
Ptnml(0-2): 16, 375, 2824, 473, 24

closes https://github.com/official-stockfish/Stockfish/pull/2942

bench 4107833
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index e5d18f77..9cdc7046 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -827,7 +827,7 @@ namespace {
         && (ss-1)->statScore < 23824
         &&  eval >= beta
         &&  eval >= ss->staticEval
-        &&  ss->staticEval >= beta - 33 * depth - 33 * improving + 112 * ttPv + 311
+        &&  ss->staticEval >= beta - 28 * depth - 28 * improving + 94 * ttPv + 200
         && !excludedMove
         &&  pos.non_pawn_material(us)
         && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor))

From e663bc533020183c0c52eaf877a91422c9c80742 Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Sat, 8 Aug 2020 17:43:41 +0300
Subject: [PATCH 40/86] Do more aggressive futility pruning for captures

This patch lines up with other patches which use better eval to produce more aggressive cutoffs based on static evaluation of position, it allows more aggressive futility pruning for captures - so now we will be producing them with bigger evaluation of position, so more often.

passed STC
https://tests.stockfishchess.org/tests/view/5f2da79e61e3b6af64881fd2
LLR: 3.87 (-2.94,2.94) {-0.50,1.50}
Total: 27256 W: 3809 L: 3593 D: 19854
Ptnml(0-2): 221, 2578, 7830, 2762, 237

passed LTC
https://tests.stockfishchess.org/tests/view/5f2df92061e3b6af64882012
LLR: 4.97 (-2.94,2.94) {0.25,1.75}
Total: 43624 W: 3095 L: 2820 D: 37709
Ptnml(0-2): 66, 2410, 16608, 2639, 89

closes https://github.com/official-stockfish/Stockfish/pull/2946

Bench: 4272280
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 9cdc7046..201cd974 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1055,7 +1055,7 @@ moves_loop: // When in check, search starts from here
                   && !(PvNode && abs(bestValue) < 2)
                   && PieceValue[MG][type_of(movedPiece)] >= PieceValue[MG][type_of(pos.piece_on(to_sq(move)))]
                   && !ss->inCheck
-                  && ss->staticEval + 267 + 391 * lmrDepth
+                  && ss->staticEval + 178 + 261 * lmrDepth
                      + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha)
                   continue;
 

From 6d6267c378aa0aa354e203e5025361d9a4e0d449 Mon Sep 17 00:00:00 2001
From: Guy Vreuls <guyvreuls@gmail.com>
Date: Sat, 8 Aug 2020 12:45:10 +0200
Subject: [PATCH 41/86] Parallelize Link Time Optimization for GCC, CLANG and
 MINGW

This patch tries to run multiple LTO threads in parallel, speeding up
the build process of optimized builds if the -j make parameter is used.
This mitigates the longer linking times of optimized builds since the
integration of the NNUE code. Roughly 2x build speedup.

I've tried a similar patch some two years ago but it ran into trouble
with old compiler versions then. Since we're on the C++17 standard now
these old compilers should be obsolete.

closes https://github.com/official-stockfish/Stockfish/pull/2943

No functional change.
---
 src/Makefile | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 4741e722..cab7a7e5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -282,6 +282,9 @@ ifeq ($(COMP),gcc)
 	ifneq ($(KERNEL),Darwin)
 	   LDFLAGS += -Wl,--no-as-needed
 	endif
+	
+	gccversion = $(shell $(CXX) --version)
+	gccisclang = $(findstring clang,$(gccversion))
 endif
 
 ifeq ($(COMP),mingw)
@@ -496,18 +499,28 @@ endif
 ### needs access to the optimization flags.
 ifeq ($(optimize),yes)
 ifeq ($(debug), no)
-	ifeq ($(comp),$(filter $(comp),gcc clang))
+	ifeq ($(comp),clang)
+		CXXFLAGS += -flto=thin
+		LDFLAGS += $(CXXFLAGS)
+
+# GCC and CLANG use different methods for parallelizing LTO and CLANG pretends to be
+# GCC on some systems.
+	else ifeq ($(comp),gcc)
+	ifeq ($(gccisclang),)
 		CXXFLAGS += -flto
+		LDFLAGS += $(CXXFLAGS) -flto=jobserver
+	else
+		CXXFLAGS += -flto=thin
 		LDFLAGS += $(CXXFLAGS)
 	endif
 
 # To use LTO and static linking on windows, the tool chain requires a recent gcc:
 # gcc version 10.1 in msys2 or TDM-GCC version 9.2 are know to work, older might not.
 # So, only enable it for a cross from Linux by default.
-	ifeq ($(comp),mingw)
+	else ifeq ($(comp),mingw)
 	ifeq ($(KERNEL),Linux)
 		CXXFLAGS += -flto
-		LDFLAGS += $(CXXFLAGS)
+		LDFLAGS += $(CXXFLAGS) -flto=jobserver
 	endif
 	endif
 endif
@@ -693,7 +706,7 @@ config-sanity:
 	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang"
 
 $(EXE): $(OBJS)
-	$(CXX) -o $@ $(OBJS) $(LDFLAGS)
+	+$(CXX) -o $@ $(OBJS) $(LDFLAGS)
 
 clang-profile-make:
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \

From 1949eb8604853e2ad8f85400590e6a1e2ce7e451 Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Sat, 8 Aug 2020 22:03:37 +0200
Subject: [PATCH 42/86] Singular extension search tweak

Tweak depth.

STC https://tests.stockfishchess.org/tests/view/5f2d22ec61e3b6af64881f40
LLR: 2.94 (-2.94,2.94) {-0.50,1.50}
Total: 17984 W: 2603 L: 2441 D: 12940
Ptnml(0-2): 133, 1751, 5094, 1849, 165

LTC https://tests.stockfishchess.org/tests/view/5f2d5a6a61e3b6af64881f7f
LLR: 2.95 (-2.94,2.94) {0.25,1.75}
Total: 85808 W: 5956 L: 5621 D: 74231
Ptnml(0-2): 149, 4748, 32785, 5063, 159

closes https://github.com/official-stockfish/Stockfish/pull/2950

fixes two README.md typos:
fixes https://github.com/official-stockfish/Stockfish/issues/2932

bench: 4022669
---
 README.md      | 4 ++--
 src/search.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f71a8b34..7b6ddf4c 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ Currently, Stockfish has the following UCI options:
 
   * #### Use NNUE
     Toggle between the NNUE and classical evaluation functions. If set to "true",
-    the network parameters must be availabe to load from file (see also EvalFile).
+    the network parameters must be available to load from file (see also EvalFile).
 
   * #### EvalFile
     The name of the file of the NNUE evaluation parameters. Depending on the GUI the
@@ -138,7 +138,7 @@ Currently, Stockfish has the following UCI options:
   * #### Debug Log File
     Write all communication to and from the engine into a text file.
 
-## classical and NNUE evaluation
+## Classical and NNUE evaluation
 
 Both approaches assign a value to a position that is used in alpha-beta (PVS) search
 to find the best move. The classical evaluation computes this value as a function
diff --git a/src/search.cpp b/src/search.cpp
index 201cd974..37e3ff22 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1072,7 +1072,7 @@ moves_loop: // When in check, search starts from here
       // then that move is singular and should be extended. To verify this we do
       // a reduced search on all the other moves but the ttMove and if the
       // result is lower than ttValue minus a margin, then we will extend the ttMove.
-      if (    depth >= 6
+      if (    depth >= 7
           &&  move == ttMove
           && !rootNode
           && !excludedMove // Avoid recursive singular search

From add890a10b8fe03e091520cd0af7383615c6c386 Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Sat, 8 Aug 2020 22:08:40 +0200
Subject: [PATCH 43/86] LMR search tweak

All credit to Vizvezdenec, the original author of the idea.

STC https://tests.stockfishchess.org/tests/view/5f2d606a61e3b6af64881f88
LLR: 2.95 (-2.94,2.94) {-0.50,1.50}
Total: 8440 W: 1191 L: 1048 D: 6201
Ptnml(0-2): 59, 754, 2467, 865, 75

LTC https://tests.stockfishchess.org/tests/view/5f2d84ad61e3b6af64881fbd
LLR: 2.95 (-2.94,2.94) {0.25,1.75}
Total: 21896 W: 1557 L: 1406 D: 18933
Ptnml(0-2): 33, 1185, 8378, 1302, 50

closes https://github.com/official-stockfish/Stockfish/pull/2951

bench: 4084753
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 37e3ff22..0a2519b6 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1159,7 +1159,7 @@ moves_loop: // When in check, search starts from here
       // Step 16. Reduced depth search (LMR, ~200 Elo). If the move fails high it will be
       // re-searched at full depth.
       if (    depth >= 3
-          &&  moveCount > 1 + 2 * rootNode
+          &&  moveCount > 1 + 2 * rootNode + 2 * (PvNode && abs(bestValue) < 2)
           && (!rootNode || thisThread->best_move_count(move) == 0)
           && (  !captureOrPromotion
               || moveCountPruning

From d7a26899a973536ab9d3ce4771d8276d1a4dc55c Mon Sep 17 00:00:00 2001
From: Daniel Dugovic <dandydand@gmail.com>
Date: Sat, 8 Aug 2020 15:39:29 -0500
Subject: [PATCH 44/86] Use fallback implementation for C++ aligned_alloc

fixes https://github.com/official-stockfish/Stockfish/issues/2921

closes https://github.com/official-stockfish/Stockfish/pull/2927

No functional change
---
 src/Makefile | 4 ++--
 src/misc.cpp | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index cab7a7e5..b7585a17 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -354,8 +354,8 @@ endif
 endif
 
 ifeq ($(KERNEL),Darwin)
-	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.15
-	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.15
+	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.13
+	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.13
 endif
 
 ### Travis CI script uses COMPILER to overwrite CXX
diff --git a/src/misc.cpp b/src/misc.cpp
index 3d7c75e5..05f79b45 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -321,9 +321,9 @@ void prefetch(void* addr) {
 ///
 
 void* std_aligned_alloc(size_t alignment, size_t size) {
-#if defined(__APPLE__)
+#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) )
   return aligned_alloc(alignment, size);
-#elif defined(_WIN32)
+#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
   return _mm_malloc(size, alignment);
 #else
   return std::aligned_alloc(alignment, size);
@@ -331,9 +331,9 @@ void* std_aligned_alloc(size_t alignment, size_t size) {
 }
 
 void std_aligned_free(void* ptr) {
-#if defined(__APPLE__)
+#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) )
   free(ptr);
-#elif defined(_WIN32)
+#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
   _mm_free(ptr);
 #else
   free(ptr);

From 320fa1b2f082a7db67363e468e7e241d7cedcc64 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 9 Aug 2020 11:05:07 +0200
Subject: [PATCH 45/86] Improve error message on missing net.

small rewording, but also print the download url for the default net.

closes https://github.com/official-stockfish/Stockfish/pull/2954

No functional change
---
 src/evaluate.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 1ae6cb3a..a642357e 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -50,9 +50,13 @@ namespace Eval {
     std::string eval_file = std::string(Options["EvalFile"]);
     if (useNNUE && eval_file_loaded != eval_file)
     {
-        std::cerr << "Use of NNUE evaluation, but the file " << eval_file << " was not loaded successfully. "
-                  << "These network evaluation parameters must be available, compatible with this version of the code. "
-                  << "The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file." << std::endl;
+        UCI::OptionsMap defaults;
+        UCI::init(defaults);
+
+        std::cerr << "NNUE evaluation used, but the network file " << eval_file << " was not loaded successfully. "
+                  << "These network evaluation parameters must be available, and compatible with this version of the code. "
+                  << "The UCI option EvalFile might need to specify the full path, including the directory/folder name, to the file. "
+                  << "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/"+std::string(defaults["EvalFile"]) << std::endl;
         std::exit(EXIT_FAILURE);
     }
 

From cd1bb27dd452f336d434a45131bfbe43f8a8c5b3 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 9 Aug 2020 19:08:47 +0200
Subject: [PATCH 46/86] Fix aligned_alloc on MinGW

introduced with d7a26899a973536ab9d3ce4771d8276d1a4dc55c

closes https://github.com/official-stockfish/Stockfish/pull/2959

No functional change.
---
 src/misc.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/misc.cpp b/src/misc.cpp
index 05f79b45..bdd7bccb 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -321,7 +321,7 @@ void prefetch(void* addr) {
 ///
 
 void* std_aligned_alloc(size_t alignment, size_t size) {
-#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) )
+#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
   return aligned_alloc(alignment, size);
 #elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
   return _mm_malloc(size, alignment);
@@ -331,7 +331,7 @@ void* std_aligned_alloc(size_t alignment, size_t size) {
 }
 
 void std_aligned_free(void* ptr) {
-#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) )
+#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
   free(ptr);
 #elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
   _mm_free(ptr);

From 2bfde5542919c2ed624b5b62883616e325ccb942 Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Sun, 9 Aug 2020 21:39:46 +0300
Subject: [PATCH 47/86] Adjust NNUE usage based on number of pawns in position

The idea of this patch is that positions are usually more complex and hard to evaluate even if there are more pawns.
This patch adjusts NNUE threshold usage depending on number of pawns in position, if pawn count is <3 we use the
classical evaluation more often, for pawn count = 3 patch the is non-functional,
with pawn count > 3 NNUE evaluation is used more often.

passed STC
https://tests.stockfishchess.org/tests/view/5f2f02d09081672066536b1f
LLR: 2.96 (-2.94,2.94) {-0.50,1.50}
Total: 36520 W: 5011 L: 4823 D: 26686
Ptnml(0-2): 299, 3482, 10548, 3594, 337

passed LTC
https://tests.stockfishchess.org/tests/view/5f2f4c329081672066536b5c
LLR: 2.98 (-2.94,2.94) {0.25,1.75}
Total: 39272 W: 2630 L: 2433 D: 34209
Ptnml(0-2): 53, 2066, 15218, 2229, 70

closes https://github.com/official-stockfish/Stockfish/pull/2960

bench 4084753
---
 src/evaluate.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index a642357e..ce35c630 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -114,7 +114,7 @@ namespace {
   constexpr Value LazyThreshold1 =  Value(1400);
   constexpr Value LazyThreshold2 =  Value(1300);
   constexpr Value SpaceThreshold = Value(12222);
-  constexpr Value NNUEThreshold  =   Value(520);
+  constexpr Value NNUEThreshold  =   Value(460);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -945,7 +945,7 @@ Value Eval::evaluate(const Position& pos) {
   {
       Value v = eg_value(pos.psq_score());
       // Take NNUE eval only on balanced positions
-      if (abs(v) < NNUEThreshold)
+      if (abs(v) < NNUEThreshold + 20 * pos.count<PAWN>())
          return NNUE::evaluate(pos) + Tempo;
   }
   return Evaluation<NO_TRACE>(pos).value();

From a6e89293df5af35931b61d86b6de3872a981c100 Mon Sep 17 00:00:00 2001
From: Dariusz Orzechowski <dariusz.orzechowski@gmail.com>
Date: Sun, 9 Aug 2020 14:32:24 -0700
Subject: [PATCH 48/86] Avoid special casing for MinGW

after some testing, no version of MinGW/gcc has been found where this code is still necessary.
Probably older code (pre-c++17?)

closes https://github.com/official-stockfish/Stockfish/pull/2891

No functional change
---
 src/nnue/layers/affine_transform.h  | 29 +++--------------
 src/nnue/layers/clipped_relu.h      | 49 ++++-------------------------
 src/nnue/nnue_feature_transformer.h | 34 ++------------------
 3 files changed, 14 insertions(+), 98 deletions(-)

diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index b585bc87..ecc3008a 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -104,13 +104,8 @@ namespace Eval::NNUE::Layers {
         __m512i sum = _mm512_setzero_si512();
         const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #else
-            __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #endif
-
+            __m512i product = _mm512_maddubs_epi16(
+              _mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
             product = _mm512_madd_epi16(product, kOnes);
             sum = _mm512_add_epi32(sum, product);
         }
@@ -125,12 +120,8 @@ namespace Eval::NNUE::Layers {
             const auto row_256 = reinterpret_cast<const __m256i*>(&weights_[offset]);
             int j = kNumChunks * 2;
 
-  #if defined(__MINGW32__) || defined(__MINGW64__)  // See HACK comment below in AVX2.
-            __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
-  #else
-            __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
-  #endif
-
+            __m256i sum256 = _mm256_maddubs_epi16(
+              _mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
             sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1));
             sum256 = _mm256_hadd_epi32(sum256, sum256);
             sum256 = _mm256_hadd_epi32(sum256, sum256);
@@ -144,17 +135,7 @@ namespace Eval::NNUE::Layers {
         const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           __m256i product = _mm256_maddubs_epi16(
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
-            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
-            //       even though alignas is specified.
-            _mm256_loadu_si256
-  #else
-            _mm256_load_si256
-  #endif
-
-            (&input_vector[j]), _mm256_load_si256(&row[j]));
+            _mm256_load_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
           product = _mm256_madd_epi16(product, kOnes);
           sum = _mm256_add_epi32(sum, product);
         }
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 7ade598f..7e5fcf4a 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -74,50 +74,13 @@ namespace Eval::NNUE::Layers {
       const auto out = reinterpret_cast<__m256i*>(output);
       for (IndexType i = 0; i < kNumChunks; ++i) {
         const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
-          //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
-          //       even though alignas is specified.
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 0]),
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 1])), kWeightScaleBits);
+          _mm256_load_si256(&in[i * 4 + 0]),
+          _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
         const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 2]),
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 3])), kWeightScaleBits);
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-        _mm256_storeu_si256
-  #else
-        _mm256_store_si256
-  #endif
-
-          (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+          _mm256_load_si256(&in[i * 4 + 2]),
+          _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
+        _mm256_store_si256(
+            &out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
             _mm256_packs_epi16(words0, words1), kZero), kOffsets));
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 1cfebbe4..f899d761 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -110,36 +110,12 @@ namespace Eval::NNUE {
         auto out = reinterpret_cast<__m256i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           __m256i sum0 =
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
-            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
-            //       even though alignas is specified.
-            _mm256_loadu_si256
-  #else
-            _mm256_load_si256
-  #endif
-
-            (&reinterpret_cast<const __m256i*>(
+            _mm256_load_si256(&reinterpret_cast<const __m256i*>(
               accumulation[perspectives[p]][0])[j * 2 + 0]);
           __m256i sum1 =
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            _mm256_loadu_si256
-  #else
-            _mm256_load_si256
-  #endif
-
-            (&reinterpret_cast<const __m256i*>(
+            _mm256_load_si256(&reinterpret_cast<const __m256i*>(
               accumulation[perspectives[p]][0])[j * 2 + 1]);
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_storeu_si256
-  #else
-          _mm256_store_si256
-  #endif
-
-          (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
 
@@ -202,11 +178,7 @@ namespace Eval::NNUE {
           auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
           for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j]));
-  #else
             accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
-  #endif
           }
 
   #elif defined(USE_SSE2)

From 27b593a94477a821f80a041320683f805114d4a3 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 9 Aug 2020 18:11:38 +0200
Subject: [PATCH 49/86] Fix a data race for NNUE

the stateInfo at the rootPos is no longer read-only, as the NNUE accumulator is part of it.
Threads can thus not share this object and need their own copy.

tested for no regression
https://tests.stockfishchess.org/tests/view/5f3022239081672066536bce
LLR: 2.96 (-2.94,2.94) {-1.50,0.50}
Total: 52800 W: 6843 L: 6802 D: 39155
Ptnml(0-2): 336, 4646, 16399, 4679, 340

closes https://github.com/official-stockfish/Stockfish/pull/2957

fixes https://github.com/official-stockfish/Stockfish/issues/2933

No functional change
---
 src/Makefile   |  4 ++--
 src/thread.cpp | 13 +++++--------
 src/thread.h   |  1 +
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index b7585a17..571172b2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -354,8 +354,8 @@ endif
 endif
 
 ifeq ($(KERNEL),Darwin)
-	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.13
-	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.13
+	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14
+	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 endif
 
 ### Travis CI script uses COMPILER to overwrite CXX
diff --git a/src/thread.cpp b/src/thread.cpp
index 44aea14e..1aa66a81 100644
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -204,21 +204,18 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
 
   // We use Position::set() to set root position across threads. But there are
   // some StateInfo fields (previous, pliesFromNull, capturedPiece) that cannot
-  // be deduced from a fen string, so set() clears them and to not lose the info
-  // we need to backup and later restore setupStates->back(). Note that setupStates
-  // is shared by threads but is accessed in read-only mode.
-  StateInfo tmp = setupStates->back();
-
+  // be deduced from a fen string, so set() clears them and they are set from
+  // setupStates->back() later. The rootState is per thread, earlier states are shared
+  // since they are read-only.
   for (Thread* th : *this)
   {
       th->nodes = th->tbHits = th->nmpMinPly = th->bestMoveChanges = 0;
       th->rootDepth = th->completedDepth = 0;
       th->rootMoves = rootMoves;
-      th->rootPos.set(pos.fen(), pos.is_chess960(), &setupStates->back(), th);
+      th->rootPos.set(pos.fen(), pos.is_chess960(), &th->rootState, th);
+      th->rootState = setupStates->back();
   }
 
-  setupStates->back() = tmp;
-
   main()->start_searching();
 }
 
diff --git a/src/thread.h b/src/thread.h
index 46da1e34..042bc2e9 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -65,6 +65,7 @@ public:
   std::atomic<uint64_t> nodes, tbHits, bestMoveChanges;
 
   Position rootPos;
+  StateInfo rootState;
   Search::RootMoves rootMoves;
   Depth rootDepth, completedDepth;
   CounterMoveHistory counterMoves;

From 651ec3b31ee68db50f38ccd8fcdedbd6673cd9ed Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 10 Aug 2020 07:18:15 +0200
Subject: [PATCH 50/86] Revert "Avoid special casing for MinGW"

This reverts commit a6e89293df5af35931b61d86b6de3872a981c100.

The offending setup has been found as gcc/mingw 7.3 (on Ubuntu 18.04).

fixes https://github.com/official-stockfish/Stockfish/issues/2963

closes https://github.com/official-stockfish/Stockfish/issues/2968

No functional change.
---
 src/nnue/layers/affine_transform.h  | 29 ++++++++++++++---
 src/nnue/layers/clipped_relu.h      | 49 +++++++++++++++++++++++++----
 src/nnue/nnue_feature_transformer.h | 34 ++++++++++++++++++--
 3 files changed, 98 insertions(+), 14 deletions(-)

diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index ecc3008a..b585bc87 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -104,8 +104,13 @@ namespace Eval::NNUE::Layers {
         __m512i sum = _mm512_setzero_si512();
         const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-            __m512i product = _mm512_maddubs_epi16(
-              _mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+  #else
+            __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+  #endif
+
             product = _mm512_madd_epi16(product, kOnes);
             sum = _mm512_add_epi32(sum, product);
         }
@@ -120,8 +125,12 @@ namespace Eval::NNUE::Layers {
             const auto row_256 = reinterpret_cast<const __m256i*>(&weights_[offset]);
             int j = kNumChunks * 2;
 
-            __m256i sum256 = _mm256_maddubs_epi16(
-              _mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
+  #if defined(__MINGW32__) || defined(__MINGW64__)  // See HACK comment below in AVX2.
+            __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
+  #else
+            __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
+  #endif
+
             sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1));
             sum256 = _mm256_hadd_epi32(sum256, sum256);
             sum256 = _mm256_hadd_epi32(sum256, sum256);
@@ -135,7 +144,17 @@ namespace Eval::NNUE::Layers {
         const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           __m256i product = _mm256_maddubs_epi16(
-            _mm256_load_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+            //       even though alignas is specified.
+            _mm256_loadu_si256
+  #else
+            _mm256_load_si256
+  #endif
+
+            (&input_vector[j]), _mm256_load_si256(&row[j]));
           product = _mm256_madd_epi16(product, kOnes);
           sum = _mm256_add_epi32(sum, product);
         }
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 7e5fcf4a..7ade598f 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -74,13 +74,50 @@ namespace Eval::NNUE::Layers {
       const auto out = reinterpret_cast<__m256i*>(output);
       for (IndexType i = 0; i < kNumChunks; ++i) {
         const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-          _mm256_load_si256(&in[i * 4 + 0]),
-          _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+          //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+          //       even though alignas is specified.
+          _mm256_loadu_si256
+  #else
+          _mm256_load_si256
+  #endif
+
+          (&in[i * 4 + 0]),
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_loadu_si256
+  #else
+          _mm256_load_si256
+  #endif
+
+          (&in[i * 4 + 1])), kWeightScaleBits);
         const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-          _mm256_load_si256(&in[i * 4 + 2]),
-          _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
-        _mm256_store_si256(
-            &out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_loadu_si256
+  #else
+          _mm256_load_si256
+  #endif
+
+          (&in[i * 4 + 2]),
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_loadu_si256
+  #else
+          _mm256_load_si256
+  #endif
+
+          (&in[i * 4 + 3])), kWeightScaleBits);
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+        _mm256_storeu_si256
+  #else
+        _mm256_store_si256
+  #endif
+
+          (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
             _mm256_packs_epi16(words0, words1), kZero), kOffsets));
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index f899d761..1cfebbe4 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -110,12 +110,36 @@ namespace Eval::NNUE {
         auto out = reinterpret_cast<__m256i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           __m256i sum0 =
-            _mm256_load_si256(&reinterpret_cast<const __m256i*>(
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
+            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
+            //       even though alignas is specified.
+            _mm256_loadu_si256
+  #else
+            _mm256_load_si256
+  #endif
+
+            (&reinterpret_cast<const __m256i*>(
               accumulation[perspectives[p]][0])[j * 2 + 0]);
           __m256i sum1 =
-            _mm256_load_si256(&reinterpret_cast<const __m256i*>(
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            _mm256_loadu_si256
+  #else
+            _mm256_load_si256
+  #endif
+
+            (&reinterpret_cast<const __m256i*>(
               accumulation[perspectives[p]][0])[j * 2 + 1]);
-          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+          _mm256_storeu_si256
+  #else
+          _mm256_store_si256
+  #endif
+
+          (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
 
@@ -178,7 +202,11 @@ namespace Eval::NNUE {
           auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
           for (IndexType j = 0; j < kNumChunks; ++j) {
+  #if defined(__MINGW32__) || defined(__MINGW64__)
+            _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j]));
+  #else
             accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
+  #endif
           }
 
   #elif defined(USE_SSE2)

From bcdf41dadc8a5f8a23116236a0f449a08b46dc6b Mon Sep 17 00:00:00 2001
From: Sergio Vieri <sergio.vieri.hp@gmail.com>
Date: Mon, 10 Aug 2020 08:47:52 +0800
Subject: [PATCH 51/86] Update default net to nn-112bb1c8cdb5.nnue

First trained net using search eval instead of pv leaf static eval.

Net created at: 20200810-0744

passed STC: https://tests.stockfishchess.org/tests/view/5f30995d90816720665373f8
LLR: 2.93 (-2.94,2.94) {-0.50,1.50}
Total: 15416 W: 2071 L: 1920 D: 11425
Ptnml(0-2): 123, 1376, 4563, 1519, 127

passed LTC: https://tests.stockfishchess.org/tests/view/5f30a104908167206653742b
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 29792 W: 2003 L: 1834 D: 25955
Ptnml(0-2): 50, 1541, 11550, 1700, 55

closes https://github.com/official-stockfish/Stockfish/pull/2966

Bench: 4084753
---
 src/ucioption.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index faeb78ae..b0689d6d 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -79,7 +79,7 @@ void init(OptionsMap& o) {
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
   o["Use NNUE"]              << Option(false, on_use_NNUE);
-  o["EvalFile"]              << Option("nn-9931db908a9b.nnue", on_eval_file);
+  o["EvalFile"]              << Option("nn-112bb1c8cdb5.nnue", on_eval_file);
 }
 
 

From a54f9011c3bf3581fe7daffef6be2d586e6662c1 Mon Sep 17 00:00:00 2001
From: jjoshua2 <jjoshua2@gmail.com>
Date: Sun, 9 Aug 2020 16:16:04 -0400
Subject: [PATCH 52/86] simplying hybrid condition

STC https://tests.stockfishchess.org/tests/view/5f3059d1908167206653736b:
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 12520 W: 766 L: 727 D: 11027
Ptnml(0-2): 13, 624, 4949, 659, 15

LTC: https://tests.stockfishchess.org/tests/view/5f30863a90816720665373d1
LLR: 2.94 (-2.94,2.94) {-1.50,0.50}
Total: 12520 W: 766 L: 727 D: 11027
Ptnml(0-2): 13, 624, 4949, 659, 15

closes: https://github.com/official-stockfish/Stockfish/pull/2965

Bench: 4084753
---
 src/evaluate.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index ce35c630..caab2979 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -114,7 +114,7 @@ namespace {
   constexpr Value LazyThreshold1 =  Value(1400);
   constexpr Value LazyThreshold2 =  Value(1300);
   constexpr Value SpaceThreshold = Value(12222);
-  constexpr Value NNUEThreshold  =   Value(460);
+  constexpr Value NNUEThreshold  =   Value(575);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -945,7 +945,7 @@ Value Eval::evaluate(const Position& pos) {
   {
       Value v = eg_value(pos.psq_score());
       // Take NNUE eval only on balanced positions
-      if (abs(v) < NNUEThreshold + 20 * pos.count<PAWN>())
+      if (abs(v) < NNUEThreshold)
          return NNUE::evaluate(pos) + Tempo;
   }
   return Evaluation<NO_TRACE>(pos).value();

From 875183b310a8249922c2155e82cb4cecfae2097e Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sun, 9 Aug 2020 23:50:59 -0700
Subject: [PATCH 53/86] Workaround using unaligned loads for gcc < 9

despite usage of alignas, the generated (avx2/avx512) code with older compilers needs to use
unaligned loads with older gcc (e.g. confirmed crash with gcc 7.3/mingw on abrok).

Better performance thus requires gcc >= 9 on hardware supporting avx2/avx512

closes https://github.com/official-stockfish/Stockfish/pull/2969

No functional change
---
 src/nnue/layers/affine_transform.h  | 32 +++----------------
 src/nnue/layers/clipped_relu.h      | 48 +++--------------------------
 src/nnue/nnue_common.h              | 21 +++++++++++++
 src/nnue/nnue_feature_transformer.h | 42 ++++---------------------
 4 files changed, 36 insertions(+), 107 deletions(-)

diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index b585bc87..20ec2f12 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -104,13 +104,7 @@ namespace Eval::NNUE::Layers {
         __m512i sum = _mm512_setzero_si512();
         const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #else
-            __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
-  #endif
-
+            __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
             product = _mm512_madd_epi16(product, kOnes);
             sum = _mm512_add_epi32(sum, product);
         }
@@ -124,13 +118,7 @@ namespace Eval::NNUE::Layers {
             const auto iv_256  = reinterpret_cast<const __m256i*>(input);
             const auto row_256 = reinterpret_cast<const __m256i*>(&weights_[offset]);
             int j = kNumChunks * 2;
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)  // See HACK comment below in AVX2.
-            __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
-  #else
-            __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
-  #endif
-
+            __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
             sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1));
             sum256 = _mm256_hadd_epi32(sum256, sum256);
             sum256 = _mm256_hadd_epi32(sum256, sum256);
@@ -143,18 +131,7 @@ namespace Eval::NNUE::Layers {
         __m256i sum = _mm256_setzero_si256();
         const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i product = _mm256_maddubs_epi16(
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
-            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
-            //       even though alignas is specified.
-            _mm256_loadu_si256
-  #else
-            _mm256_load_si256
-  #endif
-
-            (&input_vector[j]), _mm256_load_si256(&row[j]));
+          __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
           product = _mm256_madd_epi16(product, kOnes);
           sum = _mm256_add_epi32(sum, product);
         }
@@ -168,8 +145,7 @@ namespace Eval::NNUE::Layers {
         __m128i sum = _mm_cvtsi32_si128(biases_[i]);
         const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m128i product = _mm_maddubs_epi16(
-              _mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
           product = _mm_madd_epi16(product, kOnes);
           sum = _mm_add_epi32(sum, product);
         }
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 7ade598f..13196ec2 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -74,50 +74,12 @@ namespace Eval::NNUE::Layers {
       const auto out = reinterpret_cast<__m256i*>(output);
       for (IndexType i = 0; i < kNumChunks; ++i) {
         const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
-          //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
-          //       even though alignas is specified.
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 0]),
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 1])), kWeightScaleBits);
+            _mm256_loadA_si256(&in[i * 4 + 0]),
+            _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
         const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 2]),
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_loadu_si256
-  #else
-          _mm256_load_si256
-  #endif
-
-          (&in[i * 4 + 3])), kWeightScaleBits);
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-        _mm256_storeu_si256
-  #else
-        _mm256_store_si256
-  #endif
-
-          (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+            _mm256_loadA_si256(&in[i * 4 + 2]),
+            _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
+        _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
             _mm256_packs_epi16(words0, words1), kZero), kOffsets));
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 972ef3e5..e7ce84f7 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -37,6 +37,27 @@
 #include <arm_neon.h>
 #endif
 
+// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary
+//       compiled with older g++ crashes because the output memory is not aligned
+//       even though alignas is specified.
+#if defined(USE_AVX2)
+#if defined(__GNUC__ ) && (__GNUC__ < 9)
+#define _mm256_loadA_si256  _mm256_loadu_si256
+#define _mm256_storeA_si256 _mm256_storeu_si256
+#else
+#define _mm256_loadA_si256  _mm256_load_si256
+#define _mm256_storeA_si256 _mm256_store_si256
+#endif
+#endif
+
+#if defined(USE_AVX512)
+#if defined(__GNUC__ ) && (__GNUC__ < 9)
+#define _mm512_loadA_si512  _mm512_loadu_si512
+#else
+#define _mm512_loadA_si512  _mm512_load_si512
+#endif
+#endif
+
 namespace Eval::NNUE {
 
   // Version of the evaluation file
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 1cfebbe4..cbcc26f3 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -109,37 +109,11 @@ namespace Eval::NNUE {
   #if defined(USE_AVX2)
         auto out = reinterpret_cast<__m256i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i sum0 =
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary
-            //       compiled with g++ in MSYS2 crashes here because the output memory is not aligned
-            //       even though alignas is specified.
-            _mm256_loadu_si256
-  #else
-            _mm256_load_si256
-  #endif
-
-            (&reinterpret_cast<const __m256i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m256i sum1 =
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            _mm256_loadu_si256
-  #else
-            _mm256_load_si256
-  #endif
-
-            (&reinterpret_cast<const __m256i*>(
-              accumulation[perspectives[p]][0])[j * 2 + 1]);
-
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-          _mm256_storeu_si256
-  #else
-          _mm256_store_si256
-  #endif
-
-          (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+          __m256i sum0 = _mm256_loadA_si256(
+              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m256i sum1 = _mm256_loadA_si256(
+            &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
+          _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
 
@@ -202,11 +176,7 @@ namespace Eval::NNUE {
           auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
           for (IndexType j = 0; j < kNumChunks; ++j) {
-  #if defined(__MINGW32__) || defined(__MINGW64__)
-            _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j]));
-  #else
-            accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
-  #endif
+            _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
           }
 
   #elif defined(USE_SSE2)

From ad2ad4c65706c18a5383506d361f1f23fc6a26ab Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Mon, 10 Aug 2020 15:39:22 +0800
Subject: [PATCH 54/86] Modify castling extension

Extend castling only if there are few friendly pieces on the castling side.

Inspired by silversolver1's (Rahul Dsilva) test
https://tests.stockfishchess.org/tests/view/5f0fef560640035f9d2978cf

STC:
LLR: 2.94 (-2.94,2.94) {-0.50,1.50}
Total: 7096 W: 947 L: 818 D: 5331
Ptnml(0-2): 32, 604, 2181, 665, 66
https://tests.stockfishchess.org/tests/view/5f309f729081672066537426

LTC:
LLR: 2.96 (-2.94,2.94) {0.25,1.75}
Total: 4712 W: 300 L: 215 D: 4197
Ptnml(0-2): 2, 190, 1895, 259, 10
https://tests.stockfishchess.org/tests/view/5f30a2039081672066537430

closes https://github.com/official-stockfish/Stockfish/pull/2970

Bench: 4094850
---
 src/search.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index 0a2519b6..3d2bb422 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1131,7 +1131,8 @@ moves_loop: // When in check, search starts from here
           extension = 1;
 
       // Castling extension
-      if (type_of(move) == CASTLING)
+      if (   type_of(move) == CASTLING
+          && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 3)
           extension = 1;
 
       // Late irreversible move extension

From cb0504028e8830dbc71be53cbd701d78c3d562a1 Mon Sep 17 00:00:00 2001
From: sf-x <sf-x@users.noreply.github.com>
Date: Sun, 9 Aug 2020 18:01:18 +0300
Subject: [PATCH 55/86] Makefile rework/cleanup

Makefile targets x86-64-sse42, x86-sse3 are removed; x86-64-sse41
is renamed to x86-64-sse41-popcnt (it did enable popcnt).

Makefile variables sse3, sse42, their associated compilation flags
and code in misc.cpp are removed.

closes https://github.com/official-stockfish/Stockfish/pull/2922

No functional change
---
 src/Makefile | 58 +++-------------------------------------------------
 src/misc.cpp |  6 ------
 2 files changed, 3 insertions(+), 61 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 571172b2..a48e7dcb 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -68,10 +68,8 @@ endif
 # prefetch = yes/no   --- -DUSE_PREFETCH   --- Use prefetch asm-instruction
 # popcnt = yes/no     --- -DUSE_POPCNT     --- Use popcnt asm-instruction
 # sse = yes/no        --- -msse            --- Use Intel Streaming SIMD Extensions
-# sse3 = yes/no       --- -msse3           --- Use Intel Streaming SIMD Extensions 3
 # ssse3 = yes/no      --- -mssse3          --- Use Intel Supplemental Streaming SIMD Extensions 3
 # sse41 = yes/no      --- -msse4.1         --- Use Intel Streaming SIMD Extensions 4.1
-# sse42 = yes/no      --- -msse4.2         --- Use Intel Streaming SIMD Extensions 4.2
 # avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
 # pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
 # avx512 = yes/no     --- -mavx512bw       --- Use Intel Advanced Vector Extensions 512
@@ -89,10 +87,8 @@ bits = 64
 prefetch = no
 popcnt = no
 sse = no
-sse3 = no
 ssse3 = no
 sse41 = no
-sse42 = no
 avx2 = no
 pext = no
 avx512 = no
@@ -127,18 +123,10 @@ ifeq ($(ARCH),x86-64)
 	sse = yes
 endif
 
-ifeq ($(ARCH),x86-64-sse3)
-	arch = x86_64
-	prefetch = yes
-	sse = yes
-	sse3 = yes
-endif
-
 ifeq ($(ARCH),x86-64-sse3-popcnt)
 	arch = x86_64
 	prefetch = yes
 	sse = yes
-	sse3 = yes
 	popcnt = yes
 endif
 
@@ -146,39 +134,25 @@ ifeq ($(ARCH),x86-64-ssse3)
 	arch = x86_64
 	prefetch = yes
 	sse = yes
-	sse3 = yes
 	ssse3 = yes
 endif
 
-ifeq ($(ARCH),x86-64-sse41)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	sse3 = yes
-	ssse3 = yes
-	sse41 = yes
-endif
-
 ifeq ($(ARCH),x86-64-modern)
 	arch = x86_64
 	prefetch = yes
 	popcnt = yes
 	sse = yes
-	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
 endif
 
-ifeq ($(ARCH),x86-64-sse42)
+ifeq ($(ARCH),x86-64-sse41-popcnt)
 	arch = x86_64
 	prefetch = yes
 	popcnt = yes
 	sse = yes
-	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
-	sse42 = yes
 endif
 
 ifeq ($(ARCH),x86-64-avx2)
@@ -186,10 +160,8 @@ ifeq ($(ARCH),x86-64-avx2)
 	prefetch = yes
 	popcnt = yes
 	sse = yes
-	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
-	sse42 = yes
 	avx2 = yes
 endif
 
@@ -198,10 +170,8 @@ ifeq ($(ARCH),x86-64-bmi2)
 	prefetch = yes
 	popcnt = yes
 	sse = yes
-	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
-	sse42 = yes
 	avx2 = yes
 	pext = yes
 endif
@@ -211,10 +181,8 @@ ifeq ($(ARCH),x86-64-avx512)
 	prefetch = yes
 	popcnt = yes
 	sse = yes
-	sse3 = yes
 	ssse3 = yes
 	sse41 = yes
-	sse42 = yes
 	avx2 = yes
 	pext = yes
 	avx512 = yes
@@ -450,13 +418,6 @@ ifeq ($(avx512),yes)
 	endif
 endif
 
-ifeq ($(sse42),yes)
-	CXXFLAGS += -DUSE_SSE42
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
-		CXXFLAGS += -msse4.2
-	endif
-endif
-
 ifeq ($(sse41),yes)
 	CXXFLAGS += -DUSE_SSE41
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
@@ -471,13 +432,6 @@ ifeq ($(ssse3),yes)
 	endif
 endif
 
-ifeq ($(sse3),yes)
-	CXXFLAGS += -DUSE_SSE3
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
-		CXXFLAGS += -msse3
-	endif
-endif
-
 ifeq ($(neon),yes)
 	CXXFLAGS += -DUSE_NEON
 endif
@@ -557,12 +511,10 @@ help:
 	@echo "x86-64-avx512           > x86 64-bit with avx512 support"
 	@echo "x86-64-bmi2             > x86 64-bit with bmi2 support"
 	@echo "x86-64-avx2             > x86 64-bit with avx2 support"
-	@echo "x86-64-sse42            > x86 64-bit with sse42 support"
-	@echo "x86-64-modern           > x86 64-bit with sse41 support (x86-64-sse41)"
-	@echo "x86-64-sse41            > x86 64-bit with sse41 support"
+	@echo "x86-64-sse41-popcnt     > x86 64-bit with sse41 and popcnt support"
+	@echo "x86-64-modern           > the same as previous (x86-64-sse41-popcnt)"
 	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
 	@echo "x86-64-sse3-popcnt      > x86 64-bit with sse3 and popcnt support"
-	@echo "x86-64-sse3             > x86 64-bit with sse3 support"
 	@echo "x86-64                  > x86 64-bit generic"
 	@echo "x86-32                  > x86 32-bit (also enables SSE)"
 	@echo "x86-32-old              > x86 32-bit fall back for old hardware"
@@ -669,10 +621,8 @@ config-sanity:
 	@echo "prefetch: '$(prefetch)'"
 	@echo "popcnt: '$(popcnt)'"
 	@echo "sse: '$(sse)'"
-	@echo "sse3: '$(sse3)'"
 	@echo "ssse3: '$(ssse3)'"
 	@echo "sse41: '$(sse41)'"
-	@echo "sse42: '$(sse42)'"
 	@echo "avx2: '$(avx2)'"
 	@echo "pext: '$(pext)'"
 	@echo "avx512: '$(avx512)'"
@@ -695,10 +645,8 @@ config-sanity:
 	@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
 	@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
 	@test "$(sse)" = "yes" || test "$(sse)" = "no"
-	@test "$(sse3)" = "yes" || test "$(sse3)" = "no"
 	@test "$(ssse3)" = "yes" || test "$(ssse3)" = "no"
 	@test "$(sse41)" = "yes" || test "$(sse41)" = "no"
-	@test "$(sse42)" = "yes" || test "$(sse42)" = "no"
 	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
 	@test "$(pext)" = "yes" || test "$(pext)" = "no"
 	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
diff --git a/src/misc.cpp b/src/misc.cpp
index bdd7bccb..5061ae13 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -220,17 +220,11 @@ const std::string compiler_info() {
   #if defined(USE_AVX2)
     compiler += " AVX2";
   #endif
-  #if defined(USE_SSE42)
-    compiler += " SSE42";
-  #endif
   #if defined(USE_SSE41)
     compiler += " SSE41";
   #endif
   #if defined(USE_SSSE3)
     compiler += " SSSE3";
-  #endif
-  #if defined(USE_SSE3)
-    compiler += " SSE3";
   #endif
     compiler += (HasPext ? " BMI2" : "");
     compiler += (HasPopCnt ? " POPCNT" : "");

From f948cd008d3a289ebbadc463271f84888e8069ba Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sun, 9 Aug 2020 16:23:33 -0700
Subject: [PATCH 56/86] Cleanup and optimize SSE/AVX code

AVX512 +4% faster
AVX2 +1% faster
SSSE3 +5% faster

passed non-regression STC:
STC https://tests.stockfishchess.org/tests/view/5f31249f90816720665374f6
LLR: 2.96 (-2.94,2.94) {-1.50,0.50}
Total: 17576 W: 2344 L: 2245 D: 12987
Ptnml(0-2): 127, 1570, 5292, 1675, 124

closes https://github.com/official-stockfish/Stockfish/pull/2962

No functional change
---
 src/nnue/layers/affine_transform.h  | 46 +++++++++++++++--------------
 src/nnue/nnue_accumulator.h         |  2 +-
 src/nnue/nnue_common.h              |  6 ++--
 src/nnue/nnue_feature_transformer.h | 21 +++++++------
 4 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 20ec2f12..89cfaad7 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -108,24 +108,19 @@ namespace Eval::NNUE::Layers {
             product = _mm512_madd_epi16(product, kOnes);
             sum = _mm512_add_epi32(sum, product);
         }
-        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
 
         // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
         // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
         // and we have to do one more 256bit chunk.
         if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
         {
-            const auto iv_256  = reinterpret_cast<const __m256i*>(input);
-            const auto row_256 = reinterpret_cast<const __m256i*>(&weights_[offset]);
-            int j = kNumChunks * 2;
-            __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
-            sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1));
-            sum256 = _mm256_hadd_epi32(sum256, sum256);
-            sum256 = _mm256_hadd_epi32(sum256, sum256);
-            const __m128i lo = _mm256_extracti128_si256(sum256, 0);
-            const __m128i hi = _mm256_extracti128_si256(sum256, 1);
-            output[i] += _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
+            const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
+            const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
+            __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+            product256 = _mm256_madd_epi16(product256, _mm256_set1_epi16(1));
+            sum = _mm512_add_epi32(sum, _mm512_zextsi256_si512(product256));
         }
+        output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
 
   #elif defined(USE_AVX2)
         __m256i sum = _mm256_setzero_si256();
@@ -135,23 +130,30 @@ namespace Eval::NNUE::Layers {
           product = _mm256_madd_epi16(product, kOnes);
           sum = _mm256_add_epi32(sum, product);
         }
-        sum = _mm256_hadd_epi32(sum, sum);
-        sum = _mm256_hadd_epi32(sum, sum);
-        const __m128i lo = _mm256_extracti128_si256(sum, 0);
-        const __m128i hi = _mm256_extracti128_si256(sum, 1);
-        output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi) + biases_[i];
+        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+        output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
 
   #elif defined(USE_SSSE3)
-        __m128i sum = _mm_cvtsi32_si128(biases_[i]);
+        __m128i sum = _mm_setzero_si128();
         const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
-        for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+        for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
+          __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+          product0 = _mm_madd_epi16(product0, kOnes);
+          sum = _mm_add_epi32(sum, product0);
+          __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
+          product1 = _mm_madd_epi16(product1, kOnes);
+          sum = _mm_add_epi32(sum, product1);
+        }
+        if (kNumChunks & 0x1) {
+          __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
           product = _mm_madd_epi16(product, kOnes);
           sum = _mm_add_epi32(sum, product);
         }
-        sum = _mm_hadd_epi32(sum, sum);
-        sum = _mm_hadd_epi32(sum, sum);
-        output[i] = _mm_cvtsi128_si32(sum);
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+        sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+        output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
 
   #elif defined(USE_NEON)
         int32x4_t sum = {biases_[i]};
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index 2a354a3c..69dfaad2 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -26,7 +26,7 @@
 namespace Eval::NNUE {
 
   // Class that holds the result of affine transformation of input features
-  struct alignas(32) Accumulator {
+  struct alignas(kCacheLineSize) Accumulator {
     std::int16_t
         accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
     Value score;
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index e7ce84f7..ff33cc79 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -52,9 +52,11 @@
 
 #if defined(USE_AVX512)
 #if defined(__GNUC__ ) && (__GNUC__ < 9)
-#define _mm512_loadA_si512  _mm512_loadu_si512
+#define _mm512_loadA_si512   _mm512_loadu_si512
+#define _mm512_storeA_si512  _mm512_storeu_si512
 #else
-#define _mm512_loadA_si512  _mm512_load_si512
+#define _mm512_loadA_si512   _mm512_load_si512
+#define _mm512_storeA_si512  _mm512_store_si512
 #endif
 #endif
 
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index cbcc26f3..3818e444 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -169,38 +169,41 @@ namespace Eval::NNUE {
                    kHalfDimensions * sizeof(BiasType));
         for (const auto index : active_indices[perspective]) {
           const IndexType offset = kHalfDimensions * index;
+  #if defined(USE_AVX512)
+          auto accumulation = reinterpret_cast<__m512i*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+          for (IndexType j = 0; j < kNumChunks; ++j)
+            _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
 
-  #if defined(USE_AVX2)
+  #elif defined(USE_AVX2)
           auto accumulation = reinterpret_cast<__m256i*>(
               &accumulator.accumulation[perspective][i][0]);
           auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
+          for (IndexType j = 0; j < kNumChunks; ++j)
             _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
-          }
 
   #elif defined(USE_SSE2)
           auto accumulation = reinterpret_cast<__m128i*>(
               &accumulator.accumulation[perspective][i][0]);
           auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
+          for (IndexType j = 0; j < kNumChunks; ++j)
             accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
-          }
 
   #elif defined(USE_NEON)
           auto accumulation = reinterpret_cast<int16x8_t*>(
               &accumulator.accumulation[perspective][i][0]);
           auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
-          for (IndexType j = 0; j < kNumChunks; ++j) {
+          for (IndexType j = 0; j < kNumChunks; ++j)
             accumulation[j] = vaddq_s16(accumulation[j], column[j]);
-          }
 
   #else
-          for (IndexType j = 0; j < kHalfDimensions; ++j) {
+          for (IndexType j = 0; j < kHalfDimensions; ++j)
             accumulator.accumulation[perspective][i][j] += weights_[offset + j];
-          }
   #endif
 
         }

From 21df37d7fd4dcc9b4a9c319382cc43685c0259c8 Mon Sep 17 00:00:00 2001
From: Fanael Linithien <fanael4@gmail.com>
Date: Sun, 9 Aug 2020 16:20:45 +0200
Subject: [PATCH 57/86] Provide vectorized NNUE code for SSE2 and MMX targets

This patch allows old x86 CPUs, from AMD K8 (which the x86-64 baseline
targets) all the way down to the Pentium MMX, to benefit from NNUE with
comparable performance hit versus hand-written eval as on more modern
processors.

NPS of the bench with NNUE enabled on a Pentium III 1.13 GHz (using the
MMX code):
  master: 38951
  this patch: 80586

NPS of the bench with NNUE enabled using baseline x86-64 arch, which is
how linux distros are likely to package stockfish, on a modern CPU
(using the SSE2 code):
  master: 882584
  this patch: 1203945

closes https://github.com/official-stockfish/Stockfish/pull/2956

No functional change.
---
 AUTHORS                             |  1 +
 src/Makefile                        | 13 ++++++-
 src/misc.cpp                        |  3 ++
 src/nnue/layers/affine_transform.h  | 59 ++++++++++++++++++++++++++++-
 src/nnue/layers/clipped_relu.h      | 20 +++++++++-
 src/nnue/nnue_common.h              |  6 +++
 src/nnue/nnue_feature_transformer.h | 54 +++++++++++++++++++++++++-
 7 files changed, 150 insertions(+), 6 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 21ef3e50..41b89705 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -53,6 +53,7 @@ Ernesto Gatti
 Linmiao Xu (linrock)
 Fabian Beuke (madnight)
 Fabian Fichter (ianfab)
+Fanael Linithien (Fanael)
 fanon
 Fauzi Akram Dabat (FauziAkram)
 Felix Wittmann
diff --git a/src/Makefile b/src/Makefile
index a48e7dcb..3d84f482 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -86,6 +86,7 @@ sanitize = no
 bits = 64
 prefetch = no
 popcnt = no
+mmx = no
 sse = no
 ssse3 = no
 sse41 = no
@@ -110,6 +111,7 @@ ifeq ($(ARCH),x86-32)
 	arch = i386
 	bits = 32
 	prefetch = yes
+	mmx = yes
 	sse = yes
 endif
 
@@ -250,7 +252,7 @@ ifeq ($(COMP),gcc)
 	ifneq ($(KERNEL),Darwin)
 	   LDFLAGS += -Wl,--no-as-needed
 	endif
-	
+
 	gccversion = $(shell $(CXX) --version)
 	gccisclang = $(findstring clang,$(gccversion))
 endif
@@ -432,6 +434,13 @@ ifeq ($(ssse3),yes)
 	endif
 endif
 
+ifeq ($(mmx),yes)
+	CXXFLAGS += -DUSE_MMX
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -mmmx
+	endif
+endif
+
 ifeq ($(neon),yes)
 	CXXFLAGS += -DUSE_NEON
 endif
@@ -516,7 +525,7 @@ help:
 	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
 	@echo "x86-64-sse3-popcnt      > x86 64-bit with sse3 and popcnt support"
 	@echo "x86-64                  > x86 64-bit generic"
-	@echo "x86-32                  > x86 32-bit (also enables SSE)"
+	@echo "x86-32                  > x86 32-bit (also enables MMX and SSE)"
 	@echo "x86-32-old              > x86 32-bit fall back for old hardware"
 	@echo "ppc-64                  > PPC 64-bit"
 	@echo "ppc-32                  > PPC 32-bit"
diff --git a/src/misc.cpp b/src/misc.cpp
index 5061ae13..401a6505 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -228,6 +228,9 @@ const std::string compiler_info() {
   #endif
     compiler += (HasPext ? " BMI2" : "");
     compiler += (HasPopCnt ? " POPCNT" : "");
+  #if defined(USE_MMX)
+    compiler += " MMX";
+  #endif
   #if !defined(NDEBUG)
     compiler += " DEBUG";
   #endif
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 89cfaad7..985ee71a 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -87,11 +87,20 @@ namespace Eval::NNUE::Layers {
       const __m256i kOnes = _mm256_set1_epi16(1);
       const auto input_vector = reinterpret_cast<const __m256i*>(input);
 
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+  #ifndef USE_SSSE3
+      const __m128i kZeros = _mm_setzero_si128();
+  #else
       const __m128i kOnes = _mm_set1_epi16(1);
+  #endif
       const auto input_vector = reinterpret_cast<const __m128i*>(input);
 
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+      const __m64 kZeros = _mm_setzero_si64();
+      const auto input_vector = reinterpret_cast<const __m64*>(input);
+
   #elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
       const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
@@ -155,6 +164,51 @@ namespace Eval::NNUE::Layers {
         sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
         output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
 
+  #elif defined(USE_SSE2)
+        __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
+        __m128i sum_hi = kZeros;
+        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m128i row_j = _mm_load_si128(&row[j]);
+          __m128i input_j = _mm_load_si128(&input_vector[j]);
+          __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
+          __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
+          __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
+          __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
+          __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
+          __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
+          __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
+          sum_lo = _mm_add_epi32(sum_lo, product_lo);
+          sum_hi = _mm_add_epi32(sum_hi, product_hi);
+        }
+        __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
+        __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
+        sum = _mm_add_epi32(sum, sum_high_64);
+        __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
+        sum = _mm_add_epi32(sum, sum_second_32);
+        output[i] = _mm_cvtsi128_si32(sum);
+
+  #elif defined(USE_MMX)
+        __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
+        __m64 sum_hi = kZeros;
+        const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m64 row_j = row[j];
+          __m64 input_j = input_vector[j];
+          __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
+          __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
+          __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
+          __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
+          __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
+          __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
+          __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
+          sum_lo = _mm_add_pi32(sum_lo, product_lo);
+          sum_hi = _mm_add_pi32(sum_hi, product_hi);
+        }
+        __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
+        sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
+        output[i] = _mm_cvtsi64_si32(sum);
+
   #elif defined(USE_NEON)
         int32x4_t sum = {biases_[i]};
         const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
@@ -174,6 +228,9 @@ namespace Eval::NNUE::Layers {
   #endif
 
       }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
       return output;
     }
 
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 13196ec2..44d8a7de 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -84,7 +84,7 @@ namespace Eval::NNUE::Layers {
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
 
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
       constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
 
   #ifdef USE_SSE41
@@ -115,6 +115,24 @@ namespace Eval::NNUE::Layers {
       }
       constexpr IndexType kStart = kNumChunks * kSimdWidth;
 
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+      const __m64 k0x80s = _mm_set1_pi8(-128);
+      const auto in = reinterpret_cast<const __m64*>(input);
+      const auto out = reinterpret_cast<__m64*>(output);
+      for (IndexType i = 0; i < kNumChunks; ++i) {
+        const __m64 words0 = _mm_srai_pi16(
+            _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
+            kWeightScaleBits);
+        const __m64 words1 = _mm_srai_pi16(
+            _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
+            kWeightScaleBits);
+        const __m64 packedbytes = _mm_packs_pi16(words0, words1);
+        out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+      }
+      _mm_empty();
+      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
   #elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
       const int8x8_t kZero = {0};
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index ff33cc79..cb1251c5 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -33,6 +33,9 @@
 #elif defined(USE_SSE2)
 #include <emmintrin.h>
 
+#elif defined(USE_MMX)
+#include <mmintrin.h>
+
 #elif defined(USE_NEON)
 #include <arm_neon.h>
 #endif
@@ -79,6 +82,9 @@ namespace Eval::NNUE {
   #elif defined(USE_SSE2)
   constexpr std::size_t kSimdWidth = 16;
 
+  #elif defined(USE_MMX)
+  constexpr std::size_t kSimdWidth = 8;
+
   #elif defined(USE_NEON)
   constexpr std::size_t kSimdWidth = 16;
   #endif
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 3818e444..40f2603d 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -88,7 +88,7 @@ namespace Eval::NNUE {
       constexpr int kControl = 0b11011000;
       const __m256i kZero = _mm256_setzero_si256();
 
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
 
   #ifdef USE_SSE41
@@ -97,6 +97,10 @@ namespace Eval::NNUE {
       const __m128i k0x80s = _mm_set1_epi8(-128);
   #endif
 
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+      const __m64 k0x80s = _mm_set1_pi8(-128);
+
   #elif defined(USE_NEON)
       constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
       const int8x8_t kZero = {0};
@@ -117,7 +121,7 @@ namespace Eval::NNUE {
               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
         }
 
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
         auto out = reinterpret_cast<__m128i*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
           __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
@@ -137,6 +141,17 @@ namespace Eval::NNUE {
           );
         }
 
+  #elif defined(USE_MMX)
+        auto out = reinterpret_cast<__m64*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m64 sum0 = *(&reinterpret_cast<const __m64*>(
+              accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
+              accumulation[perspectives[p]][0])[j * 2 + 1]);
+          const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
+          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+        }
+
   #elif defined(USE_NEON)
         const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -154,6 +169,9 @@ namespace Eval::NNUE {
   #endif
 
       }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
     }
 
    private:
@@ -193,6 +211,15 @@ namespace Eval::NNUE {
           for (IndexType j = 0; j < kNumChunks; ++j)
             accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
 
+  #elif defined(USE_MMX)
+          auto accumulation = reinterpret_cast<__m64*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
+          }
+
   #elif defined(USE_NEON)
           auto accumulation = reinterpret_cast<int16x8_t*>(
               &accumulator.accumulation[perspective][i][0]);
@@ -208,6 +235,9 @@ namespace Eval::NNUE {
 
         }
       }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
 
       accumulator.computed_accumulation = true;
       accumulator.computed_score = false;
@@ -234,6 +264,11 @@ namespace Eval::NNUE {
         auto accumulation = reinterpret_cast<__m128i*>(
             &accumulator.accumulation[perspective][i][0]);
 
+  #elif defined(USE_MMX)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<__m64*>(
+            &accumulator.accumulation[perspective][i][0]);
+
   #elif defined(USE_NEON)
         constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
         auto accumulation = reinterpret_cast<int16x8_t*>(
@@ -263,6 +298,12 @@ namespace Eval::NNUE {
               accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
             }
 
+  #elif defined(USE_MMX)
+            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
+            }
+
   #elif defined(USE_NEON)
             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
             for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -294,6 +335,12 @@ namespace Eval::NNUE {
               accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
             }
 
+  #elif defined(USE_MMX)
+            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
+            }
+
   #elif defined(USE_NEON)
             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
             for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -310,6 +357,9 @@ namespace Eval::NNUE {
           }
         }
       }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
 
       accumulator.computed_accumulation = true;
       accumulator.computed_score = false;

From 220ef1d27d9cd006a30b07ab726999c8181d10f0 Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Mon, 10 Aug 2020 15:38:44 +0200
Subject: [PATCH 58/86] Assorted search parameter tune

STC https://tests.stockfishchess.org/tests/view/5f31219090816720665374ec
LLR: 2.96 (-2.94,2.94) {-0.50,1.50}
Total: 3376 W: 487 L: 359 D: 2530
Ptnml(0-2): 17, 253, 1042, 337, 39

LTC https://tests.stockfishchess.org/tests/view/5f3127f79081672066537502
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 8360 W: 581 L: 475 D: 7304
Ptnml(0-2): 11, 407, 3238, 513, 11

closes https://github.com/official-stockfish/Stockfish/pull/2971

bench: 4733874
---
 src/search.cpp | 60 +++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 3d2bb422..676427f7 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -63,9 +63,9 @@ namespace {
   constexpr uint64_t TtHitAverageResolution = 1024;
 
   // Razor and futility margins
-  constexpr int RazorMargin = 527;
+  constexpr int RazorMargin = 510;
   Value futility_margin(Depth d, bool improving) {
-    return Value(227 * (d - improving));
+    return Value(223 * (d - improving));
   }
 
   // Reductions lookup table, initialized at startup
@@ -73,7 +73,7 @@ namespace {
 
   Depth reduction(bool i, Depth d, int mn) {
     int r = Reductions[d] * Reductions[mn];
-    return (r + 570) / 1024 + (!i && r > 1018);
+    return (r + 509) / 1024 + (!i && r > 894);
   }
 
   constexpr int futility_move_count(bool improving, Depth depth) {
@@ -82,7 +82,7 @@ namespace {
 
   // History and stats update bonus, based on depth
   int stat_bonus(Depth d) {
-    return d > 15 ? 27 : 17 * d * d + 133 * d - 134;
+    return d > 13 ? 29 : 17 * d * d + 134 * d - 134;
   }
 
   // Add a small random component to draw evaluations to avoid 3fold-blindness
@@ -192,7 +192,7 @@ namespace {
 void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
-      Reductions[i] = int((24.8 + std::log(Threads.size())) * std::log(i));
+      Reductions[i] = int((22.0 + std::log(Threads.size())) * std::log(i));
 }
 
 
@@ -403,12 +403,12 @@ void Thread::search() {
           if (rootDepth >= 4)
           {
               Value prev = rootMoves[pvIdx].previousScore;
-              delta = Value(19);
+              delta = Value(17);
               alpha = std::max(prev - delta,-VALUE_INFINITE);
               beta  = std::min(prev + delta, VALUE_INFINITE);
 
               // Adjust contempt based on root move's previousScore (dynamic contempt)
-              int dct = ct + (110 - ct / 2) * prev / (abs(prev) + 140);
+              int dct = ct + (105 - ct / 2) * prev / (abs(prev) + 149);
 
               contempt = (us == WHITE ?  make_score(dct, dct / 2)
                                       : -make_score(dct, dct / 2));
@@ -506,13 +506,13 @@ void Thread::search() {
           && !Threads.stop
           && !mainThread->stopOnPonderhit)
       {
-          double fallingEval = (296 + 6 * (mainThread->bestPreviousScore - bestValue)
-                                    + 6 * (mainThread->iterValue[iterIdx] - bestValue)) / 725.0;
+          double fallingEval = (318 + 6 * (mainThread->bestPreviousScore - bestValue)
+                                    + 6 * (mainThread->iterValue[iterIdx] - bestValue)) / 825.0;
           fallingEval = Utility::clamp(fallingEval, 0.5, 1.5);
 
           // If the bestMove is stable over several iterations, reduce time accordingly
-          timeReduction = lastBestMoveDepth + 10 < completedDepth ? 1.92 : 0.95;
-          double reduction = (1.47 + mainThread->previousTimeReduction) / (2.22 * timeReduction);
+          timeReduction = lastBestMoveDepth + 9 < completedDepth ? 1.92 : 0.95;
+          double reduction = (1.47 + mainThread->previousTimeReduction) / (2.32 * timeReduction);
 
           // Use part of the gained time from a previous stable move for the current move
           for (Thread* th : Threads)
@@ -537,7 +537,7 @@ void Thread::search() {
           }
           else if (   Threads.increaseDepth
                    && !mainThread->ponder
-                   && Time.elapsed() > totalTime * 0.56)
+                   && Time.elapsed() > totalTime * 0.58)
                    Threads.increaseDepth = false;
           else
                    Threads.increaseDepth = true;
@@ -824,10 +824,10 @@ namespace {
     // Step 9. Null move search with verification search (~40 Elo)
     if (   !PvNode
         && (ss-1)->currentMove != MOVE_NULL
-        && (ss-1)->statScore < 23824
+        && (ss-1)->statScore < 22977
         &&  eval >= beta
         &&  eval >= ss->staticEval
-        &&  ss->staticEval >= beta - 28 * depth - 28 * improving + 94 * ttPv + 200
+        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ttPv + 182
         && !excludedMove
         &&  pos.non_pawn_material(us)
         && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor))
@@ -835,7 +835,7 @@ namespace {
         assert(eval - beta >= 0);
 
         // Null move dynamic reduction based on depth and value
-        Depth R = (737 + 77 * depth) / 246 + std::min(int(eval - beta) / 192, 3);
+        Depth R = (817 + 71 * depth) / 213 + std::min(int(eval - beta) / 192, 3);
 
         ss->currentMove = MOVE_NULL;
         ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0];
@@ -1028,17 +1028,17 @@ moves_loop: // When in check, search starts from here
                   continue;
 
               // Futility pruning: parent node (~5 Elo)
-              if (   lmrDepth < 8
+              if (   lmrDepth < 7
                   && !ss->inCheck
-                  && ss->staticEval + 284 + 188 * lmrDepth <= alpha
+                  && ss->staticEval + 283 + 170 * lmrDepth <= alpha
                   &&  (*contHist[0])[movedPiece][to_sq(move)]
                     + (*contHist[1])[movedPiece][to_sq(move)]
                     + (*contHist[3])[movedPiece][to_sq(move)]
-                    + (*contHist[5])[movedPiece][to_sq(move)] / 2 < 28388)
+                    + (*contHist[5])[movedPiece][to_sq(move)] / 2 < 27376)
                   continue;
 
               // Prune moves with negative SEE (~20 Elo)
-              if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 17)) * lmrDepth * lmrDepth)))
+              if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
                   continue;
           }
           else
@@ -1055,12 +1055,12 @@ moves_loop: // When in check, search starts from here
                   && !(PvNode && abs(bestValue) < 2)
                   && PieceValue[MG][type_of(movedPiece)] >= PieceValue[MG][type_of(pos.piece_on(to_sq(move)))]
                   && !ss->inCheck
-                  && ss->staticEval + 178 + 261 * lmrDepth
+                  && ss->staticEval + 169 + 244 * lmrDepth
                      + PieceValue[MG][type_of(pos.piece_on(to_sq(move)))] <= alpha)
                   continue;
 
               // See based pruning
-              if (!pos.see_ge(move, Value(-202) * depth)) // (~25 Elo)
+              if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo)
                   continue;
           }
       }
@@ -1166,7 +1166,7 @@ moves_loop: // When in check, search starts from here
               || moveCountPruning
               || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha
               || cutNode
-              || thisThread->ttHitAverage < 415 * TtHitAverageResolution * TtHitAverageWindow / 1024))
+              || thisThread->ttHitAverage < 427 * TtHitAverageResolution * TtHitAverageWindow / 1024))
       {
           Depth r = reduction(improving, depth, moveCount);
 
@@ -1178,7 +1178,7 @@ moves_loop: // When in check, search starts from here
               r--;
 
           // Decrease reduction if the ttHit running average is large
-          if (thisThread->ttHitAverage > 473 * TtHitAverageResolution * TtHitAverageWindow / 1024)
+          if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024)
               r--;
 
           // Reduction if other threads are searching this position
@@ -1221,17 +1221,17 @@ moves_loop: // When in check, search starts from here
                              + (*contHist[0])[movedPiece][to_sq(move)]
                              + (*contHist[1])[movedPiece][to_sq(move)]
                              + (*contHist[3])[movedPiece][to_sq(move)]
-                             - 4826;
+                             - 5287;
 
               // Decrease/increase reduction by comparing opponent's stat score (~10 Elo)
-              if (ss->statScore >= -100 && (ss-1)->statScore < -112)
+              if (ss->statScore >= -106 && (ss-1)->statScore < -104)
                   r--;
 
-              else if ((ss-1)->statScore >= -125 && ss->statScore < -138)
+              else if ((ss-1)->statScore >= -119 && ss->statScore < -140)
                   r++;
 
               // Decrease/increase reduction for moves with a good/bad history (~30 Elo)
-              r -= ss->statScore / 14615;
+              r -= ss->statScore / 14884;
           }
           else
           {
@@ -1241,7 +1241,7 @@ moves_loop: // When in check, search starts from here
 
             // Unless giving check, this capture is likely bad
             if (   !givesCheck
-                && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 211 * depth <= alpha)
+                && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha)
                 r++;
           }
 
@@ -1503,7 +1503,7 @@ moves_loop: // When in check, search starts from here
         if (PvNode && bestValue > alpha)
             alpha = bestValue;
 
-        futilityBase = bestValue + 141;
+        futilityBase = bestValue + 145;
     }
 
     const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory,
@@ -1754,7 +1754,7 @@ moves_loop: // When in check, search starts from here
     }
 
     if (depth > 11 && ss->ply < MAX_LPH)
-        thisThread->lowPlyHistory[ss->ply][from_to(move)] << stat_bonus(depth - 6);
+        thisThread->lowPlyHistory[ss->ply][from_to(move)] << stat_bonus(depth - 7);
   }
 
   // When playing with strength handicap, choose best move among a set of RootMoves

From a72cec1ff854a77a92452c2afe2001e05f06e6d4 Mon Sep 17 00:00:00 2001
From: Vizvezdenec <Vizvezdenec@gmail.com>
Date: Sat, 18 Jul 2020 16:30:00 +0300
Subject: [PATCH 59/86] Add comments to probCut code

and rename a variable

closes https://github.com/official-stockfish/Stockfish/pull/2819

No functional change
---
 src/search.cpp | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 676427f7..ef47fd22 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -596,7 +596,7 @@ namespace {
     Key posKey;
     Move ttMove, move, excludedMove, bestMove;
     Depth extension, newDepth;
-    Value bestValue, value, ttValue, eval, maxValue, probcutBeta;
+    Value bestValue, value, ttValue, eval, maxValue, probCutBeta;
     bool ttHit, ttPv, formerPv, givesCheck, improving, didLMR, priorCapture;
     bool captureOrPromotion, doFullDepthSearch, moveCountPruning,
          ttCapture, singularQuietLMR;
@@ -871,7 +871,7 @@ namespace {
         }
     }
 
-    probcutBeta = beta + 176 - 49 * improving;
+    probCutBeta = beta + 176 - 49 * improving;
 
     // Step 10. ProbCut (~10 Elo)
     // If we have a good enough capture and a reduced search returns a value
@@ -879,21 +879,27 @@ namespace {
     if (   !PvNode
         &&  depth > 4
         &&  abs(beta) < VALUE_TB_WIN_IN_MAX_PLY
-        && !(   ttHit
-             && tte->depth() >= depth - 3
+        // if value from transposition table is lower than probCutBeta, don't attempt probCut
+        // there and in further interactions with transposition table cutoff depth is set to depth - 3
+        // because probCut search has depth set to depth - 4 but we also do a move before it
+        // so effective depth is equal to depth - 3
+        && !(   ttHit 
+             && tte->depth() >= depth - 3 
              && ttValue != VALUE_NONE
-             && ttValue < probcutBeta))
+             && ttValue < probCutBeta))
     {
+        // if ttMove is a capture and value from transposition table is good enough produce probCut
+        // cutoff without digging into actual probCut search
         if (   ttHit
             && tte->depth() >= depth - 3
             && ttValue != VALUE_NONE
-            && ttValue >= probcutBeta
+            && ttValue >= probCutBeta
             && ttMove
             && pos.capture_or_promotion(ttMove))
-            return probcutBeta;
+            return probCutBeta;
 
-        assert(probcutBeta < VALUE_INFINITE);
-        MovePicker mp(pos, ttMove, probcutBeta - ss->staticEval, &captureHistory);
+        assert(probCutBeta < VALUE_INFINITE);
+        MovePicker mp(pos, ttMove, probCutBeta - ss->staticEval, &captureHistory);
         int probCutCount = 0;
 
         while (   (move = mp.next_move()) != MOVE_NONE
@@ -915,16 +921,17 @@ namespace {
                 pos.do_move(move, st);
 
                 // Perform a preliminary qsearch to verify that the move holds
-                value = -qsearch<NonPV>(pos, ss+1, -probcutBeta, -probcutBeta+1);
+                value = -qsearch<NonPV>(pos, ss+1, -probCutBeta, -probCutBeta+1);
 
                 // If the qsearch held, perform the regular search
-                if (value >= probcutBeta)
-                    value = -search<NonPV>(pos, ss+1, -probcutBeta, -probcutBeta+1, depth - 4, !cutNode);
+                if (value >= probCutBeta)
+                    value = -search<NonPV>(pos, ss+1, -probCutBeta, -probCutBeta+1, depth - 4, !cutNode);
 
                 pos.undo_move(move);
 
-                if (value >= probcutBeta)
+                if (value >= probCutBeta)
                 {
+                    // if transposition table doesn't have equal or more deep info write probCut data into it
                     if ( !(ttHit
                        && tte->depth() >= depth - 3
                        && ttValue != VALUE_NONE))

From 4ab8b0b738fe4ae58588efb421fd7b1643b2ef66 Mon Sep 17 00:00:00 2001
From: Guy Vreuls <guyvreuls@gmail.com>
Date: Tue, 11 Aug 2020 04:38:38 +0200
Subject: [PATCH 60/86] Fix parallel LTO issues on Windows

This adds -save-temps to the linker flags when parallel LTO is used on
MinGW/MSYS.

fixes #2977

closes https://github.com/official-stockfish/Stockfish/pull/2978

No functional change.
---
 src/Makefile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 3d84f482..fd2618a4 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -472,6 +472,11 @@ ifeq ($(debug), no)
 	ifeq ($(gccisclang),)
 		CXXFLAGS += -flto
 		LDFLAGS += $(CXXFLAGS) -flto=jobserver
+		ifneq ($(findstring MINGW,$(KERNEL)),)
+			LDFLAGS += -save-temps
+		else ifneq ($(findstring MSYS,$(KERNEL)),)
+			LDFLAGS += -save-temps
+		endif
 	else
 		CXXFLAGS += -flto=thin
 		LDFLAGS += $(CXXFLAGS)
@@ -605,7 +610,7 @@ objclean:
 # clean auxiliary profiling files
 profileclean:
 	@rm -rf profdir
-	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda
+	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s
 	@rm -f stockfish.profdata *.profraw
 
 default:

From 399cddf444666cf1671c5281f7a8e78887b4f400 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Mon, 10 Aug 2020 16:14:17 +0200
Subject: [PATCH 61/86] More aligned_alloc changes to support Android

Move to posix_memalign for those platforms, in particular android,
that do not fully support c++17 std::aligned_alloc() (and are not windows)

see https://github.com/official-stockfish/Stockfish/issues/2860

closes https://github.com/official-stockfish/Stockfish/pull/2973

No functional change
---
 src/misc.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/misc.cpp b/src/misc.cpp
index 401a6505..fc3746cf 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -51,6 +51,11 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 #include <sys/mman.h>
 #endif
 
+#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
+#define POSIXALIGNEDALLOC
+#include <stdlib.h>
+#endif
+
 #include "misc.h"
 #include "thread.h"
 
@@ -318,8 +323,11 @@ void prefetch(void* addr) {
 ///
 
 void* std_aligned_alloc(size_t alignment, size_t size) {
-#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
-  return aligned_alloc(alignment, size);
+#if defined(POSIXALIGNEDALLOC)
+  void *pointer;
+  if(posix_memalign(&pointer, alignment, size) == 0)
+      return pointer;
+  return nullptr;
 #elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
   return _mm_malloc(size, alignment);
 #else
@@ -328,7 +336,7 @@ void* std_aligned_alloc(size_t alignment, size_t size) {
 }
 
 void std_aligned_free(void* ptr) {
-#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
+#if defined(POSIXALIGNEDALLOC)
   free(ptr);
 #elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
   _mm_free(ptr);

From f46c73040c16a078b884825c203feee6b0a8850b Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Mon, 10 Aug 2020 12:52:46 -0700
Subject: [PATCH 62/86] Fix AVX512 build with older compilers

avoids an intrinsic that is missing in gcc < 10.

For this target, might trigger another gcc bug on windows that
requires up-to-date gcc 8, 9, or 10, or usage of clang.

Fixes https://github.com/official-stockfish/Stockfish/issues/2975

closes https://github.com/official-stockfish/Stockfish/pull/2976

No functional change
---
 src/Makefile                       | 2 +-
 src/nnue/layers/affine_transform.h | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index fd2618a4..e34fbf61 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -416,7 +416,7 @@ endif
 ifeq ($(avx512),yes)
 	CXXFLAGS += -DUSE_AVX512
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
-		CXXFLAGS += -mavx512bw
+		CXXFLAGS += -mavx512f -mavx512bw
 	endif
 endif
 
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 985ee71a..8d2acd18 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -126,8 +126,7 @@ namespace Eval::NNUE::Layers {
             const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
             const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
             __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
-            product256 = _mm256_madd_epi16(product256, _mm256_set1_epi16(1));
-            sum = _mm512_add_epi32(sum, _mm512_zextsi256_si512(product256));
+            sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
         }
         output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
 

From ea6220f3813e5b76b444a02905eaf2c556bdb368 Mon Sep 17 00:00:00 2001
From: Guy Vreuls <guyvreuls@gmail.com>
Date: Fri, 7 Aug 2020 17:07:46 +0200
Subject: [PATCH 63/86] This commit enables a mixed bench, to improve CI and
 allow for PGO (profile-build) of the NNUE part of the code.

Joint work gvreuls / vondele

* Download the default NNUE net in AppVeyor
* Download net in travis CI `make net`
* Adjust tests to cover more archs, speedup instrumented testing
* Introduce 'mixed' bench as default, with further options:

classical, NNUE, mixed.

mixed (default) and NNUE require the default net to be present,
which can be obtained with

```
make net
```

Further examples (first is equivalent to `./stockfish bench`):

```
./stockfish bench 16 1 13 default depth mixed
./stockfish bench 16 1 13 default depth classical
./stockfish bench 16 1 13 default depth NNUE
```

The net is now downloaded automatically if needed for `profile-build`
(usual `build` works fine without net present)

PGO gives a nice speedup on fishtest:

passed STC:
LLR: 2.93 (-2.94,2.94) {-0.50,1.50}
Total: 3360 W: 469 L: 343 D: 2548
Ptnml(0-2): 20, 246, 1030, 356, 28
https://tests.stockfishchess.org/tests/view/5f31b5499081672066537569

passed LTC:
LLR: 2.97 (-2.94,2.94) {0.25,1.75}
Total: 8824 W: 609 L: 502 D: 7713
Ptnml(0-2): 8, 430, 3438, 519, 17
https://tests.stockfishchess.org/tests/view/5f31c87b908167206653757c

closes https://github.com/official-stockfish/Stockfish/pull/2931

fixes https://github.com/official-stockfish/Stockfish/issues/2907

requires fishtest updates before commit

Bench: 4290577
---
 .travis.yml           | 27 +++++++++++++++++++++------
 appveyor.yml          | 14 ++++++++++++++
 src/Makefile          |  2 +-
 src/benchmark.cpp     | 13 +++++++++++--
 tests/instrumented.sh |  8 ++++----
 5 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index d563a1e1..0dd38047 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -43,6 +43,9 @@ before_script:
   - cd src
 
 script:
+  # Download net
+  - make net
+
   # Obtain bench reference from git log
   - git log HEAD | grep "\b[Bb]ench[ :]\+[0-9]\{7\}" | head -n 1 | sed "s/[^0-9]*\([0-9]*\).*/\1/g" > git_sig
   - export benchref=$(cat git_sig)
@@ -55,14 +58,26 @@ script:
   #
   # Verify bench number against various builds
   - export CXXFLAGS="-Werror -D_GLIBCXX_DEBUG"
-  - make clean && make -j2 ARCH=x86-64 optimize=no debug=yes build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-64-modern optimize=no debug=yes build && ../tests/signature.sh $benchref
+  - export CXXFLAGS="-Werror"
+  - make clean && make -j2 ARCH=x86-64-modern build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref
+  - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-old build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" && "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
+
+  # compile only for some more advanced architectures (might not run in travis)
+  - make clean && make -j2 ARCH=x86-64-avx2 build
+  - make clean && make -j2 ARCH=x86-64-bmi2 build
+  # needs gcc 10 to compile
+  - if [[ "$COMPILER" != "g++-8" ]]; then make clean && make -j2 ARCH=x86-64-avx512 build; fi
 
   #
   # Check perft and reproducible search
-  - export CXXFLAGS="-Werror"
-  - make clean && make -j2 ARCH=x86-64 build
+  - make clean && make -j2 ARCH=x86-64-modern build
   - ../tests/perft.sh
   - ../tests/reprosearch.sh
 
@@ -70,11 +85,11 @@ script:
   # Valgrind
   #
   - export CXXFLAGS="-O1 -fno-inline"
-  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64 debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi
+  - if [ -x "$(command -v valgrind )" ]; then make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind; fi
   - if [ -x "$(command -v valgrind )" ]; then ../tests/instrumented.sh --valgrind-thread; fi
 
   #
   # Sanitizer
   #
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64 sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread; fi
diff --git a/appveyor.yml b/appveyor.yml
index d356ba2f..a3732a23 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -61,6 +61,20 @@ before_build:
 
 build_script:
   - cmake --build . --config %CONFIGURATION% -- /verbosity:minimal
+  - ps: |
+      # Download default NNUE net from fishtest
+      $nnuenet = Get-Content -Path src\ucioption.cpp | Select-String -CaseSensitive -Pattern "Option" | Select-String -CaseSensitive -Pattern "nn-[a-z0-9]{12}.nnue"
+      $dummy = $nnuenet -match "(?<nnuenet>nn-[a-z0-9]{12}.nnue)"
+      $nnuenet = $Matches.nnuenet
+      Write-Host "Default net:" $nnuenet
+      $nnuedownloadurl = "https://tests.stockfishchess.org/api/nn/$nnuenet"
+      $nnuefilepath = "src\${env:CONFIGURATION}\$nnuenet"
+      if (Test-Path -Path $nnuefilepath) {
+            Write-Host "Already available."
+      } else {
+            Write-Host "Downloading $nnuedownloadurl to $nnuefilepath"
+            Invoke-WebRequest -Uri $nnuedownloadurl -OutFile $nnuefilepath
+      }
 
 before_test:
   - cd src/%CONFIGURATION%
diff --git a/src/Makefile b/src/Makefile
index e34fbf61..c00b60b5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -569,7 +569,7 @@ help:
 build: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
 
-profile-build: config-sanity objclean profileclean
+profile-build: config-sanity objclean profileclean net
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)
diff --git a/src/benchmark.cpp b/src/benchmark.cpp
index 6041d642..806e9840 100644
--- a/src/benchmark.cpp
+++ b/src/benchmark.cpp
@@ -95,8 +95,9 @@ const vector<string> Defaults = {
 /// setup_bench() builds a list of UCI commands to be run by bench. There
 /// are five parameters: TT size in MB, number of search threads that
 /// should be used, the limit value spent for each position, a file name
-/// where to look for positions in FEN format and the type of the limit:
-/// depth, perft, nodes and movetime (in millisecs).
+/// where to look for positions in FEN format, the type of the limit:
+/// depth, perft, nodes and movetime (in millisecs), and evaluation type
+/// mixed (default), classical, NNUE.
 ///
 /// bench -> search default positions up to depth 13
 /// bench 64 1 15 -> search default positions up to depth 15 (TT = 64MB)
@@ -115,6 +116,7 @@ vector<string> setup_bench(const Position& current, istream& is) {
   string limit     = (is >> token) ? token : "13";
   string fenFile   = (is >> token) ? token : "default";
   string limitType = (is >> token) ? token : "depth";
+  string evalType  = (is >> token) ? token : "mixed";
 
   go = limitType == "eval" ? "eval" : "go " + limitType + " " + limit;
 
@@ -146,13 +148,20 @@ vector<string> setup_bench(const Position& current, istream& is) {
   list.emplace_back("setoption name Hash value " + ttSize);
   list.emplace_back("ucinewgame");
 
+  size_t posCounter = 0;
+
   for (const string& fen : fens)
       if (fen.find("setoption") != string::npos)
           list.emplace_back(fen);
       else
       {
+          if (evalType == "classical" || (evalType == "mixed" && posCounter % 2 == 0))
+              list.emplace_back("setoption name Use NNUE value false");
+          else if (evalType == "NNUE" || (evalType == "mixed" && posCounter % 2 != 0))
+              list.emplace_back("setoption name Use NNUE value true");
           list.emplace_back("position fen " + fen);
           list.emplace_back(go);
+          ++posCounter;
       }
 
   return list;
diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index ae6d5c4b..03ded74a 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -70,7 +70,7 @@ for args in "eval" \
             "go depth 10" \
             "go movetime 1000" \
             "go wtime 8000 btime 8000 winc 500 binc 500" \
-            "bench 128 $threads 10 default depth"
+            "bench 128 $threads 8 default depth"
 do
 
    echo "$prefix $exeprefix ./stockfish $args $postfix"
@@ -80,7 +80,7 @@ done
 
 # more general testing, following an uci protocol exchange
 cat << EOF > game.exp
- set timeout 10
+ set timeout 240
  spawn $exeprefix ./stockfish
 
  send "uci\n"
@@ -98,7 +98,7 @@ cat << EOF > game.exp
  expect "bestmove"
 
  send "position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1\n"
- send "go depth 30\n"
+ send "go depth 20\n"
  expect "bestmove"
 
  send "quit\n"
@@ -121,7 +121,7 @@ cat << EOF > syzygy.exp
  send "uci\n"
  send "setoption name SyzygyPath value ../tests/syzygy/\n"
  expect "info string Found 35 tablebases" {} timeout {exit 1}
- send "bench 128 1 10 default depth\n"
+ send "bench 128 1 8 default depth\n"
  send "quit\n"
  expect eof
 

From ee060464129f8d3af184efa013177a4ef387a394 Mon Sep 17 00:00:00 2001
From: SFisGOD <jonathandumale@gmail.com>
Date: Mon, 10 Aug 2020 21:13:56 +0800
Subject: [PATCH 64/86] Tweak castling extension

Change condition from three friendly pieces to two. This now means that we only extend castling on the king side if there are no other friendly pieces aside from king and rook. For the queen side, we only extend if there is only a rook and another friendly piece or if there is only a single rook and no other friendly piece but this is very rare.

STC:
LLR: 3.20 (-2.94,2.94) {-0.50,1.50}
Total: 31144 W: 4086 L: 3903 D: 23155
Ptnml(0-2): 227, 2843, 9278, 2968, 256
https://tests.stockfishchess.org/tests/view/5f31487f9081672066537516

LTC:
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 57816 W: 3786 L: 3538 D: 50492
Ptnml(0-2): 92, 2991, 22488, 3251, 86
https://tests.stockfishchess.org/tests/view/5f3167c3908167206653753d

closes https://github.com/official-stockfish/Stockfish/pull/2980

Bench: 4244812
---
 src/search.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/search.cpp b/src/search.cpp
index ef47fd22..c5b4332f 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1139,7 +1139,7 @@ moves_loop: // When in check, search starts from here
 
       // Castling extension
       if (   type_of(move) == CASTLING
-          && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 3)
+          && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2)
           extension = 1;
 
       // Late irreversible move extension

From 992f549ae7f4f73b025429c44bdbbc65de917f6c Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Tue, 11 Aug 2020 21:11:17 +0200
Subject: [PATCH 65/86] Restrict avx2 hack to windows target

this workaround is possibly rather a windows & gcc specific problem. See e.g.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412#c25

on Linux with gcc 8 this patch brings roughly a 8% speedup.
However, probably needs some testing in the wild.

includes a workaround for an old msys make (3.81) installation (fixes #2984)

No functional change
---
 src/Makefile           | 2 +-
 src/nnue/nnue_common.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index c00b60b5..e82b066b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -569,7 +569,7 @@ help:
 build: config-sanity
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
 
-profile-build: config-sanity objclean profileclean net
+profile-build: net config-sanity objclean profileclean
 	@echo ""
 	@echo "Step 1/4. Building instrumented executable ..."
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index cb1251c5..eab7d258 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -44,7 +44,7 @@
 //       compiled with older g++ crashes because the output memory is not aligned
 //       even though alignas is specified.
 #if defined(USE_AVX2)
-#if defined(__GNUC__ ) && (__GNUC__ < 9)
+#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32)
 #define _mm256_loadA_si256  _mm256_loadu_si256
 #define _mm256_storeA_si256 _mm256_storeu_si256
 #else
@@ -54,7 +54,7 @@
 #endif
 
 #if defined(USE_AVX512)
-#if defined(__GNUC__ ) && (__GNUC__ < 9)
+#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32)
 #define _mm512_loadA_si512   _mm512_loadu_si512
 #define _mm512_storeA_si512  _mm512_storeu_si512
 #else

From 6bc0256292cf51d390fee0cb78963da884dc2677 Mon Sep 17 00:00:00 2001
From: Daylen Yang <services@daylenyang.com>
Date: Tue, 11 Aug 2020 12:02:48 -0700
Subject: [PATCH 66/86] Use posix_memalign for Apple Silicon instead of
 _mm_malloc

fails to build on that target, because of missing Intel Intrinsics.
macOS has posix_memalign() since ~2014 so we can simplify the code and just use that for all Apple platforms.

closes https://github.com/official-stockfish/Stockfish/pull/2985

No functional change.
---
 src/misc.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/misc.cpp b/src/misc.cpp
index fc3746cf..aeb3c912 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -51,7 +51,7 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 #include <sys/mman.h>
 #endif
 
-#if (defined(__APPLE__) && defined(_LIBCPP_HAS_C11_FEATURES)) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
+#if defined(__APPLE__) || defined(__ANDROID__) || defined(__OpenBSD__) || (defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC) && !defined(_WIN32))
 #define POSIXALIGNEDALLOC
 #include <stdlib.h>
 #endif
@@ -328,7 +328,7 @@ void* std_aligned_alloc(size_t alignment, size_t size) {
   if(posix_memalign(&pointer, alignment, size) == 0)
       return pointer;
   return nullptr;
-#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
+#elif defined(_WIN32)
   return _mm_malloc(size, alignment);
 #else
   return std::aligned_alloc(alignment, size);
@@ -338,7 +338,7 @@ void* std_aligned_alloc(size_t alignment, size_t size) {
 void std_aligned_free(void* ptr) {
 #if defined(POSIXALIGNEDALLOC)
   free(ptr);
-#elif (defined(_WIN32) || (defined(__APPLE__) && !defined(_LIBCPP_HAS_C11_FEATURES)))
+#elif defined(_WIN32)
   _mm_free(ptr);
 #else
   free(ptr);

From dd63b98fb06e050aa961fbad6fd1f9316f2b17df Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Tue, 11 Aug 2020 12:59:39 -0700
Subject: [PATCH 67/86] Add support for VNNI

Adds support for Vector Neural Network Instructions (avx512), as available on Intel Cascade Lake

The _mm512_dpbusd_epi32() intrinsic (vpdpbusd instruction) is taylor made for NNUE.

on a cascade lake CPU (AWS C5.24x.large, gcc 10) NNUE eval is at roughly 78% nps of classical
(single core test)

bench 1024 1 24 default depth:
target 	classical 	NNUE 	ratio
vnni 	2207232 	1725987 	78.20
avx512 	2216789 	1671734 	75.41
avx2 	2194006 	1611263 	73.44
modern 	2185001 	1352469 	61.90

closes https://github.com/official-stockfish/Stockfish/pull/2987

No functional change
---
 src/Makefile                       | 25 +++++++++++++++++++++++++
 src/misc.cpp                       |  3 +++
 src/nnue/layers/affine_transform.h | 14 +++++++++++++-
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index e82b066b..0804cdd5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -73,6 +73,7 @@ endif
 # avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
 # pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
 # avx512 = yes/no     --- -mavx512bw       --- Use Intel Advanced Vector Extensions 512
+# vnni = yes/no       --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 512
 # neon = yes/no       --- -DUSE_NEON       --- Use ARM SIMD architecture
 #
 # Note that Makefile is space sensitive, so when adding new architectures
@@ -93,6 +94,7 @@ sse41 = no
 avx2 = no
 pext = no
 avx512 = no
+vnni = no
 neon = no
 ARCH = x86-64-modern
 
@@ -190,6 +192,19 @@ ifeq ($(ARCH),x86-64-avx512)
 	avx512 = yes
 endif
 
+ifeq ($(ARCH),x86-64-vnni)
+	arch = x86_64
+	prefetch = yes
+	popcnt = yes
+	sse = yes
+	ssse3 = yes
+	sse41 = yes
+	avx2 = yes
+	pext = yes
+	avx512 = yes
+	vnni = yes
+endif
+
 ifeq ($(ARCH),armv7)
 	arch = armv7
 	prefetch = yes
@@ -420,6 +435,13 @@ ifeq ($(avx512),yes)
 	endif
 endif
 
+ifeq ($(vnni),yes)
+	CXXFLAGS += -DUSE_VNNI
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl
+	endif
+endif
+
 ifeq ($(sse41),yes)
 	CXXFLAGS += -DUSE_SSE41
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
@@ -522,6 +544,7 @@ help:
 	@echo ""
 	@echo "Supported archs:"
 	@echo ""
+	@echo "x86-64-vnni             > x86 64-bit with vnni support"
 	@echo "x86-64-avx512           > x86 64-bit with avx512 support"
 	@echo "x86-64-bmi2             > x86 64-bit with bmi2 support"
 	@echo "x86-64-avx2             > x86 64-bit with avx2 support"
@@ -640,6 +663,7 @@ config-sanity:
 	@echo "avx2: '$(avx2)'"
 	@echo "pext: '$(pext)'"
 	@echo "avx512: '$(avx512)'"
+	@echo "vnni: '$(vnni)'"
 	@echo "neon: '$(neon)'"
 	@echo ""
 	@echo "Flags:"
@@ -664,6 +688,7 @@ config-sanity:
 	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
 	@test "$(pext)" = "yes" || test "$(pext)" = "no"
 	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
+	@test "$(vnni)" = "yes" || test "$(vnni)" = "no"
 	@test "$(neon)" = "yes" || test "$(neon)" = "no"
 	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang"
 
diff --git a/src/misc.cpp b/src/misc.cpp
index aeb3c912..ab52d30b 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -219,6 +219,9 @@ const std::string compiler_info() {
 
   compiler += "\nCompilation settings include: ";
   compiler += (Is64Bit ? " 64bit" : " 32bit");
+  #if defined(USE_VNNI)
+    compiler += " VNNI";
+  #endif
   #if defined(USE_AVX512)
     compiler += " AVX512";
   #endif
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 8d2acd18..322e3240 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -79,8 +79,10 @@ namespace Eval::NNUE::Layers {
 
   #if defined(USE_AVX512)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
-      const __m512i kOnes = _mm512_set1_epi16(1);
       const auto input_vector = reinterpret_cast<const __m512i*>(input);
+  #if !defined(USE_VNNI)
+      const __m512i kOnes = _mm512_set1_epi16(1);
+  #endif
 
   #elif defined(USE_AVX2)
       constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
@@ -113,9 +115,13 @@ namespace Eval::NNUE::Layers {
         __m512i sum = _mm512_setzero_si512();
         const auto row = reinterpret_cast<const __m512i*>(&weights_[offset]);
         for (IndexType j = 0; j < kNumChunks; ++j) {
+  #if defined(USE_VNNI)
+            sum = _mm512_dpbusd_epi32(sum, _mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
+  #else
             __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j]));
             product = _mm512_madd_epi16(product, kOnes);
             sum = _mm512_add_epi32(sum, product);
+  #endif
         }
 
         // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
@@ -125,8 +131,14 @@ namespace Eval::NNUE::Layers {
         {
             const auto iv256  = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
             const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
+  #if defined(USE_VNNI)
+            __m256i product256 = _mm256_dpbusd_epi32(
+                _mm512_castsi512_si256(sum), _mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+            sum = _mm512_inserti32x8(sum, product256, 0);
+  #else
             __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
             sum = _mm512_add_epi32(sum, _mm512_cvtepi16_epi32(product256));
+  #endif
         }
         output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
 

From 69cfe28f315b559cb1a07c0806266aa2850b5d4b Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Wed, 12 Aug 2020 17:21:12 +0200
Subject: [PATCH 68/86] Output the SSE2 flag in compiler_info

was missing in the list of outputs, slightly reorder flags.
explicitly add -msse2 if USE_SSE2 (is implicit already, -msse -m64).

closes https://github.com/official-stockfish/Stockfish/pull/2990

No functional change.
---
 src/Makefile | 2 +-
 src/misc.cpp | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 0804cdd5..027cc3e3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -468,7 +468,7 @@ ifeq ($(neon),yes)
 endif
 
 ifeq ($(arch),x86_64)
-	CXXFLAGS += -DUSE_SSE2
+	CXXFLAGS += -msse2 -DUSE_SSE2
 endif
 
 ### 3.7 pext
diff --git a/src/misc.cpp b/src/misc.cpp
index ab52d30b..1cee4726 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -225,6 +225,7 @@ const std::string compiler_info() {
   #if defined(USE_AVX512)
     compiler += " AVX512";
   #endif
+  compiler += (HasPext ? " BMI2" : "");
   #if defined(USE_AVX2)
     compiler += " AVX2";
   #endif
@@ -234,11 +235,14 @@ const std::string compiler_info() {
   #if defined(USE_SSSE3)
     compiler += " SSSE3";
   #endif
-    compiler += (HasPext ? " BMI2" : "");
-    compiler += (HasPopCnt ? " POPCNT" : "");
+  #if defined(USE_SSE2)
+    compiler += " SSE2";
+  #endif
+  compiler += (HasPopCnt ? " POPCNT" : "");
   #if defined(USE_MMX)
     compiler += " MMX";
   #endif
+
   #if !defined(NDEBUG)
     compiler += " DEBUG";
   #endif

From 67e48418afd58dd69708dcd67dea6161f61ef76f Mon Sep 17 00:00:00 2001
From: Sergio Vieri <sergio.vieri.hp@gmail.com>
Date: Wed, 12 Aug 2020 23:21:21 +0800
Subject: [PATCH 69/86] Update default net to nn-82215d0fd0df.nnue

Net created at: 20200812-2257

passed STC: https://tests.stockfishchess.org/tests/view/5f340ca99e5f2effc089da17
LLR: 2.96 (-2.94,2.94) {-0.50,1.50}
Total: 5744 W: 756 L: 627 D: 4361
Ptnml(0-2): 28, 485, 1731, 586, 42

passed LTC: https://tests.stockfishchess.org/tests/view/5f341eba9e5f2effc089da23
LLR: 2.94 (-2.94,2.94) {0.25,1.75}
Total: 17136 W: 1041 L: 917 D: 15178
Ptnml(0-2): 13, 813, 6807, 907, 28

closes https://github.com/official-stockfish/Stockfish/pull/2992

Bench: 3935117
---
 src/ucioption.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index b0689d6d..0a35d01b 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -79,7 +79,7 @@ void init(OptionsMap& o) {
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
   o["Use NNUE"]              << Option(false, on_use_NNUE);
-  o["EvalFile"]              << Option("nn-112bb1c8cdb5.nnue", on_eval_file);
+  o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);
 }
 
 

From e8ea215a13e009b78a148fda831392eb3224107e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Thu, 13 Aug 2020 13:40:06 +0200
Subject: [PATCH 70/86] Clean-up Makefile help

Do not show the details of the default architecture for a simple "make help"
invocation, as the details are most likely to confuse beginners. Instead we
make it clear which architecture is the default and put an example at the end
of the Makefile as an incentative to use "make help ARCH=blah" to discover
the flags used by the different architectures.

```
    make help
    make help ARCH=x86-64-ssse3
```

Also clean-up and modernize a bit the Makefile examples while at it.

closes https://github.com/official-stockfish/Stockfish/pull/2996

No functional change
---
 src/Makefile | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 027cc3e3..a9fb7b81 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -81,6 +81,11 @@ endif
 # at the end of the line for flag values.
 
 ### 2.1. General and architecture defaults
+
+ifeq ($(ARCH),)
+    empty_arch = yes
+endif
+
 optimize = yes
 debug = no
 sanitize = no
@@ -99,6 +104,7 @@ neon = no
 ARCH = x86-64-modern
 
 ### 2.2 Architecture specific
+
 ifeq ($(ARCH),general-32)
 	arch = any
 	bits = 32
@@ -141,16 +147,7 @@ ifeq ($(ARCH),x86-64-ssse3)
 	ssse3 = yes
 endif
 
-ifeq ($(ARCH),x86-64-modern)
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
-	sse = yes
-	ssse3 = yes
-	sse41 = yes
-endif
-
-ifeq ($(ARCH),x86-64-sse41-popcnt)
+ifeq ($(ARCH),$(filter $(ARCH),x86-64-sse41-popcnt x86-64-modern))
 	arch = x86_64
 	prefetch = yes
 	popcnt = yes
@@ -535,12 +532,13 @@ help:
 	@echo ""
 	@echo "Supported targets:"
 	@echo ""
+	@echo "help                    > Display architecture details"
 	@echo "build                   > Standard build"
-	@echo "profile-build           > Standard build with PGO"
+	@echo "net                     > Download the default nnue net"
+	@echo "profile-build           > Faster build (with profile-guided optimization)"
 	@echo "strip                   > Strip executable"
 	@echo "install                 > Install executable"
 	@echo "clean                   > Clean up"
-	@echo "net                     > Download the default nnue net"
 	@echo ""
 	@echo "Supported archs:"
 	@echo ""
@@ -549,7 +547,7 @@ help:
 	@echo "x86-64-bmi2             > x86 64-bit with bmi2 support"
 	@echo "x86-64-avx2             > x86 64-bit with avx2 support"
 	@echo "x86-64-sse41-popcnt     > x86 64-bit with sse41 and popcnt support"
-	@echo "x86-64-modern           > the same as previous (x86-64-sse41-popcnt)"
+	@echo "x86-64-modern           > common modern CPU, currently x86-64-sse41-popcnt"
 	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
 	@echo "x86-64-sse3-popcnt      > x86 64-bit with sse3 and popcnt support"
 	@echo "x86-64                  > x86 64-bit generic"
@@ -572,17 +570,20 @@ help:
 	@echo ""
 	@echo "Simple examples. If you don't know what to do, you likely want to run: "
 	@echo ""
-	@echo "make -j build ARCH=x86-64    (This is for 64-bit systems)"
-	@echo "make -j build ARCH=x86-32    (This is for 32-bit systems)"
+	@echo "make -j build ARCH=x86-64  (A portable, slow compile for 64-bit systems)"
+	@echo "make -j build ARCH=x86-32  (A portable, slow compile for 32-bit systems)"
 	@echo ""
-	@echo "Advanced examples, for experienced users: "
+	@echo "Advanced examples, for experienced users looking for performance: "
 	@echo ""
-	@echo "make -j build ARCH=x86-64-modern COMP=clang"
-	@echo "make -j profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-4.8"
-	@echo ""
-	@echo "The selected architecture $(ARCH) enables the following configuration: "
+	@echo "make    help  ARCH=x86-64-bmi2"
+	@echo "make -j profile-build ARCH=x86-64-bmi2 COMP=gcc COMPCXX=g++-9.0"
+	@echo "make -j build ARCH=x86-64-ssse3 COMP=clang"
 	@echo ""
+ifneq ($(empty_arch), yes)
+	@echo "-------------------------------\n"
+	@echo "The selected architecture $(ARCH) will enable the following configuration: "
 	@$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity
+endif
 
 
 .PHONY: help build profile-build strip install clean net objclean profileclean \

From ce009ea1aaecc577bbdf208cef8e61dd1827a18e Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Thu, 13 Aug 2020 22:54:13 +0200
Subject: [PATCH 71/86] Verify SHA of downloaded net file

check SHA of the available and downloaded file.

Document the format requirement on the default net.

Also allow curl to make possibly insecure connections, as needed for old curl.

fixes https://github.com/official-stockfish/Stockfish/issues/2998

closes https://github.com/official-stockfish/Stockfish/pull/3000

No functional change.
---
 src/Makefile      | 4 +++-
 src/ucioption.cpp | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index a9fb7b81..38f607cb 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -624,8 +624,10 @@ net:
 	$(eval nnuenet := $(shell grep EvalFile ucioption.cpp | grep Option | sed 's/.*\(nn-[a-z0-9]\{12\}.nnue\).*/\1/'))
 	@echo "Default net: $(nnuenet)"
 	$(eval nnuedownloadurl := https://tests.stockfishchess.org/api/nn/$(nnuenet))
-	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -sL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
+	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
 	@if test -f "$(nnuenet)"; then echo "Already available."; else echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet); fi
+	$(eval shasum_command := $(shell if hash shasum 2>/dev/null; then echo "shasum -a 256 "; elif hash sha256sum 2>/dev/null; then echo "sha256sum "; fi))
+	@if [ "$(nnuenet)" != "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then echo "Failed download or $(nnuenet) corrupted, please delete!"; exit 1; fi
 
 # clean binaries and objects
 objclean:
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index 0a35d01b..2b66a475 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -79,6 +79,8 @@ void init(OptionsMap& o) {
   o["Syzygy50MoveRule"]      << Option(true);
   o["SyzygyProbeLimit"]      << Option(7, 0, 7);
   o["Use NNUE"]              << Option(false, on_use_NNUE);
+  // The default must follow the format nn-[SHA256 first 12 digits].nnue
+  // for the build process (profile-build and fishtest) to work.
   o["EvalFile"]              << Option("nn-82215d0fd0df.nnue", on_eval_file);
 }
 

From e5f450cf0bfe5a34dd4ea51a5592a71be4514601 Mon Sep 17 00:00:00 2001
From: Miguel Lahoz <miguel_lahoz@protonmail.com>
Date: Mon, 10 Aug 2020 22:57:11 +0800
Subject: [PATCH 72/86] Also dampen NNUE eval with 50 move rule

Move the existing dampening function last so that NNUE evaluations are
also handled as we approach the 50 move rule.

STC:
LLR: 2.95 (-2.94,2.94) {-0.50,1.50}
Total: 4792 W: 695 L: 561 D: 3536
Ptnml(0-2): 19, 420, 1422, 478, 57
https://tests.stockfishchess.org/tests/view/5f3164179081672066537534

LTC:
LLR: 8.62 (-2.94,2.94) {0.25,1.75}
Total: 286744 W: 18494 L: 17430 D: 250820
Ptnml(0-2): 418, 14886, 111745, 15860, 463
https://tests.stockfishchess.org/tests/view/5f316b039081672066537541

closes https://github.com/official-stockfish/Stockfish/pull/3004

Bench: 4001800
---
 src/evaluate.cpp | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index caab2979..00fd2005 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -927,9 +927,6 @@ make_v:
     // Side to move point of view
     v = (pos.side_to_move() == WHITE ? v : -v) + Tempo;
 
-    // Damp down the evaluation linearly when shuffling
-    v = v * (100 - pos.rule50_count()) / 100;
-
     return v;
   }
 
@@ -941,14 +938,15 @@ make_v:
 
 Value Eval::evaluate(const Position& pos) {
 
-  if (Eval::useNNUE)
-  {
-      Value v = eg_value(pos.psq_score());
-      // Take NNUE eval only on balanced positions
-      if (abs(v) < NNUEThreshold)
-         return NNUE::evaluate(pos) + Tempo;
-  }
-  return Evaluation<NO_TRACE>(pos).value();
+  bool classical = !Eval::useNNUE
+                ||  abs(eg_value(pos.psq_score())) >= NNUEThreshold;
+  Value v = classical ? Evaluation<NO_TRACE>(pos).value()
+                      : NNUE::evaluate(pos) + Tempo;
+
+  // Damp down the evaluation linearly when shuffling
+  v = v * (100 - pos.rule50_count()) / 100;
+
+  return v;
 }
 
 /// trace() is like evaluate(), but instead of returning a value, it returns

From 6eb186c97e9d808970d0b1369bcd7aca60612e26 Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Fri, 14 Aug 2020 04:49:33 -0700
Subject: [PATCH 73/86] Try to match relative magnitude of NNUE eval to
 classical

The idea is that since we are mixing NNUE and classical evals matching their magnitudes closer allows for better comparisons.

STC https://tests.stockfishchess.org/tests/view/5f35a65411a9b1a1dbf18e2b
LLR: 2.94 (-2.94,2.94) {-0.50,1.50}
Total: 9840 W: 1150 L: 1027 D: 7663
Ptnml(0-2): 49, 772, 3175, 855, 69

LTC https://tests.stockfishchess.org/tests/view/5f35bcbe11a9b1a1dbf18e47
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 44424 W: 2492 L: 2294 D: 39638
Ptnml(0-2): 42, 2015, 17915, 2183, 57

also corrects the location to clamp the evaluation (non-function on bench).

closes https://github.com/official-stockfish/Stockfish/pull/3003

bench: 3905447
---
 src/evaluate.cpp           | 5 ++++-
 src/nnue/evaluate_nnue.cpp | 5 +----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 00fd2005..a453fa0f 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -941,11 +941,14 @@ Value Eval::evaluate(const Position& pos) {
   bool classical = !Eval::useNNUE
                 ||  abs(eg_value(pos.psq_score())) >= NNUEThreshold;
   Value v = classical ? Evaluation<NO_TRACE>(pos).value()
-                      : NNUE::evaluate(pos) + Tempo;
+                      : NNUE::evaluate(pos) * 5 / 4 + Tempo;
 
   // Damp down the evaluation linearly when shuffling
   v = v * (100 - pos.rule50_count()) / 100;
 
+  // Guarantee evalution outside of TB range
+  v = Utility::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
+
   return v;
 }
 
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index af0894b2..a6ece8e2 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -159,10 +159,7 @@ namespace Eval::NNUE {
 
   // Evaluation function. Perform differential calculation.
   Value evaluate(const Position& pos) {
-    Value v = ComputeScore(pos, false);
-    v = Utility::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
-
-    return v;
+    return ComputeScore(pos, false);
   }
 
   // Evaluation function. Perform full calculation.

From cd0b8b4cf28208fffef931322749205a0ddc6066 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Fri, 14 Aug 2020 22:18:12 +0200
Subject: [PATCH 74/86] Use NNUE more for fortresses

Increases the use of NNUE evaluation in positions without captures/pawn moves,
by increasing the NNUEThreshold threshold with rule50_count.

This patch will force Stockfish to use NNUE eval more and more in materially
unbalanced positions, when it seems that the classical eval is struggling to
win and only manages to shuffle. This will ask the (slower) NNUE eval to
double-check the potential fortress branches of the search tree, but only
when necessary.

passed STC:
https://tests.stockfishchess.org/tests/view/5f36f1bf11a9b1a1dbf192d8
LLR: 2.93 (-2.94,2.94) {-0.50,1.50}
Total: 51824 W: 5836 L: 5653 D: 40335
Ptnml(0-2): 264, 4356, 16512, 4493, 287

passed LTC:
https://tests.stockfishchess.org/tests/view/5f37836111a9b1a1dbf1936d
LLR: 2.93 (-2.94,2.94) {0.25,1.75}
Total: 29768 W: 1747 L: 1590 D: 26431
Ptnml(0-2): 33, 1347, 11977, 1484, 43

closes https://github.com/official-stockfish/Stockfish/pull/3011

Bench: 4173967
---
 src/evaluate.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index a453fa0f..3a620a78 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -939,7 +939,7 @@ make_v:
 Value Eval::evaluate(const Position& pos) {
 
   bool classical = !Eval::useNNUE
-                ||  abs(eg_value(pos.psq_score())) >= NNUEThreshold;
+                ||  abs(eg_value(pos.psq_score())) >= NNUEThreshold * (16 + pos.rule50_count()) / 16;
   Value v = classical ? Evaluation<NO_TRACE>(pos).value()
                       : NNUE::evaluate(pos) * 5 / 4 + Tempo;
 

From 8cf43c6317665295eece747ed1589ee33a435d2c Mon Sep 17 00:00:00 2001
From: Daylen Yang <services@daylenyang.com>
Date: Fri, 14 Aug 2020 19:53:46 -0700
Subject: [PATCH 75/86] Display NEON in compiler string

if NEON intrinsics are being used and USE_NEON is defined.

closes https://github.com/official-stockfish/Stockfish/pull/3008

No functional change
---
 src/misc.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/misc.cpp b/src/misc.cpp
index 1cee4726..459ea100 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -242,6 +242,9 @@ const std::string compiler_info() {
   #if defined(USE_MMX)
     compiler += " MMX";
   #endif
+  #if defined(USE_NEON)
+    compiler += " NEON";
+  #endif
 
   #if !defined(NDEBUG)
     compiler += " DEBUG";

From 72dc7a5c54554a8c7c4bf68aa7de2d4de05f3294 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Sat, 15 Aug 2020 16:50:39 +0200
Subject: [PATCH 76/86] Assume network file is in little-endian byte order

This patch fixes the byte order when reading 16- and 32-bit values from the network file on a big-endian machine.

Bytes are ordered in read_le() using unsigned arithmetic, which doesn't need tricks to determine the endianness of the machine. Unfortunately the compiler doesn't seem to be able to optimise the ordering operation, but reading in the weights is not a time-critical operation and the extra time it takes should not be noticeable.

Big endian systems are still untested with NNUE.

fixes #3007

closes https://github.com/official-stockfish/Stockfish/pull/3009

No functional change.
---
 src/nnue/evaluate_nnue.cpp          |  8 ++++----
 src/nnue/layers/affine_transform.h  |  9 ++++-----
 src/nnue/nnue_common.h              | 19 +++++++++++++++++++
 src/nnue/nnue_feature_transformer.h |  8 ++++----
 4 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index a6ece8e2..3aa85943 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -77,7 +77,7 @@ namespace Eval::NNUE {
   bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
 
     std::uint32_t header;
-    stream.read(reinterpret_cast<char*>(&header), sizeof(header));
+    header = read_le<std::uint32_t>(stream);
     if (!stream || header != T::GetHashValue()) return false;
     return pointer->ReadParameters(stream);
   }
@@ -96,9 +96,9 @@ namespace Eval::NNUE {
     std::uint32_t* hash_value, std::string* architecture) {
 
     std::uint32_t version, size;
-    stream.read(reinterpret_cast<char*>(&version), sizeof(version));
-    stream.read(reinterpret_cast<char*>(hash_value), sizeof(*hash_value));
-    stream.read(reinterpret_cast<char*>(&size), sizeof(size));
+    version = read_le<std::uint32_t>(stream);
+    *hash_value = read_le<std::uint32_t>(stream);
+    size = read_le<std::uint32_t>(stream);
     if (!stream || version != kVersion) return false;
     architecture->resize(size);
     stream.read(&(*architecture)[0], size);
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 322e3240..bac258e8 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -62,11 +62,10 @@ namespace Eval::NNUE::Layers {
    // Read network parameters
     bool ReadParameters(std::istream& stream) {
       if (!previous_layer_.ReadParameters(stream)) return false;
-      stream.read(reinterpret_cast<char*>(biases_),
-                  kOutputDimensions * sizeof(BiasType));
-      stream.read(reinterpret_cast<char*>(weights_),
-                  kOutputDimensions * kPaddedInputDimensions *
-                  sizeof(WeightType));
+      for (std::size_t i = 0; i < kOutputDimensions; ++i)
+        biases_[i] = read_le<BiasType>(stream);
+      for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
+        weights_[i] = read_le<WeightType>(stream);
       return !stream.fail();
     }
 
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index eab7d258..61f18aee 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -21,6 +21,9 @@
 #ifndef NNUE_COMMON_H_INCLUDED
 #define NNUE_COMMON_H_INCLUDED
 
+#include <cstring>
+#include <iostream>
+
 #if defined(USE_AVX2)
 #include <immintrin.h>
 
@@ -101,6 +104,22 @@ namespace Eval::NNUE {
     return (n + base - 1) / base * base;
   }
 
+  // Read a signed or unsigned integer from  a stream in little-endian order
+  template <typename IntType>
+  inline IntType read_le(std::istream& stream) {
+    // Read the relevant bytes from the stream in little-endian order
+    std::uint8_t u[sizeof(IntType)];
+    stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
+    // Use unsigned arithmetic to convert to machine order
+    typename std::make_unsigned<IntType>::type v = 0;
+    for (std::size_t i = 0; i < sizeof(IntType); ++i)
+      v = (v << 8) | u[sizeof(IntType) - i - 1];
+    // Copy the machine-ordered bytes into a potentially signed value
+    IntType w;
+    std::memcpy(&w, &v, sizeof(IntType));
+    return w;
+  }
+
 }  // namespace Eval::NNUE
 
 #endif // #ifndef NNUE_COMMON_H_INCLUDED
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 40f2603d..4db9be9f 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -55,10 +55,10 @@ namespace Eval::NNUE {
 
     // Read network parameters
     bool ReadParameters(std::istream& stream) {
-      stream.read(reinterpret_cast<char*>(biases_),
-                  kHalfDimensions * sizeof(BiasType));
-      stream.read(reinterpret_cast<char*>(weights_),
-                  kHalfDimensions * kInputDimensions * sizeof(WeightType));
+      for (std::size_t i = 0; i < kHalfDimensions; ++i)
+        biases_[i] = read_le<BiasType>(stream);
+      for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
+        weights_[i] = read_le<WeightType>(stream);
       return !stream.fail();
     }
 

From 65572de4a79ab017c19d85eacee865afe7bfc7c1 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sun, 16 Aug 2020 13:21:07 +0200
Subject: [PATCH 77/86] Add further targets to travis testing

general-32, general-64 and help

closes https://github.com/official-stockfish/Stockfish/pull/3014

No functional change
---
 .travis.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 0dd38047..45f1bd3d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -51,11 +51,12 @@ script:
   - export benchref=$(cat git_sig)
   - echo "Reference bench:" $benchref
 
-  #
   # Compiler version string
   - $COMPILER -v
 
-  #
+  # test help target
+  - make help
+
   # Verify bench number against various builds
   - export CXXFLAGS="-Werror -D_GLIBCXX_DEBUG"
   - make clean && make -j2 ARCH=x86-64-modern optimize=no debug=yes build && ../tests/signature.sh $benchref
@@ -64,8 +65,10 @@ script:
   - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref
   - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref
   - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-old build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" && "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
 

From 81d716f5ccff3f0898ae985b9ef69f79d014bdc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ste=CC=81phane=20Nicolet?= <cassio@free.fr>
Date: Sun, 16 Aug 2020 21:46:54 +0200
Subject: [PATCH 78/86] Reformat code in little-endian patch

Reformat code and rename the function to "read_little_endian()" in the recent
commit by Ronald de Man for support of big endian systems.

closes https://github.com/official-stockfish/Stockfish/pull/3016

No functional change
-----

Recommended net: https://tests.stockfishchess.org/api/nn/nn-82215d0fd0df.nnue
---
 src/nnue/evaluate_nnue.cpp          | 14 +++++++-------
 src/nnue/layers/affine_transform.h  |  4 ++--
 src/nnue/nnue_common.h              | 30 +++++++++++++++--------------
 src/nnue/nnue_feature_transformer.h |  4 ++--
 4 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index 3aa85943..dfbb1ac2 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -77,7 +77,7 @@ namespace Eval::NNUE {
   bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
 
     std::uint32_t header;
-    header = read_le<std::uint32_t>(stream);
+    header = read_little_endian<std::uint32_t>(stream);
     if (!stream || header != T::GetHashValue()) return false;
     return pointer->ReadParameters(stream);
   }
@@ -92,13 +92,13 @@ namespace Eval::NNUE {
   }
 
   // Read network header
-  bool ReadHeader(std::istream& stream,
-    std::uint32_t* hash_value, std::string* architecture) {
-
+  bool ReadHeader(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
+  {
     std::uint32_t version, size;
-    version = read_le<std::uint32_t>(stream);
-    *hash_value = read_le<std::uint32_t>(stream);
-    size = read_le<std::uint32_t>(stream);
+
+    version     = read_little_endian<std::uint32_t>(stream);
+    *hash_value = read_little_endian<std::uint32_t>(stream);
+    size        = read_little_endian<std::uint32_t>(stream);
     if (!stream || version != kVersion) return false;
     architecture->resize(size);
     stream.read(&(*architecture)[0], size);
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index bac258e8..7ac5a1c0 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -63,9 +63,9 @@ namespace Eval::NNUE::Layers {
     bool ReadParameters(std::istream& stream) {
       if (!previous_layer_.ReadParameters(stream)) return false;
       for (std::size_t i = 0; i < kOutputDimensions; ++i)
-        biases_[i] = read_le<BiasType>(stream);
+        biases_[i] = read_little_endian<BiasType>(stream);
       for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i)
-        weights_[i] = read_le<WeightType>(stream);
+        weights_[i] = read_little_endian<WeightType>(stream);
       return !stream.fail();
     }
 
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 61f18aee..4c93e3d1 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -101,23 +101,25 @@ namespace Eval::NNUE {
   // Round n up to be a multiple of base
   template <typename IntType>
   constexpr IntType CeilToMultiple(IntType n, IntType base) {
-    return (n + base - 1) / base * base;
+      return (n + base - 1) / base * base;
   }
 
-  // Read a signed or unsigned integer from  a stream in little-endian order
+  // read_little_endian() is our utility to read an integer (signed or unsigned, any size)
+  // from a stream in little-endian order. We swap the byte order after the read if
+  // necessary to return a result with the byte ordering of the compiling machine.
   template <typename IntType>
-  inline IntType read_le(std::istream& stream) {
-    // Read the relevant bytes from the stream in little-endian order
-    std::uint8_t u[sizeof(IntType)];
-    stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
-    // Use unsigned arithmetic to convert to machine order
-    typename std::make_unsigned<IntType>::type v = 0;
-    for (std::size_t i = 0; i < sizeof(IntType); ++i)
-      v = (v << 8) | u[sizeof(IntType) - i - 1];
-    // Copy the machine-ordered bytes into a potentially signed value
-    IntType w;
-    std::memcpy(&w, &v, sizeof(IntType));
-    return w;
+  inline IntType read_little_endian(std::istream& stream) {
+
+      IntType result;
+      std::uint8_t u[sizeof(IntType)];
+      typename std::make_unsigned<IntType>::type v = 0;
+
+      stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
+      for (std::size_t i = 0; i < sizeof(IntType); ++i)
+          v = (v << 8) | u[sizeof(IntType) - i - 1];
+
+      std::memcpy(&result, &v, sizeof(IntType));
+      return result;
   }
 
 }  // namespace Eval::NNUE
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 4db9be9f..43707610 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -56,9 +56,9 @@ namespace Eval::NNUE {
     // Read network parameters
     bool ReadParameters(std::istream& stream) {
       for (std::size_t i = 0; i < kHalfDimensions; ++i)
-        biases_[i] = read_le<BiasType>(stream);
+        biases_[i] = read_little_endian<BiasType>(stream);
       for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
-        weights_[i] = read_le<WeightType>(stream);
+        weights_[i] = read_little_endian<WeightType>(stream);
       return !stream.fail();
     }
 

From 0e17a89e4dee73bd46e496cf6bed467432f116e6 Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Mon, 17 Aug 2020 09:22:15 +0200
Subject: [PATCH 79/86] Simplify away the passed pawn extension

STC https://tests.stockfishchess.org/tests/view/5f3955f0e98b6c64b3df41d7
LLR: 2.96 (-2.94,2.94) {-1.50,0.50}
Total: 31992 W: 3611 L: 3548 D: 24833
Ptnml(0-2): 174, 2658, 10273, 2713, 178

LTC https://tests.stockfishchess.org/tests/view/5f399e41e98b6c64b3df4210
LLR: 3.01 (-2.94,2.94) {-1.50,0.50}
Total: 29568 W: 1488 L: 1480 D: 26600
Ptnml(0-2): 40, 1272, 12142, 1300, 30

closes https://github.com/official-stockfish/Stockfish/pull/3017

bench: 3844671

-----

Recommended net: https://tests.stockfishchess.org/api/nn/nn-82215d0fd0df.nnue
---
 src/search.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index c5b4332f..83fb722f 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1126,12 +1126,6 @@ moves_loop: // When in check, search starts from here
                && (pos.is_discovery_check_on_king(~us, move) || pos.see_ge(move)))
           extension = 1;
 
-      // Passed pawn extension
-      else if (   move == ss->killers[0]
-               && pos.advanced_pawn_push(move)
-               && pos.pawn_passed(us, to_sq(move)))
-          extension = 1;
-
       // Last captures extension
       else if (   PieceValue[EG][pos.captured_piece()] > PawnValueEg
                && pos.non_pawn_material() <= 2 * RookValueMg)

From 65b976439f8867e81682c0b66da6796ad3176177 Mon Sep 17 00:00:00 2001
From: notruck <56622488+notruck@users.noreply.github.com>
Date: Sun, 16 Aug 2020 08:59:13 -0700
Subject: [PATCH 80/86] Support building for Android using NDK

The easiest way to use the NDK in conjunction with this Makefile (tested on linux-x86_64):

1. Download the latest NDK (r21d) from Google from https://developer.android.com/ndk/downloads
2. Place and unzip the NDK in $HOME/ndk folder
3. Export the path variable e.g., `export PATH=$PATH:$HOME/ndk/android-ndk-r21d/toolchains/llvm/prebuilt/linux-x86_64/bin`
4. cd to your Stockfish/src dir
5. Issue `make -j ARCH=armv8 COMP=ndk build`  (use `ARCH=armv7` or `ARCH=armv7-neon` for older CPUs)
6. Optionally `make -j ARCH=armv8 COMP=ndk strip`
7. That's all. Enjoy!

Improves support from Raspberry Pi (incomplete?) and compiling on arm in general

closes https://github.com/official-stockfish/Stockfish/pull/3015

fixes https://github.com/official-stockfish/Stockfish/issues/2860

fixes https://github.com/official-stockfish/Stockfish/issues/2641

Support is still fragile as we're missing CI on these targets. Nevertheless tested with:

```bash
  # build crosses from ubuntu 20.04 on x86 to various arch/OS combos
  # tested with suitable packages installed
  # (build-essentials, mingw-w64, g++-arm-linux-gnueabihf, NDK (r21d) from google)

  # cross to Android
  export PATH=$HOME/ndk/android-ndk-r21d/toolchains/llvm/prebuilt/linux-x86_64/bin:$PATH
  make clean && make -j build ARCH=armv7         COMP=ndk  && make -j build ARCH=armv7 COMP=ndk strip
  make clean && make -j build ARCH=armv7-neon    COMP=ndk  && make -j build ARCH=armv7-neon COMP=ndk strip
  make clean && make -j build ARCH=armv8         COMP=ndk  && make -j build ARCH=armv8 COMP=ndk strip

  # cross to Raspberry Pi
  make clean && make -j build ARCH=armv7         COMP=gcc COMPILER=arm-linux-gnueabihf-g++
  make clean && make -j build ARCH=armv7-neon    COMP=gcc COMPILER=arm-linux-gnueabihf-g++

  # cross to Windows
  make clean && make -j build ARCH=x86-64-modern COMP=mingw
```

No functional change
---
 AUTHORS      |  1 +
 src/Makefile | 65 ++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 56 insertions(+), 10 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 41b89705..d8f4d30e 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -127,6 +127,7 @@ Niklas Fiekas (niklasf)
 Nikolay Kostov (NikolayIT)
 Nguyen Pham (nguyenpham)
 Norman Schmidt (FireFather)
+notruck
 Ondrej Mosnáček (WOnder93)
 Oskar Werkelin Ahlin
 Pablo Vazquez
diff --git a/src/Makefile b/src/Makefile
index 38f607cb..0f458aa1 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -102,6 +102,7 @@ avx512 = no
 vnni = no
 neon = no
 ARCH = x86-64-modern
+STRIP = strip
 
 ### 2.2 Architecture specific
 
@@ -208,6 +209,14 @@ ifeq ($(ARCH),armv7)
 	bits = 32
 endif
 
+ifeq ($(ARCH),armv7-neon)
+	arch = armv7
+	prefetch = yes
+	popcnt = yes
+	neon = yes
+	bits = 32
+endif
+
 ifeq ($(ARCH),armv8)
 	arch = armv8-a
 	prefetch = yes
@@ -251,7 +260,7 @@ ifeq ($(COMP),gcc)
 	CXX=g++
 	CXXFLAGS += -pedantic -Wextra -Wshadow
 
-	ifeq ($(ARCH),$(filter $(ARCH),armv7 armv8))
+	ifeq ($(arch),$(filter $(arch),armv7 armv8-a))
 		ifeq ($(OS),Android)
 			CXXFLAGS += -m$(bits)
 			LDFLAGS += -m$(bits)
@@ -261,6 +270,10 @@ ifeq ($(COMP),gcc)
 		LDFLAGS += -m$(bits)
 	endif
 
+	ifeq ($(arch),$(filter $(arch),armv7))
+		LDFLAGS += -latomic
+	endif
+
 	ifneq ($(KERNEL),Darwin)
 	   LDFLAGS += -Wl,--no-as-needed
 	endif
@@ -311,7 +324,7 @@ ifeq ($(COMP),clang)
 	endif
 	endif
 
-	ifeq ($(ARCH),$(filter $(ARCH),armv7 armv8))
+	ifeq ($(arch),$(filter $(arch),armv7 armv8))
 		ifeq ($(OS),Android)
 			CXXFLAGS += -m$(bits)
 			LDFLAGS += -m$(bits)
@@ -340,6 +353,25 @@ ifeq ($(KERNEL),Darwin)
 	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14
 endif
 
+# To cross-compile for Android, NDK version r21 or later is recommended.
+# In earlier NDK versions, you'll need to pass -fno-addrsig if using GNU binutils.
+# Currently we don't know how to make PGO builds with the NDK yet.
+ifeq ($(COMP),ndk)
+	CXXFLAGS += -stdlib=libc++ -fPIE
+	ifeq ($(arch),armv7)
+		comp=armv7a-linux-androideabi16-clang
+		CXX=armv7a-linux-androideabi16-clang++
+		CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
+		STRIP=arm-linux-androideabi-strip
+	endif
+	ifeq ($(arch),armv8-a)
+		comp=aarch64-linux-android21-clang
+		CXX=aarch64-linux-android21-clang++
+		STRIP=aarch64-linux-android-strip
+	endif
+	LDFLAGS += -static-libstdc++ -pie -lm -latomic
+endif
+
 ### Travis CI script uses COMPILER to overwrite CXX
 ifdef COMPILER
 	COMPCXX=$(COMPILER)
@@ -356,7 +388,9 @@ ifneq ($(comp),mingw)
 	ifneq ($(OS),Android)
 		# Haiku has pthreads in its libroot, so only link it in on other platforms
 		ifneq ($(KERNEL),Haiku)
-			LDFLAGS += -lpthread
+			ifneq ($(COMP),ndk)
+				LDFLAGS += -lpthread
+			endif
 		endif
 	endif
 endif
@@ -401,7 +435,6 @@ endif
 ifeq ($(prefetch),yes)
 	ifeq ($(sse),yes)
 		CXXFLAGS += -msse
-		DEPENDFLAGS += -msse
 	endif
 else
 	CXXFLAGS += -DNO_PREFETCH
@@ -409,7 +442,7 @@ endif
 
 ### 3.6 popcnt
 ifeq ($(popcnt),yes)
-	ifeq ($(arch),$(filter $(arch),ppc64 armv8-a arm64))
+	ifeq ($(arch),$(filter $(arch),ppc64 armv7 armv8-a arm64))
 		CXXFLAGS += -DUSE_POPCNT
 	else ifeq ($(comp),icc)
 		CXXFLAGS += -msse3 -DUSE_POPCNT
@@ -418,6 +451,7 @@ ifeq ($(popcnt),yes)
 	endif
 endif
 
+
 ifeq ($(avx2),yes)
 	CXXFLAGS += -DUSE_AVX2
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
@@ -462,6 +496,11 @@ endif
 
 ifeq ($(neon),yes)
 	CXXFLAGS += -DUSE_NEON
+	ifeq ($(KERNEL),Linux)
+	ifneq ($(COMP),ndk)
+		CXXFLAGS += -mfpu=neon
+	endif
+	endif
 endif
 
 ifeq ($(arch),x86_64)
@@ -481,7 +520,10 @@ endif
 ### needs access to the optimization flags.
 ifeq ($(optimize),yes)
 ifeq ($(debug), no)
-	ifeq ($(comp),clang)
+	ifeq ($(COMP),ndk)
+		CXXFLAGS += -flto=thin
+		LDFLAGS += $(CXXFLAGS)
+	else ifeq ($(comp),clang)
 		CXXFLAGS += -flto=thin
 		LDFLAGS += $(CXXFLAGS)
 
@@ -502,7 +544,7 @@ ifeq ($(debug), no)
 	endif
 
 # To use LTO and static linking on windows, the tool chain requires a recent gcc:
-# gcc version 10.1 in msys2 or TDM-GCC version 9.2 are know to work, older might not.
+# gcc version 10.1 in msys2 or TDM-GCC version 9.2 are known to work, older might not.
 # So, only enable it for a cross from Linux by default.
 	else ifeq ($(comp),mingw)
 	ifeq ($(KERNEL),Linux)
@@ -556,7 +598,8 @@ help:
 	@echo "ppc-64                  > PPC 64-bit"
 	@echo "ppc-32                  > PPC 32-bit"
 	@echo "armv7                   > ARMv7 32-bit"
-	@echo "armv8                   > ARMv8 64-bit"
+	@echo "armv7-neon"             > ARMv7 32-bit with popcnt and neon"
+	@echo "armv8                   > ARMv8 64-bit with popcnt and neon"
 	@echo "apple-silicon           > Apple silicon ARM64"
 	@echo "general-64              > unspecified 64-bit"
 	@echo "general-32              > unspecified 32-bit"
@@ -567,6 +610,7 @@ help:
 	@echo "mingw                   > Gnu compiler with MinGW under Windows"
 	@echo "clang                   > LLVM Clang compiler"
 	@echo "icc                     > Intel compiler"
+	@echo "ndk                     > Google NDK to cross-compile for Android"
 	@echo ""
 	@echo "Simple examples. If you don't know what to do, you likely want to run: "
 	@echo ""
@@ -609,7 +653,7 @@ profile-build: net config-sanity objclean profileclean
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
 
 strip:
-	strip $(EXE)
+	$(STRIP) $(EXE)
 
 install:
 	-mkdir -p -m 755 $(BINDIR)
@@ -693,7 +737,8 @@ config-sanity:
 	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
 	@test "$(vnni)" = "yes" || test "$(vnni)" = "no"
 	@test "$(neon)" = "yes" || test "$(neon)" = "no"
-	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang"
+	@test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \
+	|| test "$(comp)" = "armv7a-linux-androideabi16-clang"  || test "$(comp)" = "aarch64-linux-android21-clang"
 
 $(EXE): $(OBJS)
 	+$(CXX) -o $@ $(OBJS) $(LDFLAGS)

From 1c0b7bdf4f77b8160cebe8af96b28230e870a136 Mon Sep 17 00:00:00 2001
From: VoyagerOne <excelgeek@gmail.com>
Date: Mon, 17 Aug 2020 08:58:03 -0400
Subject: [PATCH 81/86] Remove history bonus from Eval

STC:
LLR: 2.92 (-2.94,2.94) {-1.50,0.50}
Total: 26776 W: 2787 L: 2725 D: 21264
https://tests.stockfishchess.org/tests/view/5f39d6beb38d442594aabd9b

LTC:
LLR: 2.93 (-2.94,2.94) {-1.50,0.50}
Total: 12968 W: 635 L: 608 D: 11725
https://tests.stockfishchess.org/tests/view/5f39decfb38d442594aabda7

closes https://github.com/official-stockfish/Stockfish/pull/3019

Bench:  4335100
---
 src/search.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 83fb722f..7c839dfc 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -794,11 +794,7 @@ namespace {
     else
     {
         if ((ss-1)->currentMove != MOVE_NULL)
-        {
-            int bonus = -(ss-1)->statScore / 512;
-
-            ss->staticEval = eval = evaluate(pos) + bonus;
-        }
+            ss->staticEval = eval = evaluate(pos);
         else
             ss->staticEval = eval = -(ss-1)->staticEval + 2 * Tempo;
 

From 581b92e4a70b99fa5a22f7a1a38f2c8d2099769f Mon Sep 17 00:00:00 2001
From: Unai Corzo <corzounai@gmail.com>
Date: Mon, 17 Aug 2020 18:22:32 +0200
Subject: [PATCH 82/86] Remove last captures extension

STC https://tests.stockfishchess.org/tests/view/5f395657e98b6c64b3df41dd
LLR: 2.95 (-2.94,2.94) {-1.50,0.50}
Total: 144664 W: 15426 L: 15537 D: 113701
Ptnml(0-2): 612, 11341, 48537, 11230, 612

LTC https://tests.stockfishchess.org/tests/view/5f3a2ec7b38d442594aabdd7
LLR: 2.96 (-2.94,2.94) {-1.50,0.50}
Total: 22728 W: 1161 L: 1146 D: 20421
Ptnml(0-2): 21, 960, 9388, 973, 22

closes https://github.com/official-stockfish/Stockfish/pull/3020

bench: 3832662
---
 src/search.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 7c839dfc..1d5bc5f7 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1122,11 +1122,6 @@ moves_loop: // When in check, search starts from here
                && (pos.is_discovery_check_on_king(~us, move) || pos.see_ge(move)))
           extension = 1;
 
-      // Last captures extension
-      else if (   PieceValue[EG][pos.captured_piece()] > PawnValueEg
-               && pos.non_pawn_material() <= 2 * RookValueMg)
-          extension = 1;
-
       // Castling extension
       if (   type_of(move) == CASTLING
           && popcount(pos.pieces(us) & ~pos.pieces(PAWN) & (to_sq(move) & KingSide ? KingSide : QueenSide)) <= 2)

From 1bcc981a5a70e3065b4ff588644f270136fd7e3c Mon Sep 17 00:00:00 2001
From: mstembera <MissingEmail@email>
Date: Sun, 16 Aug 2020 15:23:50 -0700
Subject: [PATCH 83/86] Fallback to NNUE

If the classical eval ends up much smaller than estimated fall back to NNUE.
Also use multiply instead of divide for the threshold comparison for smoother transitions without rounding.

STC https://tests.stockfishchess.org/tests/view/5f3a5011b38d442594aabdfe
LLR: 2.96 (-2.94,2.94) {-0.50,1.50}
Total: 57352 W: 6325 L: 6135 D: 44892
Ptnml(0-2): 277, 4748, 18482, 4846, 323

LTC https://tests.stockfishchess.org/tests/view/5f3aee9db38d442594aabe82
LLR: 2.95 (-2.94,2.94) {0.25,1.75}
Total: 16232 W: 897 L: 781 D: 14554
Ptnml(0-2): 19, 679, 6616, 771, 31

closes https://github.com/official-stockfish/Stockfish/pull/3023

bench: 4026216

-----

Recommended net: https://tests.stockfishchess.org/api/nn/nn-82215d0fd0df.nnue
---
 src/evaluate.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 3a620a78..1bd89353 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -114,7 +114,8 @@ namespace {
   constexpr Value LazyThreshold1 =  Value(1400);
   constexpr Value LazyThreshold2 =  Value(1300);
   constexpr Value SpaceThreshold = Value(12222);
-  constexpr Value NNUEThreshold  =   Value(575);
+  constexpr Value NNUEThreshold1 =   Value(550);
+  constexpr Value NNUEThreshold2 =   Value(150);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -939,10 +940,13 @@ make_v:
 Value Eval::evaluate(const Position& pos) {
 
   bool classical = !Eval::useNNUE
-                ||  abs(eg_value(pos.psq_score())) >= NNUEThreshold * (16 + pos.rule50_count()) / 16;
+                ||  abs(eg_value(pos.psq_score())) * 16 > NNUEThreshold1 * (16 + pos.rule50_count());
   Value v = classical ? Evaluation<NO_TRACE>(pos).value()
                       : NNUE::evaluate(pos) * 5 / 4 + Tempo;
 
+  if (classical && Eval::useNNUE && abs(v) * 16 < NNUEThreshold2 * (16 + pos.rule50_count()))
+      v = NNUE::evaluate(pos) * 5 / 4 + Tempo;
+
   // Damp down the evaluation linearly when shuffling
   v = v * (100 - pos.rule50_count()) / 100;
 

From fbae5614eb1e82bccd37fbcfb0d2ca388b7a9a7d Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Tue, 18 Aug 2020 08:49:06 +0200
Subject: [PATCH 84/86] Fix Makefile typo

remove stray quote, shown with `make help`

No functional change
---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index 0f458aa1..1f8ba455 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -598,7 +598,7 @@ help:
 	@echo "ppc-64                  > PPC 64-bit"
 	@echo "ppc-32                  > PPC 32-bit"
 	@echo "armv7                   > ARMv7 32-bit"
-	@echo "armv7-neon"             > ARMv7 32-bit with popcnt and neon"
+	@echo "armv7-neon              > ARMv7 32-bit with popcnt and neon"
 	@echo "armv8                   > ARMv8 64-bit with popcnt and neon"
 	@echo "apple-silicon           > Apple silicon ARM64"
 	@echo "general-64              > unspecified 64-bit"

From 384d6844841e9f2da8f5a913c7620440f9e05ab5 Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Tue, 18 Aug 2020 18:06:28 +0200
Subject: [PATCH 85/86] Better error message on missing curl/wget

provide clean error/warning message for missing curl/wget, sha256sum/shasum

fixes https://github.com/official-stockfish/Stockfish/issues/3025

closes https://github.com/official-stockfish/Stockfish/pull/3026

No functional change
---
 src/Makefile | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 1f8ba455..a3feb68e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -669,9 +669,24 @@ net:
 	@echo "Default net: $(nnuenet)"
 	$(eval nnuedownloadurl := https://tests.stockfishchess.org/api/nn/$(nnuenet))
 	$(eval curl_or_wget := $(shell if hash curl 2>/dev/null; then echo "curl -skL"; elif hash wget 2>/dev/null; then echo "wget -qO-"; fi))
-	@if test -f "$(nnuenet)"; then echo "Already available."; else echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet); fi
+	@if test -f "$(nnuenet)"; then \
+            echo "Already available."; \
+         else \
+            if [ "x$(curl_or_wget)" = "x" ]; then \
+               echo "Automatic download failed: neither curl nor wget is installed. Install one of these tools or download the net manually"; exit 1; \
+            else \
+               echo "Downloading $(nnuedownloadurl)"; $(curl_or_wget) $(nnuedownloadurl) > $(nnuenet);\
+            fi; \
+        fi;
 	$(eval shasum_command := $(shell if hash shasum 2>/dev/null; then echo "shasum -a 256 "; elif hash sha256sum 2>/dev/null; then echo "sha256sum "; fi))
-	@if [ "$(nnuenet)" != "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then echo "Failed download or $(nnuenet) corrupted, please delete!"; exit 1; fi
+	@if [ "x$(shasum_command)" != "x" ]; then \
+	    if [ "$(nnuenet)" != "nn-"`$(shasum_command) $(nnuenet) | cut -c1-12`".nnue" ]; then \
+                echo "Failed download or $(nnuenet) corrupted, please delete!"; exit 1; \
+            fi \
+         else \
+            echo "shasum / sha256sum not found, skipping net validation"; \
+        fi
+
 
 # clean binaries and objects
 objclean:

From 42e8789f0b3935b7ea389b3aa929e05e0a016872 Mon Sep 17 00:00:00 2001
From: syzygy1 <3028851+syzygy1@users.noreply.github.com>
Date: Tue, 18 Aug 2020 01:56:12 +0200
Subject: [PATCH 86/86] Expanded support for x86-32 architectures.

add new ARCH targets

x86-32-sse41-popcnt     > x86 32-bit with sse41 and popcnt support
x86-32-sse2             > x86 32-bit with sse2 support
x86-32                  > x86 32-bit generic (with mmx and sse support)

retire x86-32-old (use general-32)

closes https://github.com/official-stockfish/Stockfish/pull/3022

No functional change.
---
 .travis.yml  |   3 +-
 src/Makefile | 145 +++++++++++++++++++++++++++++++--------------------
 2 files changed, 91 insertions(+), 57 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 45f1bd3d..12596f1e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -67,9 +67,10 @@ script:
   - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 optimize=no debug=yes build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse41-popcnt build && ../tests/signature.sh $benchref; fi
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-sse2 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32 build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-32 build && ../tests/signature.sh $benchref; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=x86-32-old build && ../tests/signature.sh $benchref; fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" && "$COMP" == "gcc" ]]; then make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref; fi
 
   # compile only for some more advanced architectures (might not run in travis)
diff --git a/src/Makefile b/src/Makefile
index a3feb68e..79c7333a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -67,11 +67,13 @@ endif
 # bits = 64/32        --- -DIS_64BIT       --- 64-/32-bit operating system
 # prefetch = yes/no   --- -DUSE_PREFETCH   --- Use prefetch asm-instruction
 # popcnt = yes/no     --- -DUSE_POPCNT     --- Use popcnt asm-instruction
+# pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
 # sse = yes/no        --- -msse            --- Use Intel Streaming SIMD Extensions
+# mmx = yes/no        --- -mmmx            --- Use Intel MMX instructions
+# sse2 = yes/no       --- -msse2           --- Use Intel Streaming SIMD Extensions 2
 # ssse3 = yes/no      --- -mssse3          --- Use Intel Supplemental Streaming SIMD Extensions 3
 # sse41 = yes/no      --- -msse4.1         --- Use Intel Streaming SIMD Extensions 4.1
 # avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
-# pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
 # avx512 = yes/no     --- -mavx512bw       --- Use Intel Advanced Vector Extensions 512
 # vnni = yes/no       --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 512
 # neon = yes/no       --- -DUSE_NEON       --- Use ARM SIMD architecture
@@ -92,12 +94,13 @@ sanitize = no
 bits = 64
 prefetch = no
 popcnt = no
-mmx = no
+pext = no
 sse = no
+mmx = no
+sse2 = no
 ssse3 = no
 sse41 = no
 avx2 = no
-pext = no
 avx512 = no
 vnni = no
 neon = no
@@ -106,83 +109,82 @@ STRIP = strip
 
 ### 2.2 Architecture specific
 
-ifeq ($(ARCH),general-32)
-	arch = any
-	bits = 32
-endif
+ifeq ($(findstring x86,$(ARCH)),x86)
 
-ifeq ($(ARCH),x86-32-old)
+# x86-32/64
+
+ifeq ($(findstring x86-32,$(ARCH)),x86-32)
 	arch = i386
 	bits = 32
-endif
-
-ifeq ($(ARCH),x86-32)
-	arch = i386
-	bits = 32
-	prefetch = yes
+	sse = yes
 	mmx = yes
-	sse = yes
-endif
-
-ifeq ($(ARCH),general-64)
-	arch = any
-endif
-
-ifeq ($(ARCH),x86-64)
+else
 	arch = x86_64
-	prefetch = yes
+	sse = yes
+	sse2 = yes
+endif
+
+ifeq ($(findstring -sse,$(ARCH)),-sse)
 	sse = yes
 endif
 
-ifeq ($(ARCH),x86-64-sse3-popcnt)
-	arch = x86_64
-	prefetch = yes
-	sse = yes
+ifeq ($(findstring -popcnt,$(ARCH)),-popcnt)
 	popcnt = yes
 endif
 
-ifeq ($(ARCH),x86-64-ssse3)
-	arch = x86_64
-	prefetch = yes
+ifeq ($(findstring -mmx,$(ARCH)),-mmx)
+	mmx = yes
+endif
+
+ifeq ($(findstring -sse2,$(ARCH)),-sse2)
 	sse = yes
+	sse2 = yes
+endif
+
+ifeq ($(findstring -ssse3,$(ARCH)),-ssse3)
+	sse = yes
+	sse2 = yes
 	ssse3 = yes
 endif
 
-ifeq ($(ARCH),$(filter $(ARCH),x86-64-sse41-popcnt x86-64-modern))
-	arch = x86_64
-	prefetch = yes
-	popcnt = yes
+ifeq ($(findstring -sse41,$(ARCH)),-sse41)
 	sse = yes
+	sse2 = yes
 	ssse3 = yes
 	sse41 = yes
 endif
 
-ifeq ($(ARCH),x86-64-avx2)
-	arch = x86_64
-	prefetch = yes
+ifeq ($(findstring -modern,$(ARCH)),-modern)
 	popcnt = yes
 	sse = yes
+	sse2 = yes
+	ssse3 = yes
+	sse41 = yes
+endif
+
+ifeq ($(findstring -avx2,$(ARCH)),-avx2)
+	popcnt = yes
+	sse = yes
+	sse2 = yes
 	ssse3 = yes
 	sse41 = yes
 	avx2 = yes
 endif
 
-ifeq ($(ARCH),x86-64-bmi2)
-	arch = x86_64
-	prefetch = yes
+ifeq ($(findstring -bmi2,$(ARCH)),-bmi2)
 	popcnt = yes
 	sse = yes
+	sse2 = yes
 	ssse3 = yes
 	sse41 = yes
 	avx2 = yes
 	pext = yes
 endif
 
-ifeq ($(ARCH),x86-64-avx512)
-	arch = x86_64
-	prefetch = yes
+ifeq ($(findstring -avx512,$(ARCH)),-avx512)
 	popcnt = yes
 	sse = yes
+	sse2 = yes
 	ssse3 = yes
 	sse41 = yes
 	avx2 = yes
@@ -190,11 +192,10 @@ ifeq ($(ARCH),x86-64-avx512)
 	avx512 = yes
 endif
 
-ifeq ($(ARCH),x86-64-vnni)
-	arch = x86_64
-	prefetch = yes
+ifeq ($(findstring -vnni,$(ARCH)),-vnni)
 	popcnt = yes
 	sse = yes
+	sse2 = yes
 	ssse3 = yes
 	sse41 = yes
 	avx2 = yes
@@ -203,6 +204,28 @@ ifeq ($(ARCH),x86-64-vnni)
 	vnni = yes
 endif
 
+ifeq ($(sse),yes)
+	prefetch = yes
+endif
+
+# 64-bit pext is not available on x86-32
+ifeq ($(bits),32)
+	pext = no
+endif
+
+else
+
+# all other architectures
+
+ifeq ($(ARCH),general-32)
+	arch = any
+	bits = 32
+endif
+
+ifeq ($(ARCH),general-64)
+	arch = any
+endif
+
 ifeq ($(ARCH),armv7)
 	arch = armv7
 	prefetch = yes
@@ -242,6 +265,8 @@ ifeq ($(ARCH),ppc-64)
 	prefetch = yes
 endif
 
+endif
+
 ### ==========================================================================
 ### Section 3. Low-level Configuration
 ### ==========================================================================
@@ -487,6 +512,13 @@ ifeq ($(ssse3),yes)
 	endif
 endif
 
+ifeq ($(sse2),yes)
+	CXXFLAGS += -DUSE_SSE2
+	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+		CXXFLAGS += -msse2
+	endif
+endif
+
 ifeq ($(mmx),yes)
 	CXXFLAGS += -DUSE_MMX
 	ifeq ($(comp),$(filter $(comp),gcc clang mingw))
@@ -503,10 +535,6 @@ ifeq ($(neon),yes)
 	endif
 endif
 
-ifeq ($(arch),x86_64)
-	CXXFLAGS += -msse2 -DUSE_SSE2
-endif
-
 ### 3.7 pext
 ifeq ($(pext),yes)
 	CXXFLAGS += -DUSE_PEXT
@@ -592,9 +620,10 @@ help:
 	@echo "x86-64-modern           > common modern CPU, currently x86-64-sse41-popcnt"
 	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
 	@echo "x86-64-sse3-popcnt      > x86 64-bit with sse3 and popcnt support"
-	@echo "x86-64                  > x86 64-bit generic"
-	@echo "x86-32                  > x86 32-bit (also enables MMX and SSE)"
-	@echo "x86-32-old              > x86 32-bit fall back for old hardware"
+	@echo "x86-64                  > x86 64-bit generic (with sse2 support)"
+	@echo "x86-32-sse41-popcnt     > x86 32-bit with sse41 and popcnt support"
+	@echo "x86-32-sse2             > x86 32-bit with sse2 support"
+	@echo "x86-32                  > x86 32-bit generic (with mmx and sse support)"
 	@echo "ppc-64                  > PPC 64-bit"
 	@echo "ppc-32                  > PPC 32-bit"
 	@echo "armv7                   > ARMv7 32-bit"
@@ -624,7 +653,7 @@ help:
 	@echo "make -j build ARCH=x86-64-ssse3 COMP=clang"
 	@echo ""
 ifneq ($(empty_arch), yes)
-	@echo "-------------------------------\n"
+	@echo "-------------------------------"
 	@echo "The selected architecture $(ARCH) will enable the following configuration: "
 	@$(MAKE) ARCH=$(ARCH) COMP=$(COMP) config-sanity
 endif
@@ -719,11 +748,13 @@ config-sanity:
 	@echo "os: '$(OS)'"
 	@echo "prefetch: '$(prefetch)'"
 	@echo "popcnt: '$(popcnt)'"
+	@echo "pext: '$(pext)'"
 	@echo "sse: '$(sse)'"
+	@echo "mmx: '$(mmx)'"
+	@echo "sse2: '$(sse2)'"
 	@echo "ssse3: '$(ssse3)'"
 	@echo "sse41: '$(sse41)'"
 	@echo "avx2: '$(avx2)'"
-	@echo "pext: '$(pext)'"
 	@echo "avx512: '$(avx512)'"
 	@echo "vnni: '$(vnni)'"
 	@echo "neon: '$(neon)'"
@@ -744,11 +775,13 @@ config-sanity:
 	@test "$(bits)" = "32" || test "$(bits)" = "64"
 	@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
 	@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
+	@test "$(pext)" = "yes" || test "$(pext)" = "no"
 	@test "$(sse)" = "yes" || test "$(sse)" = "no"
+	@test "$(mmx)" = "yes" || test "$(mmx)" = "no"
+	@test "$(sse2)" = "yes" || test "$(sse2)" = "no"
 	@test "$(ssse3)" = "yes" || test "$(ssse3)" = "no"
 	@test "$(sse41)" = "yes" || test "$(sse41)" = "no"
 	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
-	@test "$(pext)" = "yes" || test "$(pext)" = "no"
 	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
 	@test "$(vnni)" = "yes" || test "$(vnni)" = "no"
 	@test "$(neon)" = "yes" || test "$(neon)" = "no"