diff --git a/.github/workflows/stockfish.yml b/.github/workflows/stockfish.yml new file mode 100644 index 00000000..e50930f8 --- /dev/null +++ b/.github/workflows/stockfish.yml @@ -0,0 +1,276 @@ +name: Stockfish +on: + push: + branches: + - master + - tools + - github_ci + pull_request: + branches: + - master + - tools +jobs: + Stockfish: + name: ${{ matrix.config.name }} + runs-on: ${{ matrix.config.os }} + env: + COMPILER: ${{ matrix.config.compiler }} + COMP: ${{ matrix.config.comp }} + strategy: + matrix: + config: + - { + name: "Ubuntu 20.04 GCC", + os: ubuntu-20.04, + compiler: g++, + comp: gcc, + run_expensive_tests: true, + run_32bit_tests: true, + run_64bit_tests: true, + shell: 'bash {0}' + } + - { + name: "Ubuntu 20.04 Clang", + os: ubuntu-20.04, + compiler: clang++, + comp: clang, + run_expensive_tests: false, + run_32bit_tests: true, + run_64bit_tests: true, + shell: 'bash {0}' + } + - { + name: "MacOS 10.15 Apple Clang", + os: macos-10.15, + compiler: clang++, + comp: clang, + run_expensive_tests: false, + run_32bit_tests: false, + run_64bit_tests: true, + shell: 'bash {0}' + } + - { + name: "MacOS 10.15 GCC 10", + os: macos-10.15, + compiler: g++-10, + comp: gcc, + run_expensive_tests: false, + run_32bit_tests: false, + run_64bit_tests: true, + shell: 'bash {0}' + } + - { + name: "Windows 2019 Mingw-w64 GCC x86_64", + os: windows-2019, + compiler: g++, + comp: gcc, + run_expensive_tests: false, + run_32bit_tests: false, + run_64bit_tests: true, + msys_sys: 'mingw64', + msys_env: 'x86_64', + shell: 'msys2 {0}' + } + - { + name: "Windows 2019 Mingw-w64 GCC i686", + os: windows-2019, + compiler: g++, + comp: gcc, + run_expensive_tests: false, + run_32bit_tests: true, + run_64bit_tests: false, + msys_sys: 'mingw32', + msys_env: 'i686', + shell: 'msys2 {0}' + } + + defaults: + run: + working-directory: src + shell: ${{ matrix.config.shell }} + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Download required linux packages + if: runner.os == 'Linux' + run: | + sudo apt update + sudo apt install expect valgrind g++-multilib + + - name: Setup msys and install required packages + if: runner.os == 'Windows' + uses: msys2/setup-msys2@v2 + with: + msystem: ${{matrix.config.msys_sys}} + install: mingw-w64-${{matrix.config.msys_env}}-gcc make git expect + + - name: Download the used network from the fishtest framework + run: | + make net + + - name: Extract the bench number from the commit history + run: | + git log HEAD | grep "\b[Bb]ench[ :]\+[0-9]\{7\}" | head -n 1 | sed "s/[^0-9]*\([0-9]*\).*/\1/g" > git_sig + [ -s git_sig ] && echo "benchref=$(cat git_sig)" >> $GITHUB_ENV && echo "Reference bench:" $(cat git_sig) || echo "No bench found" + + - name: Check compiler + run: | + $COMPILER -v + + - name: Test help target + run: | + make help + + # x86-32 tests + + - name: Test debug x86-32 build + if: ${{ matrix.config.run_32bit_tests }} + run: | + export CXXFLAGS="-D_GLIBCXX_DEBUG" + make clean + make -j2 ARCH=x86-32 optimize=no debug=yes build + ../tests/signature.sh $benchref + + - name: Test x86-32 build + if: ${{ matrix.config.run_32bit_tests }} + run: | + make clean + make -j2 ARCH=x86-32 build + ../tests/signature.sh $benchref + + - name: Test x86-32-sse41-popcnt build + if: ${{ matrix.config.run_32bit_tests }} + run: | + make clean + make -j2 ARCH=x86-32-sse41-popcnt build + ../tests/signature.sh $benchref + + - name: Test x86-32-sse2 build + if: ${{ matrix.config.run_32bit_tests }} + run: | + make clean + make -j2 ARCH=x86-32-sse2 build + ../tests/signature.sh $benchref + + - name: Test general-32 build + if: ${{ matrix.config.run_32bit_tests }} + run: | + make clean + make -j2 ARCH=general-32 build + ../tests/signature.sh $benchref + + # x86-64 tests + + - name: Test debug x86-64-modern build + if: ${{ matrix.config.run_64bit_tests }} + run: | + export CXXFLAGS="-D_GLIBCXX_DEBUG" + make clean + make -j2 ARCH=x86-64-modern optimize=no debug=yes build + ../tests/signature.sh $benchref + + - name: Test x86-64-modern build + if: ${{ matrix.config.run_64bit_tests }} + run: | + make clean + make -j2 ARCH=x86-64-modern build + ../tests/signature.sh $benchref + + - name: Test x86-64-ssse3 build + if: ${{ matrix.config.run_64bit_tests }} + run: | + make clean + make -j2 ARCH=x86-64-ssse3 build + ../tests/signature.sh $benchref + + - name: Test x86-64-sse3-popcnt build + if: ${{ matrix.config.run_64bit_tests }} + run: | + make clean + make -j2 ARCH=x86-64-sse3-popcnt build + ../tests/signature.sh $benchref + + - name: Test x86-64 build + if: ${{ matrix.config.run_64bit_tests }} + run: | + make clean + make -j2 ARCH=x86-64 build + ../tests/signature.sh $benchref + + - name: Test general-64 build + if: matrix.config.run_64bit_tests + run: | + make clean + make -j2 ARCH=general-64 build + ../tests/signature.sh $benchref + + # x86-64 with newer extensions tests + + - name: Compile x86-64-avx2 build + if: ${{ matrix.config.run_64bit_tests }} + run: | + make clean + make -j2 ARCH=x86-64-avx2 build + + - name: Compile x86-64-bmi2 build + if: ${{ matrix.config.run_64bit_tests }} + run: | + make clean + make -j2 ARCH=x86-64-bmi2 build + + - name: Compile x86-64-avx512 build + if: ${{ matrix.config.run_64bit_tests }} + run: | + make clean + make -j2 ARCH=x86-64-avx512 build + + - name: Compile x86-64-vnni512 build + if: ${{ matrix.config.run_64bit_tests }} + run: | + make clean + make -j2 ARCH=x86-64-vnni512 build + + - name: Compile x86-64-vnni256 build + if: ${{ matrix.config.run_64bit_tests }} + run: | + make clean + make -j2 ARCH=x86-64-vnni256 build + + # Other tests + + - name: Check perft and search reproducibility + if: ${{ matrix.config.run_64bit_tests }} + run: | + make clean + make -j2 ARCH=x86-64-modern build + ../tests/perft.sh + ../tests/reprosearch.sh + + # Sanitizers + + - name: Run under valgrind + if: ${{ matrix.config.run_expensive_tests }} + run: | + export CXXFLAGS="-O1 -fno-inline" + make clean + make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null + ../tests/instrumented.sh --valgrind + ../tests/instrumented.sh --valgrind-thread + + - name: Run with UB sanitizer + if: ${{ matrix.config.run_expensive_tests }} + run: | + export CXXFLAGS="-O1 -fno-inline" + make clean + make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null + ../tests/instrumented.sh --sanitizer-undefined + + - name: Run with thread sanitizer + if: ${{ matrix.config.run_expensive_tests }} + run: | + export CXXFLAGS="-O1 -fno-inline" + make clean + make -j2 ARCH=x86-64-modern sanitize=thread optimize=no debug=yes build > /dev/null + ../tests/instrumented.sh --sanitizer-thread diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 377796f7..00000000 --- a/.travis.yml +++ /dev/null @@ -1,75 +0,0 @@ -language: cpp -dist: focal - -matrix: - include: - - os: linux - compiler: gcc - addons: - apt: - packages: ['g++-multilib', 'valgrind', 'expect', 'curl'] - env: - - COMPILER=g++ - - COMP=gcc - -branches: - only: - - master - -before_script: - - cd src - -script: - # Download net - - make net - - # Obtain bench reference from git log - - git log HEAD | grep "\b[Bb]ench[ :]\+[0-9]\{7\}" | head -n 1 | sed "s/[^0-9]*\([0-9]*\).*/\1/g" > git_sig - - export benchref=$(cat git_sig) - - echo "Reference bench:" $benchref - - # Compiler version string - - $COMPILER -v - - # test help target - - make help - - # Verify bench number against various builds - - export CXXFLAGS="-Werror -D_GLIBCXX_DEBUG" - - make clean && make -j2 ARCH=x86-64-modern optimize=no debug=yes build && ../tests/signature.sh $benchref - - export CXXFLAGS="-Werror" - - make clean && make -j2 ARCH=x86-64-modern build && ../tests/signature.sh $benchref - - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref - - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref - - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref - # TODO avoid _mm_malloc - # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi - - make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref - - # compile only for some more advanced architectures (might not run in travis) - - make clean && make -j2 ARCH=x86-64-avx2 blas=yes build - - - make clean && make -j2 ARCH=x86-64-avx2 build - - make clean && make -j2 ARCH=x86-64-bmi2 build - - make clean && make -j2 ARCH=x86-64-avx512 build - - make clean && make -j2 ARCH=x86-64-vnni512 build - - make clean && make -j2 ARCH=x86-64-vnni256 build - - # - # Check perft and reproducible search - - make clean && make -j2 ARCH=x86-64-modern build - - ../tests/perft.sh - - ../tests/reprosearch.sh - - # - # Valgrind - # - - export CXXFLAGS="-O1 -fno-inline" - - make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind - - ../tests/instrumented.sh --valgrind-thread - - # - # Sanitizer - # - - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined - - make clean && make -j2 ARCH=x86-64-modern sanitize=thread optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread diff --git a/AUTHORS b/AUTHORS index 7165363f..4d72314f 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,4 +1,4 @@ -# List of authors for Stockfish, as of May 17, 2021 +# List of authors for Stockfish, as of June 14, 2021 # Founders of the Stockfish project and fishtest infrastructure Tord Romstad (romstad) @@ -69,6 +69,7 @@ gamander Gary Heckman (gheckman) George Sobala (gsobala) gguliash +Giacomo Lorenzetti (G-Lorenz) Gian-Carlo Pascutto (gcp) Gontran Lemaire (gonlem) Goodkov Vasiliy Aleksandrovich (goodkov) @@ -96,6 +97,7 @@ Joost VandeVondele (vondele) Jörg Oster (joergoster) Joseph Ellis (jhellis3) Joseph R. Prostko +Julian Willemer (NightlyKing) jundery Justin Blanchard (UncombedCoconut) Kelly Wilson @@ -106,6 +108,7 @@ Kojirion Krystian Kuzniarek (kuzkry) Leonardo Ljubičić (ICCF World Champion) Leonid Pechenik (lp--) +Liam Keegan (lkeegan) Linus Arver (listx) loco-loco Lub van den Berg (ElbertoOne) diff --git a/README.md b/README.md index 19d5a229..467dd3c3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ## Overview -[![Build Status](https://travis-ci.org/official-stockfish/Stockfish.svg?branch=master)](https://travis-ci.org/official-stockfish/Stockfish) +[![Build Status](https://github.com/official-stockfish/Stockfish/actions/workflows/stockfish.yml/badge.svg)](https://github.com/official-stockfish/Stockfish/actions) [![Build Status](https://ci.appveyor.com/api/projects/status/github/official-stockfish/Stockfish?branch=master&svg=true)](https://ci.appveyor.com/project/mcostalba/stockfish/branch/master) [Stockfish](https://stockfishchess.org) is a free, powerful UCI chess engine @@ -35,12 +35,14 @@ This distribution of Stockfish consists of the following files: ## The UCI protocol and available options -The Universal Chess Interface (UCI) is a standard protocol used to communicate with a chess engine, -and is the recommended way to do so for typical graphical user interfaces (GUI) or chess tools. +The Universal Chess Interface (UCI) is a standard protocol used to communicate with +a chess engine, and is the recommended way to do so for typical graphical user interfaces +(GUI) or chess tools. Stockfish implements the majority of it options as described +in [the UCI protocol](https://www.shredderchess.com/download/div/uci.zip). -Stockfish implements most commands as described in [the UCI protocol](https://www.shredderchess.com/download/div/uci.zip) - -For users, the following UCI options, which can typically be set via a GUI, are available in Stockfish: +Developers can see the default values for UCI options available in Stockfish by typing +`./stockfish uci` in a terminal, but the majority of users will typically see them and +change them via a chess GUI. This is a list of available UCI options in Stockfish: * #### Threads The number of CPU threads used for searching a position. For best performance, set @@ -118,14 +120,6 @@ For users, the following UCI options, which can typically be set via a GUI, are Limit Syzygy tablebase probing to positions with at most this many pieces left (including kings and pawns). - * #### Contempt - A positive value for contempt favors middle game positions and avoids draws, - effective for the classical evaluation only. - - * #### Analysis Contempt - By default, contempt is set to prefer the side to move. Set this option to "White" - or "Black" to analyse with contempt for that side, or "Off" to disable contempt. - * #### Move Overhead Assume a time delay of x ms due to network and GUI overheads. This is useful to avoid losses on time in those cases. @@ -143,9 +137,9 @@ For users, the following UCI options, which can typically be set via a GUI, are For developers the following non-standard commands might be of interest, mainly useful for debugging: - * #### bench ttSize threads limit fenFile limitType evalType - Performs a standard benchmark using various options. The signature or standard node - count is obtained using all defaults. `bench` is currently `bench 16 1 13 default depth mixed`. + * #### bench *ttSize threads limit fenFile limitType evalType* + Performs a standard benchmark using various options. The signature of a version (standard node + count) is obtained using all defaults. `bench` is currently `bench 16 1 13 default depth mixed`. * #### compiler Give information about the compiler and environment used for building a binary. diff --git a/Top CPU Contributors.txt b/Top CPU Contributors.txt index f5347ea1..dacc5781 100644 --- a/Top CPU Contributors.txt +++ b/Top CPU Contributors.txt @@ -1,189 +1,205 @@ -Contributors to Fishtest with >10,000 CPU hours, as of Feb 15, 2021. +Contributors to Fishtest with >10,000 CPU hours, as of Jun 29, 2021. Thank you! -Username CPU Hours Games played ----------------------------------------------------- -noobpwnftw 23930906 1560559941 -dew 1169948 70333008 -mlang 957168 61657446 -mibere 703840 46867607 -tvijlbrief 517888 33379462 -JojoM 515404 30334272 -cw 443276 29385549 -crunchy 427035 27344275 -grandphish2 425794 26347253 -fastgm 414133 24519696 -gvreuls 377843 24708884 -CSU_Dynasty 338718 23030006 -Fisherman 326795 21820747 -TueRens 313730 19490246 -ctoks 298442 20052551 -velislav 270519 17355456 -bcross 241064 17196165 -glinscott 217799 13780820 -nordlandia 211692 13484886 -bking_US 198894 11876016 -drabel 191096 13129722 -leszek 189170 11446821 -mgrabiak 187153 12013300 -robal 181389 11539242 -Thanar 179852 12365359 -vdv 175274 9889046 -spams 157128 10319326 -marrco 150292 9401741 -sqrt2 147963 9724586 -CoffeeOne 137086 5022516 -vdbergh 137041 8926915 -malala 136182 8002293 -mhoram 132780 8398229 -xoto 124729 8652088 -davar 122092 7960001 -dsmith 122059 7570238 -Data 113305 8220352 -BrunoBanani 112960 7436849 -pemo 109598 5036441 -Dantist 106768 6431396 -MaZePallas 102741 6630419 -ElbertoOne 99028 7023771 -brabos 92118 6186135 -linrock 90903 6708639 -psk 89957 5984901 -sunu 88614 6020673 -sterni1971 86948 5613788 -Vizvezdenec 83761 5344740 -BRAVONE 81239 5054681 -nssy 76497 5259388 -cuistot 76366 4370584 -racerschmacer 75753 5442626 -teddybaer 75125 5407666 -Pking_cda 73776 5293873 -0x3C33 73133 4670293 -jromang 72117 5054915 -solarlight 70517 5028306 -dv8silencer 70287 3883992 -Bobo1239 68515 4652287 -manap 66273 4121774 -tinker 64321 4268390 -robnjr 57262 4053117 -Freja 56938 3733019 -ttruscott 56010 3680085 -rkl 54986 4150767 -renouve 53811 3501516 -finfish 51360 3370515 -eva42 51272 3599691 -rap 49985 3219146 -pb00067 49727 3298270 -amicic 49691 3042481 -ronaldjerum 47654 3240695 -bigpen0r 47278 3291647 -biffhero 46564 3111352 -VoyagerOne 45476 3452465 -eastorwest 45033 3071805 -speedycpu 43842 3003273 -jbwiebe 43305 2805433 -Antihistamine 41788 2761312 -mhunt 41735 2691355 -homyur 39893 2850481 -gri 39871 2515779 -oryx 38282 2944400 -Spprtr 38157 2470529 -SC 37290 2731014 -csnodgrass 36207 2688994 -jmdana 36157 2210661 -strelock 34716 2074055 -Garf 33800 2747562 -skiminki 33515 2055584 -EthanOConnor 33370 2090311 -slakovv 32915 2021889 -yurikvelo 32600 2255966 -Prcuvu 30377 2170122 -manapbk 30326 1770143 -anst 30301 2190091 -jkiiski 30136 1904470 -hyperbolic.tom 29840 2017394 -Pyafue 29650 1902349 -qurashee 27758 1509620 -OuaisBla 27636 1578800 -chriswk 26902 1868317 -achambord 26582 1767323 -Fifis 26376 1776853 -Patrick_G 26276 1801617 -yorkman 26193 1992080 -SFTUser 25182 1675689 -nabildanial 24942 1519409 -Sharaf_DG 24765 1786697 -ncfish1 24411 1520927 -agg177 23890 1395014 -JanErik 23408 1703875 -Isidor 23388 1680691 -Norabor 23164 1591830 -cisco2015 22895 1762069 -Zirie 22542 1472937 -team-oh 22272 1636708 -MazeOfGalious 21978 1629593 -sg4032 21945 1643065 -ianh2105 21725 1632562 -xor12 21628 1680365 -dex 21612 1467203 -nesoneg 21494 1463031 -jjoshua2 20997 1422689 -horst.prack 20878 1465656 -0xB00B1ES 20590 1208666 -sphinx 20515 1352368 -j3corre 20405 941444 -Adrian.Schmidt123 20316 1281436 -Ente 20017 1432602 -wei 19973 1745989 -rstoesser 19569 1293588 -eudhan 19274 1283717 -jundery 18445 1115855 -iisiraider 18247 1101015 -ville 17883 1384026 -chris 17698 1487385 -purplefishies 17595 1092533 -DMBK 17357 1279152 -DragonLord 17014 1162790 -dju 16515 929427 -IgorLeMasson 16064 1147232 -ako027ako 15671 1173203 -Nikolay.IT 15154 1068349 -Andrew Grant 15114 895539 -OssumOpossum 14857 1007129 -enedene 14476 905279 -bpfliegel 14298 884523 -jpulman 13982 870599 -joster 13794 950160 -Nesa92 13786 1114691 -crocogoat 13753 1114622 -Hjax 13535 915487 -Dark_wizzie 13422 1007152 -mpx86 12941 693640 -mabichito 12903 749391 -thijsk 12886 722107 -AdrianSA 12860 804972 -Flopzee 12698 894821 -fatmurphy 12547 853210 -scuzzi 12511 845761 -Karby 12429 735880 -SapphireBrand 12416 969604 -modolief 12386 896470 -pgontarz 12151 848794 -stocky 11954 699440 -mschmidt 11941 803401 -infinity 11470 727027 -torbjo 11395 729145 -Thomas A. Anderson 11372 732094 -d64 11263 789184 -Maxim 11129 804704 -snicolet 11106 869170 -MooTheCow 11008 694942 -savage84 10965 641068 -Rudolphous 10915 741268 -Wolfgang 10809 580032 -rpngn 10712 688203 -basepi 10637 744851 -michaelrpg 10409 735127 -dzjp 10343 732529 -ali-al-zhrani 10324 726502 -ols 10259 570669 -lbraesch 10252 647825 +Username CPU Hours Games played +----------------------------------------------------- +noobpwnftw 27649494 1834734733 +mlang 1426107 89454622 +dew 1380910 82831648 +mibere 703840 46867607 +grandphish2 692707 41737913 +tvijlbrief 669642 42371594 +JojoM 597778 35297180 +TueRens 519226 31823562 +cw 458421 30307421 +fastgm 439667 25950040 +gvreuls 436599 28177460 +crunchy 427035 27344275 +CSU_Dynasty 374765 25106278 +Fisherman 326901 21822979 +ctoks 325477 21767943 +velislav 295343 18844324 +linrock 292789 10624427 +bcross 278584 19488961 +okrout 262818 13803272 +pemo 245982 11376085 +glinscott 217799 13780820 +leszek 212346 12959025 +nordlandia 211692 13484886 +bking_US 198894 11876016 +drabel 196463 13450602 +robal 195473 12375650 +mgrabiak 187226 12016564 +Dantist 183202 10990484 +Thanar 179852 12365359 +vdv 175274 9889046 +spams 157128 10319326 +marrco 150295 9402141 +sqrt2 147963 9724586 +mhoram 141278 8901241 +CoffeeOne 137100 5024116 +vdbergh 137041 8926915 +malala 136182 8002293 +xoto 133702 9156676 +davar 122092 7960001 +dsmith 122059 7570238 +Data 113305 8220352 +BrunoBanani 112960 7436849 +MaZePallas 102823 6633619 +sterni1971 100532 5880772 +ElbertoOne 99028 7023771 +brabos 92118 6186135 +oz 92100 6486640 +psk 89957 5984901 +amicic 89156 5392305 +sunu 88851 6028873 +Vizvezdenec 83761 5344740 +0x3C33 82614 5271253 +BRAVONE 81239 5054681 +racerschmacer 80899 5759262 +cuistot 80300 4606144 +nssy 76497 5259388 +teddybaer 75125 5407666 +Pking_cda 73776 5293873 +jromang 72192 5057715 +solarlight 70517 5028306 +dv8silencer 70287 3883992 +Bobo1239 68515 4652287 +manap 66273 4121774 +skiminki 65088 4023328 +tinker 64333 4268790 +sschnee 60767 3500800 +qurashee 57344 3168264 +robnjr 57262 4053117 +Freja 56938 3733019 +ttruscott 56010 3680085 +rkl 55132 4164467 +renouve 53811 3501516 +finfish 51360 3370515 +eva42 51272 3599691 +rap 49985 3219146 +pb00067 49727 3298270 +ronaldjerum 47654 3240695 +bigpen0r 47653 3335327 +eastorwest 47585 3221629 +biffhero 46564 3111352 +VoyagerOne 45476 3452465 +yurikvelo 44834 3034550 +speedycpu 43842 3003273 +jbwiebe 43305 2805433 +Spprtr 42279 2680153 +DesolatedDodo 42007 2447516 +Antihistamine 41788 2761312 +mhunt 41735 2691355 +homyur 39893 2850481 +gri 39871 2515779 +Fifis 38776 2529121 +oryx 38724 2966648 +SC 37290 2731014 +csnodgrass 36207 2688994 +jmdana 36157 2210661 +strelock 34716 2074055 +rpngn 33951 2057395 +Garf 33922 2751802 +EthanOConnor 33370 2090311 +slakovv 32915 2021889 +manapbk 30987 1810399 +Prcuvu 30377 2170122 +anst 30301 2190091 +jkiiski 30136 1904470 +hyperbolic.tom 29840 2017394 +Pyafue 29650 1902349 +Wolfgang 29260 1658936 +zeryl 28156 1579911 +OuaisBla 27636 1578800 +DMBK 27051 1999456 +chriswk 26902 1868317 +achambord 26582 1767323 +Patrick_G 26276 1801617 +yorkman 26193 1992080 +SFTUser 25182 1675689 +nabildanial 24942 1519409 +Sharaf_DG 24765 1786697 +ncfish1 24411 1520927 +rodneyc 24227 1409514 +agg177 23890 1395014 +JanErik 23408 1703875 +Isidor 23388 1680691 +Norabor 23164 1591830 +cisco2015 22897 1762669 +Zirie 22542 1472937 +team-oh 22272 1636708 +MazeOfGalious 21978 1629593 +sg4032 21947 1643265 +ianh2105 21725 1632562 +xor12 21628 1680365 +dex 21612 1467203 +nesoneg 21494 1463031 +sphinx 21211 1384728 +jjoshua2 21001 1423089 +horst.prack 20878 1465656 +Ente 20865 1477066 +0xB00B1ES 20590 1208666 +j3corre 20405 941444 +Adrian.Schmidt123 20316 1281436 +wei 19973 1745989 +MaxKlaxxMiner 19850 1009176 +rstoesser 19569 1293588 +gopeto 19491 1174952 +eudhan 19274 1283717 +jundery 18445 1115855 +megaman7de 18377 1067540 +iisiraider 18247 1101015 +ville 17883 1384026 +chris 17698 1487385 +purplefishies 17595 1092533 +dju 17353 978595 +DragonLord 17014 1162790 +IgorLeMasson 16064 1147232 +ako027ako 15671 1173203 +chuckstablers 15289 891576 +Nikolay.IT 15154 1068349 +Andrew Grant 15114 895539 +OssumOpossum 14857 1007129 +Karby 14808 867120 +enedene 14476 905279 +bpfliegel 14298 884523 +mpx86 14019 759568 +jpulman 13982 870599 +crocogoat 13803 1117422 +joster 13794 950160 +Nesa92 13786 1114691 +Hjax 13535 915487 +jsys14 13459 785000 +Dark_wizzie 13422 1007152 +mabichito 12903 749391 +thijsk 12886 722107 +AdrianSA 12860 804972 +Flopzee 12698 894821 +fatmurphy 12547 853210 +Rudolphous 12520 832340 +scuzzi 12511 845761 +SapphireBrand 12416 969604 +modolief 12386 896470 +Machariel 12335 810784 +pgontarz 12151 848794 +stocky 11954 699440 +mschmidt 11941 803401 +Maxim 11543 836024 +infinity 11470 727027 +torbjo 11395 729145 +Thomas A. Anderson 11372 732094 +savage84 11358 670860 +d64 11263 789184 +MooTheCow 11237 720174 +snicolet 11106 869170 +ali-al-zhrani 11086 767926 +AndreasKrug 10875 887457 +pirt 10806 836519 +basepi 10637 744851 +michaelrpg 10508 739039 +dzjp 10343 732529 +aga 10302 622975 +ols 10259 570669 +lbraesch 10252 647825 +FormazChar 10059 757283 diff --git a/src/Makefile b/src/Makefile index d3cea8de..4cfefe77 100644 --- a/src/Makefile +++ b/src/Makefile @@ -50,7 +50,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \ search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \ nnue/evaluate_nnue.cpp \ - nnue/features/half_ka_v2.cpp \ + nnue/features/half_ka_v2_hm.cpp \ tools/validate_training_data.cpp \ tools/sfen_packer.cpp \ tools/training_data_generator.cpp \ @@ -72,9 +72,11 @@ VPATH = syzygy:nnue:nnue/features:eval:extra:tools # ---------------------------------------------------------------------------- # # debug = yes/no --- -DNDEBUG --- Enable/Disable debug mode -# sanitize = undefined/thread/no (-fsanitize ) +# sanitize = none/ ... (-fsanitize ) # --- ( undefined ) --- enable undefined behavior checks -# --- ( thread ) --- enable threading error checks +# --- ( thread ) --- enable threading error checks +# --- ( address ) --- enable memory access checks +# --- ...etc... --- see compiler documentation for supported sanitizers # optimize = yes/no --- (-O3/-fast etc.) --- Enable/Disable optimizations # arch = (name) --- (-arch) --- Target architecture # bits = 64/32 --- -DIS_64BIT --- 64-/32-bit operating system @@ -95,6 +97,10 @@ VPATH = syzygy:nnue:nnue/features:eval:extra:tools # Note that Makefile is space sensitive, so when adding new architectures # or modifying existing flags, you have to make sure there are no extra spaces # at the end of the line for flag values. +# +# Example of use for these flags: +# make build ARCH=x86-64-avx512 debug=on sanitize="address undefined" + ### 2.1. General and architecture defaults @@ -116,7 +122,7 @@ endif optimize = yes debug = no -sanitize = no +sanitize = none bits = 64 prefetch = no popcnt = no @@ -392,10 +398,12 @@ ifeq ($(COMP),clang) ifneq ($(KERNEL),Darwin) ifneq ($(KERNEL),OpenBSD) ifneq ($(KERNEL),FreeBSD) + ifneq ($(RTLIB),compiler-rt) LDFLAGS += -latomic endif endif endif + endif ifeq ($(arch),$(filter $(arch),armv7 armv8)) ifeq ($(OS),Android) @@ -409,8 +417,12 @@ ifeq ($(COMP),clang) endif ifeq ($(KERNEL),Darwin) - CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.15 - LDFLAGS += -arch $(arch) -mmacosx-version-min=10.15 + CXXFLAGS += -mmacosx-version-min=10.15 + LDFLAGS += -mmacosx-version-min=10.15 + ifneq ($(arch),any) + CXXFLAGS += -arch $(arch) + LDFLAGS += -arch $(arch) + endif XCRUN = xcrun endif @@ -484,10 +496,10 @@ else CXXFLAGS += -g endif -### 3.2.3 Debugging with undefined behavior sanitizers -ifneq ($(sanitize),no) - CXXFLAGS += -g3 -fsanitize=$(sanitize) - LDFLAGS += -fsanitize=$(sanitize) +### 3.2.2 Debugging with undefined behavior sanitizers +ifneq ($(sanitize),none) + CXXFLAGS += -g3 $(addprefix -fsanitize=,$(sanitize)) + LDFLAGS += $(addprefix -fsanitize=,$(sanitize)) endif ### 3.3 Optimization @@ -806,7 +818,9 @@ profileclean: @rm -rf profdir @rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./tools/*.gcda ./extra/*.gcda ./eval/*.gcda @rm -f stockfish.profdata *.profraw - @rm -f $(PGO_TRAINING_DATA_FILE) + @rm -f stockfish.exe.lto_wrapper_args + @rm -f stockfish.exe.ltrans.out + @rm -f ./-lstdc++.res default: help @@ -849,7 +863,6 @@ config-sanity: net @echo "Testing config sanity. If this fails, try 'make help' ..." @echo "" @test "$(debug)" = "yes" || test "$(debug)" = "no" - @test "$(sanitize)" = "undefined" || test "$(sanitize)" = "thread" || test "$(sanitize)" = "address" || test "$(sanitize)" = "no" @test "$(optimize)" = "yes" || test "$(optimize)" = "no" @test "$(SUPPORTED_ARCH)" = "true" @test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \ @@ -889,14 +902,15 @@ clang-profile-use: all gcc-profile-make: + @mkdir -p profdir $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ - EXTRACXXFLAGS='-fprofile-generate' \ + EXTRACXXFLAGS='-fprofile-generate=profdir' \ EXTRALDFLAGS='-lgcov' \ all gcc-profile-use: $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ - EXTRACXXFLAGS='-fprofile-use -fno-peel-loops -fno-tracer' \ + EXTRACXXFLAGS='-fprofile-use=profdir -fno-peel-loops -fno-tracer' \ EXTRALDFLAGS='-lgcov' \ all diff --git a/src/evaluate.cpp b/src/evaluate.cpp index ccb7436b..74474363 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -128,30 +128,6 @@ namespace Eval { } } - /// NNUE::export_net() exports the currently loaded network to a file - void NNUE::export_net(const std::optional& filename) { - std::string actualFilename; - - if (filename.has_value()) - actualFilename = filename.value(); - else - { - if (eval_file_loaded != EvalFileDefaultName) - { - sync_cout << "Failed to export a net. A non-embedded net can only be saved if the filename is specified." << sync_endl; - return; - } - actualFilename = EvalFileDefaultName; - } - - ofstream stream(actualFilename, std::ios_base::binary); - - if (save_eval(stream)) - sync_cout << "Network saved successfully to " << actualFilename << "." << sync_endl; - else - sync_cout << "Failed to export a net." << sync_endl; - } - /// NNUE::verify() verifies that the last net used was loaded successfully void NNUE::verify() { @@ -218,7 +194,7 @@ namespace Trace { else os << scores[t][WHITE] << " | " << scores[t][BLACK]; - os << " | " << scores[t][WHITE] - scores[t][BLACK] << "\n"; + os << " | " << scores[t][WHITE] - scores[t][BLACK] << " |\n"; return os; } } @@ -228,11 +204,9 @@ using namespace Trace; namespace { // Threshold for lazy and space evaluation - constexpr Value LazyThreshold1 = Value(1565); - constexpr Value LazyThreshold2 = Value(1102); - constexpr Value SpaceThreshold = Value(11551); - constexpr Value NNUEThreshold1 = Value(682); - constexpr Value NNUEThreshold2 = Value(176); + constexpr Value LazyThreshold1 = Value(3130); + constexpr Value LazyThreshold2 = Value(2204); + constexpr Value SpaceThreshold = Value(11551); // KingAttackWeights[PieceType] contains king attack weights by piece type constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 }; @@ -1018,7 +992,7 @@ namespace { // Initialize score by reading the incrementally updated scores included in // the position object (material + piece square tables) and the material // imbalance. Score is computed internally from the white point of view. - Score score = pos.psq_score() + me->imbalance() + pos.this_thread()->contempt; + Score score = pos.psq_score() + me->imbalance() + pos.this_thread()->trend; // Probe the pawn hash table pe = Pawns::probe(pos); @@ -1026,7 +1000,7 @@ namespace { // Early exit if score is high auto lazy_skip = [&](Value lazyThreshold) { - return abs(mg_value(score) + eg_value(score)) / 2 > lazyThreshold + pos.non_pawn_material() / 64; + return abs(mg_value(score) + eg_value(score)) > lazyThreshold + pos.non_pawn_material() / 32; }; if (lazy_skip(LazyThreshold1)) @@ -1140,8 +1114,9 @@ Value Eval::evaluate(const Position& pos) { // Scale and shift NNUE for compatibility with search and classical evaluation auto adjusted_NNUE = [&]() { - - int scale = 903 + 28 * pos.count() + 28 * pos.non_pawn_material() / 1024; + int scale = 883 + + 32 * pos.count() + + 32 * pos.non_pawn_material() / 1024; Value nnue = NNUE::evaluate(pos, true) * scale / 1024; @@ -1151,30 +1126,14 @@ Value Eval::evaluate(const Position& pos) { return nnue; }; - // If there is PSQ imbalance we use the classical eval. We also introduce - // a small probability of using the classical eval when PSQ imbalance is small. + // If there is PSQ imbalance we use the classical eval, but we switch to + // NNUE eval faster when shuffling or if the material on the board is high. + int r50 = pos.rule50_count(); Value psq = Value(abs(eg_value(pos.psq_score()))); - int r50 = 16 + pos.rule50_count(); - bool largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50; - bool classical = largePsq; + bool classical = psq * 5 > (850 + pos.non_pawn_material() / 64) * (5 + r50); - // Use classical evaluation for really low piece endgames. - // One critical case is the draw for bishop + A/H file pawn vs naked king. - bool lowPieceEndgame = pos.non_pawn_material() == BishopValueMg - || (pos.non_pawn_material() < 2 * RookValueMg && pos.count() < 2); - - v = classical || lowPieceEndgame ? Evaluation(pos).value() - : adjusted_NNUE(); - - // If the classical eval is small and imbalance large, use NNUE nevertheless. - // For the case of opposite colored bishops, switch to NNUE eval with small - // probability if the classical eval is less than the threshold. - if ( largePsq - && !lowPieceEndgame - && ( abs(v) * 16 < NNUEThreshold2 * r50 - || ( pos.opposite_bishops() - && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50))) - v = adjusted_NNUE(); + v = classical ? Evaluation(pos).value() // classical + : adjusted_NNUE(); // NNUE } // Damp down the evaluation linearly when shuffling @@ -1191,7 +1150,7 @@ Value Eval::evaluate(const Position& pos) { /// descriptions and values of each evaluation term. Useful for debugging. /// Trace scores are from white's point of view -std::string Eval::trace(const Position& pos) { +std::string Eval::trace(Position& pos) { if (pos.checkers()) return "Final evaluation: none (in check)"; @@ -1203,44 +1162,53 @@ std::string Eval::trace(const Position& pos) { std::memset(scores, 0, sizeof(scores)); - pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt + pos.this_thread()->trend = SCORE_ZERO; // Reset any dynamic contempt v = Evaluation(pos).value(); ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2) - << " Term | White | Black | Total \n" - << " | MG EG | MG EG | MG EG \n" - << " ------------+-------------+-------------+------------\n" - << " Material | " << Term(MATERIAL) - << " Imbalance | " << Term(IMBALANCE) - << " Pawns | " << Term(PAWN) - << " Knights | " << Term(KNIGHT) - << " Bishops | " << Term(BISHOP) - << " Rooks | " << Term(ROOK) - << " Queens | " << Term(QUEEN) - << " Mobility | " << Term(MOBILITY) - << " King safety | " << Term(KING) - << " Threats | " << Term(THREAT) - << " Passed | " << Term(PASSED) - << " Space | " << Term(SPACE) - << " Winnable | " << Term(WINNABLE) - << " ------------+-------------+-------------+------------\n" - << " Total | " << Term(TOTAL); - - v = pos.side_to_move() == WHITE ? v : -v; - - ss << "\nClassical evaluation: " << to_cp(v) << " (white side)\n"; + << " Contributing terms for the classical eval:\n" + << "+------------+-------------+-------------+-------------+\n" + << "| Term | White | Black | Total |\n" + << "| | MG EG | MG EG | MG EG |\n" + << "+------------+-------------+-------------+-------------+\n" + << "| Material | " << Term(MATERIAL) + << "| Imbalance | " << Term(IMBALANCE) + << "| Pawns | " << Term(PAWN) + << "| Knights | " << Term(KNIGHT) + << "| Bishops | " << Term(BISHOP) + << "| Rooks | " << Term(ROOK) + << "| Queens | " << Term(QUEEN) + << "| Mobility | " << Term(MOBILITY) + << "|King safety | " << Term(KING) + << "| Threats | " << Term(THREAT) + << "| Passed | " << Term(PASSED) + << "| Space | " << Term(SPACE) + << "| Winnable | " << Term(WINNABLE) + << "+------------+-------------+-------------+-------------+\n" + << "| Total | " << Term(TOTAL) + << "+------------+-------------+-------------+-------------+\n"; + if (NNUE::useNNUE != NNUE::UseNNUEMode::False) + ss << '\n' << NNUE::trace(pos) << '\n'; + + ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15); + + v = pos.side_to_move() == WHITE ? v : -v; + ss << "\nClassical evaluation " << to_cp(v) << " (white side)\n"; if (NNUE::useNNUE != NNUE::UseNNUEMode::False) { - v = NNUE::evaluate(pos); + v = NNUE::evaluate(pos, false); v = pos.side_to_move() == WHITE ? v : -v; - ss << "\nNNUE evaluation: " << to_cp(v) << " (white side)\n"; + ss << "NNUE evaluation " << to_cp(v) << " (white side)\n"; } v = evaluate(pos); v = pos.side_to_move() == WHITE ? v : -v; - ss << "\nFinal evaluation: " << to_cp(v) << " (white side)\n"; + ss << "Final evaluation " << to_cp(v) << " (white side)"; + if (NNUE::useNNUE != NNUE::UseNNUEMode::False) + ss << " [with scaled NNUE, hybrid, ...]"; + ss << "\n"; return ss.str(); } diff --git a/src/evaluate.h b/src/evaluate.h index fa16a93d..8c91b807 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -30,13 +30,13 @@ class Position; namespace Eval { - std::string trace(const Position& pos); + std::string trace(Position& pos); Value evaluate(const Position& pos); // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue // for the build process (profile-build and fishtest) to work. Do not change the // name of the macro, as it is used in the Makefile. - #define EvalFileDefaultName "nn-7756374aaed3.nnue" + #define EvalFileDefaultName "nn-e8321e467bf6.nnue" namespace NNUE { enum struct UseNNUEMode @@ -49,13 +49,17 @@ namespace Eval { extern UseNNUEMode useNNUE; extern std::string eval_file_loaded; + std::string trace(Position& pos); Value evaluate(const Position& pos, bool adjusted = false); + + void init(); + void verify(); + bool load_eval(std::string name, std::istream& stream); bool save_eval(std::ostream& stream); - void init(); - void export_net(const std::optional& filename); - void verify(); - } + bool save_eval(const std::optional& filename); + + } // namespace NNUE } // namespace Eval diff --git a/src/misc.cpp b/src/misc.cpp index 7a5559ce..d17e5c7e 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -380,6 +380,7 @@ void std_aligned_free(void* ptr) { static void* aligned_large_pages_alloc_windows(size_t allocSize) { #if !defined(_WIN64) + (void)allocSize; // suppress unused-parameter compiler warning return nullptr; #else diff --git a/src/misc.h b/src/misc.h index 1a574c58..99b8c3bb 100644 --- a/src/misc.h +++ b/src/misc.h @@ -89,9 +89,10 @@ std::ostream& operator<<(std::ostream&, SyncCout); #define sync_cout std::cout << IO_LOCK #define sync_endl std::endl << IO_UNLOCK -// `ptr` must point to an array of size at least -// `sizeof(T) * N + alignment` bytes, where `N` is the -// number of elements in the array. + +// align_ptr_up() : get the first aligned element of an array. +// ptr must point to an array of size at least `sizeof(T) * N + alignment` bytes, +// where N is the number of elements in the array. template T* align_ptr_up(T* ptr) { @@ -101,6 +102,12 @@ T* align_ptr_up(T* ptr) return reinterpret_cast(reinterpret_cast((ptrint + (Alignment - 1)) / Alignment * Alignment)); } + +// IsLittleEndian : true if and only if the binary is compiled on a little endian machine +static inline const union { uint32_t i; char c[4]; } Le = { 0x01020304 }; +static inline const bool IsLittleEndian = (Le.c[0] == 4); + + template class ValueListInserter { public: diff --git a/src/movegen.cpp b/src/movegen.cpp index bb81aeac..5f3ba90a 100644 --- a/src/movegen.cpp +++ b/src/movegen.cpp @@ -26,21 +26,16 @@ namespace Stockfish { namespace { template - ExtMove* make_promotions(ExtMove* moveList, Square to, Square ksq) { + ExtMove* make_promotions(ExtMove* moveList, Square to) { if (Type == CAPTURES || Type == EVASIONS || Type == NON_EVASIONS) - { *moveList++ = make(to - D, to, QUEEN); - if (attacks_bb(to) & ksq) - *moveList++ = make(to - D, to, KNIGHT); - } if (Type == QUIETS || Type == EVASIONS || Type == NON_EVASIONS) { *moveList++ = make(to - D, to, ROOK); *moveList++ = make(to - D, to, BISHOP); - if (!(attacks_bb(to) & ksq)) - *moveList++ = make(to - D, to, KNIGHT); + *moveList++ = make(to - D, to, KNIGHT); } return moveList; @@ -57,7 +52,6 @@ namespace { constexpr Direction UpRight = (Us == WHITE ? NORTH_EAST : SOUTH_WEST); constexpr Direction UpLeft = (Us == WHITE ? NORTH_WEST : SOUTH_EAST); - const Square ksq = pos.square(Them); const Bitboard emptySquares = Type == QUIETS || Type == QUIET_CHECKS ? target : ~pos.pieces(); const Bitboard enemies = Type == EVASIONS ? pos.checkers() : Type == CAPTURES ? target : pos.pieces(Them); @@ -82,6 +76,7 @@ namespace { // To make a quiet check, you either make a direct check by pushing a pawn // or push a blocker pawn that is not on the same file as the enemy king. // Discovered check promotion has been already generated amongst the captures. + Square ksq = pos.square(Them); Bitboard dcCandidatePawns = pos.blockers_for_king(Them) & ~file_bb(ksq); b1 &= pawn_attacks_bb(Them, ksq) | shift< Up>(dcCandidatePawns); b2 &= pawn_attacks_bb(Them, ksq) | shift(dcCandidatePawns); @@ -111,13 +106,13 @@ namespace { b3 &= target; while (b1) - moveList = make_promotions(moveList, pop_lsb(b1), ksq); + moveList = make_promotions(moveList, pop_lsb(b1)); while (b2) - moveList = make_promotions(moveList, pop_lsb(b2), ksq); + moveList = make_promotions(moveList, pop_lsb(b2)); while (b3) - moveList = make_promotions(moveList, pop_lsb(b3), ksq); + moveList = make_promotions(moveList, pop_lsb(b3)); } // Standard and en passant captures @@ -206,6 +201,7 @@ namespace { moveList = generate_moves(pos, moveList, target); moveList = generate_moves(pos, moveList, target); } + if (!Checks || pos.blockers_for_king(~Us) & ksq) { Bitboard b = attacks_bb(ksq) & (Type == EVASIONS ? ~pos.pieces(Us) : target); @@ -227,10 +223,10 @@ namespace { } // namespace -/// Generates all pseudo-legal captures plus queen and checking knight promotions -/// Generates all pseudo-legal non-captures and underpromotions (except checking knight) +/// Generates all pseudo-legal captures plus queen promotions +/// Generates all pseudo-legal non-captures and underpromotions /// Generates all pseudo-legal check evasions when the side to move is in check -/// Generates all pseudo-legal non-captures giving check, except castling +/// Generates all pseudo-legal non-captures giving check, except castling and promotions /// Generates all pseudo-legal captures and non-captures /// /// Returns a pointer to the end of the move list. diff --git a/src/movepick.cpp b/src/movepick.cpp index 4ff4cff4..20640fe2 100644 --- a/src/movepick.cpp +++ b/src/movepick.cpp @@ -111,7 +111,7 @@ void MovePicker::score() { + (*continuationHistory[1])[pos.moved_piece(m)][to_sq(m)] + (*continuationHistory[3])[pos.moved_piece(m)][to_sq(m)] + (*continuationHistory[5])[pos.moved_piece(m)][to_sq(m)] - + (ply < MAX_LPH ? std::min(4, depth / 3) * (*lowPlyHistory)[ply][from_to(m)] : 0); + + (ply < MAX_LPH ? 6 * (*lowPlyHistory)[ply][from_to(m)] : 0); else // Type == EVASIONS { diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp index cee77fe9..891f8faa 100644 --- a/src/nnue/evaluate_nnue.cpp +++ b/src/nnue/evaluate_nnue.cpp @@ -20,6 +20,9 @@ #include #include +#include +#include +#include #include "../evaluate.h" #include "../position.h" @@ -158,29 +161,214 @@ namespace Stockfish::Eval::NNUE { ASSERT_ALIGNED(buffer, alignment); const std::size_t bucket = (pos.count() - 1) / 4; - const auto [psqt, lazy] = featureTransformer->transform(pos, transformedFeatures, bucket); + const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket); + const auto output = network[bucket]->propagate(transformedFeatures, buffer); - if (lazy) - return static_cast(psqt / OutputScale); - else - { + int materialist = psqt; + int positional = output[0]; + + int delta_npm = abs(pos.non_pawn_material(WHITE) - pos.non_pawn_material(BLACK)); + int entertainment = (adjusted && delta_npm <= BishopValueMg - KnightValueMg ? 7 : 0); + + int A = 128 - entertainment; + int B = 128 + entertainment; + + int sum = (A * materialist + B * positional) / 128; + + return static_cast( sum / OutputScale ); + } + + struct NnueEvalTrace { + static_assert(LayerStacks == PSQTBuckets); + + Value psqt[LayerStacks]; + Value positional[LayerStacks]; + std::size_t correctBucket; + }; + + static NnueEvalTrace trace_evaluate(const Position& pos) { + + // We manually align the arrays on the stack because with gcc < 9.3 + // overaligning stack variables with alignas() doesn't work correctly. + + constexpr uint64_t alignment = CacheLineSize; + +#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) + TransformedFeatureType transformedFeaturesUnaligned[ + FeatureTransformer::BufferSize + alignment / sizeof(TransformedFeatureType)]; + char bufferUnaligned[Network::BufferSize + alignment]; + + auto* transformedFeatures = align_ptr_up(&transformedFeaturesUnaligned[0]); + auto* buffer = align_ptr_up(&bufferUnaligned[0]); +#else + alignas(alignment) + TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize]; + alignas(alignment) char buffer[Network::BufferSize]; +#endif + + ASSERT_ALIGNED(transformedFeatures, alignment); + ASSERT_ALIGNED(buffer, alignment); + + NnueEvalTrace t{}; + t.correctBucket = (pos.count() - 1) / 4; + for (std::size_t bucket = 0; bucket < LayerStacks; ++bucket) { + const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket); const auto output = network[bucket]->propagate(transformedFeatures, buffer); int materialist = psqt; int positional = output[0]; - int delta_npm = abs(pos.non_pawn_material(WHITE) - pos.non_pawn_material(BLACK)); - int entertainment = (adjusted && delta_npm <= BishopValueMg - KnightValueMg ? 7 : 0); + t.psqt[bucket] = static_cast( materialist / OutputScale ); + t.positional[bucket] = static_cast( positional / OutputScale ); + } - int A = 128 - entertainment; - int B = 128 + entertainment; + return t; + } - int sum = (A * materialist + B * positional) / 128; + static const std::string PieceToChar(" PNBRQK pnbrqk"); - return static_cast( sum / OutputScale ); + + // format_cp_compact() converts a Value into (centi)pawns and writes it in a buffer. + // The buffer must have capacity for at least 5 chars. + static void format_cp_compact(Value v, char* buffer) { + + buffer[0] = (v < 0 ? '-' : v > 0 ? '+' : ' '); + + int cp = std::abs(100 * v / PawnValueEg); + if (cp >= 10000) + { + buffer[1] = '0' + cp / 10000; cp %= 10000; + buffer[2] = '0' + cp / 1000; cp %= 1000; + buffer[3] = '0' + cp / 100; cp %= 100; + buffer[4] = ' '; + } + else if (cp >= 1000) + { + buffer[1] = '0' + cp / 1000; cp %= 1000; + buffer[2] = '0' + cp / 100; cp %= 100; + buffer[3] = '.'; + buffer[4] = '0' + cp / 10; + } + else + { + buffer[1] = '0' + cp / 100; cp %= 100; + buffer[2] = '.'; + buffer[3] = '0' + cp / 10; cp %= 10; + buffer[4] = '0' + cp / 1; } } + + // format_cp_aligned_dot() converts a Value into (centi)pawns and writes it in a buffer, + // always keeping two decimals. The buffer must have capacity for at least 7 chars. + static void format_cp_aligned_dot(Value v, char* buffer) { + + buffer[0] = (v < 0 ? '-' : v > 0 ? '+' : ' '); + + double cp = 1.0 * std::abs(int(v)) / PawnValueEg; + sprintf(&buffer[1], "%6.2f", cp); + } + + + // trace() returns a string with the value of each piece on a board, + // and a table for (PSQT, Layers) values bucket by bucket. + + std::string trace(Position& pos) { + + std::stringstream ss; + + char board[3*8+1][8*8+2]; + std::memset(board, ' ', sizeof(board)); + for (int row = 0; row < 3*8+1; ++row) + board[row][8*8+1] = '\0'; + + // A lambda to output one box of the board + auto writeSquare = [&board](File file, Rank rank, Piece pc, Value value) { + + const int x = ((int)file) * 8; + const int y = (7 - (int)rank) * 3; + for (int i = 1; i < 8; ++i) + board[y][x+i] = board[y+3][x+i] = '-'; + for (int i = 1; i < 3; ++i) + board[y+i][x] = board[y+i][x+8] = '|'; + board[y][x] = board[y][x+8] = board[y+3][x+8] = board[y+3][x] = '+'; + if (pc != NO_PIECE) + board[y+1][x+4] = PieceToChar[pc]; + if (value != VALUE_NONE) + format_cp_compact(value, &board[y+2][x+2]); + }; + + // We estimate the value of each piece by doing a differential evaluation from + // the current base eval, simulating the removal of the piece from its square. + Value base = evaluate(pos); + base = pos.side_to_move() == WHITE ? base : -base; + + for (File f = FILE_A; f <= FILE_H; ++f) + for (Rank r = RANK_1; r <= RANK_8; ++r) + { + Square sq = make_square(f, r); + Piece pc = pos.piece_on(sq); + Value v = VALUE_NONE; + + if (pc != NO_PIECE && type_of(pc) != KING) + { + auto st = pos.state(); + + pos.remove_piece(sq); + st->accumulator.computed[WHITE] = false; + st->accumulator.computed[BLACK] = false; + + Value eval = evaluate(pos); + eval = pos.side_to_move() == WHITE ? eval : -eval; + v = base - eval; + + pos.put_piece(pc, sq); + st->accumulator.computed[WHITE] = false; + st->accumulator.computed[BLACK] = false; + } + + writeSquare(f, r, pc, v); + } + + ss << " NNUE derived piece values:\n"; + for (int row = 0; row < 3*8+1; ++row) + ss << board[row] << '\n'; + ss << '\n'; + + auto t = trace_evaluate(pos); + + ss << " NNUE network contributions " + << (pos.side_to_move() == WHITE ? "(White to move)" : "(Black to move)") << std::endl + << "+------------+------------+------------+------------+\n" + << "| Bucket | Material | Positional | Total |\n" + << "| | (PSQT) | (Layers) | |\n" + << "+------------+------------+------------+------------+\n"; + + for (std::size_t bucket = 0; bucket < LayerStacks; ++bucket) + { + char buffer[3][8]; + std::memset(buffer, '\0', sizeof(buffer)); + + format_cp_aligned_dot(t.psqt[bucket], buffer[0]); + format_cp_aligned_dot(t.positional[bucket], buffer[1]); + format_cp_aligned_dot(t.psqt[bucket] + t.positional[bucket], buffer[2]); + + ss << "| " << bucket << " " + << " | " << buffer[0] << " " + << " | " << buffer[1] << " " + << " | " << buffer[2] << " " + << " |"; + if (bucket == t.correctBucket) + ss << " <-- this bucket is used"; + ss << '\n'; + } + + ss << "+------------+------------+------------+------------+\n"; + + return ss.str(); + } + + // Load eval, from a file stream or a memory stream bool load_eval(std::string name, std::istream& stream) { @@ -198,4 +386,35 @@ namespace Stockfish::Eval::NNUE { return write_parameters(stream); } + /// Save eval, to a file given by its name + bool save_eval(const std::optional& filename) { + + std::string actualFilename; + std::string msg; + + if (filename.has_value()) + actualFilename = filename.value(); + else + { + if (eval_file_loaded != EvalFileDefaultName) + { + msg = "Failed to export a net. A non-embedded net can only be saved if the filename is specified"; + + sync_cout << msg << sync_endl; + return false; + } + actualFilename = EvalFileDefaultName; + } + + std::ofstream stream(actualFilename, std::ios_base::binary); + bool saved = save_eval(stream); + + msg = saved ? "Network saved successfully to " + actualFilename + : "Failed to export a net"; + + sync_cout << msg << sync_endl; + return saved; + } + + } // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/features/half_ka_v2.cpp b/src/nnue/features/half_ka_v2_hm.cpp similarity index 68% rename from src/nnue/features/half_ka_v2.cpp rename to src/nnue/features/half_ka_v2_hm.cpp index 57f43e50..098a6d60 100644 --- a/src/nnue/features/half_ka_v2.cpp +++ b/src/nnue/features/half_ka_v2_hm.cpp @@ -16,31 +16,32 @@ along with this program. If not, see . */ -//Definition of input features HalfKAv2 of NNUE evaluation function +//Definition of input features HalfKAv2_hm of NNUE evaluation function -#include "half_ka_v2.h" +#include "half_ka_v2_hm.h" #include "../../position.h" namespace Stockfish::Eval::NNUE::Features { // Orient a square according to perspective (rotates by 180 for black) - inline Square HalfKAv2::orient(Color perspective, Square s) { - return Square(int(s) ^ (bool(perspective) * 56)); + inline Square HalfKAv2_hm::orient(Color perspective, Square s, Square ksq) { + return Square(int(s) ^ (bool(perspective) * SQ_A8) ^ ((file_of(ksq) < FILE_E) * SQ_H1)); } // Index of a feature for a given king position and another piece on some square - inline IndexType HalfKAv2::make_index(Color perspective, Square s, Piece pc, Square ksq) { - return IndexType(orient(perspective, s) + PieceSquareIndex[perspective][pc] + PS_NB * ksq); + inline IndexType HalfKAv2_hm::make_index(Color perspective, Square s, Piece pc, Square ksq) { + Square o_ksq = orient(perspective, ksq, ksq); + return IndexType(orient(perspective, s, ksq) + PieceSquareIndex[perspective][pc] + PS_NB * KingBuckets[o_ksq]); } // Get a list of indices for active features - void HalfKAv2::append_active_indices( + void HalfKAv2_hm::append_active_indices( const Position& pos, Color perspective, ValueListInserter active ) { - Square ksq = orient(perspective, pos.square(perspective)); + Square ksq = pos.square(perspective); Bitboard bb = pos.pieces(); while (bb) { @@ -52,7 +53,7 @@ namespace Stockfish::Eval::NNUE::Features { // append_changed_indices() : get a list of indices for recently changed features - void HalfKAv2::append_changed_indices( + void HalfKAv2_hm::append_changed_indices( Square ksq, StateInfo* st, Color perspective, @@ -60,25 +61,24 @@ namespace Stockfish::Eval::NNUE::Features { ValueListInserter added ) { const auto& dp = st->dirtyPiece; - Square oriented_ksq = orient(perspective, ksq); for (int i = 0; i < dp.dirty_num; ++i) { Piece pc = dp.piece[i]; if (dp.from[i] != SQ_NONE) - removed.push_back(make_index(perspective, dp.from[i], pc, oriented_ksq)); + removed.push_back(make_index(perspective, dp.from[i], pc, ksq)); if (dp.to[i] != SQ_NONE) - added.push_back(make_index(perspective, dp.to[i], pc, oriented_ksq)); + added.push_back(make_index(perspective, dp.to[i], pc, ksq)); } } - int HalfKAv2::update_cost(StateInfo* st) { + int HalfKAv2_hm::update_cost(StateInfo* st) { return st->dirtyPiece.dirty_num; } - int HalfKAv2::refresh_cost(const Position& pos) { + int HalfKAv2_hm::refresh_cost(const Position& pos) { return pos.count(); } - bool HalfKAv2::requires_refresh(StateInfo* st, Color perspective) { + bool HalfKAv2_hm::requires_refresh(StateInfo* st, Color perspective) { return st->dirtyPiece.piece[0] == make_piece(perspective, KING); } diff --git a/src/nnue/features/half_ka_v2.h b/src/nnue/features/half_ka_v2_hm.h similarity index 80% rename from src/nnue/features/half_ka_v2.h rename to src/nnue/features/half_ka_v2_hm.h index e4b2edd9..2c1144f6 100644 --- a/src/nnue/features/half_ka_v2.h +++ b/src/nnue/features/half_ka_v2_hm.h @@ -18,8 +18,8 @@ //Definition of input features HalfKP of NNUE evaluation function -#ifndef NNUE_FEATURES_HALF_KA_V2_H_INCLUDED -#define NNUE_FEATURES_HALF_KA_V2_H_INCLUDED +#ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED +#define NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED #include "../nnue_common.h" @@ -32,9 +32,9 @@ namespace Stockfish { namespace Stockfish::Eval::NNUE::Features { - // Feature HalfKAv2: Combination of the position of own king - // and the position of pieces - class HalfKAv2 { + // Feature HalfKAv2_hm: Combination of the position of own king + // and the position of pieces. Position mirrored such that king always on e..h files. + class HalfKAv2_hm { // unique number for each piece type on each square enum { @@ -63,21 +63,32 @@ namespace Stockfish::Eval::NNUE::Features { }; // Orient a square according to perspective (rotates by 180 for black) - static Square orient(Color perspective, Square s); + static Square orient(Color perspective, Square s, Square ksq); // Index of a feature for a given king position and another piece on some square static IndexType make_index(Color perspective, Square s, Piece pc, Square ksq); public: // Feature name - static constexpr const char* Name = "HalfKAv2(Friend)"; + static constexpr const char* Name = "HalfKAv2_hm(Friend)"; // Hash value embedded in the evaluation file - static constexpr std::uint32_t HashValue = 0x5f234cb8u; + static constexpr std::uint32_t HashValue = 0x7f234cb8u; // Number of feature dimensions static constexpr IndexType Dimensions = - static_cast(SQUARE_NB) * static_cast(PS_NB); + static_cast(SQUARE_NB) * static_cast(PS_NB) / 2; + + static constexpr int KingBuckets[64] = { + -1, -1, -1, -1, 31, 30, 29, 28, + -1, -1, -1, -1, 27, 26, 25, 24, + -1, -1, -1, -1, 23, 22, 21, 20, + -1, -1, -1, -1, 19, 18, 17, 16, + -1, -1, -1, -1, 15, 14, 13, 12, + -1, -1, -1, -1, 11, 10, 9, 8, + -1, -1, -1, -1, 7, 6, 5, 4, + -1, -1, -1, -1, 3, 2, 1, 0 + }; // Maximum number of simultaneously active features. static constexpr IndexType MaxActiveDimensions = 32; @@ -108,4 +119,4 @@ namespace Stockfish::Eval::NNUE::Features { } // namespace Stockfish::Eval::NNUE::Features -#endif // #ifndef NNUE_FEATURES_HALF_KA_V2_H_INCLUDED +#endif // #ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 9a3b778e..d1318368 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -46,6 +46,11 @@ namespace Stockfish::Eval::NNUE::Layers { #elif defined (USE_SSSE3) static constexpr const IndexType OutputSimdWidth = SimdWidth / 4; #endif +#if defined (USE_AVX512) + static constexpr const IndexType InputSimdWidth = SimdWidth * 2; +#elif defined (USE_SSSE3) + static constexpr const IndexType InputSimdWidth = SimdWidth; +#endif // Size of forward propagation buffer used in this layer static constexpr std::size_t SelfBufferSize = @@ -72,6 +77,15 @@ namespace Stockfish::Eval::NNUE::Layers { for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) #if !defined (USE_SSSE3) weights[i] = read_little_endian(stream); +#elif defined (USE_VNNI) || defined (USE_AVX512) + if constexpr (OutputDimensions <= 8 && OutputDimensions != 1) + weights[i] = read_little_endian(stream); + else + weights[ + (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 + + i / PaddedInputDimensions * 4 + + i % 4 + ] = read_little_endian(stream); #else weights[ (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 + @@ -108,7 +122,6 @@ namespace Stockfish::Eval::NNUE::Layers { return !stream.fail(); } - // Forward propagation const OutputType* propagate( const TransformedFeatureType* transformedFeatures, char* buffer) const { @@ -123,6 +136,40 @@ namespace Stockfish::Eval::NNUE::Layers { return _mm512_reduce_add_epi32(sum) + bias; }; + [[maybe_unused]] auto m512_hadd128x16_interleave = []( + __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) -> __m512i { + + __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1); + __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1); + + __m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3); + __m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3); + + __m512i sum01 = _mm512_add_epi32(sum01a, sum01b); + __m512i sum23 = _mm512_add_epi32(sum23a, sum23b); + + __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23); + __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23); + + return _mm512_add_epi32(sum0123a, sum0123b); + }; + + [[maybe_unused]] auto m512_haddx4 = [m512_hadd128x16_interleave]( + __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i { + + __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3); + + __m256i sum256lo = _mm512_castsi512_si256(sum); + __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1); + + sum256lo = _mm256_add_epi32(sum256lo, sum256hi); + + __m128i sum128lo = _mm256_castsi256_si128(sum256lo); + __m128i sum128hi = _mm256_extracti128_si256(sum256lo, 1); + + return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias); + }; + [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) { #if defined (USE_VNNI) acc = _mm512_dpbusd_epi32(acc, a, b); @@ -133,6 +180,19 @@ namespace Stockfish::Eval::NNUE::Layers { #endif }; + [[maybe_unused]] auto m512_add_dpbusd_epi32x2 = [=](__m512i& acc, __m512i a0, __m512i b0, __m512i a1, __m512i b1) { +#if defined (USE_VNNI) + acc = _mm512_dpbusd_epi32(acc, a0, b0); + acc = _mm512_dpbusd_epi32(acc, a1, b1); +#else + __m512i product0 = _mm512_maddubs_epi16(a0, b0); + __m512i product1 = _mm512_maddubs_epi16(a1, b1); + product0 = _mm512_adds_epi16(product0, product1); + product0 = _mm512_madd_epi16(product0, Ones512); + acc = _mm512_add_epi32(acc, product0); +#endif + }; + [[maybe_unused]] auto m512_add_dpbusd_epi32x4 = [=](__m512i& acc, __m512i a0, __m512i b0, __m512i a1, __m512i b1, __m512i a2, __m512i b2, __m512i a3, __m512i b3) { #if defined (USE_VNNI) @@ -165,6 +225,18 @@ namespace Stockfish::Eval::NNUE::Layers { return _mm_cvtsi128_si32(sum128) + bias; }; + [[maybe_unused]] auto m256_haddx4 = [](__m256i sum0, __m256i sum1, __m256i sum2, __m256i sum3, __m128i bias) -> __m128i { + sum0 = _mm256_hadd_epi32(sum0, sum1); + sum2 = _mm256_hadd_epi32(sum2, sum3); + + sum0 = _mm256_hadd_epi32(sum0, sum2); + + __m128i sum128lo = _mm256_castsi256_si128(sum0); + __m128i sum128hi = _mm256_extracti128_si256(sum0, 1); + + return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias); + }; + [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) { #if defined (USE_VNNI) acc = _mm256_dpbusd_epi32(acc, a, b); @@ -175,6 +247,19 @@ namespace Stockfish::Eval::NNUE::Layers { #endif }; + [[maybe_unused]] auto m256_add_dpbusd_epi32x2 = [=](__m256i& acc, __m256i a0, __m256i b0, __m256i a1, __m256i b1) { +#if defined (USE_VNNI) + acc = _mm256_dpbusd_epi32(acc, a0, b0); + acc = _mm256_dpbusd_epi32(acc, a1, b1); +#else + __m256i product0 = _mm256_maddubs_epi16(a0, b0); + __m256i product1 = _mm256_maddubs_epi16(a1, b1); + product0 = _mm256_adds_epi16(product0, product1); + product0 = _mm256_madd_epi16(product0, Ones256); + acc = _mm256_add_epi32(acc, product0); +#endif + }; + [[maybe_unused]] auto m256_add_dpbusd_epi32x4 = [=](__m256i& acc, __m256i a0, __m256i b0, __m256i a1, __m256i b1, __m256i a2, __m256i b2, __m256i a3, __m256i b3) { #if defined (USE_VNNI) @@ -206,12 +291,27 @@ namespace Stockfish::Eval::NNUE::Layers { return _mm_cvtsi128_si32(sum) + bias; }; + [[maybe_unused]] auto m128_haddx4 = [](__m128i sum0, __m128i sum1, __m128i sum2, __m128i sum3, __m128i bias) -> __m128i { + sum0 = _mm_hadd_epi32(sum0, sum1); + sum2 = _mm_hadd_epi32(sum2, sum3); + sum0 = _mm_hadd_epi32(sum0, sum2); + return _mm_add_epi32(sum0, bias); + }; + [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) { __m128i product0 = _mm_maddubs_epi16(a, b); product0 = _mm_madd_epi16(product0, Ones128); acc = _mm_add_epi32(acc, product0); }; + [[maybe_unused]] auto m128_add_dpbusd_epi32x2 = [=](__m128i& acc, __m128i a0, __m128i b0, __m128i a1, __m128i b1) { + __m128i product0 = _mm_maddubs_epi16(a0, b0); + __m128i product1 = _mm_maddubs_epi16(a1, b1); + product0 = _mm_adds_epi16(product0, product1); + product0 = _mm_madd_epi16(product0, Ones128); + acc = _mm_add_epi32(acc, product0); + }; + [[maybe_unused]] auto m128_add_dpbusd_epi32x4 = [=](__m128i& acc, __m128i a0, __m128i b0, __m128i a1, __m128i b1, __m128i a2, __m128i b2, __m128i a3, __m128i b3) { __m128i product0 = _mm_maddubs_epi16(a0, b0); @@ -231,45 +331,130 @@ namespace Stockfish::Eval::NNUE::Layers { using vec_t = __m512i; #define vec_setzero _mm512_setzero_si512 #define vec_set_32 _mm512_set1_epi32 - auto& vec_add_dpbusd_32 = m512_add_dpbusd_epi32; - auto& vec_add_dpbusd_32x4 = m512_add_dpbusd_epi32x4; - auto& vec_hadd = m512_hadd; + [[maybe_unused]] auto& vec_add_dpbusd_32 = m512_add_dpbusd_epi32; + [[maybe_unused]] auto& vec_add_dpbusd_32x2 = m512_add_dpbusd_epi32x2; + [[maybe_unused]] auto& vec_add_dpbusd_32x4 = m512_add_dpbusd_epi32x4; + [[maybe_unused]] auto& vec_hadd = m512_hadd; + [[maybe_unused]] auto& vec_haddx4 = m512_haddx4; #elif defined (USE_AVX2) using vec_t = __m256i; #define vec_setzero _mm256_setzero_si256 #define vec_set_32 _mm256_set1_epi32 - auto& vec_add_dpbusd_32 = m256_add_dpbusd_epi32; - auto& vec_add_dpbusd_32x4 = m256_add_dpbusd_epi32x4; - auto& vec_hadd = m256_hadd; + [[maybe_unused]] auto& vec_add_dpbusd_32 = m256_add_dpbusd_epi32; + [[maybe_unused]] auto& vec_add_dpbusd_32x2 = m256_add_dpbusd_epi32x2; + [[maybe_unused]] auto& vec_add_dpbusd_32x4 = m256_add_dpbusd_epi32x4; + [[maybe_unused]] auto& vec_hadd = m256_hadd; + [[maybe_unused]] auto& vec_haddx4 = m256_haddx4; #elif defined (USE_SSSE3) using vec_t = __m128i; #define vec_setzero _mm_setzero_si128 #define vec_set_32 _mm_set1_epi32 - auto& vec_add_dpbusd_32 = m128_add_dpbusd_epi32; - auto& vec_add_dpbusd_32x4 = m128_add_dpbusd_epi32x4; - auto& vec_hadd = m128_hadd; + [[maybe_unused]] auto& vec_add_dpbusd_32 = m128_add_dpbusd_epi32; + [[maybe_unused]] auto& vec_add_dpbusd_32x2 = m128_add_dpbusd_epi32x2; + [[maybe_unused]] auto& vec_add_dpbusd_32x4 = m128_add_dpbusd_epi32x4; + [[maybe_unused]] auto& vec_hadd = m128_hadd; + [[maybe_unused]] auto& vec_haddx4 = m128_haddx4; #endif #if defined (USE_SSSE3) - // Different layout, we process 4 inputs at a time, always. - static_assert(InputDimensions % 4 == 0); - const auto output = reinterpret_cast(buffer); const auto inputVector = reinterpret_cast(input); +#endif - static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1); +#if defined (USE_VNNI) || defined (USE_AVX512) + + static_assert(OutputDimensions == 1 || OutputDimensions % 4 == 0); // OutputDimensions is either 1 or a multiple of SimdWidth // because then it is also an input dimension. - if constexpr (OutputDimensions % OutputSimdWidth == 0) + if constexpr (OutputDimensions <= 8 && OutputDimensions != 1) { - constexpr IndexType NumChunks = InputDimensions / 4; + constexpr IndexType NumChunks = PaddedInputDimensions / InputSimdWidth; + static_assert(NumChunks % 2 == 0); + + const auto input_vec = reinterpret_cast(input); + const auto bias_vec = reinterpret_cast(biases); + auto out_vec = reinterpret_cast<__m128i*>(output); + + vec_t regs[OutputDimensions]; + for (IndexType k = 0; k < OutputDimensions; ++k) + regs[k] = vec_setzero(); + + for (IndexType i = 0; i < NumChunks / 2; ++i) + { + const vec_t in0 = input_vec[i * 2 + 0]; + const vec_t in1 = input_vec[i * 2 + 1]; + for (IndexType k = 0; k < OutputDimensions; ++k) + { + const vec_t w0 = reinterpret_cast(&weights[k * PaddedInputDimensions])[i * 2 + 0]; + const vec_t w1 = reinterpret_cast(&weights[k * PaddedInputDimensions])[i * 2 + 1]; + vec_add_dpbusd_32(regs[k], in0, w0); + vec_add_dpbusd_32(regs[k], in1, w1); + } + } + + for (IndexType k = 0; k < OutputDimensions / 4; ++k) + { + out_vec[k] = vec_haddx4( + regs[k * 4 + 0], + regs[k * 4 + 1], + regs[k * 4 + 2], + regs[k * 4 + 3], + bias_vec[k] + ); + } + } + else if constexpr (InputDimensions == 8) + { + const auto input32 = reinterpret_cast(input); + __m256i* outptr = reinterpret_cast<__m256i*>(output); + std::memcpy(output, biases, OutputDimensions * sizeof(OutputType)); + + const __m256i in0 = _mm256_set1_epi32(input32[0]); + const __m256i in1 = _mm256_set1_epi32(input32[1]); + const auto col0 = reinterpret_cast(&weights[0]); + const auto col1 = reinterpret_cast(&weights[OutputDimensions * 4]); + for (IndexType j = 0; j * 8 < OutputDimensions; ++j) + m256_add_dpbusd_epi32x2(outptr[j], in0, col0[j], in1, col1[j]); + } + else + +#elif defined (USE_SSSE3) + + if constexpr (OutputDimensions % OutputSimdWidth == 0 && InputDimensions == 8) + { const auto input32 = reinterpret_cast(input); vec_t* outptr = reinterpret_cast(output); std::memcpy(output, biases, OutputDimensions * sizeof(OutputType)); - for (int i = 0; i < (int)NumChunks - 3; i += 4) + const vec_t in0 = vec_set_32(input32[0]); + const vec_t in1 = vec_set_32(input32[1]); + const auto col0 = reinterpret_cast(&weights[0]); + const auto col1 = reinterpret_cast(&weights[OutputDimensions * 4]); + for (IndexType j = 0; j * OutputSimdWidth < OutputDimensions; ++j) + vec_add_dpbusd_32x2(outptr[j], in0, col0[j], in1, col1[j]); + } + else + +#endif + +#if defined (USE_SSSE3) + + if constexpr (OutputDimensions % OutputSimdWidth == 0) + { + static_assert(InputDimensions % 16 == 0); + + constexpr IndexType NumChunks = InputDimensions / 4; + constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth; + + const auto input32 = reinterpret_cast(input); + const vec_t* biasvec = reinterpret_cast(biases); + vec_t outs[NumRegs]; + for (IndexType k = 0; k < NumRegs; ++k) + outs[k] = biasvec[k]; + + for (IndexType i = 0; i < NumChunks; i += 4) { const vec_t in0 = vec_set_32(input32[i + 0]); const vec_t in1 = vec_set_32(input32[i + 1]); @@ -279,12 +464,18 @@ namespace Stockfish::Eval::NNUE::Layers { const auto col1 = reinterpret_cast(&weights[(i + 1) * OutputDimensions * 4]); const auto col2 = reinterpret_cast(&weights[(i + 2) * OutputDimensions * 4]); const auto col3 = reinterpret_cast(&weights[(i + 3) * OutputDimensions * 4]); - for (int j = 0; j * OutputSimdWidth < OutputDimensions; ++j) - vec_add_dpbusd_32x4(outptr[j], in0, col0[j], in1, col1[j], in2, col2[j], in3, col3[j]); + for (IndexType k = 0; k < NumRegs; ++k) + vec_add_dpbusd_32x4(outs[k], in0, col0[k], in1, col1[k], in2, col2[k], in3, col3[k]); } + + vec_t* outptr = reinterpret_cast(output); + for (IndexType k = 0; k < NumRegs; ++k) + outptr[k] = outs[k]; } else if constexpr (OutputDimensions == 1) { + static_assert(InputDimensions % 4 == 0); + #if defined (USE_AVX512) if constexpr (PaddedInputDimensions % (SimdWidth * 2) != 0) { @@ -329,8 +520,8 @@ namespace Stockfish::Eval::NNUE::Layers { #if defined(USE_SSE2) // At least a multiple of 16, with SSE2. - static_assert(InputDimensions % SimdWidth == 0); - constexpr IndexType NumChunks = InputDimensions / SimdWidth; + static_assert(PaddedInputDimensions % SimdWidth == 0); + constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth; const __m128i Zeros = _mm_setzero_si128(); const auto inputVector = reinterpret_cast(input); @@ -341,8 +532,8 @@ namespace Stockfish::Eval::NNUE::Layers { const auto inputVector = reinterpret_cast(input); #elif defined(USE_NEON) - static_assert(InputDimensions % SimdWidth == 0); - constexpr IndexType NumChunks = InputDimensions / SimdWidth; + static_assert(PaddedInputDimensions % SimdWidth == 0); + constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth; const auto inputVector = reinterpret_cast(input); #endif @@ -415,6 +606,13 @@ namespace Stockfish::Eval::NNUE::Layers { _mm_empty(); #endif +#endif + +#if (!defined (USE_SSSE3) && defined (USE_SSE2)) || defined (USE_NEON) + static_assert(SimdWidth <= 16, "Otherwise we run outside of the padding for the output."); + if constexpr (SimdWidth > OutputDimensions && OutputDimensions != 1) + for (IndexType i = OutputDimensions; i < SimdWidth; ++i) + output[i] = 0; #endif return output; diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index e24902c4..d41ecf95 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -25,14 +25,11 @@ namespace Stockfish::Eval::NNUE { - // The accumulator of a StateInfo without parent is set to the INIT state - enum AccumulatorState { EMPTY, COMPUTED, INIT }; - // Class that holds the result of affine transformation of input features struct alignas(CacheLineSize) Accumulator { std::int16_t accumulation[2][TransformedFeatureDimensions]; std::int32_t psqtAccumulation[2][PSQTBuckets]; - AccumulatorState state[2]; + bool computed[2]; }; } // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h index 879a39cd..193a197d 100644 --- a/src/nnue/nnue_architecture.h +++ b/src/nnue/nnue_architecture.h @@ -23,7 +23,7 @@ #include "nnue_common.h" -#include "features/half_ka_v2.h" +#include "features/half_ka_v2_hm.h" #include "layers/input_slice.h" #include "layers/affine_transform.h" @@ -32,10 +32,10 @@ namespace Stockfish::Eval::NNUE { // Input features used in evaluation function - using FeatureSet = Features::HalfKAv2; + using FeatureSet = Features::HalfKAv2_hm; // Number of input feature dimensions after conversion - constexpr IndexType TransformedFeatureDimensions = 512; + constexpr IndexType TransformedFeatureDimensions = 1024; constexpr IndexType PSQTBuckets = 8; constexpr IndexType LayerStacks = 8; @@ -43,7 +43,7 @@ namespace Stockfish::Eval::NNUE { // Define network structure using InputLayer = InputSlice; - using HiddenLayer1 = ClippedReLU>; + using HiddenLayer1 = ClippedReLU>; using HiddenLayer2 = ClippedReLU>; using OutputLayer = AffineTransform; diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index 26f7267f..efc33fb8 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -26,6 +26,8 @@ #include #include +#include "../misc.h" // for IsLittleEndian + #if defined(USE_AVX2) #include @@ -88,37 +90,77 @@ namespace Stockfish::Eval::NNUE { // necessary to return a result with the byte ordering of the compiling machine. template inline IntType read_little_endian(std::istream& stream) { - IntType result; - std::uint8_t u[sizeof(IntType)]; - typename std::make_unsigned::type v = 0; - stream.read(reinterpret_cast(u), sizeof(IntType)); - for (std::size_t i = 0; i < sizeof(IntType); ++i) - v = (v << 8) | u[sizeof(IntType) - i - 1]; + if (IsLittleEndian) + stream.read(reinterpret_cast(&result), sizeof(IntType)); + else + { + std::uint8_t u[sizeof(IntType)]; + typename std::make_unsigned::type v = 0; + + stream.read(reinterpret_cast(u), sizeof(IntType)); + for (std::size_t i = 0; i < sizeof(IntType); ++i) + v = (v << 8) | u[sizeof(IntType) - i - 1]; + + std::memcpy(&result, &v, sizeof(IntType)); + } - std::memcpy(&result, &v, sizeof(IntType)); return result; } + // write_little_endian() is our utility to write an integer (signed or unsigned, any size) + // to a stream in little-endian order. We swap the byte order before the write if + // necessary to always write in little endian order, independantly of the byte + // ordering of the compiling machine. template inline void write_little_endian(std::ostream& stream, IntType value) { - std::uint8_t u[sizeof(IntType)]; - typename std::make_unsigned::type v = value; + if (IsLittleEndian) + stream.write(reinterpret_cast(&value), sizeof(IntType)); + else + { + std::uint8_t u[sizeof(IntType)]; + typename std::make_unsigned::type v = value; - std::size_t i = 0; - // if constexpr to silence the warning about shift by 8 - if constexpr (sizeof(IntType) > 1) { - for (; i + 1 < sizeof(IntType); ++i) { - u[i] = v; - v >>= 8; - } + std::size_t i = 0; + // if constexpr to silence the warning about shift by 8 + if constexpr (sizeof(IntType) > 1) + { + for (; i + 1 < sizeof(IntType); ++i) + { + u[i] = v; + v >>= 8; + } + } + u[i] = v; + + stream.write(reinterpret_cast(u), sizeof(IntType)); } - u[i] = v; - - stream.write(reinterpret_cast(u), sizeof(IntType)); } + + // read_little_endian(s, out, N) : read integers in bulk from a little indian stream. + // This reads N integers from stream s and put them in array out. + template + inline void read_little_endian(std::istream& stream, IntType* out, std::size_t count) { + if (IsLittleEndian) + stream.read(reinterpret_cast(out), sizeof(IntType) * count); + else + for (std::size_t i = 0; i < count; ++i) + out[i] = read_little_endian(stream); + } + + // write_little_endian(s, values, N) : write integers in bulk to a little indian stream. + // This takes N integers from array values and writes them on stream s. + template + inline void write_little_endian(std::ostream& stream, const IntType* values, std::size_t count) { + if (IsLittleEndian) + stream.write(reinterpret_cast(values), sizeof(IntType) * count); + else + for (std::size_t i = 0; i < count; ++i) + write_little_endian(stream, values[i]); + } + } // namespace Stockfish::Eval::NNUE #endif // #ifndef NNUE_COMMON_H_INCLUDED diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 85ab8481..47fe9c06 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -31,12 +31,17 @@ namespace Stockfish::Eval::NNUE { + using BiasType = std::int16_t; + using WeightType = std::int16_t; + using PSQTWeightType = std::int32_t; + // If vector instructions are enabled, we update and refresh the // accumulator tile by tile such that each tile fits in the CPU's // vector registers. #define VECTOR - static_assert(PSQTBuckets == 8, "Assumed by the current choice of constants."); + static_assert(PSQTBuckets % 8 == 0, + "Per feature PSQT values cannot be processed at granularity lower than 8 at a time."); #ifdef USE_AVX512 typedef __m512i vec_t; @@ -50,8 +55,7 @@ namespace Stockfish::Eval::NNUE { #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b) #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b) #define vec_zero_psqt() _mm256_setzero_si256() - static constexpr IndexType NumRegs = 8; // only 8 are needed - static constexpr IndexType NumPsqtRegs = 1; + #define NumRegistersSIMD 32 #elif USE_AVX2 typedef __m256i vec_t; @@ -65,8 +69,7 @@ namespace Stockfish::Eval::NNUE { #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b) #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b) #define vec_zero_psqt() _mm256_setzero_si256() - static constexpr IndexType NumRegs = 16; - static constexpr IndexType NumPsqtRegs = 1; + #define NumRegistersSIMD 16 #elif USE_SSE2 typedef __m128i vec_t; @@ -80,8 +83,7 @@ namespace Stockfish::Eval::NNUE { #define vec_add_psqt_32(a,b) _mm_add_epi32(a,b) #define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b) #define vec_zero_psqt() _mm_setzero_si128() - static constexpr IndexType NumRegs = Is64Bit ? 16 : 8; - static constexpr IndexType NumPsqtRegs = 2; + #define NumRegistersSIMD (Is64Bit ? 16 : 8) #elif USE_MMX typedef __m64 vec_t; @@ -95,8 +97,7 @@ namespace Stockfish::Eval::NNUE { #define vec_add_psqt_32(a,b) _mm_add_pi32(a,b) #define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b) #define vec_zero_psqt() _mm_setzero_si64() - static constexpr IndexType NumRegs = 8; - static constexpr IndexType NumPsqtRegs = 4; + #define NumRegistersSIMD 8 #elif USE_NEON typedef int16x8_t vec_t; @@ -110,14 +111,61 @@ namespace Stockfish::Eval::NNUE { #define vec_add_psqt_32(a,b) vaddq_s32(a,b) #define vec_sub_psqt_32(a,b) vsubq_s32(a,b) #define vec_zero_psqt() psqt_vec_t{0} - static constexpr IndexType NumRegs = 16; - static constexpr IndexType NumPsqtRegs = 2; + #define NumRegistersSIMD 16 #else #undef VECTOR #endif + + #ifdef VECTOR + + // Compute optimal SIMD register count for feature transformer accumulation. + + // We use __m* types as template arguments, which causes GCC to emit warnings + // about losing some attribute information. This is irrelevant to us as we + // only take their size, so the following pragma are harmless. + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wignored-attributes" + + template + static constexpr int BestRegisterCount() + { + #define RegisterSize sizeof(SIMDRegisterType) + #define LaneSize sizeof(LaneType) + + static_assert(RegisterSize >= LaneSize); + static_assert(MaxRegisters <= NumRegistersSIMD); + static_assert(MaxRegisters > 0); + static_assert(NumRegistersSIMD > 0); + static_assert(RegisterSize % LaneSize == 0); + static_assert((NumLanes * LaneSize) % RegisterSize == 0); + + const int ideal = (NumLanes * LaneSize) / RegisterSize; + if (ideal <= MaxRegisters) + return ideal; + + // Look for the largest divisor of the ideal register count that is smaller than MaxRegisters + for (int divisor = MaxRegisters; divisor > 1; --divisor) + if (ideal % divisor == 0) + return divisor; + + return 1; + } + + static constexpr int NumRegs = BestRegisterCount(); + static constexpr int NumPsqtRegs = BestRegisterCount(); + + #pragma GCC diagnostic pop + + #endif + + + // Input feature converter class FeatureTransformer { @@ -125,8 +173,6 @@ namespace Stockfish::Eval::NNUE { // Number of output dimensions for one side static constexpr IndexType HalfDimensions = TransformedFeatureDimensions; - static constexpr int LazyThreshold = 1400; - #ifdef VECTOR static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2; static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4; @@ -153,26 +199,26 @@ namespace Stockfish::Eval::NNUE { // Read network parameters bool read_parameters(std::istream& stream) { - for (std::size_t i = 0; i < HalfDimensions; ++i) - biases[i] = read_little_endian(stream); - for (std::size_t i = 0; i < HalfDimensions * InputDimensions; ++i) - weights[i] = read_little_endian(stream); - for (std::size_t i = 0; i < PSQTBuckets * InputDimensions; ++i) - psqtWeights[i] = read_little_endian(stream); + + read_little_endian(stream, biases , HalfDimensions ); + read_little_endian(stream, weights , HalfDimensions * InputDimensions); + read_little_endian(stream, psqtWeights, PSQTBuckets * InputDimensions); + return !stream.fail(); } // Write network parameters bool write_parameters(std::ostream& stream) const { - for (std::size_t i = 0; i < HalfDimensions; ++i) - write_little_endian(stream, biases[i]); - for (std::size_t i = 0; i < HalfDimensions * InputDimensions; ++i) - write_little_endian(stream, weights[i]); + + write_little_endian(stream, biases , HalfDimensions ); + write_little_endian(stream, weights , HalfDimensions * InputDimensions); + write_little_endian(stream, psqtWeights, PSQTBuckets * InputDimensions); + return !stream.fail(); } // Convert input features - std::pair transform(const Position& pos, OutputType* output, int bucket) const { + std::int32_t transform(const Position& pos, OutputType* output, int bucket) const { update_accumulator(pos, WHITE); update_accumulator(pos, BLACK); @@ -181,121 +227,144 @@ namespace Stockfish::Eval::NNUE { const auto& psqtAccumulation = pos.state()->accumulator.psqtAccumulation; const auto psqt = ( - psqtAccumulation[static_cast(perspectives[0])][bucket] - - psqtAccumulation[static_cast(perspectives[1])][bucket] + psqtAccumulation[perspectives[0]][bucket] + - psqtAccumulation[perspectives[1]][bucket] ) / 2; - if (abs(psqt) > LazyThreshold * OutputScale) - return { psqt, true }; #if defined(USE_AVX512) + constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2); static_assert(HalfDimensions % (SimdWidth * 2) == 0); const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); const __m512i Zero = _mm512_setzero_si512(); + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + auto out = reinterpret_cast<__m512i*>(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + __m512i sum0 = _mm512_load_si512(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 0]); + __m512i sum1 = _mm512_load_si512(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 1]); + + _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control, + _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero))); + } + } + return psqt; + #elif defined(USE_AVX2) + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; constexpr int Control = 0b11011000; const __m256i Zero = _mm256_setzero_si256(); - #elif defined(USE_SSE2) - constexpr IndexType NumChunks = HalfDimensions / SimdWidth; + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + auto out = reinterpret_cast<__m256i*>(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + __m256i sum0 = _mm256_load_si256(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 0]); + __m256i sum1 = _mm256_load_si256(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 1]); - #ifdef USE_SSE41 + _mm256_store_si256(&out[j], _mm256_permute4x64_epi64( + _mm256_max_epi8(_mm256_packs_epi16(sum0, sum1), Zero), Control)); + } + } + return psqt; + + #elif defined(USE_SSE2) + + #ifdef USE_SSE41 + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; const __m128i Zero = _mm_setzero_si128(); - #else + #else + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; const __m128i k0x80s = _mm_set1_epi8(-128); - #endif + #endif + + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + auto out = reinterpret_cast<__m128i*>(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + __m128i sum0 = _mm_load_si128(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 0]); + __m128i sum1 = _mm_load_si128(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 1]); + const __m128i packedbytes = _mm_packs_epi16(sum0, sum1); + + #ifdef USE_SSE41 + _mm_store_si128(&out[j], _mm_max_epi8(packedbytes, Zero)); + #else + _mm_store_si128(&out[j], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)); + #endif + } + } + return psqt; #elif defined(USE_MMX) + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; const __m64 k0x80s = _mm_set1_pi8(-128); + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + auto out = reinterpret_cast<__m64*>(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + __m64 sum0 = *(&reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 0]); + __m64 sum1 = *(&reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 1]); + const __m64 packedbytes = _mm_packs_pi16(sum0, sum1); + out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); + } + } + _mm_empty(); + return psqt; + #elif defined(USE_NEON) + constexpr IndexType NumChunks = HalfDimensions / (SimdWidth / 2); const int8x8_t Zero = {0}; - #endif - - for (IndexType p = 0; p < 2; ++p) { - const IndexType offset = HalfDimensions * p; - - #if defined(USE_AVX512) - auto out = reinterpret_cast<__m512i*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m512i sum0 = _mm512_load_si512( - &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 0]); - __m512i sum1 = _mm512_load_si512( - &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 1]); - _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control, - _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero))); - } - - #elif defined(USE_AVX2) - auto out = reinterpret_cast<__m256i*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m256i sum0 = _mm256_load_si256( - &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 0]); - __m256i sum1 = _mm256_load_si256( - &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 1]); - _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( - _mm256_packs_epi16(sum0, sum1), Zero), Control)); - } - - #elif defined(USE_SSE2) - auto out = reinterpret_cast<__m128i*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m128i sum0 = _mm_load_si128(&reinterpret_cast( - accumulation[perspectives[p]])[j * 2 + 0]); - __m128i sum1 = _mm_load_si128(&reinterpret_cast( - accumulation[perspectives[p]])[j * 2 + 1]); - const __m128i packedbytes = _mm_packs_epi16(sum0, sum1); - - _mm_store_si128(&out[j], - - #ifdef USE_SSE41 - _mm_max_epi8(packedbytes, Zero) - #else - _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) - #endif - - ); - } - - #elif defined(USE_MMX) - auto out = reinterpret_cast<__m64*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m64 sum0 = *(&reinterpret_cast( - accumulation[perspectives[p]])[j * 2 + 0]); - __m64 sum1 = *(&reinterpret_cast( - accumulation[perspectives[p]])[j * 2 + 1]); - const __m64 packedbytes = _mm_packs_pi16(sum0, sum1); - out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); - } - - #elif defined(USE_NEON) - const auto out = reinterpret_cast(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - int16x8_t sum = reinterpret_cast( - accumulation[perspectives[p]])[j]; - out[j] = vmax_s8(vqmovn_s16(sum), Zero); - } - - #else - for (IndexType j = 0; j < HalfDimensions; ++j) { - BiasType sum = accumulation[static_cast(perspectives[p])][j]; - output[offset + j] = static_cast( - std::max(0, std::min(127, sum))); - } - #endif + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + const auto out = reinterpret_cast(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + int16x8_t sum = reinterpret_cast(accumulation[perspectives[p]])[j]; + out[j] = vmax_s8(vqmovn_s16(sum), Zero); + } } - #if defined(USE_MMX) - _mm_empty(); + return psqt; + + #else + + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + for (IndexType j = 0; j < HalfDimensions; ++j) + { + BiasType sum = accumulation[perspectives[p]][j]; + output[offset + j] = static_cast(std::max(0, std::min(127, sum))); + } + } + return psqt; + #endif - return { psqt, false }; - } + } // end of function transform() + + private: void update_accumulator(const Position& pos, const Color perspective) const { @@ -317,7 +386,7 @@ namespace Stockfish::Eval::NNUE { // of the estimated gain in terms of features to be added/subtracted. StateInfo *st = pos.state(), *next = nullptr; int gain = FeatureSet::refresh_cost(pos); - while (st->accumulator.state[perspective] == EMPTY) + while (st->previous && !st->accumulator.computed[perspective]) { // This governs when a full feature refresh is needed and how many // updates are better than just one full refresh. @@ -328,7 +397,7 @@ namespace Stockfish::Eval::NNUE { st = st->previous; } - if (st->accumulator.state[perspective] == COMPUTED) + if (st->accumulator.computed[perspective]) { if (next == nullptr) return; @@ -346,8 +415,8 @@ namespace Stockfish::Eval::NNUE { ksq, st2, perspective, removed[1], added[1]); // Mark the accumulators as computed. - next->accumulator.state[perspective] = COMPUTED; - pos.state()->accumulator.state[perspective] = COMPUTED; + next->accumulator.computed[perspective] = true; + pos.state()->accumulator.computed[perspective] = true; // Now update the accumulators listed in states_to_update[], where the last element is a sentinel. StateInfo *states_to_update[3] = @@ -467,7 +536,7 @@ namespace Stockfish::Eval::NNUE { { // Refresh the accumulator auto& accumulator = pos.state()->accumulator; - accumulator.state[perspective] = COMPUTED; + accumulator.computed[perspective] = true; IndexList active; FeatureSet::append_active_indices(pos, perspective, active); @@ -539,10 +608,6 @@ namespace Stockfish::Eval::NNUE { #endif } - using BiasType = std::int16_t; - using WeightType = std::int16_t; - using PSQTWeightType = std::int32_t; - alignas(CacheLineSize) BiasType biases[HalfDimensions]; alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions]; alignas(CacheLineSize) PSQTWeightType psqtWeights[InputDimensions * PSQTBuckets]; diff --git a/src/position.cpp b/src/position.cpp index b497196d..6bb2edb4 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -256,8 +256,6 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th set_castling_right(c, rsq); } - set_state(st); - // 4. En passant square. // Ignore if square is invalid or not on side to move relative rank 6. bool enpassant = false; @@ -271,24 +269,12 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th // a) side to move have a pawn threatening epSquare // b) there is an enemy pawn in front of epSquare // c) there is no piece on epSquare or behind epSquare - // d) enemy pawn didn't block a check of its own color by moving forward enpassant = pawn_attacks_bb(~sideToMove, st->epSquare) & pieces(sideToMove, PAWN) && (pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove))) - && !(pieces() & (st->epSquare | (st->epSquare + pawn_push(sideToMove)))) - && ( file_of(square(sideToMove)) == file_of(st->epSquare) - || !(blockers_for_king(sideToMove) & (st->epSquare + pawn_push(~sideToMove)))); + && !(pieces() & (st->epSquare | (st->epSquare + pawn_push(sideToMove)))); } - // It's necessary for st->previous to be intialized in this way because legality check relies on its existence - if (enpassant) { - st->previous = new StateInfo(); - remove_piece(st->epSquare - pawn_push(sideToMove)); - st->previous->checkersBB = attackers_to(square(~sideToMove)) & pieces(sideToMove); - st->previous->blockersForKing[WHITE] = slider_blockers(pieces(BLACK), square(WHITE), st->previous->pinners[BLACK]); - st->previous->blockersForKing[BLACK] = slider_blockers(pieces(WHITE), square(BLACK), st->previous->pinners[WHITE]); - put_piece(make_piece(~sideToMove, PAWN), st->epSquare - pawn_push(sideToMove)); - } - else + if (!enpassant) st->epSquare = SQ_NONE; // 5-6. Halfmove clock and fullmove number @@ -300,8 +286,7 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th chess960 = isChess960; thisThread = th; - st->accumulator.state[WHITE] = Eval::NNUE::INIT; - st->accumulator.state[BLACK] = Eval::NNUE::INIT; + set_state(st); assert(pos_is_ok()); @@ -522,11 +507,23 @@ bool Position::legal(Move m) const { assert(color_of(moved_piece(m)) == us); assert(piece_on(square(us)) == make_piece(us, KING)); - // st->previous->blockersForKing consider capsq as empty. - // If pinned, it has to move along the king ray. + // En passant captures are a tricky special case. Because they are rather + // uncommon, we do it simply by testing whether the king is attacked after + // the move is made. if (type_of(m) == EN_PASSANT) - return !(st->previous->blockersForKing[sideToMove] & from) - || aligned(from, to, square(us)); + { + Square ksq = square(us); + Square capsq = to - pawn_push(us); + Bitboard occupied = (pieces() ^ from ^ capsq) | to; + + assert(to == ep_square()); + assert(moved_piece(m) == make_piece(us, PAWN)); + assert(piece_on(capsq) == make_piece(~us, PAWN)); + assert(piece_on(to) == NO_PIECE); + + return !(attacks_bb< ROOK>(ksq, occupied) & pieces(~us, QUEEN, ROOK)) + && !(attacks_bb(ksq, occupied) & pieces(~us, QUEEN, BISHOP)); + } // Castling moves generation does not check if the castling path is clear of // enemy attacks, it is delayed at a later time: now! @@ -659,15 +656,18 @@ bool Position::gives_check(Move m) const { case PROMOTION: return attacks_bb(promotion_type(m), to, pieces() ^ from) & square(~sideToMove); - // The double-pushed pawn blocked a check? En Passant will remove the blocker. - // The only discovery check that wasn't handle is through capsq and fromsq - // So the King must be in the same rank as fromsq to consider this possibility. - // st->previous->blockersForKing consider capsq as empty. + // En passant capture with check? We have already handled the case + // of direct checks and ordinary discovered check, so the only case we + // need to handle is the unusual case of a discovered check through + // the captured pawn. case EN_PASSANT: - return st->previous->checkersBB - || ( rank_of(square(~sideToMove)) == rank_of(from) - && st->previous->blockersForKing[~sideToMove] & from); + { + Square capsq = make_square(file_of(to), rank_of(from)); + Bitboard b = (pieces() ^ from ^ capsq) | to; + return (attacks_bb< ROOK>(square(~sideToMove), b) & pieces(sideToMove, QUEEN, ROOK)) + | (attacks_bb(square(~sideToMove), b) & pieces(sideToMove, QUEEN, BISHOP)); + } default: //CASTLING { // Castling is encoded as 'king captures the rook' @@ -707,8 +707,8 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) { ++st->pliesFromNull; // Used by NNUE - st->accumulator.state[WHITE] = Eval::NNUE::EMPTY; - st->accumulator.state[BLACK] = Eval::NNUE::EMPTY; + st->accumulator.computed[WHITE] = false; + st->accumulator.computed[BLACK] = false; auto& dp = st->dirtyPiece; dp.dirty_num = 1; @@ -1009,8 +1009,8 @@ void Position::do_null_move(StateInfo& newSt) { // Used by NNUE st->dirtyPiece.dirty_num = 0; st->dirtyPiece.piece[0] = NO_PIECE; // Avoid checks in UpdateAccumulator() - st->accumulator.state[WHITE] = Eval::NNUE::EMPTY; - st->accumulator.state[BLACK] = Eval::NNUE::EMPTY; + st->accumulator.computed[WHITE] = false; + st->accumulator.computed[BLACK] = false; if (st->epSquare != SQ_NONE) { @@ -1086,8 +1086,9 @@ bool Position::see_ge(Move m, Value threshold) const { if (swap <= 0) return true; + assert(color_of(piece_on(from)) == sideToMove); Bitboard occupied = pieces() ^ from ^ to; - Color stm = color_of(piece_on(from)); + Color stm = sideToMove; Bitboard attackers = attackers_to(to, occupied); Bitboard stmAttackers, bb; int res = 1; diff --git a/src/position.h b/src/position.h index c0193a9f..20f999bc 100644 --- a/src/position.h +++ b/src/position.h @@ -197,6 +197,9 @@ public: // Returns the position of the ball on the c side. Square king_square(Color c) const { return lsb(pieces(c, KING)); } + + void put_piece(Piece pc, Square s); + void remove_piece(Square s); private: // Initialization helpers (used while setting up a position) @@ -205,8 +208,6 @@ private: void set_check_info(StateInfo* si) const; // Other helpers - void put_piece(Piece pc, Square s); - void remove_piece(Square s); void move_piece(Square from, Square to); template void do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto); @@ -413,7 +414,7 @@ inline void Position::remove_piece(Square s) { byTypeBB[ALL_PIECES] ^= s; byTypeBB[type_of(pc)] ^= s; byColorBB[color_of(pc)] ^= s; - /* board[s] = NO_PIECE; Not needed, overwritten by the capturing one */ + board[s] = NO_PIECE; pieceCount[pc]--; pieceCount[make_piece(color_of(pc), ALL_PIECES)]--; psq -= PSQT::psq[pc][s]; diff --git a/src/search.cpp b/src/search.cpp index be137f33..b4b3303b 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -54,7 +54,7 @@ bool Search::prune_at_shallow_depth = true; namespace { // Different node types, used as a template parameter - enum NodeType { NonPV, PV }; + enum NodeType { NonPV, PV, Root }; constexpr uint64_t TtHitAverageWindow = 4096; constexpr uint64_t TtHitAverageResolution = 1024; @@ -97,10 +97,10 @@ namespace { Move best = MOVE_NONE; }; - template + template Value search(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth, bool cutNode); - template + template Value qsearch(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth = 0); Value value_to_tt(Value v, int ply); @@ -147,7 +147,7 @@ namespace { void Search::init() { for (int i = 1; i < MAX_MOVES; ++i) - Reductions[i] = int((21.3 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i))); + Reductions[i] = int(21.9 * std::log(i)); } @@ -248,7 +248,7 @@ void Thread::search() { // To allow access to (ss-7) up to (ss+2), the stack must be oversized. // The former is needed to allow update_continuation_histories(ss-1, ...), // which accesses its argument at ss-6, also near the root. - // The latter is needed for statScores and killer initialization. + // The latter is needed for statScore and killer initialization. Stack stack[MAX_PLY+10], *ss = stack+7; Move pv[MAX_PLY+1]; Value bestValue, alpha, beta, delta; @@ -263,6 +263,9 @@ void Thread::search() { for (int i = 7; i > 0; i--) (ss-i)->continuationHistory = &this->continuationHistory[0][0][NO_PIECE][0]; // Use as a sentinel + for (int i = 0; i <= MAX_PLY + 2; ++i) + (ss+i)->ply = i; + ss->pv = pv; bestValue = delta = alpha = -VALUE_INFINITE; @@ -307,19 +310,7 @@ void Thread::search() { multiPV = std::min(multiPV, rootMoves.size()); ttHitAverage = TtHitAverageWindow * TtHitAverageResolution / 2; - int ct = int(Options["Contempt"]) * PawnValueEg / 100; // From centipawns - - // In analysis mode, adjust contempt in accordance with user preference - if (Limits.infinite || Options["UCI_AnalyseMode"]) - ct = Options["Analysis Contempt"] == "Off" ? 0 - : Options["Analysis Contempt"] == "Both" ? ct - : Options["Analysis Contempt"] == "White" && us == BLACK ? -ct - : Options["Analysis Contempt"] == "Black" && us == WHITE ? -ct - : ct; - - // Evaluation score is from the white point of view - contempt = (us == WHITE ? make_score(ct, ct / 2) - : -make_score(ct, ct / 2)); + trend = SCORE_ZERO; int searchAgainCounter = 0; @@ -365,11 +356,11 @@ void Thread::search() { alpha = std::max(prev - delta,-VALUE_INFINITE); beta = std::min(prev + delta, VALUE_INFINITE); - // Adjust contempt based on root move's previousScore (dynamic contempt) - int dct = ct + (113 - ct / 2) * prev / (abs(prev) + 147); + // Adjust trend based on root move's previousScore (dynamic contempt) + int tr = 113 * prev / (abs(prev) + 147); - contempt = (us == WHITE ? make_score(dct, dct / 2) - : -make_score(dct, dct / 2)); + trend = (us == WHITE ? make_score(tr, tr / 2) + : -make_score(tr, tr / 2)); } // Start with a small aspiration window and, in the case of a fail @@ -379,7 +370,7 @@ void Thread::search() { while (true) { Depth adjustedDepth = std::max(1, rootDepth - failedHighCnt - searchAgainCounter); - bestValue = Stockfish::search(rootPos, ss, alpha, beta, adjustedDepth, false); + bestValue = Stockfish::search(rootPos, ss, alpha, beta, adjustedDepth, false); // Bring the best move to the front. It is critical that sorting // is done with a stable algorithm because all the values but the @@ -475,8 +466,8 @@ void Thread::search() { totBestMoveChanges += th->bestMoveChanges; th->bestMoveChanges = 0; } - double bestMoveInstability = 1 + 2 * totBestMoveChanges / Threads.size(); - + double bestMoveInstability = 1.073 + std::max(1.0, 2.25 - 9.9 / rootDepth) + * totBestMoveChanges / Threads.size(); double totalTime = Time.optimum() * fallingEval * reduction * bestMoveInstability; // Cap used time in case of a single legal move for a better viewer experience in tournaments @@ -522,18 +513,18 @@ namespace { // search<>() is the main search function for both PV and non-PV nodes - template + template Value search(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth, bool cutNode) { - constexpr bool PvNode = NT == PV; - const bool rootNode = PvNode && ss->ply == 0; + constexpr bool PvNode = nodeType != NonPV; + constexpr bool rootNode = nodeType == Root; const Depth maxNextDepth = rootNode ? depth : depth + 1; // Check if we have an upcoming move which draws by repetition, or // if the opponent had an alternative move earlier to this position. - if ( pos.rule50_count() >= 3 + if ( !rootNode + && pos.rule50_count() >= 3 && alpha < VALUE_DRAW - && !rootNode && pos.has_game_cycle(ss->ply)) { alpha = value_draw(pos.this_thread()); @@ -543,7 +534,7 @@ namespace { // Dive into quiescence search when the depth reaches zero if (depth <= 0) - return qsearch(pos, ss, alpha, beta); + return qsearch(pos, ss, alpha, beta); assert(-VALUE_INFINITE <= alpha && alpha < beta && beta <= VALUE_INFINITE); assert(PvNode || (alpha == beta - 1)); @@ -559,7 +550,7 @@ namespace { Move ttMove, move, excludedMove, bestMove; Depth extension, newDepth; Value bestValue, value, ttValue, eval, maxValue, probCutBeta; - bool formerPv, givesCheck, improving, didLMR, priorCapture; + bool givesCheck, improving, didLMR, priorCapture; bool captureOrPromotion, doFullDepthSearch, moveCountPruning, ttCapture, singularQuietLMR; Piece movedPiece; @@ -605,11 +596,11 @@ namespace { assert(0 <= ss->ply && ss->ply < MAX_PLY); - (ss+1)->ply = ss->ply + 1; - (ss+1)->ttPv = false; + (ss+1)->ttPv = false; (ss+1)->excludedMove = bestMove = MOVE_NONE; - (ss+2)->killers[0] = (ss+2)->killers[1] = MOVE_NONE; - Square prevSq = to_sq((ss-1)->currentMove); + (ss+2)->killers[0] = (ss+2)->killers[1] = MOVE_NONE; + ss->doubleExtensions = (ss-1)->doubleExtensions; + Square prevSq = to_sq((ss-1)->currentMove); // Initialize statScore to zero for the grandchildren of the current position. // So statScore is shared between all grandchildren and only the first grandchild @@ -630,7 +621,6 @@ namespace { : ss->ttHit ? tte->move() : MOVE_NONE; if (!excludedMove) ss->ttPv = PvNode || (ss->ttHit && tte->is_pv()); - formerPv = ss->ttPv && !PvNode; // Update low ply history for previous move if we are near root and position is or has been in PV if ( ss->ttPv @@ -768,6 +758,7 @@ namespace { ss->staticEval = eval = -(ss-1)->staticEval; // Save static evaluation into transposition table + if(!excludedMove) tte->save(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_NONE, MOVE_NONE, eval); } @@ -786,7 +777,8 @@ namespace { ? ss->staticEval > (ss-4)->staticEval || (ss-4)->staticEval == VALUE_NONE : ss->staticEval > (ss-2)->staticEval; - // Step 7. Futility pruning: child node (~50 Elo) + // Step 7. Futility pruning: child node (~50 Elo). + // The depth condition is important for mate finding. if ( !PvNode && depth < 9 && eval - futility_margin(depth, improving) >= beta @@ -915,7 +907,7 @@ namespace { && !ttMove) depth -= 2; -moves_loop: // When in check, search starts from here +moves_loop: // When in check, search starts here ttCapture = ttMove && pos.capture_or_promotion(ttMove); @@ -950,6 +942,7 @@ moves_loop: // When in check, search starts from here value = bestValue; singularQuietLMR = moveCountPruning = false; + bool doubleExtension = false; // Indicate PvNodes that will probably fail low if the node was searched // at a depth equal or greater than the current depth, and the result of this search was a fail low. @@ -998,7 +991,7 @@ moves_loop: // When in check, search starts from here // Calculate new depth for this move newDepth = depth - 1; - // Step 13. Pruning at shallow depth (~200 Elo) + // Step 13. Pruning at shallow depth (~200 Elo). Depth conditions are important for mate finding. if ( !rootNode && (PvNode ? prune_at_shallow_depth : true) && pos.non_pawn_material(us) @@ -1027,22 +1020,18 @@ moves_loop: // When in check, search starts from here { // Continuation history based pruning (~20 Elo) if ( lmrDepth < 5 - && (*contHist[0])[movedPiece][to_sq(move)] < CounterMovePruneThreshold - && (*contHist[1])[movedPiece][to_sq(move)] < CounterMovePruneThreshold) + && (*contHist[0])[movedPiece][to_sq(move)] < 23 - 23 * depth * depth + && (*contHist[1])[movedPiece][to_sq(move)] < 23 - 23 * depth * depth) continue; // Futility pruning: parent node (~5 Elo) - if ( lmrDepth < 7 - && !ss->inCheck - && ss->staticEval + 174 + 157 * lmrDepth <= alpha - && (*contHist[0])[movedPiece][to_sq(move)] - + (*contHist[1])[movedPiece][to_sq(move)] - + (*contHist[3])[movedPiece][to_sq(move)] - + (*contHist[5])[movedPiece][to_sq(move)] / 3 < 28255) + if ( !ss->inCheck + && lmrDepth < 7 + && ss->staticEval + 174 + 157 * lmrDepth <= alpha) continue; // Prune moves with negative SEE (~20 Elo) - if (!pos.see_ge(move, Value(-(30 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth))) + if (!pos.see_ge(move, Value(-21 * lmrDepth * lmrDepth - 21 * lmrDepth))) continue; } } @@ -1054,17 +1043,17 @@ moves_loop: // When in check, search starts from here // then that move is singular and should be extended. To verify this we do // a reduced search on all the other moves but the ttMove and if the // result is lower than ttValue minus a margin, then we will extend the ttMove. - if ( depth >= 7 + if ( !rootNode + && depth >= 7 && move == ttMove - && !rootNode && !excludedMove // Avoid recursive singular search /* && ttValue != VALUE_NONE Already implicit in the next condition */ && abs(ttValue) < VALUE_KNOWN_WIN && (tte->bound() & BOUND_LOWER) && tte->depth() >= depth - 3) { - Value singularBeta = ttValue - ((formerPv + 4) * depth) / 2; - Depth singularDepth = (depth - 1 + 3 * formerPv) / 2; + Value singularBeta = ttValue - 2 * depth; + Depth singularDepth = (depth - 1) / 2; ss->excludedMove = move; value = search(pos, ss, singularBeta - 1, singularBeta, singularDepth, cutNode); @@ -1074,8 +1063,15 @@ moves_loop: // When in check, search starts from here { extension = 1; singularQuietLMR = !ttCapture; - if (!PvNode && value < singularBeta - 93) + + // Avoid search explosion by limiting the number of double extensions to at most 3 + if ( !PvNode + && value < singularBeta - 93 + && ss->doubleExtensions < 3) + { extension = 2; + doubleExtension = true; + } } // Multi-cut pruning @@ -1098,9 +1094,14 @@ moves_loop: // When in check, search starts from here return beta; } } + else if ( givesCheck + && depth > 6 + && abs(ss->staticEval) > Value(100)) + extension = 1; // Add extension to new depth newDepth += extension; + ss->doubleExtensions = (ss-1)->doubleExtensions + (extension == 2); // Speculative prefetch as early as possible prefetch(TT.first_entry(pos.key_after(move))); @@ -1122,12 +1123,15 @@ moves_loop: // When in check, search starts from here if ( depth >= 3 && moveCount > 1 + 2 * rootNode && ( !captureOrPromotion - || cutNode - || (!PvNode && !formerPv)) + || (cutNode && (ss-1)->moveCount > 1) + || !ss->ttPv) && (!PvNode || ss->ply > 1 || thisThread->id() % 4 != 3)) { Depth r = reduction(improving, depth, moveCount); + if (PvNode) + r--; + // Decrease reduction if the ttHit running average is large (~0 Elo) if (thisThread->ttHitAverage > 537 * TtHitAverageResolution * TtHitAverageWindow / 1024) r--; @@ -1140,7 +1144,6 @@ moves_loop: // When in check, search starts from here // Increase reduction at root and non-PV nodes when the best move does not change frequently if ( (rootNode || !PvNode) - && thisThread->rootDepth > 10 && thisThread->bestMoveChanges <= 2) r++; @@ -1152,31 +1155,27 @@ moves_loop: // When in check, search starts from here if (singularQuietLMR) r--; - if (!captureOrPromotion) - { - // Increase reduction if ttMove is a capture (~3 Elo) - if (ttCapture) - r++; + // Increase reduction for cut nodes (~3 Elo) + if (cutNode && move != ss->killers[0]) + r += 2; - // Increase reduction for cut nodes (~3 Elo) - if (cutNode) - r += 2; + // Increase reduction if ttMove is a capture (~3 Elo) + if (ttCapture) + r++; - ss->statScore = thisThread->mainHistory[us][from_to(move)] - + (*contHist[0])[movedPiece][to_sq(move)] - + (*contHist[1])[movedPiece][to_sq(move)] - + (*contHist[3])[movedPiece][to_sq(move)] - - 4923; + ss->statScore = thisThread->mainHistory[us][from_to(move)] + + (*contHist[0])[movedPiece][to_sq(move)] + + (*contHist[1])[movedPiece][to_sq(move)] + + (*contHist[3])[movedPiece][to_sq(move)] + - 4923; - // Decrease/increase reduction for moves with a good/bad history (~30 Elo) - if (!ss->inCheck) - r -= ss->statScore / 14721; - } + // Decrease/increase reduction for moves with a good/bad history (~30 Elo) + r -= ss->statScore / 14721; // In general we want to cap the LMR depth search at newDepth. But if // reductions are really negative and movecount is low, we allow this move - // to be searched deeper than the first move. - Depth d = std::clamp(newDepth - r, 1, newDepth + (r < -1 && moveCount <= 5)); + // to be searched deeper than the first move, unless ttMove was extended by 2. + Depth d = std::clamp(newDepth - r, 1, newDepth + (r < -1 && moveCount <= 5 && !doubleExtension)); value = -search(pos, ss+1, -(alpha+1), -alpha, d, true); @@ -1274,7 +1273,6 @@ moves_loop: // When in check, search starts from here else { assert(value >= beta); // Fail high - ss->statScore = 0; break; } } @@ -1348,10 +1346,11 @@ moves_loop: // When in check, search starts from here // qsearch() is the quiescence search function, which is called by the main search // function with zero depth, or recursively with further decreasing depth per call. - template + template Value qsearch(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth) { - constexpr bool PvNode = NT == PV; + static_assert(nodeType != Root); + constexpr bool PvNode = nodeType == PV; assert(alpha >= -VALUE_INFINITE && alpha < beta && beta <= VALUE_INFINITE); assert(PvNode || (alpha == beta - 1)); @@ -1377,7 +1376,6 @@ moves_loop: // When in check, search starts from here } Thread* thisThread = pos.this_thread(); - (ss+1)->ply = ss->ply + 1; bestMove = MOVE_NONE; ss->inCheck = pos.checkers(); moveCount = 0; @@ -1458,7 +1456,7 @@ moves_loop: // When in check, search starts from here // Initialize a MovePicker object for the current position, and prepare // to search the moves. Because the depth is <= 0 here, only captures, - // queen and checking knight promotions, and other checks(only if depth >= DEPTH_QS_CHECKS) + // queen promotions, and other checks (only if depth >= DEPTH_QS_CHECKS) // will be generated. MovePicker mp(pos, ttMove, depth, &thisThread->mainHistory, &thisThread->captureHistory, @@ -1470,6 +1468,10 @@ moves_loop: // When in check, search starts from here { assert(is_ok(move)); + // Check for legality + if (!pos.legal(move)) + continue; + givesCheck = pos.gives_check(move); captureOrPromotion = pos.capture_or_promotion(move); @@ -1508,13 +1510,6 @@ moves_loop: // When in check, search starts from here // Speculative prefetch as early as possible prefetch(TT.first_entry(pos.key_after(move))); - // Check for legality just before making the move - if (!pos.legal(move)) - { - moveCount--; - continue; - } - ss->currentMove = move; ss->continuationHistory = &thisThread->continuationHistory[ss->inCheck] [captureOrPromotion] @@ -1530,7 +1525,7 @@ moves_loop: // When in check, search starts from here // Make and search the move pos.do_move(move, st, givesCheck); - value = -qsearch(pos, ss+1, -beta, -alpha, depth - 1); + value = -qsearch(pos, ss+1, -beta, -alpha, depth - 1); pos.undo_move(move); assert(value > -VALUE_INFINITE && value < VALUE_INFINITE); @@ -1977,20 +1972,8 @@ namespace Search // th->clear(); - int ct = int(Options["Contempt"]) * PawnValueEg / 100; // From centipawns - Color us = pos.side_to_move(); - - // In analysis mode, adjust contempt in accordance with user preference - if (Limits.infinite || Options["UCI_AnalyseMode"]) - ct = Options["Analysis Contempt"] == "Off" ? 0 - : Options["Analysis Contempt"] == "Both" ? ct - : Options["Analysis Contempt"] == "White" && us == BLACK ? -ct - : Options["Analysis Contempt"] == "Black" && us == WHITE ? -ct - : ct; - // Evaluation score is from the white point of view - th->contempt = (us == WHITE ? make_score(ct, ct / 2) - : -make_score(ct, ct / 2)); + th->trend = make_score(0, 0); for (int i = 7; i > 0; i--) (ss - i)->continuationHistory = &th->continuationHistory[0][0][NO_PIECE][0]; // Use as a sentinel @@ -2153,7 +2136,7 @@ namespace Search while (true) { Depth adjustedDepth = std::max(1, rootDepth); - bestValue = Stockfish::search(pos, ss, alpha, beta, adjustedDepth, false); + bestValue = Stockfish::search(pos, ss, alpha, beta, adjustedDepth, false); stable_sort(rootMoves.begin() + pvIdx, rootMoves.end()); //my_stable_sort(pos.this_thread()->thread_id(),&rootMoves[0] + pvIdx, rootMoves.size() - pvIdx); @@ -3093,20 +3076,8 @@ namespace Search for (int i = 1; i <= MAX_PLY; ++i) (stack + i)->ply = i; - int ct = int(Options["Contempt"]) * PawnValueEg / 100; // From centipawns - Color us = pos.side_to_move(); - - // In analysis mode, adjust contempt in accordance with user preference - if (Limits.infinite || Options["UCI_AnalyseMode"]) - ct = Options["Analysis Contempt"] == "Off" ? 0 - : Options["Analysis Contempt"] == "Both" ? ct - : Options["Analysis Contempt"] == "White" && us == BLACK ? -ct - : Options["Analysis Contempt"] == "Black" && us == WHITE ? -ct - : ct; - // Evaluation score is from the white point of view - th->contempt = (us == WHITE ? make_score(ct, ct / 2) - : -make_score(ct, ct / 2)); + th->trend = make_score(0, 0); create_new_root(pos); diff --git a/src/search.h b/src/search.h index 609f19a7..36bcb18b 100644 --- a/src/search.h +++ b/src/search.h @@ -54,6 +54,7 @@ struct Stack { bool inCheck; bool ttPv; bool ttHit; + int doubleExtensions; }; diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp index 57c9204b..f382edbc 100644 --- a/src/syzygy/tbprobe.cpp +++ b/src/syzygy/tbprobe.cpp @@ -105,9 +105,6 @@ template<> inline void swap_endian(uint8_t&) {} template T number(void* addr) { - static const union { uint32_t i; char c[4]; } Le = { 0x01020304 }; - static const bool IsLittleEndian = (Le.c[0] == 4); - T v; if ((uintptr_t)addr & (alignof(T) - 1)) // Unaligned pointer (very rare) @@ -1539,6 +1536,14 @@ bool Tablebases::root_probe(Position& pos, Search::RootMoves& rootMoves) { WDLScore wdl = -probe_wdl(pos, &result); dtz = dtz_before_zeroing(wdl); } + else if (pos.is_draw(1)) + { + // In case a root move leads to a draw by repetition or + // 50-move rule, we set dtz to zero. Note: since we are + // only 1 ply from the root, this must be a true 3-fold + // repetition inside the game history. + dtz = 0; + } else { // Otherwise, take dtz for the new position and correct by 1 ply @@ -1589,6 +1594,7 @@ bool Tablebases::root_probe_wdl(Position& pos, Search::RootMoves& rootMoves) { ProbeState result; StateInfo st; + WDLScore wdl; bool rule50 = Options["Syzygy50MoveRule"]; @@ -1597,7 +1603,10 @@ bool Tablebases::root_probe_wdl(Position& pos, Search::RootMoves& rootMoves) { { pos.do_move(m.pv[0], st); - WDLScore wdl = -probe_wdl(pos, &result); + if (pos.is_draw(1)) + wdl = WDLDraw; + else + wdl = -probe_wdl(pos, &result); pos.undo_move(m.pv[0]); diff --git a/src/thread.h b/src/thread.h index 0989f4ba..c0218577 100644 --- a/src/thread.h +++ b/src/thread.h @@ -102,7 +102,7 @@ public: LowPlyHistory lowPlyHistory; CapturePieceToHistory captureHistory; ContinuationHistory continuationHistory[2][2]; - Score contempt; + Score trend; int failedHighCnt; bool rootInTB; int Cardinality; diff --git a/src/tools/sfen_packer.cpp b/src/tools/sfen_packer.cpp index 8182503c..7a6fb979 100644 --- a/src/tools/sfen_packer.cpp +++ b/src/tools/sfen_packer.cpp @@ -260,8 +260,8 @@ namespace Stockfish::Tools { pos.clear(); std::memset(si, 0, sizeof(StateInfo)); - si->accumulator.state[WHITE] = Eval::NNUE::INIT; - si->accumulator.state[BLACK] = Eval::NNUE::INIT; + si->accumulator.computed[WHITE] = false; + si->accumulator.computed[BLACK] = false; pos.st = si; // Active color diff --git a/src/tools/training_data_generator.cpp b/src/tools/training_data_generator.cpp index 45781dbb..0c4f8d82 100644 --- a/src/tools/training_data_generator.cpp +++ b/src/tools/training_data_generator.cpp @@ -812,10 +812,8 @@ namespace Stockfish::Tools is >> params.seed; else if (token == "set_recommended_uci_options") { - UCI::setoption("Contempt", "0"); UCI::setoption("Skill Level", "20"); UCI::setoption("UCI_Chess960", "false"); - UCI::setoption("UCI_AnalyseMode", "false"); UCI::setoption("UCI_LimitStrength", "false"); UCI::setoption("PruneAtShallowDepth", "false"); UCI::setoption("EnableTranspositionTable", "true"); diff --git a/src/tools/training_data_generator_nonpv.cpp b/src/tools/training_data_generator_nonpv.cpp index 278259c6..04bab4a2 100644 --- a/src/tools/training_data_generator_nonpv.cpp +++ b/src/tools/training_data_generator_nonpv.cpp @@ -434,10 +434,8 @@ namespace Stockfish::Tools params.smart_fen_skipping = true; else if (token == "set_recommended_uci_options") { - UCI::setoption("Contempt", "0"); UCI::setoption("Skill Level", "20"); UCI::setoption("UCI_Chess960", "false"); - UCI::setoption("UCI_AnalyseMode", "false"); UCI::setoption("UCI_LimitStrength", "false"); UCI::setoption("PruneAtShallowDepth", "false"); UCI::setoption("EnableTranspositionTable", "true"); diff --git a/src/uci.cpp b/src/uci.cpp index 887c8a21..b1d385d0 100644 --- a/src/uci.cpp +++ b/src/uci.cpp @@ -200,35 +200,39 @@ namespace { } // namespace -void UCI::setoption(const std::string& name, const std::string& value) -{ - if (Options.count(name)) - Options[name] = value; - else - sync_cout << "No such option: " << name << sync_endl; -} +namespace UCI { -// The win rate model returns the probability (per mille) of winning given an eval -// and a game-ply. The model fits rather accurately the LTC fishtest statistics. -int win_rate_model(Value v, int ply) { + void setoption(const std::string& name, const std::string& value) + { + if (Options.count(name)) + Options[name] = value; + else + sync_cout << "No such option: " << name << sync_endl; + } - // The model captures only up to 240 plies, so limit input (and rescale) - double m = std::min(240, ply) / 64.0; + // The win rate model returns the probability (per mille) of winning given an eval + // and a game-ply. The model fits rather accurately the LTC fishtest statistics. + int win_rate_model(Value v, int ply) { - // Coefficients of a 3rd order polynomial fit based on fishtest data - // for two parameters needed to transform eval to the argument of a - // logistic function. - double as[] = {-8.24404295, 64.23892342, -95.73056462, 153.86478679}; - double bs[] = {-3.37154371, 28.44489198, -56.67657741, 72.05858751}; - double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3]; - double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3]; + // The model captures only up to 240 plies, so limit input (and rescale) + double m = std::min(240, ply) / 64.0; - // Transform eval to centipawns with limited range - double x = std::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0); + // Coefficients of a 3rd order polynomial fit based on fishtest data + // for two parameters needed to transform eval to the argument of a + // logistic function. + double as[] = {-3.68389304, 30.07065921, -60.52878723, 149.53378557}; + double bs[] = {-2.0181857, 15.85685038, -29.83452023, 47.59078827}; + double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3]; + double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3]; - // Return win rate in per mille (rounded to nearest) - return int(0.5 + 1000 / (1 + std::exp((a - x) / b))); -} + // Transform eval to centipawns with limited range + double x = std::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0); + + // Return win rate in per mille (rounded to nearest) + return int(0.5 + 1000 / (1 + std::exp((a - x) / b))); + } + +} // namespace // -------------------- // Call qsearch(),search() directly for testing @@ -348,16 +352,16 @@ void UCI::loop(int argc, char* argv[]) { else if (token == "d") sync_cout << pos << sync_endl; else if (token == "eval") trace_eval(pos); else if (token == "compiler") sync_cout << compiler_info() << sync_endl; - else if (token == "export_net") { + else if (token == "export_net") + { std::optional filename; std::string f; - if (is >> skipws >> f) { - filename = f; - } - Eval::NNUE::export_net(filename); + if (is >> skipws >> f) + filename = f; + Eval::NNUE::save_eval(filename); } else if (token == "generate_training_data") Tools::generate_training_data(is); - else if (token == "generate_training_data") Tools::generate_training_data_nonpv(is); + else if (token == "generate_training_data_nonpv") Tools::generate_training_data_nonpv(is); else if (token == "convert") Tools::convert(is); else if (token == "validate_training_data") Tools::validate_training_data(is); else if (token == "convert_bin") Tools::convert_bin(is); diff --git a/src/ucioption.cpp b/src/ucioption.cpp index c42c38c9..5af78ec4 100644 --- a/src/ucioption.cpp +++ b/src/ucioption.cpp @@ -68,8 +68,6 @@ void init(OptionsMap& o) { constexpr int MaxHashMB = Is64Bit ? 33554432 : 2048; o["Debug Log File"] << Option("", on_logger); - o["Contempt"] << Option(24, -100, 100); - o["Analysis Contempt"] << Option("Both var Off var White var Black var Both", "Both"); o["Threads"] << Option(1, 1, 512, on_threads); o["Hash"] << Option(16, 1, MaxHashMB, on_hash_size); o["Clear Hash"] << Option(on_clear_hash); diff --git a/tests/instrumented.sh b/tests/instrumented.sh index 518d1087..545fb1c0 100755 --- a/tests/instrumented.sh +++ b/tests/instrumented.sh @@ -13,7 +13,7 @@ case $1 in --valgrind) echo "valgrind testing started" prefix='' - exeprefix='valgrind --error-exitcode=42' + exeprefix='valgrind --error-exitcode=42 --errors-for-leak-kinds=all --leak-check=full' postfix='1>/dev/null' threads="1" bench_depth=5 @@ -110,7 +110,7 @@ cat << EOF > game.exp expect "bestmove" send "position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1\n" - send "go depth $go_depth\n" + send "go depth 10\n" expect "bestmove" send "quit\n" @@ -192,7 +192,7 @@ cat << EOF > data_generation02.exp exit \$value EOF -for exp in game.exp data_generation01.exe data_generation02.exp +for exp in game.exp data_generation01.exp data_generation02.exp do echo "$prefix expect $exp $postfix"