mirror of
https://github.com/opelly27/Stockfish.git
synced 2026-05-20 08:37:44 +00:00
Optimize FT activation and affine transform for NEON.
This patch optimizes the NEON implementation in two ways.
The activation layer after the feature transformer is rewritten to make it easier for the compiler to see through dependencies and unroll. This in itself is a minimal, but a positive improvement. Other architectures could benefit from this too in the future. This is not an algorithmic change.
The affine transform for large matrices (first layer after FT) on NEON now utilizes the same optimized code path as >=SSSE3, which makes the memory accesses more sequential and makes better use of the available registers, which allows for code that has longer dependency chains.
Benchmarks from Redshift#161, profile-build with apple clang
george@Georges-MacBook-Air nets % ./stockfish-b82d93 bench 2>&1 | tail -4 (current master)
===========================
Total time (ms) : 2167
Nodes searched : 4667742
Nodes/second : 2154011
george@Georges-MacBook-Air nets % ./stockfish-7377b8 bench 2>&1 | tail -4 (this patch)
===========================
Total time (ms) : 1842
Nodes searched : 4667742
Nodes/second : 2534061
This is a solid 18% improvement overall, larger in a bench with NNUE-only, not mixed.
Improvement is also observed on armv7-neon (Raspberry Pi, and older phones), around 5% speedup.
No changes for architectures other than NEON.
closes https://github.com/official-stockfish/Stockfish/pull/3837
No functional changes.
This commit is contained in:
committed by
Joost VandeVondele
parent
b82d93ece4
commit
4766dfc395
+39
@@ -343,6 +343,45 @@ namespace Stockfish::Simd {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (USE_NEON)
|
||||
|
||||
[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
|
||||
# if USE_NEON >= 8
|
||||
return vaddvq_s32(s);
|
||||
# else
|
||||
return s[0] + s[1] + s[2] + s[3];
|
||||
# endif
|
||||
}
|
||||
|
||||
[[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) {
|
||||
return neon_m128_reduce_add_epi32(sum) + bias;
|
||||
}
|
||||
|
||||
[[maybe_unused]] static int32x4_t neon_m128_haddx4(
|
||||
int32x4_t sum0, int32x4_t sum1, int32x4_t sum2, int32x4_t sum3,
|
||||
int32x4_t bias) {
|
||||
|
||||
int32x4_t hsums {
|
||||
neon_m128_reduce_add_epi32(sum0),
|
||||
neon_m128_reduce_add_epi32(sum1),
|
||||
neon_m128_reduce_add_epi32(sum2),
|
||||
neon_m128_reduce_add_epi32(sum3)
|
||||
};
|
||||
return vaddq_s32(hsums, bias);
|
||||
}
|
||||
|
||||
[[maybe_unused]] static void neon_m128_add_dpbusd_epi32x2(
|
||||
int32x4_t& acc,
|
||||
int8x8_t a0, int8x8_t b0,
|
||||
int8x8_t a1, int8x8_t b1) {
|
||||
|
||||
int16x8_t product = vmull_s8(a0, b0);
|
||||
product = vmlal_s8(product, a1, b1);
|
||||
acc = vpadalq_s16(acc, product);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#endif // STOCKFISH_SIMD_H_INCLUDED
|
||||
|
||||
Reference in New Issue
Block a user