On Mon, Jul 26, 2021 at 10:20 AM Noah Goldstein wrote: > > > > On Mon, Jul 26, 2021 at 8:02 AM H.J. Lu via Libc-alpha wrote: >> >> commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 >> Author: H.J. Lu >> Date: Sat Jan 25 14:19:40 2020 -0800 >> >> x86-64: Avoid rep movsb with short distance [BZ #27130] >> >> introduced some regressions on Intel processors without Fast Short REP >> MOV (FSRM). Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with >> short distance only on Intel processors with FSRM. bench-memmove-large >> on Skylake server shows that cycles of __memmove_evex_unaligned_erms are >> improved for the following data size: >> >> before after Improvement >> length=4127, align1=3, align2=0: 479.38 343.00 28% >> length=4223, align1=9, align2=5: 405.62 335.50 17% >> length=8223, align1=3, align2=0: 786.12 495.00 37% >> length=8319, align1=9, align2=5: 256.69 170.38 33% >> length=16415, align1=3, align2=0: 1436.88 839.50 41% >> length=16511, align1=9, align2=5: 1375.50 840.62 39% >> length=32799, align1=3, align2=0: 2890.00 1850.62 36% >> length=32895, align1=9, align2=5: 2891.38 1948.62 32% >> >> There are no regression on Ice Lake server. > > > On Tigerlake I see some strange results for the random tests: > > "ifuncs": ["__memcpy_avx_unaligned", "__memcpy_avx_unaligned_erms", "__memcpy_evex_unaligned", "__memcpy_evex_unaligned_erms", "__memcpy_ssse3_back", "__memcpy_ssse3", "__memcpy_avx512_no_vzeroupper", "__memcpy_avx512_unaligned", "__memcpy_avx512_unaligned_erms", "__memcpy_sse2_unaligned", "__memcpy_sse2_unaligned_erms", "__memcpy_erms"], > > Without the Patch > "length": 4096, > "timings": [117793, 118814, 95009.2, 140061, 209016, 162007, 112210, 113011, 139953, 106604, 106483, 116845] > > With the patch > "length": 4096, > "timings": [136386, 95256.7, 134947, 102466, 182687, 163942, 110546, 127766, 98344.5, 107647, 109190, 118613] > > > It seems like some of the erms versions are heavily pessimized while the non-erms versions are significantly > benefited. I think it has to do with the change in alignment of L(less_vec) though I am not certain. I also saw it on Tiger Lake. Please try this patch on top of my patch. > Are you seeing the same performance changes on Skylake/Icelake server? I will check it out. >> >> --- >> sysdeps/x86/cacheinfo.h | 7 +++++++ >> sysdeps/x86/cpu-features.c | 5 +++++ >> .../x86/include/cpu-features-preferred_feature_index_1.def | 1 + >> sysdeps/x86/sysdep.h | 3 +++ >> sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 5 +++++ >> 5 files changed, 21 insertions(+) >> >> diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h >> index eba8dbc4a6..174ea38f5b 100644 >> --- a/sysdeps/x86/cacheinfo.h >> +++ b/sysdeps/x86/cacheinfo.h >> @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048; >> /* Threshold to stop using Enhanced REP MOVSB. */ >> long int __x86_rep_movsb_stop_threshold attribute_hidden; >> >> +/* String/memory function control. */ >> +int __x86_string_control attribute_hidden; >> + >> static void >> init_cacheinfo (void) >> { >> @@ -71,5 +74,9 @@ init_cacheinfo (void) >> __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; >> __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; >> __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold; >> + >> + if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB)) >> + __x86_string_control >> + |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB; >> } >> #endif >> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c >> index 706a172ba9..645bba6314 100644 >> --- a/sysdeps/x86/cpu-features.c >> +++ b/sysdeps/x86/cpu-features.c >> @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features) >> cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP] >> |= bit_arch_Prefer_AVX2_STRCMP; >> } >> + >> + /* Avoid avoid short distance REP MOVSB on processor with FSRM. */ >> + if (CPU_FEATURES_CPU_P (cpu_features, FSRM)) >> + cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB] >> + |= bit_arch_Avoid_Short_Distance_REP_MOVSB; >> } >> /* This spells out "AuthenticAMD" or "HygonGenuine". */ >> else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) >> diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def >> index 133aab19f1..d7c93f00c5 100644 >> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def >> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def >> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512) >> BIT (MathVec_Prefer_No_AVX512) >> BIT (Prefer_FSRM) >> BIT (Prefer_AVX2_STRCMP) >> +BIT (Avoid_Short_Distance_REP_MOVSB) >> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h >> index 51c069bfe1..35cb90d507 100644 >> --- a/sysdeps/x86/sysdep.h >> +++ b/sysdeps/x86/sysdep.h >> @@ -57,6 +57,9 @@ enum cf_protection_level >> #define STATE_SAVE_MASK \ >> ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) >> >> +/* Avoid short distance REP MOVSB. */ >> +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB (1 << 0) >> + >> #ifdef __ASSEMBLER__ >> >> /* Syntactic details of assembler. */ >> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S >> index a783da5de2..9f02624375 100644 >> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S >> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S >> @@ -325,12 +325,16 @@ L(movsb): >> /* Avoid slow backward REP MOVSB. */ >> jb L(more_8x_vec_backward) >> # if AVOID_SHORT_DISTANCE_REP_MOVSB >> + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) >> + jz 3f >> movq %rdi, %rcx >> subq %rsi, %rcx >> jmp 2f >> # endif >> 1: >> # if AVOID_SHORT_DISTANCE_REP_MOVSB >> + andl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) >> + jz 3f >> movq %rsi, %rcx >> subq %rdi, %rcx >> 2: >> @@ -338,6 +342,7 @@ L(movsb): >> is N*4GB + [1..63] with N >= 0. */ >> cmpl $63, %ecx >> jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ >> +3: >> # endif >> mov %RDX_LP, %RCX_LP >> rep movsb >> -- >> 2.31.1 >> -- H.J.