unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB
@ 2021-07-26 12:00 H.J. Lu via Libc-alpha
  2021-07-26 17:20 ` Noah Goldstein via Libc-alpha
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-07-26 12:00 UTC (permalink / raw)
  To: libc-alpha

commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sat Jan 25 14:19:40 2020 -0800

    x86-64: Avoid rep movsb with short distance [BZ #27130]

introduced some regressions on Intel processors without Fast Short REP
MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
short distance only on Intel processors with FSRM.  bench-memmove-large
on Skylake server shows that cycles of __memmove_evex_unaligned_erms are
improved for the following data size:

                                  before    after    Improvement
length=4127, align1=3, align2=0:  479.38    343.00      28%
length=4223, align1=9, align2=5:  405.62    335.50      17%
length=8223, align1=3, align2=0:  786.12    495.00      37%
length=8319, align1=9, align2=5:  256.69    170.38      33%
length=16415, align1=3, align2=0: 1436.88   839.50      41%
length=16511, align1=9, align2=5: 1375.50   840.62      39%
length=32799, align1=3, align2=0: 2890.00   1850.62     36%
length=32895, align1=9, align2=5: 2891.38   1948.62     32%

There are no regression on Ice Lake server.
---
 sysdeps/x86/cacheinfo.h                                    | 7 +++++++
 sysdeps/x86/cpu-features.c                                 | 5 +++++
 .../x86/include/cpu-features-preferred_feature_index_1.def | 1 +
 sysdeps/x86/sysdep.h                                       | 3 +++
 sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S      | 5 +++++
 5 files changed, 21 insertions(+)

diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
index eba8dbc4a6..174ea38f5b 100644
--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
@@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
 /* Threshold to stop using Enhanced REP MOVSB.  */
 long int __x86_rep_movsb_stop_threshold attribute_hidden;
 
+/* String/memory function control.  */
+int __x86_string_control attribute_hidden;
+
 static void
 init_cacheinfo (void)
 {
@@ -71,5 +74,9 @@ init_cacheinfo (void)
   __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
   __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
   __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
+
+  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
+    __x86_string_control
+      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
 }
 #endif
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 706a172ba9..645bba6314 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features)
 	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
 	      |= bit_arch_Prefer_AVX2_STRCMP;
 	}
+
+      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
+      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
+	cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
+	  |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
     }
   /* This spells out "AuthenticAMD" or "HygonGenuine".  */
   else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 133aab19f1..d7c93f00c5 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
 BIT (MathVec_Prefer_No_AVX512)
 BIT (Prefer_FSRM)
 BIT (Prefer_AVX2_STRCMP)
+BIT (Avoid_Short_Distance_REP_MOVSB)
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 51c069bfe1..35cb90d507 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -57,6 +57,9 @@ enum cf_protection_level
 #define STATE_SAVE_MASK \
   ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
 
+/* Avoid short distance REP MOVSB.  */
+#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB	(1 << 0)
+
 #ifdef	__ASSEMBLER__
 
 /* Syntactic details of assembler.  */
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index a783da5de2..9f02624375 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -325,12 +325,16 @@ L(movsb):
 	/* Avoid slow backward REP MOVSB.  */
 	jb	L(more_8x_vec_backward)
 # if AVOID_SHORT_DISTANCE_REP_MOVSB
+	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	3f
 	movq	%rdi, %rcx
 	subq	%rsi, %rcx
 	jmp	2f
 # endif
 1:
 # if AVOID_SHORT_DISTANCE_REP_MOVSB
+	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	3f
 	movq	%rsi, %rcx
 	subq	%rdi, %rcx
 2:
@@ -338,6 +342,7 @@ L(movsb):
    is N*4GB + [1..63] with N >= 0.  */
 	cmpl	$63, %ecx
 	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
+3:
 # endif
 	mov	%RDX_LP, %RCX_LP
 	rep movsb
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-07-26 12:00 [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB H.J. Lu via Libc-alpha
@ 2021-07-26 17:20 ` Noah Goldstein via Libc-alpha
  2021-07-26 18:50   ` H.J. Lu via Libc-alpha
  2021-07-27  2:15 ` Carlos O'Donell via Libc-alpha
  2021-08-28  0:27 ` [PATCH] " Alexey Tourbin via Libc-alpha
  2 siblings, 1 reply; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-07-26 17:20 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library

On Mon, Jul 26, 2021 at 8:02 AM H.J. Lu via Libc-alpha <
libc-alpha@sourceware.org> wrote:

> commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
> Author: H.J. Lu <hjl.tools@gmail.com>
> Date:   Sat Jan 25 14:19:40 2020 -0800
>
>     x86-64: Avoid rep movsb with short distance [BZ #27130]
>
> introduced some regressions on Intel processors without Fast Short REP
> MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
> short distance only on Intel processors with FSRM.  bench-memmove-large
> on Skylake server shows that cycles of __memmove_evex_unaligned_erms are
> improved for the following data size:
>
>                                   before    after    Improvement
> length=4127, align1=3, align2=0:  479.38    343.00      28%
> length=4223, align1=9, align2=5:  405.62    335.50      17%
> length=8223, align1=3, align2=0:  786.12    495.00      37%
> length=8319, align1=9, align2=5:  256.69    170.38      33%
> length=16415, align1=3, align2=0: 1436.88   839.50      41%
> length=16511, align1=9, align2=5: 1375.50   840.62      39%
> length=32799, align1=3, align2=0: 2890.00   1850.62     36%
> length=32895, align1=9, align2=5: 2891.38   1948.62     32%
>
> There are no regression on Ice Lake server.
>

On Tigerlake I see some strange results for the random tests:

"ifuncs": ["__memcpy_avx_unaligned", "__memcpy_avx_unaligned_erms",
"__memcpy_evex_unaligned", "__memcpy_evex_unaligned_erms",
"__memcpy_ssse3_back", "__memcpy_ssse3", "__memcpy_avx512_no_vzeroupper",
"__memcpy_avx512_unaligned", "__memcpy_avx512_unaligned_erms",
"__memcpy_sse2_unaligned", "__memcpy_sse2_unaligned_erms", "__memcpy_erms"],

Without the Patch
"length": 4096,
"timings": [117793, 118814, 95009.2, 140061, 209016, 162007, 112210,
113011, 139953, 106604, 106483, 116845]

With the patch
"length": 4096,
"timings": [136386, 95256.7, 134947, 102466, 182687, 163942, 110546,
127766, 98344.5, 107647, 109190, 118613]


It seems like some of the erms versions are heavily pessimized while the
non-erms versions are significantly
benefited. I think it has to do with the change in alignment of L(less_vec)
though I am not certain.

Are you seeing the same performance changes on Skylake/Icelake server?


> ---
>  sysdeps/x86/cacheinfo.h                                    | 7 +++++++
>  sysdeps/x86/cpu-features.c                                 | 5 +++++
>  .../x86/include/cpu-features-preferred_feature_index_1.def | 1 +
>  sysdeps/x86/sysdep.h                                       | 3 +++
>  sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S      | 5 +++++
>  5 files changed, 21 insertions(+)
>
> diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> index eba8dbc4a6..174ea38f5b 100644
> --- a/sysdeps/x86/cacheinfo.h
> +++ b/sysdeps/x86/cacheinfo.h
> @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden =
> 2048;
>  /* Threshold to stop using Enhanced REP MOVSB.  */
>  long int __x86_rep_movsb_stop_threshold attribute_hidden;
>
> +/* String/memory function control.  */
> +int __x86_string_control attribute_hidden;
> +
>  static void
>  init_cacheinfo (void)
>  {
> @@ -71,5 +74,9 @@ init_cacheinfo (void)
>    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
>    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
>    __x86_rep_movsb_stop_threshold =
> cpu_features->rep_movsb_stop_threshold;
> +
> +  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
> +    __x86_string_control
> +      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
>  }
>  #endif
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 706a172ba9..645bba6314 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>             cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
>               |= bit_arch_Prefer_AVX2_STRCMP;
>         }
> +
> +      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
> +      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
> +       cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
> +         |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
>      }
>    /* This spells out "AuthenticAMD" or "HygonGenuine".  */
>    else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
> diff --git
> a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> index 133aab19f1..d7c93f00c5 100644
> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>  BIT (MathVec_Prefer_No_AVX512)
>  BIT (Prefer_FSRM)
>  BIT (Prefer_AVX2_STRCMP)
> +BIT (Avoid_Short_Distance_REP_MOVSB)
> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> index 51c069bfe1..35cb90d507 100644
> --- a/sysdeps/x86/sysdep.h
> +++ b/sysdeps/x86/sysdep.h
> @@ -57,6 +57,9 @@ enum cf_protection_level
>  #define STATE_SAVE_MASK \
>    ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
>
> +/* Avoid short distance REP MOVSB.  */
> +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB      (1 << 0)
> +
>  #ifdef __ASSEMBLER__
>
>  /* Syntactic details of assembler.  */
> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index a783da5de2..9f02624375 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -325,12 +325,16 @@ L(movsb):
>         /* Avoid slow backward REP MOVSB.  */
>         jb      L(more_8x_vec_backward)
>  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> +       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> +       jz      3f
>         movq    %rdi, %rcx
>         subq    %rsi, %rcx
>         jmp     2f
>  # endif
>  1:
>  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> +       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> +       jz      3f
>         movq    %rsi, %rcx
>         subq    %rdi, %rcx
>  2:
> @@ -338,6 +342,7 @@ L(movsb):
>     is N*4GB + [1..63] with N >= 0.  */
>         cmpl    $63, %ecx
>         jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
> +3:
>  # endif
>         mov     %RDX_LP, %RCX_LP
>         rep movsb
> --
> 2.31.1
>
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-07-26 17:20 ` Noah Goldstein via Libc-alpha
@ 2021-07-26 18:50   ` H.J. Lu via Libc-alpha
  0 siblings, 0 replies; 15+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-07-26 18:50 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

[-- Attachment #1: Type: text/plain, Size: 7004 bytes --]

On Mon, Jul 26, 2021 at 10:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
> On Mon, Jul 26, 2021 at 8:02 AM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote:
>>
>> commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
>> Author: H.J. Lu <hjl.tools@gmail.com>
>> Date:   Sat Jan 25 14:19:40 2020 -0800
>>
>>     x86-64: Avoid rep movsb with short distance [BZ #27130]
>>
>> introduced some regressions on Intel processors without Fast Short REP
>> MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
>> short distance only on Intel processors with FSRM.  bench-memmove-large
>> on Skylake server shows that cycles of __memmove_evex_unaligned_erms are
>> improved for the following data size:
>>
>>                                   before    after    Improvement
>> length=4127, align1=3, align2=0:  479.38    343.00      28%
>> length=4223, align1=9, align2=5:  405.62    335.50      17%
>> length=8223, align1=3, align2=0:  786.12    495.00      37%
>> length=8319, align1=9, align2=5:  256.69    170.38      33%
>> length=16415, align1=3, align2=0: 1436.88   839.50      41%
>> length=16511, align1=9, align2=5: 1375.50   840.62      39%
>> length=32799, align1=3, align2=0: 2890.00   1850.62     36%
>> length=32895, align1=9, align2=5: 2891.38   1948.62     32%
>>
>> There are no regression on Ice Lake server.
>
>
> On Tigerlake I see some strange results for the random tests:
>
> "ifuncs": ["__memcpy_avx_unaligned", "__memcpy_avx_unaligned_erms", "__memcpy_evex_unaligned", "__memcpy_evex_unaligned_erms", "__memcpy_ssse3_back", "__memcpy_ssse3", "__memcpy_avx512_no_vzeroupper", "__memcpy_avx512_unaligned", "__memcpy_avx512_unaligned_erms", "__memcpy_sse2_unaligned", "__memcpy_sse2_unaligned_erms", "__memcpy_erms"],
>
> Without the Patch
> "length": 4096,
> "timings": [117793, 118814, 95009.2, 140061, 209016, 162007, 112210, 113011, 139953, 106604, 106483, 116845]
>
> With the patch
> "length": 4096,
> "timings": [136386, 95256.7, 134947, 102466, 182687, 163942, 110546, 127766, 98344.5, 107647, 109190, 118613]
>
>
> It seems like some of the erms versions are heavily pessimized while the non-erms versions are significantly
> benefited. I think it has to do with the change in alignment of L(less_vec) though I am not certain.

I also saw it on Tiger Lake.  Please try this patch on top of my patch.

> Are you seeing the same performance changes on Skylake/Icelake server?

I will check it out.

>>
>> ---
>>  sysdeps/x86/cacheinfo.h                                    | 7 +++++++
>>  sysdeps/x86/cpu-features.c                                 | 5 +++++
>>  .../x86/include/cpu-features-preferred_feature_index_1.def | 1 +
>>  sysdeps/x86/sysdep.h                                       | 3 +++
>>  sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S      | 5 +++++
>>  5 files changed, 21 insertions(+)
>>
>> diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
>> index eba8dbc4a6..174ea38f5b 100644
>> --- a/sysdeps/x86/cacheinfo.h
>> +++ b/sysdeps/x86/cacheinfo.h
>> @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
>>  /* Threshold to stop using Enhanced REP MOVSB.  */
>>  long int __x86_rep_movsb_stop_threshold attribute_hidden;
>>
>> +/* String/memory function control.  */
>> +int __x86_string_control attribute_hidden;
>> +
>>  static void
>>  init_cacheinfo (void)
>>  {
>> @@ -71,5 +74,9 @@ init_cacheinfo (void)
>>    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
>>    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
>>    __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
>> +
>> +  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
>> +    __x86_string_control
>> +      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
>>  }
>>  #endif
>> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
>> index 706a172ba9..645bba6314 100644
>> --- a/sysdeps/x86/cpu-features.c
>> +++ b/sysdeps/x86/cpu-features.c
>> @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>>             cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
>>               |= bit_arch_Prefer_AVX2_STRCMP;
>>         }
>> +
>> +      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
>> +      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
>> +       cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
>> +         |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
>>      }
>>    /* This spells out "AuthenticAMD" or "HygonGenuine".  */
>>    else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
>> diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> index 133aab19f1..d7c93f00c5 100644
>> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>>  BIT (MathVec_Prefer_No_AVX512)
>>  BIT (Prefer_FSRM)
>>  BIT (Prefer_AVX2_STRCMP)
>> +BIT (Avoid_Short_Distance_REP_MOVSB)
>> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
>> index 51c069bfe1..35cb90d507 100644
>> --- a/sysdeps/x86/sysdep.h
>> +++ b/sysdeps/x86/sysdep.h
>> @@ -57,6 +57,9 @@ enum cf_protection_level
>>  #define STATE_SAVE_MASK \
>>    ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
>>
>> +/* Avoid short distance REP MOVSB.  */
>> +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB      (1 << 0)
>> +
>>  #ifdef __ASSEMBLER__
>>
>>  /* Syntactic details of assembler.  */
>> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> index a783da5de2..9f02624375 100644
>> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> @@ -325,12 +325,16 @@ L(movsb):
>>         /* Avoid slow backward REP MOVSB.  */
>>         jb      L(more_8x_vec_backward)
>>  # if AVOID_SHORT_DISTANCE_REP_MOVSB
>> +       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
>> +       jz      3f
>>         movq    %rdi, %rcx
>>         subq    %rsi, %rcx
>>         jmp     2f
>>  # endif
>>  1:
>>  # if AVOID_SHORT_DISTANCE_REP_MOVSB
>> +       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
>> +       jz      3f
>>         movq    %rsi, %rcx
>>         subq    %rdi, %rcx
>>  2:
>> @@ -338,6 +342,7 @@ L(movsb):
>>     is N*4GB + [1..63] with N >= 0.  */
>>         cmpl    $63, %ecx
>>         jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
>> +3:
>>  # endif
>>         mov     %RDX_LP, %RCX_LP
>>         rep movsb
>> --
>> 2.31.1
>>


-- 
H.J.

[-- Attachment #2: p.diff --]
[-- Type: text/x-patch, Size: 722 bytes --]

diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 9f02624375..5e882b0cde 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -330,6 +330,12 @@ L(movsb):
 	movq	%rdi, %rcx
 	subq	%rsi, %rcx
 	jmp	2f
+	/* data16 cs nopw 0x0(%rax,%rax,1) */
+	.byte 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
+	/* data16 cs nopw 0x0(%rax,%rax,1) */
+	.byte 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
+	/* nopw 0x0(%rax,%rax,1) */
+	.byte 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
 # endif
 1:
 # if AVOID_SHORT_DISTANCE_REP_MOVSB

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-07-26 12:00 [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB H.J. Lu via Libc-alpha
  2021-07-26 17:20 ` Noah Goldstein via Libc-alpha
@ 2021-07-27  2:15 ` Carlos O'Donell via Libc-alpha
  2021-07-27  3:11   ` H.J. Lu via Libc-alpha
  2021-08-28  0:27 ` [PATCH] " Alexey Tourbin via Libc-alpha
  2 siblings, 1 reply; 15+ messages in thread
From: Carlos O'Donell via Libc-alpha @ 2021-07-27  2:15 UTC (permalink / raw)
  To: H.J. Lu, libc-alpha

On 7/26/21 8:00 AM, H.J. Lu via Libc-alpha wrote:
> commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
> Author: H.J. Lu <hjl.tools@gmail.com>
> Date:   Sat Jan 25 14:19:40 2020 -0800
> 
>     x86-64: Avoid rep movsb with short distance [BZ #27130]
> introduced some regressions on Intel processors without Fast Short REP
> MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
> short distance only on Intel processors with FSRM.  bench-memmove-large
> on Skylake server shows that cycles of __memmove_evex_unaligned_erms are
> improved for the following data size:
> 
>                                   before    after    Improvement
> length=4127, align1=3, align2=0:  479.38    343.00      28%
> length=4223, align1=9, align2=5:  405.62    335.50      17%
> length=8223, align1=3, align2=0:  786.12    495.00      37%
> length=8319, align1=9, align2=5:  256.69    170.38      33%
> length=16415, align1=3, align2=0: 1436.88   839.50      41%
> length=16511, align1=9, align2=5: 1375.50   840.62      39%
> length=32799, align1=3, align2=0: 2890.00   1850.62     36%
> length=32895, align1=9, align2=5: 2891.38   1948.62     32%
> 
> There are no regression on Ice Lake server.

At this point we're waiting on Noah to provide feedback on the performance
results given the alignment nop insertion you provided as a follow-up patch
(unless you can confirm this yourself).

Looking forward to a v2 the incorporates the alignment fix (pending Noah's
comments), and my suggestions below.

> ---
>  sysdeps/x86/cacheinfo.h                                    | 7 +++++++
>  sysdeps/x86/cpu-features.c                                 | 5 +++++
>  .../x86/include/cpu-features-preferred_feature_index_1.def | 1 +
>  sysdeps/x86/sysdep.h                                       | 3 +++
>  sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S      | 5 +++++
>  5 files changed, 21 insertions(+)
> 
> diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> index eba8dbc4a6..174ea38f5b 100644
> --- a/sysdeps/x86/cacheinfo.h
> +++ b/sysdeps/x86/cacheinfo.h
> @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
>  /* Threshold to stop using Enhanced REP MOVSB.  */
>  long int __x86_rep_movsb_stop_threshold attribute_hidden;
>  
> +/* String/memory function control.  */
> +int __x86_string_control attribute_hidden;

Please expand comment.

Suggest:

/* A bit-wise OR of string/memory requirements for optimal performance
   e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These bits
   are used at runtime to tune implementation behavior.  */
int __x86_string_control attribute_hidden;

> +
>  static void
>  init_cacheinfo (void)
>  {
> @@ -71,5 +74,9 @@ init_cacheinfo (void)
>    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
>    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
>    __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
> +
> +  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
> +    __x86_string_control
> +      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;

OK.

>  }
>  #endif
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 706a172ba9..645bba6314 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>  	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
>  	      |= bit_arch_Prefer_AVX2_STRCMP;
>  	}
> +
> +      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
> +      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
> +	cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
> +	  |= bit_arch_Avoid_Short_Distance_REP_MOVSB;

OK.

>      }
>    /* This spells out "AuthenticAMD" or "HygonGenuine".  */
>    else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
> diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> index 133aab19f1..d7c93f00c5 100644
> --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>  BIT (MathVec_Prefer_No_AVX512)
>  BIT (Prefer_FSRM)
>  BIT (Prefer_AVX2_STRCMP)
> +BIT (Avoid_Short_Distance_REP_MOVSB)

OK.

> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> index 51c069bfe1..35cb90d507 100644
> --- a/sysdeps/x86/sysdep.h
> +++ b/sysdeps/x86/sysdep.h
> @@ -57,6 +57,9 @@ enum cf_protection_level
>  #define STATE_SAVE_MASK \
>    ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
>  

Suggest adding:

/* Constants for bits in __x86_string_control:  */

> +/* Avoid short distance REP MOVSB.  */
> +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB	(1 << 0)

OK.

> +
>  #ifdef	__ASSEMBLER__
>  
>  /* Syntactic details of assembler.  */
> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index a783da5de2..9f02624375 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -325,12 +325,16 @@ L(movsb):
>  	/* Avoid slow backward REP MOVSB.  */
>  	jb	L(more_8x_vec_backward)
>  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> +	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> +	jz	3f

OK.

>  	movq	%rdi, %rcx
>  	subq	%rsi, %rcx
>  	jmp	2f
>  # endif
>  1:
>  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> +	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> +	jz	3f

OK.

>  	movq	%rsi, %rcx
>  	subq	%rdi, %rcx
>  2:
> @@ -338,6 +342,7 @@ L(movsb):
>     is N*4GB + [1..63] with N >= 0.  */
>  	cmpl	$63, %ecx
>  	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
> +3:

OK.

>  # endif
>  	mov	%RDX_LP, %RCX_LP
>  	rep movsb
> 


-- 
Cheers,
Carlos.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-07-27  2:15 ` Carlos O'Donell via Libc-alpha
@ 2021-07-27  3:11   ` H.J. Lu via Libc-alpha
  2021-07-27  4:05     ` Noah Goldstein via Libc-alpha
  0 siblings, 1 reply; 15+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-07-27  3:11 UTC (permalink / raw)
  To: Carlos O'Donell; +Cc: GNU C Library

On Mon, Jul 26, 2021 at 7:15 PM Carlos O'Donell <carlos@redhat.com> wrote:
>
> On 7/26/21 8:00 AM, H.J. Lu via Libc-alpha wrote:
> > commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
> > Author: H.J. Lu <hjl.tools@gmail.com>
> > Date:   Sat Jan 25 14:19:40 2020 -0800
> >
> >     x86-64: Avoid rep movsb with short distance [BZ #27130]
> > introduced some regressions on Intel processors without Fast Short REP
> > MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
> > short distance only on Intel processors with FSRM.  bench-memmove-large
> > on Skylake server shows that cycles of __memmove_evex_unaligned_erms are
> > improved for the following data size:
> >
> >                                   before    after    Improvement
> > length=4127, align1=3, align2=0:  479.38    343.00      28%
> > length=4223, align1=9, align2=5:  405.62    335.50      17%
> > length=8223, align1=3, align2=0:  786.12    495.00      37%
> > length=8319, align1=9, align2=5:  256.69    170.38      33%
> > length=16415, align1=3, align2=0: 1436.88   839.50      41%
> > length=16511, align1=9, align2=5: 1375.50   840.62      39%
> > length=32799, align1=3, align2=0: 2890.00   1850.62     36%
> > length=32895, align1=9, align2=5: 2891.38   1948.62     32%
> >
> > There are no regression on Ice Lake server.
>
> At this point we're waiting on Noah to provide feedback on the performance
> results given the alignment nop insertion you provided as a follow-up patch

We are testing 25 byte nop padding now:

https://gitlab.com/x86-glibc/glibc/-/commit/de8985640a568786a59576716db54e0749d420e8

> (unless you can confirm this yourself).
>
> Looking forward to a v2 the incorporates the alignment fix (pending Noah's
> comments), and my suggestions below.
>
> > ---
> >  sysdeps/x86/cacheinfo.h                                    | 7 +++++++
> >  sysdeps/x86/cpu-features.c                                 | 5 +++++
> >  .../x86/include/cpu-features-preferred_feature_index_1.def | 1 +
> >  sysdeps/x86/sysdep.h                                       | 3 +++
> >  sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S      | 5 +++++
> >  5 files changed, 21 insertions(+)
> >
> > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> > index eba8dbc4a6..174ea38f5b 100644
> > --- a/sysdeps/x86/cacheinfo.h
> > +++ b/sysdeps/x86/cacheinfo.h
> > @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
> >  /* Threshold to stop using Enhanced REP MOVSB.  */
> >  long int __x86_rep_movsb_stop_threshold attribute_hidden;
> >
> > +/* String/memory function control.  */
> > +int __x86_string_control attribute_hidden;
>
> Please expand comment.
>
> Suggest:
>
> /* A bit-wise OR of string/memory requirements for optimal performance
>    e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These bits
>    are used at runtime to tune implementation behavior.  */
> int __x86_string_control attribute_hidden;

I will fix it in the v2 patch.

Thanks.

> > +
> >  static void
> >  init_cacheinfo (void)
> >  {
> > @@ -71,5 +74,9 @@ init_cacheinfo (void)
> >    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
> >    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
> >    __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
> > +
> > +  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
> > +    __x86_string_control
> > +      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
>
> OK.
>
> >  }
> >  #endif
> > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > index 706a172ba9..645bba6314 100644
> > --- a/sysdeps/x86/cpu-features.c
> > +++ b/sysdeps/x86/cpu-features.c
> > @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features)
> >           cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
> >             |= bit_arch_Prefer_AVX2_STRCMP;
> >       }
> > +
> > +      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
> > +      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
> > +     cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
> > +       |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
>
> OK.
>
> >      }
> >    /* This spells out "AuthenticAMD" or "HygonGenuine".  */
> >    else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
> > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > index 133aab19f1..d7c93f00c5 100644
> > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
> >  BIT (MathVec_Prefer_No_AVX512)
> >  BIT (Prefer_FSRM)
> >  BIT (Prefer_AVX2_STRCMP)
> > +BIT (Avoid_Short_Distance_REP_MOVSB)
>
> OK.
>
> > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> > index 51c069bfe1..35cb90d507 100644
> > --- a/sysdeps/x86/sysdep.h
> > +++ b/sysdeps/x86/sysdep.h
> > @@ -57,6 +57,9 @@ enum cf_protection_level
> >  #define STATE_SAVE_MASK \
> >    ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
> >
>
> Suggest adding:
>
> /* Constants for bits in __x86_string_control:  */
>
> > +/* Avoid short distance REP MOVSB.  */
> > +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB    (1 << 0)
>
> OK.
>
> > +
> >  #ifdef       __ASSEMBLER__
> >
> >  /* Syntactic details of assembler.  */
> > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > index a783da5de2..9f02624375 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > @@ -325,12 +325,16 @@ L(movsb):
> >       /* Avoid slow backward REP MOVSB.  */
> >       jb      L(more_8x_vec_backward)
> >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> > +     jz      3f
>
> OK.
>
> >       movq    %rdi, %rcx
> >       subq    %rsi, %rcx
> >       jmp     2f
> >  # endif
> >  1:
> >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> > +     jz      3f
>
> OK.
>
> >       movq    %rsi, %rcx
> >       subq    %rdi, %rcx
> >  2:
> > @@ -338,6 +342,7 @@ L(movsb):
> >     is N*4GB + [1..63] with N >= 0.  */
> >       cmpl    $63, %ecx
> >       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
> > +3:
>
> OK.
>
> >  # endif
> >       mov     %RDX_LP, %RCX_LP
> >       rep movsb
> >
>
>
> --
> Cheers,
> Carlos.
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-07-27  3:11   ` H.J. Lu via Libc-alpha
@ 2021-07-27  4:05     ` Noah Goldstein via Libc-alpha
  2021-07-27 16:05       ` [PATCH v2] " H.J. Lu via Libc-alpha
  0 siblings, 1 reply; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-07-27  4:05 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library

On Mon, Jul 26, 2021 at 11:11 PM H.J. Lu via Libc-alpha <
libc-alpha@sourceware.org> wrote:

> On Mon, Jul 26, 2021 at 7:15 PM Carlos O'Donell <carlos@redhat.com> wrote:
> >
> > On 7/26/21 8:00 AM, H.J. Lu via Libc-alpha wrote:
> > > commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
> > > Author: H.J. Lu <hjl.tools@gmail.com>
> > > Date:   Sat Jan 25 14:19:40 2020 -0800
> > >
> > >     x86-64: Avoid rep movsb with short distance [BZ #27130]
> > > introduced some regressions on Intel processors without Fast Short REP
> > > MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
> > > short distance only on Intel processors with FSRM.  bench-memmove-large
> > > on Skylake server shows that cycles of __memmove_evex_unaligned_erms
> are
> > > improved for the following data size:
> > >
> > >                                   before    after    Improvement
> > > length=4127, align1=3, align2=0:  479.38    343.00      28%
> > > length=4223, align1=9, align2=5:  405.62    335.50      17%
> > > length=8223, align1=3, align2=0:  786.12    495.00      37%
> > > length=8319, align1=9, align2=5:  256.69    170.38      33%
> > > length=16415, align1=3, align2=0: 1436.88   839.50      41%
> > > length=16511, align1=9, align2=5: 1375.50   840.62      39%
> > > length=32799, align1=3, align2=0: 2890.00   1850.62     36%
> > > length=32895, align1=9, align2=5: 2891.38   1948.62     32%
> > >
> > > There are no regression on Ice Lake server.
> >
> > At this point we're waiting on Noah to provide feedback on the
> performance
> > results given the alignment nop insertion you provided as a follow-up
> patch
>

The results with the padding look good!


>
> We are testing 25 byte nop padding now:


>
> https://gitlab.com/x86-glibc/glibc/-/commit/de8985640a568786a59576716db54e0749d420e8
>
> How did you come to the exact padding choice used?


> > (unless you can confirm this yourself).
> >
> > Looking forward to a v2 the incorporates the alignment fix (pending
> Noah's
> > comments), and my suggestions below.

>
> > > ---
> > >  sysdeps/x86/cacheinfo.h                                    | 7 +++++++
> > >  sysdeps/x86/cpu-features.c                                 | 5 +++++
> > >  .../x86/include/cpu-features-preferred_feature_index_1.def | 1 +
> > >  sysdeps/x86/sysdep.h                                       | 3 +++
> > >  sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S      | 5 +++++
> > >  5 files changed, 21 insertions(+)
> > >
> > > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> > > index eba8dbc4a6..174ea38f5b 100644
> > > --- a/sysdeps/x86/cacheinfo.h
> > > +++ b/sysdeps/x86/cacheinfo.h
> > > @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden
> = 2048;
> > >  /* Threshold to stop using Enhanced REP MOVSB.  */
> > >  long int __x86_rep_movsb_stop_threshold attribute_hidden;
> > >
> > > +/* String/memory function control.  */
> > > +int __x86_string_control attribute_hidden;
> >
> > Please expand comment.
> >
> > Suggest:
> >
> > /* A bit-wise OR of string/memory requirements for optimal performance
> >    e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These bits
> >    are used at runtime to tune implementation behavior.  */
> > int __x86_string_control attribute_hidden;
>
> I will fix it in the v2 patch.
>
> Thanks.
>
> > > +
> > >  static void
> > >  init_cacheinfo (void)
> > >  {
> > > @@ -71,5 +74,9 @@ init_cacheinfo (void)
> > >    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
> > >    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
> > >    __x86_rep_movsb_stop_threshold =
> cpu_features->rep_movsb_stop_threshold;
> > > +
> > > +  if (CPU_FEATURES_ARCH_P (cpu_features,
> Avoid_Short_Distance_REP_MOVSB))
> > > +    __x86_string_control
> > > +      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
> >
> > OK.
> >
> > >  }
> > >  #endif
> > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > > index 706a172ba9..645bba6314 100644
> > > --- a/sysdeps/x86/cpu-features.c
> > > +++ b/sysdeps/x86/cpu-features.c
> > > @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features
> *cpu_features)
> > >           cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
> > >             |= bit_arch_Prefer_AVX2_STRCMP;
> > >       }
> > > +
> > > +      /* Avoid avoid short distance REP MOVSB on processor with
> FSRM.  */
> > > +      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
> > > +
>  cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
> > > +       |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
> >
> > OK.
> >
> > >      }
> > >    /* This spells out "AuthenticAMD" or "HygonGenuine".  */
> > >    else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx ==
> 0x69746e65)
> > > diff --git
> a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > > index 133aab19f1..d7c93f00c5 100644
> > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> > > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
> > >  BIT (MathVec_Prefer_No_AVX512)
> > >  BIT (Prefer_FSRM)
> > >  BIT (Prefer_AVX2_STRCMP)
> > > +BIT (Avoid_Short_Distance_REP_MOVSB)
> >
> > OK.
> >
> > > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> > > index 51c069bfe1..35cb90d507 100644
> > > --- a/sysdeps/x86/sysdep.h
> > > +++ b/sysdeps/x86/sysdep.h
> > > @@ -57,6 +57,9 @@ enum cf_protection_level
> > >  #define STATE_SAVE_MASK \
> > >    ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
> > >
> >
> > Suggest adding:
> >
> > /* Constants for bits in __x86_string_control:  */
> >
> > > +/* Avoid short distance REP MOVSB.  */
> > > +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB    (1 << 0)
> >
> > OK.
> >
> > > +
> > >  #ifdef       __ASSEMBLER__
> > >
> > >  /* Syntactic details of assembler.  */
> > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > index a783da5de2..9f02624375 100644
> > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > @@ -325,12 +325,16 @@ L(movsb):
> > >       /* Avoid slow backward REP MOVSB.  */
> > >       jb      L(more_8x_vec_backward)
> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> > > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> > > +     jz      3f
> >
> > OK.
> >
> > >       movq    %rdi, %rcx
> > >       subq    %rsi, %rcx
> > >       jmp     2f
> > >  # endif
> > >  1:
> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> > > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> > > +     jz      3f
> >
> > OK.
> >
> > >       movq    %rsi, %rcx
> > >       subq    %rdi, %rcx
> > >  2:
> > > @@ -338,6 +342,7 @@ L(movsb):
> > >     is N*4GB + [1..63] with N >= 0.  */
> > >       cmpl    $63, %ecx
> > >       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
> > > +3:
> >
> > OK.
> >
> > >  # endif
> > >       mov     %RDX_LP, %RCX_LP
> > >       rep movsb
> > >
> >
> >
> > --
> > Cheers,
> > Carlos.
> >
>
>
> --
> H.J.
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v2] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-07-27  4:05     ` Noah Goldstein via Libc-alpha
@ 2021-07-27 16:05       ` H.J. Lu via Libc-alpha
  2021-07-27 19:12         ` Noah Goldstein via Libc-alpha
  0 siblings, 1 reply; 15+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-07-27 16:05 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

[-- Attachment #1: Type: text/plain, Size: 8633 bytes --]

On Mon, Jul 26, 2021 at 9:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
> On Mon, Jul 26, 2021 at 11:11 PM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote:
>>
>> On Mon, Jul 26, 2021 at 7:15 PM Carlos O'Donell <carlos@redhat.com> wrote:
>> >
>> > On 7/26/21 8:00 AM, H.J. Lu via Libc-alpha wrote:
>> > > commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
>> > > Author: H.J. Lu <hjl.tools@gmail.com>
>> > > Date:   Sat Jan 25 14:19:40 2020 -0800
>> > >
>> > >     x86-64: Avoid rep movsb with short distance [BZ #27130]
>> > > introduced some regressions on Intel processors without Fast Short REP
>> > > MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
>> > > short distance only on Intel processors with FSRM.  bench-memmove-large
>> > > on Skylake server shows that cycles of __memmove_evex_unaligned_erms are
>> > > improved for the following data size:
>> > >
>> > >                                   before    after    Improvement
>> > > length=4127, align1=3, align2=0:  479.38    343.00      28%
>> > > length=4223, align1=9, align2=5:  405.62    335.50      17%
>> > > length=8223, align1=3, align2=0:  786.12    495.00      37%
>> > > length=8319, align1=9, align2=5:  256.69    170.38      33%
>> > > length=16415, align1=3, align2=0: 1436.88   839.50      41%
>> > > length=16511, align1=9, align2=5: 1375.50   840.62      39%
>> > > length=32799, align1=3, align2=0: 2890.00   1850.62     36%
>> > > length=32895, align1=9, align2=5: 2891.38   1948.62     32%
>> > >
>> > > There are no regression on Ice Lake server.
>> >
>> > At this point we're waiting on Noah to provide feedback on the performance
>> > results given the alignment nop insertion you provided as a follow-up patch
>
>
> The results with the padding look good!
>
>>
>>
>> We are testing 25 byte nop padding now:
>>
>>
>> https://gitlab.com/x86-glibc/glibc/-/commit/de8985640a568786a59576716db54e0749d420e8
>>
> How did you come to the exact padding choice used?

I first replaced the 9 byte instructions:

        andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
__x86_string_control(%rip)
        jz      3f

with a 9-byte NOP and reproduced the regression on Tiger Lake.  It confirmed
that the code layout caused the regression.    I first tried adding
".p2align 4" to
branch targets and they made no differences.   Then I started adding different
size of nops after

        andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
__x86_string_control(%rip)
        jz      3f
        movq    %rdi, %rcx
        subq    %rsi, %rcx
        jmp     2f

with ".nops N".  I started with N == 1 and doubled N in each step.  I
noticed that
improvement started at N == 32.   I started bisecting between 16 and 32:

1. 24 and 32 are good.
2. 24 and 28 are good.
3. 25 is the best overall.

>>
>> > (unless you can confirm this yourself).
>> >
>> > Looking forward to a v2 the incorporates the alignment fix (pending Noah's
>> > comments), and my suggestions below.
>>
>> >
>> > > ---
>> > >  sysdeps/x86/cacheinfo.h                                    | 7 +++++++
>> > >  sysdeps/x86/cpu-features.c                                 | 5 +++++
>> > >  .../x86/include/cpu-features-preferred_feature_index_1.def | 1 +
>> > >  sysdeps/x86/sysdep.h                                       | 3 +++
>> > >  sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S      | 5 +++++
>> > >  5 files changed, 21 insertions(+)
>> > >
>> > > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
>> > > index eba8dbc4a6..174ea38f5b 100644
>> > > --- a/sysdeps/x86/cacheinfo.h
>> > > +++ b/sysdeps/x86/cacheinfo.h
>> > > @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
>> > >  /* Threshold to stop using Enhanced REP MOVSB.  */
>> > >  long int __x86_rep_movsb_stop_threshold attribute_hidden;
>> > >
>> > > +/* String/memory function control.  */
>> > > +int __x86_string_control attribute_hidden;
>> >
>> > Please expand comment.
>> >
>> > Suggest:
>> >
>> > /* A bit-wise OR of string/memory requirements for optimal performance
>> >    e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These bits
>> >    are used at runtime to tune implementation behavior.  */
>> > int __x86_string_control attribute_hidden;
>>
>> I will fix it in the v2 patch.
>>
>> Thanks.
>>
>> > > +
>> > >  static void
>> > >  init_cacheinfo (void)
>> > >  {
>> > > @@ -71,5 +74,9 @@ init_cacheinfo (void)
>> > >    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
>> > >    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
>> > >    __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
>> > > +
>> > > +  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
>> > > +    __x86_string_control
>> > > +      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
>> >
>> > OK.
>> >
>> > >  }
>> > >  #endif
>> > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
>> > > index 706a172ba9..645bba6314 100644
>> > > --- a/sysdeps/x86/cpu-features.c
>> > > +++ b/sysdeps/x86/cpu-features.c
>> > > @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>> > >           cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
>> > >             |= bit_arch_Prefer_AVX2_STRCMP;
>> > >       }
>> > > +
>> > > +      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
>> > > +      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
>> > > +     cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
>> > > +       |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
>> >
>> > OK.
>> >
>> > >      }
>> > >    /* This spells out "AuthenticAMD" or "HygonGenuine".  */
>> > >    else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
>> > > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> > > index 133aab19f1..d7c93f00c5 100644
>> > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> > > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>> > >  BIT (MathVec_Prefer_No_AVX512)
>> > >  BIT (Prefer_FSRM)
>> > >  BIT (Prefer_AVX2_STRCMP)
>> > > +BIT (Avoid_Short_Distance_REP_MOVSB)
>> >
>> > OK.
>> >
>> > > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
>> > > index 51c069bfe1..35cb90d507 100644
>> > > --- a/sysdeps/x86/sysdep.h
>> > > +++ b/sysdeps/x86/sysdep.h
>> > > @@ -57,6 +57,9 @@ enum cf_protection_level
>> > >  #define STATE_SAVE_MASK \
>> > >    ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
>> > >
>> >
>> > Suggest adding:
>> >
>> > /* Constants for bits in __x86_string_control:  */
>> >
>> > > +/* Avoid short distance REP MOVSB.  */
>> > > +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB    (1 << 0)
>> >
>> > OK.
>> >
>> > > +
>> > >  #ifdef       __ASSEMBLER__
>> > >
>> > >  /* Syntactic details of assembler.  */
>> > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> > > index a783da5de2..9f02624375 100644
>> > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> > > @@ -325,12 +325,16 @@ L(movsb):
>> > >       /* Avoid slow backward REP MOVSB.  */
>> > >       jb      L(more_8x_vec_backward)
>> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
>> > > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
>> > > +     jz      3f
>> >
>> > OK.
>> >
>> > >       movq    %rdi, %rcx
>> > >       subq    %rsi, %rcx
>> > >       jmp     2f
>> > >  # endif
>> > >  1:
>> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
>> > > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
>> > > +     jz      3f
>> >
>> > OK.
>> >
>> > >       movq    %rsi, %rcx
>> > >       subq    %rdi, %rcx
>> > >  2:
>> > > @@ -338,6 +342,7 @@ L(movsb):
>> > >     is N*4GB + [1..63] with N >= 0.  */
>> > >       cmpl    $63, %ecx
>> > >       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
>> > > +3:
>> >
>> > OK.
>> >
>> > >  # endif
>> > >       mov     %RDX_LP, %RCX_LP
>> > >       rep movsb
>> > >
>> >
>> >
>> > --
>> > Cheers,
>> > Carlos.
>> >
>>
>>
>> --
>> H.J.

Here is the v2 patch:

1.  Add a 25-byte NOP padding after JMP for Avoid_Short_Distance_REP_MOVSB,
which improves bench-memcpy-random performance on Tiger Lake by ~30%.
2. Update comments for __x86_string_control.


-- 
H.J.

[-- Attachment #2: v2-0001-x86-64-Add-Avoid_Short_Distance_REP_MOVSB.patch --]
[-- Type: text/x-patch, Size: 5901 bytes --]

From 7fbe4770ecd7b9d86b733af02f1182d214e52c45 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 22 Jul 2021 20:26:25 -0700
Subject: [PATCH v2] x86-64: Add Avoid_Short_Distance_REP_MOVSB

commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sat Jan 25 14:19:40 2020 -0800

    x86-64: Avoid rep movsb with short distance [BZ #27130]

introduced some regressions on Intel processors without Fast Short REP
MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
short distance only on Intel processors with FSRM.  bench-memmove-large
on Skylake server shows that cycles of __memmove_evex_unaligned_erms
improves for the following data size:

                                  before    after    Improvement
length=4127, align1=3, align2=0:  479.38    343.00      28%
length=4223, align1=9, align2=5:  405.62    335.50      17%
length=8223, align1=3, align2=0:  786.12    495.00      37%
length=8319, align1=9, align2=5:  256.69    170.38      33%
length=16415, align1=3, align2=0: 1436.88   839.50      41%
length=16511, align1=9, align2=5: 1375.50   840.62      39%
length=32799, align1=3, align2=0: 2890.00   1850.62     36%
length=32895, align1=9, align2=5: 2891.38   1948.62     32%

Add a 25-byte NOP padding after JMP for Avoid_Short_Distance_REP_MOVSB,
which improves bench-memcpy-random performance on Tiger Lake by ~30%.
---
 sysdeps/x86/cacheinfo.h                             |  9 +++++++++
 sysdeps/x86/cpu-features.c                          |  5 +++++
 .../cpu-features-preferred_feature_index_1.def      |  1 +
 sysdeps/x86/sysdep.h                                |  5 +++++
 .../x86_64/multiarch/memmove-vec-unaligned-erms.S   | 13 +++++++++++++
 5 files changed, 33 insertions(+)

diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
index eba8dbc4a6..41d2c81369 100644
--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
@@ -49,6 +49,11 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
 /* Threshold to stop using Enhanced REP MOVSB.  */
 long int __x86_rep_movsb_stop_threshold attribute_hidden;
 
+/* A bit-wise OR of string/memory requirements for optimal performance
+   e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These bits
+   are used at runtime to tune implementation behavior.  */
+int __x86_string_control attribute_hidden;
+
 static void
 init_cacheinfo (void)
 {
@@ -71,5 +76,9 @@ init_cacheinfo (void)
   __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
   __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
   __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
+
+  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
+    __x86_string_control
+      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
 }
 #endif
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 706a172ba9..645bba6314 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features)
 	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
 	      |= bit_arch_Prefer_AVX2_STRCMP;
 	}
+
+      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
+      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
+	cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
+	  |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
     }
   /* This spells out "AuthenticAMD" or "HygonGenuine".  */
   else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 133aab19f1..d7c93f00c5 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
 BIT (MathVec_Prefer_No_AVX512)
 BIT (Prefer_FSRM)
 BIT (Prefer_AVX2_STRCMP)
+BIT (Avoid_Short_Distance_REP_MOVSB)
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 51c069bfe1..cac1d762fb 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -57,6 +57,11 @@ enum cf_protection_level
 #define STATE_SAVE_MASK \
   ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
 
+/* Constants for bits in __x86_string_control:  */
+
+/* Avoid short distance REP MOVSB.  */
+#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB	(1 << 0)
+
 #ifdef	__ASSEMBLER__
 
 /* Syntactic details of assembler.  */
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index a783da5de2..8d42fe517b 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -325,12 +325,24 @@ L(movsb):
 	/* Avoid slow backward REP MOVSB.  */
 	jb	L(more_8x_vec_backward)
 # if AVOID_SHORT_DISTANCE_REP_MOVSB
+	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	3f
 	movq	%rdi, %rcx
 	subq	%rsi, %rcx
 	jmp	2f
+	/* Add a 25-byte NOP padding here to improve bench-memcpy-random
+	   performance on Skylake and Tiger Lake.  */
+	/* data16 cs nopw 0x0(%rax,%rax,1) */
+	.byte 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
+	/* data16 cs nopw 0x0(%rax,%rax,1) */
+	.byte 0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
+	/* nopl (%rax) */
+	.byte 0x0f, 0x1f, 0x00
 # endif
 1:
 # if AVOID_SHORT_DISTANCE_REP_MOVSB
+	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	3f
 	movq	%rsi, %rcx
 	subq	%rdi, %rcx
 2:
@@ -338,6 +350,7 @@ L(movsb):
    is N*4GB + [1..63] with N >= 0.  */
 	cmpl	$63, %ecx
 	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
+3:
 # endif
 	mov	%RDX_LP, %RCX_LP
 	rep movsb
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-07-27 16:05       ` [PATCH v2] " H.J. Lu via Libc-alpha
@ 2021-07-27 19:12         ` Noah Goldstein via Libc-alpha
  2021-07-27 19:22           ` [PATCH v3] " H.J. Lu via Libc-alpha
  0 siblings, 1 reply; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-07-27 19:12 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library

On Tue, Jul 27, 2021 at 12:06 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> On Mon, Jul 26, 2021 at 9:06 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> >
> >
> > On Mon, Jul 26, 2021 at 11:11 PM H.J. Lu via Libc-alpha <
> libc-alpha@sourceware.org> wrote:
> >>
> >> On Mon, Jul 26, 2021 at 7:15 PM Carlos O'Donell <carlos@redhat.com>
> wrote:
> >> >
> >> > On 7/26/21 8:00 AM, H.J. Lu via Libc-alpha wrote:
> >> > > commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
> >> > > Author: H.J. Lu <hjl.tools@gmail.com>
> >> > > Date:   Sat Jan 25 14:19:40 2020 -0800
> >> > >
> >> > >     x86-64: Avoid rep movsb with short distance [BZ #27130]
> >> > > introduced some regressions on Intel processors without Fast Short
> REP
> >> > > MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb
> with
> >> > > short distance only on Intel processors with FSRM.
> bench-memmove-large
> >> > > on Skylake server shows that cycles of
> __memmove_evex_unaligned_erms are
> >> > > improved for the following data size:
> >> > >
> >> > >                                   before    after    Improvement
> >> > > length=4127, align1=3, align2=0:  479.38    343.00      28%
> >> > > length=4223, align1=9, align2=5:  405.62    335.50      17%
> >> > > length=8223, align1=3, align2=0:  786.12    495.00      37%
> >> > > length=8319, align1=9, align2=5:  256.69    170.38      33%
> >> > > length=16415, align1=3, align2=0: 1436.88   839.50      41%
> >> > > length=16511, align1=9, align2=5: 1375.50   840.62      39%
> >> > > length=32799, align1=3, align2=0: 2890.00   1850.62     36%
> >> > > length=32895, align1=9, align2=5: 2891.38   1948.62     32%
> >> > >
> >> > > There are no regression on Ice Lake server.
> >> >
> >> > At this point we're waiting on Noah to provide feedback on the
> performance
> >> > results given the alignment nop insertion you provided as a follow-up
> patch
> >
> >
> > The results with the padding look good!
> >
> >>
> >>
> >> We are testing 25 byte nop padding now:
> >>
> >>
> >>
> https://gitlab.com/x86-glibc/glibc/-/commit/de8985640a568786a59576716db54e0749d420e8
> >>
> > How did you come to the exact padding choice used?
>
> I first replaced the 9 byte instructions:
>
>         andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
>         jz      3f
>
> with a 9-byte NOP and reproduced the regression on Tiger Lake.  It
> confirmed
> that the code layout caused the regression.    I first tried adding
> ".p2align 4" to
> branch targets and they made no differences.   Then I started adding
> different
> size of nops after
>
>         andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
>         jz      3f
>         movq    %rdi, %rcx
>         subq    %rsi, %rcx
>         jmp     2f
>
> with ".nops N".  I started with N == 1 and doubled N in each step.  I
> noticed that
> improvement started at N == 32.   I started bisecting between 16 and 32:
>
> 1. 24 and 32 are good.
> 2. 24 and 28 are good.
> 3. 25 is the best overall.
>
> >>
> >> > (unless you can confirm this yourself).
> >> >
> >> > Looking forward to a v2 the incorporates the alignment fix (pending
> Noah's
> >> > comments), and my suggestions below.
> >>
> >> >
> >> > > ---
> >> > >  sysdeps/x86/cacheinfo.h                                    | 7
> +++++++
> >> > >  sysdeps/x86/cpu-features.c                                 | 5
> +++++
> >> > >  .../x86/include/cpu-features-preferred_feature_index_1.def | 1 +
> >> > >  sysdeps/x86/sysdep.h                                       | 3 +++
> >> > >  sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S      | 5
> +++++
> >> > >  5 files changed, 21 insertions(+)
> >> > >
> >> > > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> >> > > index eba8dbc4a6..174ea38f5b 100644
> >> > > --- a/sysdeps/x86/cacheinfo.h
> >> > > +++ b/sysdeps/x86/cacheinfo.h
> >> > > @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold
> attribute_hidden = 2048;
> >> > >  /* Threshold to stop using Enhanced REP MOVSB.  */
> >> > >  long int __x86_rep_movsb_stop_threshold attribute_hidden;
> >> > >
> >> > > +/* String/memory function control.  */
> >> > > +int __x86_string_control attribute_hidden;
> >> >
> >> > Please expand comment.
> >> >
> >> > Suggest:
> >> >
> >> > /* A bit-wise OR of string/memory requirements for optimal performance
> >> >    e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These bits
> >> >    are used at runtime to tune implementation behavior.  */
> >> > int __x86_string_control attribute_hidden;
> >>
> >> I will fix it in the v2 patch.
> >>
> >> Thanks.
> >>
> >> > > +
> >> > >  static void
> >> > >  init_cacheinfo (void)
> >> > >  {
> >> > > @@ -71,5 +74,9 @@ init_cacheinfo (void)
> >> > >    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
> >> > >    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
> >> > >    __x86_rep_movsb_stop_threshold =
> cpu_features->rep_movsb_stop_threshold;
> >> > > +
> >> > > +  if (CPU_FEATURES_ARCH_P (cpu_features,
> Avoid_Short_Distance_REP_MOVSB))
> >> > > +    __x86_string_control
> >> > > +      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
> >> >
> >> > OK.
> >> >
> >> > >  }
> >> > >  #endif
> >> > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> >> > > index 706a172ba9..645bba6314 100644
> >> > > --- a/sysdeps/x86/cpu-features.c
> >> > > +++ b/sysdeps/x86/cpu-features.c
> >> > > @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features
> *cpu_features)
> >> > >           cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
> >> > >             |= bit_arch_Prefer_AVX2_STRCMP;
> >> > >       }
> >> > > +
> >> > > +      /* Avoid avoid short distance REP MOVSB on processor with
> FSRM.  */
> >> > > +      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
> >> > > +
>  cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
> >> > > +       |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
> >> >
> >> > OK.
> >> >
> >> > >      }
> >> > >    /* This spells out "AuthenticAMD" or "HygonGenuine".  */
> >> > >    else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx ==
> 0x69746e65)
> >> > > diff --git
> a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> >> > > index 133aab19f1..d7c93f00c5 100644
> >> > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> >> > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> >> > > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
> >> > >  BIT (MathVec_Prefer_No_AVX512)
> >> > >  BIT (Prefer_FSRM)
> >> > >  BIT (Prefer_AVX2_STRCMP)
> >> > > +BIT (Avoid_Short_Distance_REP_MOVSB)
> >> >
> >> > OK.
> >> >
> >> > > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> >> > > index 51c069bfe1..35cb90d507 100644
> >> > > --- a/sysdeps/x86/sysdep.h
> >> > > +++ b/sysdeps/x86/sysdep.h
> >> > > @@ -57,6 +57,9 @@ enum cf_protection_level
> >> > >  #define STATE_SAVE_MASK \
> >> > >    ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
> >> > >
> >> >
> >> > Suggest adding:
> >> >
> >> > /* Constants for bits in __x86_string_control:  */
> >> >
> >> > > +/* Avoid short distance REP MOVSB.  */
> >> > > +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB    (1 <<
> 0)
> >> >
> >> > OK.
> >> >
> >> > > +
> >> > >  #ifdef       __ASSEMBLER__
> >> > >
> >> > >  /* Syntactic details of assembler.  */
> >> > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> >> > > index a783da5de2..9f02624375 100644
> >> > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> >> > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> >> > > @@ -325,12 +325,16 @@ L(movsb):
> >> > >       /* Avoid slow backward REP MOVSB.  */
> >> > >       jb      L(more_8x_vec_backward)
> >> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> >> > > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> >> > > +     jz      3f
> >> >
> >> > OK.
> >> >
> >> > >       movq    %rdi, %rcx
> >> > >       subq    %rsi, %rcx
> >> > >       jmp     2f
> >> > >  # endif
> >> > >  1:
> >> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> >> > > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> >> > > +     jz      3f
> >> >
> >> > OK.
> >> >
> >> > >       movq    %rsi, %rcx
> >> > >       subq    %rdi, %rcx
> >> > >  2:
> >> > > @@ -338,6 +342,7 @@ L(movsb):
> >> > >     is N*4GB + [1..63] with N >= 0.  */
> >> > >       cmpl    $63, %ecx
> >> > >       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
> >> > > +3:
> >> >
> >> > OK.
> >> >
> >> > >  # endif
> >> > >       mov     %RDX_LP, %RCX_LP
> >> > >       rep movsb
> >> > >
> >> >
> >> >
> >> > --
> >> > Cheers,
> >> > Carlos.
> >> >
> >>
> >>
> >> --
> >> H.J.
>
> Here is the v2 patch:
>
> 1.  Add a 25-byte NOP padding after JMP for Avoid_Short_Distance_REP_MOVSB,
> which improves bench-memcpy-random performance on Tiger Lake by ~30%


I think this may not be due to unrelated factors. I reran the random
benchmarks with
the function on a fresh page and entry of the *_erms version at either + 0,
16, 32, 48
bytes and 1) don't see a universal improvement and 2) believe it's likely
that the 30%
you measured is due to unrelated alignment changes.

"__memcpy_avx_unaligned", "__memcpy_avx_unaligned_erms",
"__memcpy_evex_unaligned", "__memcpy_evex_unaligned_erms"

+ 0 Entry Alignment
New: 91824.1, 95460.1, 95063.3, 97998.3
Old: 99973.7, 100127, 100370, 100049

+ 16 Entry Alignment
New: 129558, 129916, 122373, 124056
Old: 125361, 96475.4, 97457.8, 124319

+ 32 Entry Alignment
New: 95073.7, 92857.8, 90182.2, 92666.3
Old: 96702.1, 98558.9, 96797.1, 96887.1

+ 48 Entry Alignment
New: 135161, 134010, 123023, 148589
Old: 128150, 139029, 98382.2, 122686

So the 32 byte/64 byte entry alignment versions seem to favor this change,
but when entry alignment % 16 != 0 this change seems to perform worse.

I generally think until we understand why the byte padding is necessary its
probably a mistake to include it unless its a benefit in actual SPEC2017.

My general intuition is that this is an issue with the benchmarks themselves
so I support the change w.o the padding.


2. Update comments for __x86_string_control.
>
>
> --
> H.J.
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v3] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-07-27 19:12         ` Noah Goldstein via Libc-alpha
@ 2021-07-27 19:22           ` H.J. Lu via Libc-alpha
  2021-07-27 19:50             ` Noah Goldstein via Libc-alpha
  0 siblings, 1 reply; 15+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-07-27 19:22 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

[-- Attachment #1: Type: text/plain, Size: 10875 bytes --]

On Tue, Jul 27, 2021 at 12:12 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
> On Tue, Jul 27, 2021 at 12:06 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Mon, Jul 26, 2021 at 9:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >
>> >
>> >
>> > On Mon, Jul 26, 2021 at 11:11 PM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote:
>> >>
>> >> On Mon, Jul 26, 2021 at 7:15 PM Carlos O'Donell <carlos@redhat.com> wrote:
>> >> >
>> >> > On 7/26/21 8:00 AM, H.J. Lu via Libc-alpha wrote:
>> >> > > commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
>> >> > > Author: H.J. Lu <hjl.tools@gmail.com>
>> >> > > Date:   Sat Jan 25 14:19:40 2020 -0800
>> >> > >
>> >> > >     x86-64: Avoid rep movsb with short distance [BZ #27130]
>> >> > > introduced some regressions on Intel processors without Fast Short REP
>> >> > > MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
>> >> > > short distance only on Intel processors with FSRM.  bench-memmove-large
>> >> > > on Skylake server shows that cycles of __memmove_evex_unaligned_erms are
>> >> > > improved for the following data size:
>> >> > >
>> >> > >                                   before    after    Improvement
>> >> > > length=4127, align1=3, align2=0:  479.38    343.00      28%
>> >> > > length=4223, align1=9, align2=5:  405.62    335.50      17%
>> >> > > length=8223, align1=3, align2=0:  786.12    495.00      37%
>> >> > > length=8319, align1=9, align2=5:  256.69    170.38      33%
>> >> > > length=16415, align1=3, align2=0: 1436.88   839.50      41%
>> >> > > length=16511, align1=9, align2=5: 1375.50   840.62      39%
>> >> > > length=32799, align1=3, align2=0: 2890.00   1850.62     36%
>> >> > > length=32895, align1=9, align2=5: 2891.38   1948.62     32%
>> >> > >
>> >> > > There are no regression on Ice Lake server.
>> >> >
>> >> > At this point we're waiting on Noah to provide feedback on the performance
>> >> > results given the alignment nop insertion you provided as a follow-up patch
>> >
>> >
>> > The results with the padding look good!
>> >
>> >>
>> >>
>> >> We are testing 25 byte nop padding now:
>> >>
>> >>
>> >> https://gitlab.com/x86-glibc/glibc/-/commit/de8985640a568786a59576716db54e0749d420e8
>> >>
>> > How did you come to the exact padding choice used?
>>
>> I first replaced the 9 byte instructions:
>>
>>         andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
>> __x86_string_control(%rip)
>>         jz      3f
>>
>> with a 9-byte NOP and reproduced the regression on Tiger Lake.  It confirmed
>> that the code layout caused the regression.    I first tried adding
>> ".p2align 4" to
>> branch targets and they made no differences.   Then I started adding different
>> size of nops after
>>
>>         andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
>> __x86_string_control(%rip)
>>         jz      3f
>>         movq    %rdi, %rcx
>>         subq    %rsi, %rcx
>>         jmp     2f
>>
>> with ".nops N".  I started with N == 1 and doubled N in each step.  I
>> noticed that
>> improvement started at N == 32.   I started bisecting between 16 and 32:
>>
>> 1. 24 and 32 are good.
>> 2. 24 and 28 are good.
>> 3. 25 is the best overall.
>>
>> >>
>> >> > (unless you can confirm this yourself).
>> >> >
>> >> > Looking forward to a v2 the incorporates the alignment fix (pending Noah's
>> >> > comments), and my suggestions below.
>> >>
>> >> >
>> >> > > ---
>> >> > >  sysdeps/x86/cacheinfo.h                                    | 7 +++++++
>> >> > >  sysdeps/x86/cpu-features.c                                 | 5 +++++
>> >> > >  .../x86/include/cpu-features-preferred_feature_index_1.def | 1 +
>> >> > >  sysdeps/x86/sysdep.h                                       | 3 +++
>> >> > >  sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S      | 5 +++++
>> >> > >  5 files changed, 21 insertions(+)
>> >> > >
>> >> > > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
>> >> > > index eba8dbc4a6..174ea38f5b 100644
>> >> > > --- a/sysdeps/x86/cacheinfo.h
>> >> > > +++ b/sysdeps/x86/cacheinfo.h
>> >> > > @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
>> >> > >  /* Threshold to stop using Enhanced REP MOVSB.  */
>> >> > >  long int __x86_rep_movsb_stop_threshold attribute_hidden;
>> >> > >
>> >> > > +/* String/memory function control.  */
>> >> > > +int __x86_string_control attribute_hidden;
>> >> >
>> >> > Please expand comment.
>> >> >
>> >> > Suggest:
>> >> >
>> >> > /* A bit-wise OR of string/memory requirements for optimal performance
>> >> >    e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These bits
>> >> >    are used at runtime to tune implementation behavior.  */
>> >> > int __x86_string_control attribute_hidden;
>> >>
>> >> I will fix it in the v2 patch.
>> >>
>> >> Thanks.
>> >>
>> >> > > +
>> >> > >  static void
>> >> > >  init_cacheinfo (void)
>> >> > >  {
>> >> > > @@ -71,5 +74,9 @@ init_cacheinfo (void)
>> >> > >    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
>> >> > >    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
>> >> > >    __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
>> >> > > +
>> >> > > +  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
>> >> > > +    __x86_string_control
>> >> > > +      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
>> >> >
>> >> > OK.
>> >> >
>> >> > >  }
>> >> > >  #endif
>> >> > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
>> >> > > index 706a172ba9..645bba6314 100644
>> >> > > --- a/sysdeps/x86/cpu-features.c
>> >> > > +++ b/sysdeps/x86/cpu-features.c
>> >> > > @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>> >> > >           cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
>> >> > >             |= bit_arch_Prefer_AVX2_STRCMP;
>> >> > >       }
>> >> > > +
>> >> > > +      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
>> >> > > +      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
>> >> > > +     cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
>> >> > > +       |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
>> >> >
>> >> > OK.
>> >> >
>> >> > >      }
>> >> > >    /* This spells out "AuthenticAMD" or "HygonGenuine".  */
>> >> > >    else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
>> >> > > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> >> > > index 133aab19f1..d7c93f00c5 100644
>> >> > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> >> > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> >> > > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>> >> > >  BIT (MathVec_Prefer_No_AVX512)
>> >> > >  BIT (Prefer_FSRM)
>> >> > >  BIT (Prefer_AVX2_STRCMP)
>> >> > > +BIT (Avoid_Short_Distance_REP_MOVSB)
>> >> >
>> >> > OK.
>> >> >
>> >> > > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
>> >> > > index 51c069bfe1..35cb90d507 100644
>> >> > > --- a/sysdeps/x86/sysdep.h
>> >> > > +++ b/sysdeps/x86/sysdep.h
>> >> > > @@ -57,6 +57,9 @@ enum cf_protection_level
>> >> > >  #define STATE_SAVE_MASK \
>> >> > >    ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
>> >> > >
>> >> >
>> >> > Suggest adding:
>> >> >
>> >> > /* Constants for bits in __x86_string_control:  */
>> >> >
>> >> > > +/* Avoid short distance REP MOVSB.  */
>> >> > > +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB    (1 << 0)
>> >> >
>> >> > OK.
>> >> >
>> >> > > +
>> >> > >  #ifdef       __ASSEMBLER__
>> >> > >
>> >> > >  /* Syntactic details of assembler.  */
>> >> > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> >> > > index a783da5de2..9f02624375 100644
>> >> > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> >> > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> >> > > @@ -325,12 +325,16 @@ L(movsb):
>> >> > >       /* Avoid slow backward REP MOVSB.  */
>> >> > >       jb      L(more_8x_vec_backward)
>> >> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
>> >> > > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
>> >> > > +     jz      3f
>> >> >
>> >> > OK.
>> >> >
>> >> > >       movq    %rdi, %rcx
>> >> > >       subq    %rsi, %rcx
>> >> > >       jmp     2f
>> >> > >  # endif
>> >> > >  1:
>> >> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
>> >> > > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
>> >> > > +     jz      3f
>> >> >
>> >> > OK.
>> >> >
>> >> > >       movq    %rsi, %rcx
>> >> > >       subq    %rdi, %rcx
>> >> > >  2:
>> >> > > @@ -338,6 +342,7 @@ L(movsb):
>> >> > >     is N*4GB + [1..63] with N >= 0.  */
>> >> > >       cmpl    $63, %ecx
>> >> > >       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
>> >> > > +3:
>> >> >
>> >> > OK.
>> >> >
>> >> > >  # endif
>> >> > >       mov     %RDX_LP, %RCX_LP
>> >> > >       rep movsb
>> >> > >
>> >> >
>> >> >
>> >> > --
>> >> > Cheers,
>> >> > Carlos.
>> >> >
>> >>
>> >>
>> >> --
>> >> H.J.
>>
>> Here is the v2 patch:
>>
>> 1.  Add a 25-byte NOP padding after JMP for Avoid_Short_Distance_REP_MOVSB,
>> which improves bench-memcpy-random performance on Tiger Lake by ~30%
>
>
> I think this may not be due to unrelated factors. I reran the random benchmarks with
> the function on a fresh page and entry of the *_erms version at either + 0, 16, 32, 48
> bytes and 1) don't see a universal improvement and 2) believe it's likely that the 30%
> you measured is due to unrelated alignment changes.
>
> "__memcpy_avx_unaligned", "__memcpy_avx_unaligned_erms", "__memcpy_evex_unaligned", "__memcpy_evex_unaligned_erms"
>
> + 0 Entry Alignment
> New: 91824.1, 95460.1, 95063.3, 97998.3
> Old: 99973.7, 100127, 100370, 100049
>
> + 16 Entry Alignment
> New: 129558, 129916, 122373, 124056
> Old: 125361, 96475.4, 97457.8, 124319
>
> + 32 Entry Alignment
> New: 95073.7, 92857.8, 90182.2, 92666.3
> Old: 96702.1, 98558.9, 96797.1, 96887.1
>
> + 48 Entry Alignment
> New: 135161, 134010, 123023, 148589
> Old: 128150, 139029, 98382.2, 122686
>
> So the 32 byte/64 byte entry alignment versions seem to favor this change,
> but when entry alignment % 16 != 0 this change seems to perform worse.
>
> I generally think until we understand why the byte padding is necessary its
> probably a mistake to include it unless its a benefit in actual SPEC2017.
>
> My general intuition is that this is an issue with the benchmarks themselves
> so I support the change w.o the padding.

Here is the v3 patch without the padding.  OK for master?

>
>> 2. Update comments for __x86_string_control.
>>
>>
>> --
>> H.J.



-- 
H.J.

[-- Attachment #2: v3-0001-x86-64-Add-Avoid_Short_Distance_REP_MOVSB.patch --]
[-- Type: text/x-patch, Size: 5384 bytes --]

From 8ae1b914e8c0e0f58249937a2f348c9b297f70f6 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 22 Jul 2021 20:26:25 -0700
Subject: [PATCH v3] x86-64: Add Avoid_Short_Distance_REP_MOVSB

commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sat Jan 25 14:19:40 2020 -0800

    x86-64: Avoid rep movsb with short distance [BZ #27130]

introduced some regressions on Intel processors without Fast Short REP
MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
short distance only on Intel processors with FSRM.  bench-memmove-large
on Skylake server shows that cycles of __memmove_evex_unaligned_erms
improves for the following data size:

                                  before    after    Improvement
length=4127, align1=3, align2=0:  479.38    349.25      27%
length=4223, align1=9, align2=5:  405.62    333.25      18%
length=8223, align1=3, align2=0:  786.12    496.38      37%
length=8319, align1=9, align2=5:  727.50    501.38      31%
length=16415, align1=3, align2=0: 1436.88   840.00      41%
length=16511, align1=9, align2=5: 1375.50   836.38      39%
length=32799, align1=3, align2=0: 2890.00   1860.12     36%
length=32895, align1=9, align2=5: 2891.38   1931.88     33%
---
 sysdeps/x86/cacheinfo.h                                  | 9 +++++++++
 sysdeps/x86/cpu-features.c                               | 5 +++++
 .../include/cpu-features-preferred_feature_index_1.def   | 1 +
 sysdeps/x86/sysdep.h                                     | 5 +++++
 sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S    | 5 +++++
 5 files changed, 25 insertions(+)

diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
index eba8dbc4a6..41d2c81369 100644
--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
@@ -49,6 +49,11 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
 /* Threshold to stop using Enhanced REP MOVSB.  */
 long int __x86_rep_movsb_stop_threshold attribute_hidden;
 
+/* A bit-wise OR of string/memory requirements for optimal performance
+   e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These bits
+   are used at runtime to tune implementation behavior.  */
+int __x86_string_control attribute_hidden;
+
 static void
 init_cacheinfo (void)
 {
@@ -71,5 +76,9 @@ init_cacheinfo (void)
   __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
   __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
   __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
+
+  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
+    __x86_string_control
+      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
 }
 #endif
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 706a172ba9..645bba6314 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features)
 	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
 	      |= bit_arch_Prefer_AVX2_STRCMP;
 	}
+
+      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
+      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
+	cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
+	  |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
     }
   /* This spells out "AuthenticAMD" or "HygonGenuine".  */
   else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 133aab19f1..d7c93f00c5 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
 BIT (MathVec_Prefer_No_AVX512)
 BIT (Prefer_FSRM)
 BIT (Prefer_AVX2_STRCMP)
+BIT (Avoid_Short_Distance_REP_MOVSB)
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 51c069bfe1..cac1d762fb 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -57,6 +57,11 @@ enum cf_protection_level
 #define STATE_SAVE_MASK \
   ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
 
+/* Constants for bits in __x86_string_control:  */
+
+/* Avoid short distance REP MOVSB.  */
+#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB	(1 << 0)
+
 #ifdef	__ASSEMBLER__
 
 /* Syntactic details of assembler.  */
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index a783da5de2..9f02624375 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -325,12 +325,16 @@ L(movsb):
 	/* Avoid slow backward REP MOVSB.  */
 	jb	L(more_8x_vec_backward)
 # if AVOID_SHORT_DISTANCE_REP_MOVSB
+	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	3f
 	movq	%rdi, %rcx
 	subq	%rsi, %rcx
 	jmp	2f
 # endif
 1:
 # if AVOID_SHORT_DISTANCE_REP_MOVSB
+	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	3f
 	movq	%rsi, %rcx
 	subq	%rdi, %rcx
 2:
@@ -338,6 +342,7 @@ L(movsb):
    is N*4GB + [1..63] with N >= 0.  */
 	cmpl	$63, %ecx
 	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
+3:
 # endif
 	mov	%RDX_LP, %RCX_LP
 	rep movsb
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-07-27 19:22           ` [PATCH v3] " H.J. Lu via Libc-alpha
@ 2021-07-27 19:50             ` Noah Goldstein via Libc-alpha
  2021-07-27 19:55               ` H.J. Lu via Libc-alpha
  2021-07-28 15:08               ` H.J. Lu via Libc-alpha
  0 siblings, 2 replies; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-07-27 19:50 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library

On Tue, Jul 27, 2021 at 3:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> On Tue, Jul 27, 2021 at 12:12 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> >
> >
> > On Tue, Jul 27, 2021 at 12:06 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> On Mon, Jul 26, 2021 at 9:06 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >> >
> >> >
> >> >
> >> > On Mon, Jul 26, 2021 at 11:11 PM H.J. Lu via Libc-alpha <
> libc-alpha@sourceware.org> wrote:
> >> >>
> >> >> On Mon, Jul 26, 2021 at 7:15 PM Carlos O'Donell <carlos@redhat.com>
> wrote:
> >> >> >
> >> >> > On 7/26/21 8:00 AM, H.J. Lu via Libc-alpha wrote:
> >> >> > > commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
> >> >> > > Author: H.J. Lu <hjl.tools@gmail.com>
> >> >> > > Date:   Sat Jan 25 14:19:40 2020 -0800
> >> >> > >
> >> >> > >     x86-64: Avoid rep movsb with short distance [BZ #27130]
> >> >> > > introduced some regressions on Intel processors without Fast
> Short REP
> >> >> > > MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep
> movsb with
> >> >> > > short distance only on Intel processors with FSRM.
> bench-memmove-large
> >> >> > > on Skylake server shows that cycles of
> __memmove_evex_unaligned_erms are
> >> >> > > improved for the following data size:
> >> >> > >
> >> >> > >                                   before    after    Improvement
> >> >> > > length=4127, align1=3, align2=0:  479.38    343.00      28%
> >> >> > > length=4223, align1=9, align2=5:  405.62    335.50      17%
> >> >> > > length=8223, align1=3, align2=0:  786.12    495.00      37%
> >> >> > > length=8319, align1=9, align2=5:  256.69    170.38      33%
> >> >> > > length=16415, align1=3, align2=0: 1436.88   839.50      41%
> >> >> > > length=16511, align1=9, align2=5: 1375.50   840.62      39%
> >> >> > > length=32799, align1=3, align2=0: 2890.00   1850.62     36%
> >> >> > > length=32895, align1=9, align2=5: 2891.38   1948.62     32%
> >> >> > >
> >> >> > > There are no regression on Ice Lake server.
> >> >> >
> >> >> > At this point we're waiting on Noah to provide feedback on the
> performance
> >> >> > results given the alignment nop insertion you provided as a
> follow-up patch
> >> >
> >> >
> >> > The results with the padding look good!
> >> >
> >> >>
> >> >>
> >> >> We are testing 25 byte nop padding now:
> >> >>
> >> >>
> >> >>
> https://gitlab.com/x86-glibc/glibc/-/commit/de8985640a568786a59576716db54e0749d420e8
> >> >>
> >> > How did you come to the exact padding choice used?
> >>
> >> I first replaced the 9 byte instructions:
> >>
> >>         andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> >> __x86_string_control(%rip)
> >>         jz      3f
> >>
> >> with a 9-byte NOP and reproduced the regression on Tiger Lake.  It
> confirmed
> >> that the code layout caused the regression.    I first tried adding
> >> ".p2align 4" to
> >> branch targets and they made no differences.   Then I started adding
> different
> >> size of nops after
> >>
> >>         andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> >> __x86_string_control(%rip)
> >>         jz      3f
> >>         movq    %rdi, %rcx
> >>         subq    %rsi, %rcx
> >>         jmp     2f
> >>
> >> with ".nops N".  I started with N == 1 and doubled N in each step.  I
> >> noticed that
> >> improvement started at N == 32.   I started bisecting between 16 and 32:
> >>
> >> 1. 24 and 32 are good.
> >> 2. 24 and 28 are good.
> >> 3. 25 is the best overall.
> >>
> >> >>
> >> >> > (unless you can confirm this yourself).
> >> >> >
> >> >> > Looking forward to a v2 the incorporates the alignment fix
> (pending Noah's
> >> >> > comments), and my suggestions below.
> >> >>
> >> >> >
> >> >> > > ---
> >> >> > >  sysdeps/x86/cacheinfo.h                                    | 7
> +++++++
> >> >> > >  sysdeps/x86/cpu-features.c                                 | 5
> +++++
> >> >> > >  .../x86/include/cpu-features-preferred_feature_index_1.def | 1 +
> >> >> > >  sysdeps/x86/sysdep.h                                       | 3
> +++
> >> >> > >  sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S      | 5
> +++++
> >> >> > >  5 files changed, 21 insertions(+)
> >> >> > >
> >> >> > > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> >> >> > > index eba8dbc4a6..174ea38f5b 100644
> >> >> > > --- a/sysdeps/x86/cacheinfo.h
> >> >> > > +++ b/sysdeps/x86/cacheinfo.h
> >> >> > > @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold
> attribute_hidden = 2048;
> >> >> > >  /* Threshold to stop using Enhanced REP MOVSB.  */
> >> >> > >  long int __x86_rep_movsb_stop_threshold attribute_hidden;
> >> >> > >
> >> >> > > +/* String/memory function control.  */
> >> >> > > +int __x86_string_control attribute_hidden;
> >> >> >
> >> >> > Please expand comment.
> >> >> >
> >> >> > Suggest:
> >> >> >
> >> >> > /* A bit-wise OR of string/memory requirements for optimal
> performance
> >> >> >    e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These
> bits
> >> >> >    are used at runtime to tune implementation behavior.  */
> >> >> > int __x86_string_control attribute_hidden;
> >> >>
> >> >> I will fix it in the v2 patch.
> >> >>
> >> >> Thanks.
> >> >>
> >> >> > > +
> >> >> > >  static void
> >> >> > >  init_cacheinfo (void)
> >> >> > >  {
> >> >> > > @@ -71,5 +74,9 @@ init_cacheinfo (void)
> >> >> > >    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
> >> >> > >    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
> >> >> > >    __x86_rep_movsb_stop_threshold =
> cpu_features->rep_movsb_stop_threshold;
> >> >> > > +
> >> >> > > +  if (CPU_FEATURES_ARCH_P (cpu_features,
> Avoid_Short_Distance_REP_MOVSB))
> >> >> > > +    __x86_string_control
> >> >> > > +      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
> >> >> >
> >> >> > OK.
> >> >> >
> >> >> > >  }
> >> >> > >  #endif
> >> >> > > diff --git a/sysdeps/x86/cpu-features.c
> b/sysdeps/x86/cpu-features.c
> >> >> > > index 706a172ba9..645bba6314 100644
> >> >> > > --- a/sysdeps/x86/cpu-features.c
> >> >> > > +++ b/sysdeps/x86/cpu-features.c
> >> >> > > @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features
> *cpu_features)
> >> >> > >           cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
> >> >> > >             |= bit_arch_Prefer_AVX2_STRCMP;
> >> >> > >       }
> >> >> > > +
> >> >> > > +      /* Avoid avoid short distance REP MOVSB on processor with
> FSRM.  */
> >> >> > > +      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
> >> >> > > +
>  cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
> >> >> > > +       |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
> >> >> >
> >> >> > OK.
> >> >> >
> >> >> > >      }
> >> >> > >    /* This spells out "AuthenticAMD" or "HygonGenuine".  */
> >> >> > >    else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx ==
> 0x69746e65)
> >> >> > > diff --git
> a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> >> >> > > index 133aab19f1..d7c93f00c5 100644
> >> >> > > ---
> a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> >> >> > > +++
> b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
> >> >> > > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
> >> >> > >  BIT (MathVec_Prefer_No_AVX512)
> >> >> > >  BIT (Prefer_FSRM)
> >> >> > >  BIT (Prefer_AVX2_STRCMP)
> >> >> > > +BIT (Avoid_Short_Distance_REP_MOVSB)
> >> >> >
> >> >> > OK.
> >> >> >
> >> >> > > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> >> >> > > index 51c069bfe1..35cb90d507 100644
> >> >> > > --- a/sysdeps/x86/sysdep.h
> >> >> > > +++ b/sysdeps/x86/sysdep.h
> >> >> > > @@ -57,6 +57,9 @@ enum cf_protection_level
> >> >> > >  #define STATE_SAVE_MASK \
> >> >> > >    ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 <<
> 7))
> >> >> > >
> >> >> >
> >> >> > Suggest adding:
> >> >> >
> >> >> > /* Constants for bits in __x86_string_control:  */
> >> >> >
> >> >> > > +/* Avoid short distance REP MOVSB.  */
> >> >> > > +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB    (1
> << 0)
> >> >> >
> >> >> > OK.
> >> >> >
> >> >> > > +
> >> >> > >  #ifdef       __ASSEMBLER__
> >> >> > >
> >> >> > >  /* Syntactic details of assembler.  */
> >> >> > > diff --git
> a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> >> >> > > index a783da5de2..9f02624375 100644
> >> >> > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> >> >> > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> >> >> > > @@ -325,12 +325,16 @@ L(movsb):
> >> >> > >       /* Avoid slow backward REP MOVSB.  */
> >> >> > >       jb      L(more_8x_vec_backward)
> >> >> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> >> >> > > +     andl
> $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> >> >> > > +     jz      3f
> >> >> >
> >> >> > OK.
> >> >> >
> >> >> > >       movq    %rdi, %rcx
> >> >> > >       subq    %rsi, %rcx
> >> >> > >       jmp     2f
> >> >> > >  # endif
> >> >> > >  1:
> >> >> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> >> >> > > +     andl
> $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> >> >> > > +     jz      3f
> >> >> >
> >> >> > OK.
> >> >> >
> >> >> > >       movq    %rsi, %rcx
> >> >> > >       subq    %rdi, %rcx
> >> >> > >  2:
> >> >> > > @@ -338,6 +342,7 @@ L(movsb):
> >> >> > >     is N*4GB + [1..63] with N >= 0.  */
> >> >> > >       cmpl    $63, %ecx
> >> >> > >       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <=
> 63.  */
> >> >> > > +3:
> >> >> >
> >> >> > OK.
> >> >> >
> >> >> > >  # endif
> >> >> > >       mov     %RDX_LP, %RCX_LP
> >> >> > >       rep movsb
> >> >> > >
> >> >> >
> >> >> >
> >> >> > --
> >> >> > Cheers,
> >> >> > Carlos.
> >> >> >
> >> >>
> >> >>
> >> >> --
> >> >> H.J.
> >>
> >> Here is the v2 patch:
> >>
> >> 1.  Add a 25-byte NOP padding after JMP for
> Avoid_Short_Distance_REP_MOVSB,
> >> which improves bench-memcpy-random performance on Tiger Lake by ~30%
> >
> >
> > I think this may not be due to unrelated factors. I reran the random
> benchmarks with
> > the function on a fresh page and entry of the *_erms version at either +
> 0, 16, 32, 48
> > bytes and 1) don't see a universal improvement and 2) believe it's
> likely that the 30%
> > you measured is due to unrelated alignment changes.
> >
> > "__memcpy_avx_unaligned", "__memcpy_avx_unaligned_erms",
> "__memcpy_evex_unaligned", "__memcpy_evex_unaligned_erms"
> >
> > + 0 Entry Alignment
> > New: 91824.1, 95460.1, 95063.3, 97998.3
> > Old: 99973.7, 100127, 100370, 100049
> >
> > + 16 Entry Alignment
> > New: 129558, 129916, 122373, 124056
> > Old: 125361, 96475.4, 97457.8, 124319
> >
> > + 32 Entry Alignment
> > New: 95073.7, 92857.8, 90182.2, 92666.3
> > Old: 96702.1, 98558.9, 96797.1, 96887.1
> >
> > + 48 Entry Alignment
> > New: 135161, 134010, 123023, 148589
> > Old: 128150, 139029, 98382.2, 122686
> >
> > So the 32 byte/64 byte entry alignment versions seem to favor this
> change,
> > but when entry alignment % 16 != 0 this change seems to perform worse.
> >
> > I generally think until we understand why the byte padding is necessary
> its
> > probably a mistake to include it unless its a benefit in actual SPEC2017.
> >
> > My general intuition is that this is an issue with the benchmarks
> themselves
> > so I support the change w.o the padding.
>
> Here is the v3 patch without the padding.  OK for master?
>

Ok with this change going out.

But I think we need to find the root cause of this degradation before more
changes are made to the file. At the moment just about any change that adds
25 bytes before L(less_vec) will look like a major win.

>
> >
> >> 2. Update comments for __x86_string_control.
> >>
> >>
> >> --
> >> H.J.
>
>
>
> --
> H.J.
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-07-27 19:50             ` Noah Goldstein via Libc-alpha
@ 2021-07-27 19:55               ` H.J. Lu via Libc-alpha
  2021-07-28 15:08               ` H.J. Lu via Libc-alpha
  1 sibling, 0 replies; 15+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-07-27 19:55 UTC (permalink / raw)
  To: Noah Goldstein, Dianhong; +Cc: GNU C Library

On Tue, Jul 27, 2021 at 12:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
> On Tue, Jul 27, 2021 at 3:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Tue, Jul 27, 2021 at 12:12 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >
>> >
>> >
>> > On Tue, Jul 27, 2021 at 12:06 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>> >>
>> >> On Mon, Jul 26, 2021 at 9:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >> >
>> >> >
>> >> >
>> >> > On Mon, Jul 26, 2021 at 11:11 PM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote:
>> >> >>
>> >> >> On Mon, Jul 26, 2021 at 7:15 PM Carlos O'Donell <carlos@redhat.com> wrote:
>> >> >> >
>> >> >> > On 7/26/21 8:00 AM, H.J. Lu via Libc-alpha wrote:
>> >> >> > > commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
>> >> >> > > Author: H.J. Lu <hjl.tools@gmail.com>
>> >> >> > > Date:   Sat Jan 25 14:19:40 2020 -0800
>> >> >> > >
>> >> >> > >     x86-64: Avoid rep movsb with short distance [BZ #27130]
>> >> >> > > introduced some regressions on Intel processors without Fast Short REP
>> >> >> > > MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
>> >> >> > > short distance only on Intel processors with FSRM.  bench-memmove-large
>> >> >> > > on Skylake server shows that cycles of __memmove_evex_unaligned_erms are
>> >> >> > > improved for the following data size:
>> >> >> > >
>> >> >> > >                                   before    after    Improvement
>> >> >> > > length=4127, align1=3, align2=0:  479.38    343.00      28%
>> >> >> > > length=4223, align1=9, align2=5:  405.62    335.50      17%
>> >> >> > > length=8223, align1=3, align2=0:  786.12    495.00      37%
>> >> >> > > length=8319, align1=9, align2=5:  256.69    170.38      33%
>> >> >> > > length=16415, align1=3, align2=0: 1436.88   839.50      41%
>> >> >> > > length=16511, align1=9, align2=5: 1375.50   840.62      39%
>> >> >> > > length=32799, align1=3, align2=0: 2890.00   1850.62     36%
>> >> >> > > length=32895, align1=9, align2=5: 2891.38   1948.62     32%
>> >> >> > >
>> >> >> > > There are no regression on Ice Lake server.
>> >> >> >
>> >> >> > At this point we're waiting on Noah to provide feedback on the performance
>> >> >> > results given the alignment nop insertion you provided as a follow-up patch
>> >> >
>> >> >
>> >> > The results with the padding look good!
>> >> >
>> >> >>
>> >> >>
>> >> >> We are testing 25 byte nop padding now:
>> >> >>
>> >> >>
>> >> >> https://gitlab.com/x86-glibc/glibc/-/commit/de8985640a568786a59576716db54e0749d420e8
>> >> >>
>> >> > How did you come to the exact padding choice used?
>> >>
>> >> I first replaced the 9 byte instructions:
>> >>
>> >>         andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
>> >> __x86_string_control(%rip)
>> >>         jz      3f
>> >>
>> >> with a 9-byte NOP and reproduced the regression on Tiger Lake.  It confirmed
>> >> that the code layout caused the regression.    I first tried adding
>> >> ".p2align 4" to
>> >> branch targets and they made no differences.   Then I started adding different
>> >> size of nops after
>> >>
>> >>         andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
>> >> __x86_string_control(%rip)
>> >>         jz      3f
>> >>         movq    %rdi, %rcx
>> >>         subq    %rsi, %rcx
>> >>         jmp     2f
>> >>
>> >> with ".nops N".  I started with N == 1 and doubled N in each step.  I
>> >> noticed that
>> >> improvement started at N == 32.   I started bisecting between 16 and 32:
>> >>
>> >> 1. 24 and 32 are good.
>> >> 2. 24 and 28 are good.
>> >> 3. 25 is the best overall.
>> >>
>> >> >>
>> >> >> > (unless you can confirm this yourself).
>> >> >> >
>> >> >> > Looking forward to a v2 the incorporates the alignment fix (pending Noah's
>> >> >> > comments), and my suggestions below.
>> >> >>
>> >> >> >
>> >> >> > > ---
>> >> >> > >  sysdeps/x86/cacheinfo.h                                    | 7 +++++++
>> >> >> > >  sysdeps/x86/cpu-features.c                                 | 5 +++++
>> >> >> > >  .../x86/include/cpu-features-preferred_feature_index_1.def | 1 +
>> >> >> > >  sysdeps/x86/sysdep.h                                       | 3 +++
>> >> >> > >  sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S      | 5 +++++
>> >> >> > >  5 files changed, 21 insertions(+)
>> >> >> > >
>> >> >> > > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
>> >> >> > > index eba8dbc4a6..174ea38f5b 100644
>> >> >> > > --- a/sysdeps/x86/cacheinfo.h
>> >> >> > > +++ b/sysdeps/x86/cacheinfo.h
>> >> >> > > @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
>> >> >> > >  /* Threshold to stop using Enhanced REP MOVSB.  */
>> >> >> > >  long int __x86_rep_movsb_stop_threshold attribute_hidden;
>> >> >> > >
>> >> >> > > +/* String/memory function control.  */
>> >> >> > > +int __x86_string_control attribute_hidden;
>> >> >> >
>> >> >> > Please expand comment.
>> >> >> >
>> >> >> > Suggest:
>> >> >> >
>> >> >> > /* A bit-wise OR of string/memory requirements for optimal performance
>> >> >> >    e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These bits
>> >> >> >    are used at runtime to tune implementation behavior.  */
>> >> >> > int __x86_string_control attribute_hidden;
>> >> >>
>> >> >> I will fix it in the v2 patch.
>> >> >>
>> >> >> Thanks.
>> >> >>
>> >> >> > > +
>> >> >> > >  static void
>> >> >> > >  init_cacheinfo (void)
>> >> >> > >  {
>> >> >> > > @@ -71,5 +74,9 @@ init_cacheinfo (void)
>> >> >> > >    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
>> >> >> > >    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
>> >> >> > >    __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
>> >> >> > > +
>> >> >> > > +  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
>> >> >> > > +    __x86_string_control
>> >> >> > > +      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > >  }
>> >> >> > >  #endif
>> >> >> > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
>> >> >> > > index 706a172ba9..645bba6314 100644
>> >> >> > > --- a/sysdeps/x86/cpu-features.c
>> >> >> > > +++ b/sysdeps/x86/cpu-features.c
>> >> >> > > @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>> >> >> > >           cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
>> >> >> > >             |= bit_arch_Prefer_AVX2_STRCMP;
>> >> >> > >       }
>> >> >> > > +
>> >> >> > > +      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
>> >> >> > > +      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
>> >> >> > > +     cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
>> >> >> > > +       |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > >      }
>> >> >> > >    /* This spells out "AuthenticAMD" or "HygonGenuine".  */
>> >> >> > >    else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
>> >> >> > > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> >> >> > > index 133aab19f1..d7c93f00c5 100644
>> >> >> > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> >> >> > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> >> >> > > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>> >> >> > >  BIT (MathVec_Prefer_No_AVX512)
>> >> >> > >  BIT (Prefer_FSRM)
>> >> >> > >  BIT (Prefer_AVX2_STRCMP)
>> >> >> > > +BIT (Avoid_Short_Distance_REP_MOVSB)
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
>> >> >> > > index 51c069bfe1..35cb90d507 100644
>> >> >> > > --- a/sysdeps/x86/sysdep.h
>> >> >> > > +++ b/sysdeps/x86/sysdep.h
>> >> >> > > @@ -57,6 +57,9 @@ enum cf_protection_level
>> >> >> > >  #define STATE_SAVE_MASK \
>> >> >> > >    ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
>> >> >> > >
>> >> >> >
>> >> >> > Suggest adding:
>> >> >> >
>> >> >> > /* Constants for bits in __x86_string_control:  */
>> >> >> >
>> >> >> > > +/* Avoid short distance REP MOVSB.  */
>> >> >> > > +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB    (1 << 0)
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > > +
>> >> >> > >  #ifdef       __ASSEMBLER__
>> >> >> > >
>> >> >> > >  /* Syntactic details of assembler.  */
>> >> >> > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> >> >> > > index a783da5de2..9f02624375 100644
>> >> >> > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> >> >> > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> >> >> > > @@ -325,12 +325,16 @@ L(movsb):
>> >> >> > >       /* Avoid slow backward REP MOVSB.  */
>> >> >> > >       jb      L(more_8x_vec_backward)
>> >> >> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
>> >> >> > > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
>> >> >> > > +     jz      3f
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > >       movq    %rdi, %rcx
>> >> >> > >       subq    %rsi, %rcx
>> >> >> > >       jmp     2f
>> >> >> > >  # endif
>> >> >> > >  1:
>> >> >> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
>> >> >> > > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
>> >> >> > > +     jz      3f
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > >       movq    %rsi, %rcx
>> >> >> > >       subq    %rdi, %rcx
>> >> >> > >  2:
>> >> >> > > @@ -338,6 +342,7 @@ L(movsb):
>> >> >> > >     is N*4GB + [1..63] with N >= 0.  */
>> >> >> > >       cmpl    $63, %ecx
>> >> >> > >       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
>> >> >> > > +3:
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > >  # endif
>> >> >> > >       mov     %RDX_LP, %RCX_LP
>> >> >> > >       rep movsb
>> >> >> > >
>> >> >> >
>> >> >> >
>> >> >> > --
>> >> >> > Cheers,
>> >> >> > Carlos.
>> >> >> >
>> >> >>
>> >> >>
>> >> >> --
>> >> >> H.J.
>> >>
>> >> Here is the v2 patch:
>> >>
>> >> 1.  Add a 25-byte NOP padding after JMP for Avoid_Short_Distance_REP_MOVSB,
>> >> which improves bench-memcpy-random performance on Tiger Lake by ~30%
>> >
>> >
>> > I think this may not be due to unrelated factors. I reran the random benchmarks with
>> > the function on a fresh page and entry of the *_erms version at either + 0, 16, 32, 48
>> > bytes and 1) don't see a universal improvement and 2) believe it's likely that the 30%
>> > you measured is due to unrelated alignment changes.
>> >
>> > "__memcpy_avx_unaligned", "__memcpy_avx_unaligned_erms", "__memcpy_evex_unaligned", "__memcpy_evex_unaligned_erms"
>> >
>> > + 0 Entry Alignment
>> > New: 91824.1, 95460.1, 95063.3, 97998.3
>> > Old: 99973.7, 100127, 100370, 100049
>> >
>> > + 16 Entry Alignment
>> > New: 129558, 129916, 122373, 124056
>> > Old: 125361, 96475.4, 97457.8, 124319
>> >
>> > + 32 Entry Alignment
>> > New: 95073.7, 92857.8, 90182.2, 92666.3
>> > Old: 96702.1, 98558.9, 96797.1, 96887.1
>> >
>> > + 48 Entry Alignment
>> > New: 135161, 134010, 123023, 148589
>> > Old: 128150, 139029, 98382.2, 122686
>> >
>> > So the 32 byte/64 byte entry alignment versions seem to favor this change,
>> > but when entry alignment % 16 != 0 this change seems to perform worse.
>> >
>> > I generally think until we understand why the byte padding is necessary its
>> > probably a mistake to include it unless its a benefit in actual SPEC2017.
>> >
>> > My general intuition is that this is an issue with the benchmarks themselves
>> > so I support the change w.o the padding.
>>
>> Here is the v3 patch without the padding.  OK for master?
>
>
> Ok with this change going out.
>
> But I think we need to find the root cause of this degradation before more
> changes are made to the file. At the moment just about any change that adds
> 25 bytes before L(less_vec) will look like a major win.

Dianhong, please run vtune to investigate why a 25-byte padding makes such
a difference.  Thanks.

>>
>>
>> >
>> >> 2. Update comments for __x86_string_control.
>> >>
>> >>
>> >> --
>> >> H.J.
>>
>>
>>
>> --
>> H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-07-27 19:50             ` Noah Goldstein via Libc-alpha
  2021-07-27 19:55               ` H.J. Lu via Libc-alpha
@ 2021-07-28 15:08               ` H.J. Lu via Libc-alpha
  1 sibling, 0 replies; 15+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-07-28 15:08 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Tue, Jul 27, 2021 at 12:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
> On Tue, Jul 27, 2021 at 3:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Tue, Jul 27, 2021 at 12:12 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >
>> >
>> >
>> > On Tue, Jul 27, 2021 at 12:06 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>> >>
>> >> On Mon, Jul 26, 2021 at 9:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >> >
>> >> >
>> >> >
>> >> > On Mon, Jul 26, 2021 at 11:11 PM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote:
>> >> >>
>> >> >> On Mon, Jul 26, 2021 at 7:15 PM Carlos O'Donell <carlos@redhat.com> wrote:
>> >> >> >
>> >> >> > On 7/26/21 8:00 AM, H.J. Lu via Libc-alpha wrote:
>> >> >> > > commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
>> >> >> > > Author: H.J. Lu <hjl.tools@gmail.com>
>> >> >> > > Date:   Sat Jan 25 14:19:40 2020 -0800
>> >> >> > >
>> >> >> > >     x86-64: Avoid rep movsb with short distance [BZ #27130]
>> >> >> > > introduced some regressions on Intel processors without Fast Short REP
>> >> >> > > MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
>> >> >> > > short distance only on Intel processors with FSRM.  bench-memmove-large
>> >> >> > > on Skylake server shows that cycles of __memmove_evex_unaligned_erms are
>> >> >> > > improved for the following data size:
>> >> >> > >
>> >> >> > >                                   before    after    Improvement
>> >> >> > > length=4127, align1=3, align2=0:  479.38    343.00      28%
>> >> >> > > length=4223, align1=9, align2=5:  405.62    335.50      17%
>> >> >> > > length=8223, align1=3, align2=0:  786.12    495.00      37%
>> >> >> > > length=8319, align1=9, align2=5:  256.69    170.38      33%
>> >> >> > > length=16415, align1=3, align2=0: 1436.88   839.50      41%
>> >> >> > > length=16511, align1=9, align2=5: 1375.50   840.62      39%
>> >> >> > > length=32799, align1=3, align2=0: 2890.00   1850.62     36%
>> >> >> > > length=32895, align1=9, align2=5: 2891.38   1948.62     32%
>> >> >> > >
>> >> >> > > There are no regression on Ice Lake server.
>> >> >> >
>> >> >> > At this point we're waiting on Noah to provide feedback on the performance
>> >> >> > results given the alignment nop insertion you provided as a follow-up patch
>> >> >
>> >> >
>> >> > The results with the padding look good!
>> >> >
>> >> >>
>> >> >>
>> >> >> We are testing 25 byte nop padding now:
>> >> >>
>> >> >>
>> >> >> https://gitlab.com/x86-glibc/glibc/-/commit/de8985640a568786a59576716db54e0749d420e8
>> >> >>
>> >> > How did you come to the exact padding choice used?
>> >>
>> >> I first replaced the 9 byte instructions:
>> >>
>> >>         andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
>> >> __x86_string_control(%rip)
>> >>         jz      3f
>> >>
>> >> with a 9-byte NOP and reproduced the regression on Tiger Lake.  It confirmed
>> >> that the code layout caused the regression.    I first tried adding
>> >> ".p2align 4" to
>> >> branch targets and they made no differences.   Then I started adding different
>> >> size of nops after
>> >>
>> >>         andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
>> >> __x86_string_control(%rip)
>> >>         jz      3f
>> >>         movq    %rdi, %rcx
>> >>         subq    %rsi, %rcx
>> >>         jmp     2f
>> >>
>> >> with ".nops N".  I started with N == 1 and doubled N in each step.  I
>> >> noticed that
>> >> improvement started at N == 32.   I started bisecting between 16 and 32:
>> >>
>> >> 1. 24 and 32 are good.
>> >> 2. 24 and 28 are good.
>> >> 3. 25 is the best overall.
>> >>
>> >> >>
>> >> >> > (unless you can confirm this yourself).
>> >> >> >
>> >> >> > Looking forward to a v2 the incorporates the alignment fix (pending Noah's
>> >> >> > comments), and my suggestions below.
>> >> >>
>> >> >> >
>> >> >> > > ---
>> >> >> > >  sysdeps/x86/cacheinfo.h                                    | 7 +++++++
>> >> >> > >  sysdeps/x86/cpu-features.c                                 | 5 +++++
>> >> >> > >  .../x86/include/cpu-features-preferred_feature_index_1.def | 1 +
>> >> >> > >  sysdeps/x86/sysdep.h                                       | 3 +++
>> >> >> > >  sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S      | 5 +++++
>> >> >> > >  5 files changed, 21 insertions(+)
>> >> >> > >
>> >> >> > > diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
>> >> >> > > index eba8dbc4a6..174ea38f5b 100644
>> >> >> > > --- a/sysdeps/x86/cacheinfo.h
>> >> >> > > +++ b/sysdeps/x86/cacheinfo.h
>> >> >> > > @@ -49,6 +49,9 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
>> >> >> > >  /* Threshold to stop using Enhanced REP MOVSB.  */
>> >> >> > >  long int __x86_rep_movsb_stop_threshold attribute_hidden;
>> >> >> > >
>> >> >> > > +/* String/memory function control.  */
>> >> >> > > +int __x86_string_control attribute_hidden;
>> >> >> >
>> >> >> > Please expand comment.
>> >> >> >
>> >> >> > Suggest:
>> >> >> >
>> >> >> > /* A bit-wise OR of string/memory requirements for optimal performance
>> >> >> >    e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These bits
>> >> >> >    are used at runtime to tune implementation behavior.  */
>> >> >> > int __x86_string_control attribute_hidden;
>> >> >>
>> >> >> I will fix it in the v2 patch.
>> >> >>
>> >> >> Thanks.
>> >> >>
>> >> >> > > +
>> >> >> > >  static void
>> >> >> > >  init_cacheinfo (void)
>> >> >> > >  {
>> >> >> > > @@ -71,5 +74,9 @@ init_cacheinfo (void)
>> >> >> > >    __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
>> >> >> > >    __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
>> >> >> > >    __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
>> >> >> > > +
>> >> >> > > +  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
>> >> >> > > +    __x86_string_control
>> >> >> > > +      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > >  }
>> >> >> > >  #endif
>> >> >> > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
>> >> >> > > index 706a172ba9..645bba6314 100644
>> >> >> > > --- a/sysdeps/x86/cpu-features.c
>> >> >> > > +++ b/sysdeps/x86/cpu-features.c
>> >> >> > > @@ -555,6 +555,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>> >> >> > >           cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
>> >> >> > >             |= bit_arch_Prefer_AVX2_STRCMP;
>> >> >> > >       }
>> >> >> > > +
>> >> >> > > +      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
>> >> >> > > +      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
>> >> >> > > +     cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
>> >> >> > > +       |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > >      }
>> >> >> > >    /* This spells out "AuthenticAMD" or "HygonGenuine".  */
>> >> >> > >    else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
>> >> >> > > diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> >> >> > > index 133aab19f1..d7c93f00c5 100644
>> >> >> > > --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> >> >> > > +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
>> >> >> > > @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
>> >> >> > >  BIT (MathVec_Prefer_No_AVX512)
>> >> >> > >  BIT (Prefer_FSRM)
>> >> >> > >  BIT (Prefer_AVX2_STRCMP)
>> >> >> > > +BIT (Avoid_Short_Distance_REP_MOVSB)
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
>> >> >> > > index 51c069bfe1..35cb90d507 100644
>> >> >> > > --- a/sysdeps/x86/sysdep.h
>> >> >> > > +++ b/sysdeps/x86/sysdep.h
>> >> >> > > @@ -57,6 +57,9 @@ enum cf_protection_level
>> >> >> > >  #define STATE_SAVE_MASK \
>> >> >> > >    ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
>> >> >> > >
>> >> >> >
>> >> >> > Suggest adding:
>> >> >> >
>> >> >> > /* Constants for bits in __x86_string_control:  */
>> >> >> >
>> >> >> > > +/* Avoid short distance REP MOVSB.  */
>> >> >> > > +#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB    (1 << 0)
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > > +
>> >> >> > >  #ifdef       __ASSEMBLER__
>> >> >> > >
>> >> >> > >  /* Syntactic details of assembler.  */
>> >> >> > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> >> >> > > index a783da5de2..9f02624375 100644
>> >> >> > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> >> >> > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
>> >> >> > > @@ -325,12 +325,16 @@ L(movsb):
>> >> >> > >       /* Avoid slow backward REP MOVSB.  */
>> >> >> > >       jb      L(more_8x_vec_backward)
>> >> >> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
>> >> >> > > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
>> >> >> > > +     jz      3f
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > >       movq    %rdi, %rcx
>> >> >> > >       subq    %rsi, %rcx
>> >> >> > >       jmp     2f
>> >> >> > >  # endif
>> >> >> > >  1:
>> >> >> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
>> >> >> > > +     andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
>> >> >> > > +     jz      3f
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > >       movq    %rsi, %rcx
>> >> >> > >       subq    %rdi, %rcx
>> >> >> > >  2:
>> >> >> > > @@ -338,6 +342,7 @@ L(movsb):
>> >> >> > >     is N*4GB + [1..63] with N >= 0.  */
>> >> >> > >       cmpl    $63, %ecx
>> >> >> > >       jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
>> >> >> > > +3:
>> >> >> >
>> >> >> > OK.
>> >> >> >
>> >> >> > >  # endif
>> >> >> > >       mov     %RDX_LP, %RCX_LP
>> >> >> > >       rep movsb
>> >> >> > >
>> >> >> >
>> >> >> >
>> >> >> > --
>> >> >> > Cheers,
>> >> >> > Carlos.
>> >> >> >
>> >> >>
>> >> >>
>> >> >> --
>> >> >> H.J.
>> >>
>> >> Here is the v2 patch:
>> >>
>> >> 1.  Add a 25-byte NOP padding after JMP for Avoid_Short_Distance_REP_MOVSB,
>> >> which improves bench-memcpy-random performance on Tiger Lake by ~30%
>> >
>> >
>> > I think this may not be due to unrelated factors. I reran the random benchmarks with
>> > the function on a fresh page and entry of the *_erms version at either + 0, 16, 32, 48
>> > bytes and 1) don't see a universal improvement and 2) believe it's likely that the 30%
>> > you measured is due to unrelated alignment changes.
>> >
>> > "__memcpy_avx_unaligned", "__memcpy_avx_unaligned_erms", "__memcpy_evex_unaligned", "__memcpy_evex_unaligned_erms"
>> >
>> > + 0 Entry Alignment
>> > New: 91824.1, 95460.1, 95063.3, 97998.3
>> > Old: 99973.7, 100127, 100370, 100049
>> >
>> > + 16 Entry Alignment
>> > New: 129558, 129916, 122373, 124056
>> > Old: 125361, 96475.4, 97457.8, 124319
>> >
>> > + 32 Entry Alignment
>> > New: 95073.7, 92857.8, 90182.2, 92666.3
>> > Old: 96702.1, 98558.9, 96797.1, 96887.1
>> >
>> > + 48 Entry Alignment
>> > New: 135161, 134010, 123023, 148589
>> > Old: 128150, 139029, 98382.2, 122686
>> >
>> > So the 32 byte/64 byte entry alignment versions seem to favor this change,
>> > but when entry alignment % 16 != 0 this change seems to perform worse.
>> >
>> > I generally think until we understand why the byte padding is necessary its
>> > probably a mistake to include it unless its a benefit in actual SPEC2017.
>> >
>> > My general intuition is that this is an issue with the benchmarks themselves
>> > so I support the change w.o the padding.
>>
>> Here is the v3 patch without the padding.  OK for master?
>
>
> Ok with this change going out.

I am going to check this version later today.

Thanks.

> But I think we need to find the root cause of this degradation before more
> changes are made to the file. At the moment just about any change that adds
> 25 bytes before L(less_vec) will look like a major win.
>>
>>
>> >
>> >> 2. Update comments for __x86_string_control.
>> >>
>> >>
>> >> --
>> >> H.J.
>>
>>
>>
>> --
>> H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-07-26 12:00 [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB H.J. Lu via Libc-alpha
  2021-07-26 17:20 ` Noah Goldstein via Libc-alpha
  2021-07-27  2:15 ` Carlos O'Donell via Libc-alpha
@ 2021-08-28  0:27 ` Alexey Tourbin via Libc-alpha
  2021-08-28  2:57   ` Noah Goldstein via Libc-alpha
  2 siblings, 1 reply; 15+ messages in thread
From: Alexey Tourbin via Libc-alpha @ 2021-08-28  0:27 UTC (permalink / raw)
  Cc: libc-alpha

On Mon, Jul 26, 2021 at 3:03 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -325,12 +325,16 @@ L(movsb):
>         /* Avoid slow backward REP MOVSB.  */
>         jb      L(more_8x_vec_backward)
>  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> +       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> +       jz      3f
>         movq    %rdi, %rcx
>         subq    %rsi, %rcx
>         jmp     2f
>  # endif
>  1:
>  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> +       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
> +       jz      3f
>         movq    %rsi, %rcx
>         subq    %rdi, %rcx
>  2:

Why "andl" rather than "testl"?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-08-28  0:27 ` [PATCH] " Alexey Tourbin via Libc-alpha
@ 2021-08-28  2:57   ` Noah Goldstein via Libc-alpha
  2022-04-28  0:15     ` Sunil Pandey via Libc-alpha
  0 siblings, 1 reply; 15+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-28  2:57 UTC (permalink / raw)
  To: Alexey Tourbin; +Cc: GNU C Library

On Fri, Aug 27, 2021 at 8:28 PM Alexey Tourbin via Libc-alpha <
libc-alpha@sourceware.org> wrote:

> On Mon, Jul 26, 2021 at 3:03 PM H.J. Lu via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > @@ -325,12 +325,16 @@ L(movsb):
> >         /* Avoid slow backward REP MOVSB.  */
> >         jb      L(more_8x_vec_backward)
> >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> > +       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> > +       jz      3f
> >         movq    %rdi, %rcx
> >         subq    %rsi, %rcx
> >         jmp     2f
> >  # endif
> >  1:
> >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> > +       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> __x86_string_control(%rip)
> > +       jz      3f
> >         movq    %rsi, %rcx
> >         subq    %rdi, %rcx
> >  2:
>
> Why "andl" rather than "testl"?
>

+1. I missed that before.

My patches: [PATCH 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S

uses `testl` instead. Although my patch might not be checked in (and it may
be a
while) as we are still looking into the causes of the slowdown.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB
  2021-08-28  2:57   ` Noah Goldstein via Libc-alpha
@ 2022-04-28  0:15     ` Sunil Pandey via Libc-alpha
  0 siblings, 0 replies; 15+ messages in thread
From: Sunil Pandey via Libc-alpha @ 2022-04-28  0:15 UTC (permalink / raw)
  To: Noah Goldstein, libc-stable; +Cc: GNU C Library

On Fri, Aug 27, 2021 at 7:58 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Fri, Aug 27, 2021 at 8:28 PM Alexey Tourbin via Libc-alpha <
> libc-alpha@sourceware.org> wrote:
>
> > On Mon, Jul 26, 2021 at 3:03 PM H.J. Lu via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > @@ -325,12 +325,16 @@ L(movsb):
> > >         /* Avoid slow backward REP MOVSB.  */
> > >         jb      L(more_8x_vec_backward)
> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> > > +       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> > __x86_string_control(%rip)
> > > +       jz      3f
> > >         movq    %rdi, %rcx
> > >         subq    %rsi, %rcx
> > >         jmp     2f
> > >  # endif
> > >  1:
> > >  # if AVOID_SHORT_DISTANCE_REP_MOVSB
> > > +       andl    $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB,
> > __x86_string_control(%rip)
> > > +       jz      3f
> > >         movq    %rsi, %rcx
> > >         subq    %rdi, %rcx
> > >  2:
> >
> > Why "andl" rather than "testl"?
> >
>
> +1. I missed that before.
>
> My patches: [PATCH 5/5] X86-64: Optimize memmove-vec-unaligned-erms.S
>
> uses `testl` instead. Although my patch might not be checked in (and it may
> be a
> while) as we are still looking into the causes of the slowdown.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2022-04-28  0:16 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-26 12:00 [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB H.J. Lu via Libc-alpha
2021-07-26 17:20 ` Noah Goldstein via Libc-alpha
2021-07-26 18:50   ` H.J. Lu via Libc-alpha
2021-07-27  2:15 ` Carlos O'Donell via Libc-alpha
2021-07-27  3:11   ` H.J. Lu via Libc-alpha
2021-07-27  4:05     ` Noah Goldstein via Libc-alpha
2021-07-27 16:05       ` [PATCH v2] " H.J. Lu via Libc-alpha
2021-07-27 19:12         ` Noah Goldstein via Libc-alpha
2021-07-27 19:22           ` [PATCH v3] " H.J. Lu via Libc-alpha
2021-07-27 19:50             ` Noah Goldstein via Libc-alpha
2021-07-27 19:55               ` H.J. Lu via Libc-alpha
2021-07-28 15:08               ` H.J. Lu via Libc-alpha
2021-08-28  0:27 ` [PATCH] " Alexey Tourbin via Libc-alpha
2021-08-28  2:57   ` Noah Goldstein via Libc-alpha
2022-04-28  0:15     ` Sunil Pandey via Libc-alpha

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).