unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c
  2021-08-24  8:27 [PATCH 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein via Libc-alpha
@ 2021-08-24  8:27 ` Noah Goldstein via Libc-alpha
  2021-08-24 15:18   ` H.J. Lu via Libc-alpha
  0 siblings, 1 reply; 5+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-24  8:27 UTC (permalink / raw)
  To: libc-alpha

This commit adds three new benchmarks for the SPEC2017
distribution. One randomized if dst > src and the other two set it
either 1/0.

As well add some tests for fixed sizes with randomize alignment and
value of dst > src. This can be useful for testing different alignment
configurations.
---
 benchtests/bench-memcpy-random.c | 107 +++++++++++++++++++++++++++----
 1 file changed, 96 insertions(+), 11 deletions(-)

diff --git a/benchtests/bench-memcpy-random.c b/benchtests/bench-memcpy-random.c
index c490b73ed0..28e0acb05f 100644
--- a/benchtests/bench-memcpy-random.c
+++ b/benchtests/bench-memcpy-random.c
@@ -16,7 +16,8 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#define MIN_PAGE_SIZE (512*1024+getpagesize())
+#define MAX_TEST_SIZE (512*1024)
+#define MIN_PAGE_SIZE (3*MAX_TEST_SIZE+3*getpagesize())
 #define TEST_MAIN
 #define TEST_NAME "memcpy"
 #include "bench-string.h"
@@ -89,9 +90,12 @@ static align_data_t dst_align_freq[] =
 
 typedef struct
 {
-  uint64_t src : 24;
-  uint64_t dst : 24;
-  uint64_t len : 16;
+/* 26 bits for src and dst so we have extra bit for alternating dst >
+   src without a branch.  */
+  uint64_t src : 26;
+  uint64_t dst : 26;
+  /* For size < 4096 12 bits is enough.  */
+  uint64_t len : 12;
 } copy_t;
 
 static copy_t copy[MAX_COPIES];
@@ -142,34 +146,100 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
 }
 
 static void
-do_test (json_ctx_t *json_ctx, size_t max_size)
+do_one_fixed_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
+               copy_t *copy, size_t n, size_t size)
 {
-  int i;
+  timing_t start, stop, cur;
+  size_t iters = INNER_LOOP_ITERS_SMALL;
 
-  memset (buf1, 1, max_size);
+  for (int j = 0; j < n; j++)
+    CALL (impl, dst + copy[j].dst, src + copy[j].src, size);
 
-  /* Create a random set of copies with the given size and alignment
+  TIMING_NOW (start);
+  for (int i = 0; i < iters; ++i)
+    for (int j = 0; j < n; j++)
+      CALL (impl, dst + copy[j].dst, src + copy[j].src, size);
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  json_element_double (json_ctx, (double) cur / (double) iters);
+}
+
+
+static size_t
+init_copy(size_t max_size, int dst_gt_src)
+{
+  size_t i, dst_offset, src_offset;
+  if (dst_gt_src <= 0)
+    {
+      dst_offset = 0;
+      src_offset = MAX_TEST_SIZE + getpagesize();
+    }
+  else
+    {
+      dst_offset = MAX_TEST_SIZE + getpagesize();
+      src_offset = 0;
+    }
+
+    /* Create a random set of copies with the given size and alignment
      distributions.  */
   for (i = 0; i < MAX_COPIES; i++)
     {
+      dst_offset  = dst_gt_src == -1
+                        ? (rand() & 1) ? MAX_TEST_SIZE + getpagesize() : 0
+                        : dst_offset;
       copy[i].dst = (rand () & (max_size - 1));
       copy[i].dst &= ~dst_align_arr[rand () & ALIGN_MASK];
+      copy[i].dst += dst_offset;
       copy[i].src = (rand () & (max_size - 1));
       copy[i].src &= ~src_align_arr[rand () & ALIGN_MASK];
+      copy[i].src += src_offset;
       copy[i].len = size_arr[rand () & SIZE_MASK];
     }
+  return i;
+}
 
+static void
+do_test (json_ctx_t *json_ctx, size_t max_size, int dst_gt_src)
+{
+  size_t n;
+  memset (buf1, 1, max_size);
+  n = init_copy(max_size, dst_gt_src);
   json_element_object_begin (json_ctx);
-  json_attr_uint (json_ctx, "length", (double) max_size);
+  json_attr_uint (json_ctx, "max-alignment", (double) max_size);
+  json_attr_int (json_ctx, "dst > src", (double) dst_gt_src);
+  json_attr_uint (json_ctx, "with-fixed-size", (double) 0);
   json_array_begin (json_ctx, "timings");
 
   FOR_EACH_IMPL (impl, 0)
-    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, i);
+    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, n);
 
   json_array_end (json_ctx);
   json_element_object_end (json_ctx);
 }
 
+static void
+do_test_fixed_size (json_ctx_t *json_ctx, size_t size, size_t max_size, int dst_gt_src)
+{
+  size_t n;
+  memset (buf1, 1, max_size);
+  n = init_copy(max_size, dst_gt_src);
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "max-alignment", (double) max_size);
+  json_attr_int (json_ctx, "dst > src", (double) dst_gt_src);
+  json_attr_uint (json_ctx, "with-fixed-size", (double) 1);
+  json_attr_uint (json_ctx, "size", (double) size);
+  json_array_begin (json_ctx, "timings");
+
+  FOR_EACH_IMPL (impl, 0)
+    do_one_fixed_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, n, size);
+
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
+}
+
+
 int
 test_main (void)
 {
@@ -194,7 +264,22 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
   for (int i = 4; i <= 512; i = i * 2)
-    do_test (&json_ctx, i * 1024);
+    {
+      if (i * 1024 > MAX_TEST_SIZE)
+          continue;
+      do_test (&json_ctx, i * 1024, 0);
+      do_test (&json_ctx, i * 1024, 1);
+      do_test (&json_ctx, i * 1024, -1);
+    }
+
+  for (int i = 4; i <= 64; i = i * 2)
+    {
+      if (i * 1024 > MAX_TEST_SIZE)
+          continue;
+      do_test_fixed_size (&json_ctx, i * 256, i * 1024, 0);
+      do_test_fixed_size (&json_ctx, i * 256, i * 1024, 1);
+      do_test_fixed_size (&json_ctx, i * 256, i * 1024, -1);
+    }
 
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c
  2021-08-24  8:27 ` [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c Noah Goldstein via Libc-alpha
@ 2021-08-24 15:18   ` H.J. Lu via Libc-alpha
  0 siblings, 0 replies; 5+ messages in thread
From: H.J. Lu via Libc-alpha @ 2021-08-24 15:18 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Tue, Aug 24, 2021 at 1:28 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit adds three new benchmarks for the SPEC2017
> distribution. One randomized if dst > src and the other two set it
> either 1/0.
>
> As well add some tests for fixed sizes with randomize alignment and
> value of dst > src. This can be useful for testing different alignment
> configurations.
> ---
>  benchtests/bench-memcpy-random.c | 107 +++++++++++++++++++++++++++----
>  1 file changed, 96 insertions(+), 11 deletions(-)
>
> diff --git a/benchtests/bench-memcpy-random.c b/benchtests/bench-memcpy-random.c
> index c490b73ed0..28e0acb05f 100644
> --- a/benchtests/bench-memcpy-random.c
> +++ b/benchtests/bench-memcpy-random.c
> @@ -16,7 +16,8 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#define MIN_PAGE_SIZE (512*1024+getpagesize())
> +#define MAX_TEST_SIZE (512*1024)
> +#define MIN_PAGE_SIZE (3*MAX_TEST_SIZE+3*getpagesize())
>  #define TEST_MAIN
>  #define TEST_NAME "memcpy"
>  #include "bench-string.h"
> @@ -89,9 +90,12 @@ static align_data_t dst_align_freq[] =
>
>  typedef struct
>  {
> -  uint64_t src : 24;
> -  uint64_t dst : 24;
> -  uint64_t len : 16;
> +/* 26 bits for src and dst so we have extra bit for alternating dst >
> +   src without a branch.  */
> +  uint64_t src : 26;
> +  uint64_t dst : 26;
> +  /* For size < 4096 12 bits is enough.  */
> +  uint64_t len : 12;
>  } copy_t;
>
>  static copy_t copy[MAX_COPIES];
> @@ -142,34 +146,100 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
>  }
>
>  static void
> -do_test (json_ctx_t *json_ctx, size_t max_size)
> +do_one_fixed_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
> +               copy_t *copy, size_t n, size_t size)
>  {
> -  int i;
> +  timing_t start, stop, cur;
> +  size_t iters = INNER_LOOP_ITERS_SMALL;
>
> -  memset (buf1, 1, max_size);
> +  for (int j = 0; j < n; j++)
> +    CALL (impl, dst + copy[j].dst, src + copy[j].src, size);
>
> -  /* Create a random set of copies with the given size and alignment
> +  TIMING_NOW (start);
> +  for (int i = 0; i < iters; ++i)
> +    for (int j = 0; j < n; j++)
> +      CALL (impl, dst + copy[j].dst, src + copy[j].src, size);
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  json_element_double (json_ctx, (double) cur / (double) iters);
> +}
> +
> +
> +static size_t
> +init_copy(size_t max_size, int dst_gt_src)
> +{
> +  size_t i, dst_offset, src_offset;
> +  if (dst_gt_src <= 0)
> +    {
> +      dst_offset = 0;
> +      src_offset = MAX_TEST_SIZE + getpagesize();
> +    }
> +  else
> +    {
> +      dst_offset = MAX_TEST_SIZE + getpagesize();
> +      src_offset = 0;
> +    }
> +
> +    /* Create a random set of copies with the given size and alignment
>       distributions.  */
>    for (i = 0; i < MAX_COPIES; i++)
>      {
> +      dst_offset  = dst_gt_src == -1
> +                        ? (rand() & 1) ? MAX_TEST_SIZE + getpagesize() : 0
> +                        : dst_offset;
>        copy[i].dst = (rand () & (max_size - 1));
>        copy[i].dst &= ~dst_align_arr[rand () & ALIGN_MASK];
> +      copy[i].dst += dst_offset;
>        copy[i].src = (rand () & (max_size - 1));
>        copy[i].src &= ~src_align_arr[rand () & ALIGN_MASK];
> +      copy[i].src += src_offset;
>        copy[i].len = size_arr[rand () & SIZE_MASK];
>      }
> +  return i;
> +}
>
> +static void
> +do_test (json_ctx_t *json_ctx, size_t max_size, int dst_gt_src)
> +{
> +  size_t n;
> +  memset (buf1, 1, max_size);
> +  n = init_copy(max_size, dst_gt_src);
>    json_element_object_begin (json_ctx);
> -  json_attr_uint (json_ctx, "length", (double) max_size);
> +  json_attr_uint (json_ctx, "max-alignment", (double) max_size);
> +  json_attr_int (json_ctx, "dst > src", (double) dst_gt_src);
> +  json_attr_uint (json_ctx, "with-fixed-size", (double) 0);
>    json_array_begin (json_ctx, "timings");
>
>    FOR_EACH_IMPL (impl, 0)
> -    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, i);
> +    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, n);
>
>    json_array_end (json_ctx);
>    json_element_object_end (json_ctx);
>  }
>
> +static void
> +do_test_fixed_size (json_ctx_t *json_ctx, size_t size, size_t max_size, int dst_gt_src)
> +{
> +  size_t n;
> +  memset (buf1, 1, max_size);
> +  n = init_copy(max_size, dst_gt_src);
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "max-alignment", (double) max_size);
> +  json_attr_int (json_ctx, "dst > src", (double) dst_gt_src);
> +  json_attr_uint (json_ctx, "with-fixed-size", (double) 1);
> +  json_attr_uint (json_ctx, "size", (double) size);
> +  json_array_begin (json_ctx, "timings");
> +
> +  FOR_EACH_IMPL (impl, 0)
> +    do_one_fixed_test (json_ctx, impl, (char *) buf2, (char *) buf1, copy, n, size);
> +
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
> +}
> +
> +
>  int
>  test_main (void)
>  {
> @@ -194,7 +264,22 @@ test_main (void)
>
>    json_array_begin (&json_ctx, "results");
>    for (int i = 4; i <= 512; i = i * 2)
> -    do_test (&json_ctx, i * 1024);
> +    {
> +      if (i * 1024 > MAX_TEST_SIZE)
> +          continue;
> +      do_test (&json_ctx, i * 1024, 0);
> +      do_test (&json_ctx, i * 1024, 1);
> +      do_test (&json_ctx, i * 1024, -1);
> +    }
> +
> +  for (int i = 4; i <= 64; i = i * 2)
> +    {
> +      if (i * 1024 > MAX_TEST_SIZE)
> +          continue;
> +      do_test_fixed_size (&json_ctx, i * 256, i * 1024, 0);
> +      do_test_fixed_size (&json_ctx, i * 256, i * 1024, 1);
> +      do_test_fixed_size (&json_ctx, i * 256, i * 1024, -1);
> +    }
>
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c
@ 2021-08-24 17:09 Wilco Dijkstra via Libc-alpha
  2021-08-24 19:32 ` Noah Goldstein via Libc-alpha
  0 siblings, 1 reply; 5+ messages in thread
From: Wilco Dijkstra via Libc-alpha @ 2021-08-24 17:09 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: 'GNU C Library'

Hi Noah,

> This commit adds three new benchmarks for the SPEC2017
> distribution. One randomized if dst > src and the other two set it
> either 1/0.

What is the goal of this? Have you ever seen memcpy do something
radically differently depending on whether src < dst (assuming no overlaps)?
Generally it's the same basic loop that gets used, so I don't see the usefulness
of adding this.

Some comments on the implementation details are below, there are
several issues that need to be resolved.

> As well add some tests for fixed sizes with randomize alignment and
> value of dst > src. This can be useful for testing different alignment
> configurations.

This tests fixed size copies from 1KB to 128KB. Large copies are typically
aligned, so I'm not sure it makes sense to use the random alignment 
distribution for small copies here.
 
-#define MIN_PAGE_SIZE (512*1024+getpagesize())
+#define MAX_TEST_SIZE (512*1024)

This makes sense. Please also use it in the loops. Also it may be
worthwhile to increase this value a bit if larger sizes are what you
are after (and the minimum size of 4K could be increased a bit as
well as L1/L2 caches have increased a lot in recent years).

+#define MIN_PAGE_SIZE (3*MAX_TEST_SIZE+3*getpagesize())

I don't get this at all. The goal is to copy within the MAX_TEST_SIZE
region, not to copy outside it. And why 3 times the size?

 typedef struct
 {
-  uint64_t src : 24;
-  uint64_t dst : 24;
-  uint64_t len : 16;
+/* 26 bits for src and dst so we have extra bit for alternating dst >
+   src without a branch.  */
+  uint64_t src : 26;
+  uint64_t dst : 26;
+  /* For size < 4096 12 bits is enough.  */
+  uint64_t len : 12;
 } copy_t;

This doesn't make any sense, where do we need offsets larger than 2^24?
 
+static size_t
+init_copy(size_t max_size, int dst_gt_src)
+{
+  size_t i, dst_offset, src_offset;
+  if (dst_gt_src <= 0)
+    {
+      dst_offset = 0;
+      src_offset = MAX_TEST_SIZE + getpagesize();
+    }
+  else
+    {
+      dst_offset = MAX_TEST_SIZE + getpagesize();
+      src_offset = 0;
+    }

This doesn't make sense since this is now copying outside of the region,
effectively enlarging the region but also always reading from one part and
writing to another. The data outside the region is uninitialized which means
it is using the zero page which gives unrepresentative results.

+    /* Create a random set of copies with the given size and alignment
      distributions.  */
   for (i = 0; i < MAX_COPIES; i++)
     {
+      dst_offset  = dst_gt_src == -1
+                        ? (rand() & 1) ? MAX_TEST_SIZE + getpagesize() : 0
+                        : dst_offset;

Same here. If you want to force src > dst then just skip cases where
src < dst. Similarly for avoiding accidental overlaps (which are likely
to happen for small values of max_size).

Given the much larger sizes used for some tests, it is also worth
ensuring the copy stays within the region.

       copy[i].dst = (rand () & (max_size - 1));
       copy[i].dst &= ~dst_align_arr[rand () & ALIGN_MASK];
+      copy[i].dst += dst_offset;
       copy[i].src = (rand () & (max_size - 1));
       copy[i].src &= ~src_align_arr[rand () & ALIGN_MASK];
+      copy[i].src += src_offset;
       copy[i].len = size_arr[rand () & SIZE_MASK];
     }
+  return i;
+}
 
+static void
+do_test (json_ctx_t *json_ctx, size_t max_size, int dst_gt_src)
+{
+  size_t n;
+  memset (buf1, 1, max_size);

Note this doesn't initialize the extra data in the buffer which creates
odd performance behaviour due to the zero page.

+  n = init_copy(max_size, dst_gt_src);
   json_element_object_begin (json_ctx);
-  json_attr_uint (json_ctx, "length", (double) max_size);
+  json_attr_uint (json_ctx, "max-alignment", (double) max_size);

max-alignment, where does that come from???
 
   json_array_begin (&json_ctx, "results");
   for (int i = 4; i <= 512; i = i * 2)
-    do_test (&json_ctx, i * 1024);
+    {
+      if (i * 1024 > MAX_TEST_SIZE)
+          continue;

It's much easier to change the for loop to i = 4096 to MAX_TEST_SIZE
and remove the various i * 1024. Then changing MAX_TEST_SIZE just works.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c
  2021-08-24 17:09 [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c Wilco Dijkstra via Libc-alpha
@ 2021-08-24 19:32 ` Noah Goldstein via Libc-alpha
  2021-08-26 17:06   ` Wilco Dijkstra via Libc-alpha
  0 siblings, 1 reply; 5+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-08-24 19:32 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: GNU C Library

On Tue, Aug 24, 2021 at 1:09 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com>
wrote:

> Hi Noah,
>
> > This commit adds three new benchmarks for the SPEC2017
> > distribution. One randomized if dst > src and the other two set it
> > either 1/0.
>
> What is the goal of this? Have you ever seen memcpy do something
> radically differently depending on whether src < dst (assuming no
> overlaps)?
> Generally it's the same basic loop that gets used, so I don't see the
> usefulness
> of adding this.
>

Since on x86_64 at least we use memmove to implement memcpy there are a
few points where the logic revolves around src < dst. Particularly in
L(more_8x_vec)
which branch on the condition w.o checking for overlap. I don't think its
particularly
fair for the random tests to essentially ignore that branch and imo a
better reflection
of real-world usage. An implementation could add some really branch heavy
code
in the case ignored by the current implementation that looked good on the
fixed
benchmarks but would perform worse in the randomized tests.

For example in cases where src/dst 4k alias forward/backward looping can
make a big difference.
On fixed tests it may be worth it to add branches that check for that, but
in a randomized
environment where the branches can miss could say otherwise.


> Some comments on the implementation details are below, there are
> several issues that need to be resolved.
>
> > As well add some tests for fixed sizes with randomize alignment and
> > value of dst > src. This can be useful for testing different alignment
> > configurations.
>
> This tests fixed size copies from 1KB to 128KB. Large copies are typically
> aligned, so I'm not sure it makes sense to use the random alignment
> distribution for small copies here.
>

I generally agree. But on the flip side there is essentially no way to
tiebreak different
alignment methods. For example in x86_64 aligning to 1x vs 2x VECs will
improve about
50% of the fixed tests and harm 50%. I think this can be useful for
determining "which method
is expected to yield better results". There are other more branch heavy
possible alignment
configurations, which I don't think fixed tests would be able to accurately
benchmark as they
would be 100% predicted.


>
> -#define MIN_PAGE_SIZE (512*1024+getpagesize())
> +#define MAX_TEST_SIZE (512*1024)
>
> This makes sense. Please also use it in the loops. Also it may be
> worthwhile to increase this value a bit if larger sizes are what you
> are after (and the minimum size of 4K could be increased a bit as
> well as L1/L2 caches have increased a lot in recent years).
>


> +#define MIN_PAGE_SIZE (3*MAX_TEST_SIZE+3*getpagesize())
>
> I don't get this at all. The goal is to copy within the MAX_TEST_SIZE
> region, not to copy outside it. And why 3 times the size?
>
>  typedef struct
>  {
> -  uint64_t src : 24;
> -  uint64_t dst : 24;
> -  uint64_t len : 16;
> +/* 26 bits for src and dst so we have extra bit for alternating dst >
> +   src without a branch.  */
> +  uint64_t src : 26;
> +  uint64_t dst : 26;
> +  /* For size < 4096 12 bits is enough.  */
> +  uint64_t len : 12;
>  } copy_t;
>
> This doesn't make any sense, where do we need offsets larger than 2^24?
>
We need the extra bits so store max_size offsets to alternate direction.


>
> +static size_t
> +init_copy(size_t max_size, int dst_gt_src)
> +{
> +  size_t i, dst_offset, src_offset;
> +  if (dst_gt_src <= 0)
> +    {
> +      dst_offset = 0;
> +      src_offset = MAX_TEST_SIZE + getpagesize();
> +    }
> +  else
> +    {
> +      dst_offset = MAX_TEST_SIZE + getpagesize();
> +      src_offset = 0;
> +    }
>
> This doesn't make sense since this is now copying outside of the region,
> effectively enlarging the region but also always reading from one part and
> writing to another. The data outside the region is uninitialized which
> means
> it is using the zero page which gives unrepresentative results.
>

I see. I didn't quite grasp the purpose of the max_size. (As seen by the
fact that I renamed "length" to "max-alignment", now "region-size").

Changed it to use max_size instead of MAX_TEST_SIZE and initialize
3 * max_size.

I think its better to expand region size to 3x max_size as opposed to 2x
max_size and have noise for store-forwarding (on x86 at least)

>
> +    /* Create a random set of copies with the given size and alignment
>       distributions.  */
>    for (i = 0; i < MAX_COPIES; i++)
>      {
> +      dst_offset  = dst_gt_src == -1
> +                        ? (rand() & 1) ? MAX_TEST_SIZE + getpagesize() : 0
> +                        : dst_offset;
>
> Same here. If you want to force src > dst then just skip cases where
> src < dst. Similarly for avoiding accidental overlaps (which are likely
> to happen for small values of max_size).
>

I don't quite understand what you mean? I change it to be
2 * max_size in v2 and src_offset always has a value of
max_size if src > dst or randomized. This should prevent
overlaps as essentially src has its own max_size buffer
and dst has two possible max_size buffers, one below src
and one above.

The current implementation was extremely buggy. Sorry!


> Given the much larger sizes used for some tests, it is also worth
> ensuring the copy stays within the region.
>

Fixed in v2


>
>        copy[i].dst = (rand () & (max_size - 1));
>        copy[i].dst &= ~dst_align_arr[rand () & ALIGN_MASK];
> +      copy[i].dst += dst_offset;
>        copy[i].src = (rand () & (max_size - 1));
>        copy[i].src &= ~src_align_arr[rand () & ALIGN_MASK];
> +      copy[i].src += src_offset;
>        copy[i].len = size_arr[rand () & SIZE_MASK];
>      }
> +  return i;
> +}
>
> +static void
> +do_test (json_ctx_t *json_ctx, size_t max_size, int dst_gt_src)
> +{
> +  size_t n;
> +  memset (buf1, 1, max_size);
>
> Note this doesn't initialize the extra data in the buffer which creates
> odd performance behaviour due to the zero page.
>

Fixed in v2.

>
> +  n = init_copy(max_size, dst_gt_src);
>    json_element_object_begin (json_ctx);
> -  json_attr_uint (json_ctx, "length", (double) max_size);
> +  json_attr_uint (json_ctx, "max-alignment", (double) max_size);
>
> max-alignment, where does that come from???
>
> I misunderstood the purpose of "length" which is essentially total
region size. Imo "length" in context of memcpy generally refers to
copy size. Renamed "region-size" and set to full region size used
which previously was 2 * max_size, now 3 * max_size.


>    json_array_begin (&json_ctx, "results");
>    for (int i = 4; i <= 512; i = i * 2)
> -    do_test (&json_ctx, i * 1024);
> +    {
> +      if (i * 1024 > MAX_TEST_SIZE)
> +          continue;
>
> It's much easier to change the for loop to i = 4096 to MAX_TEST_SIZE
> and remove the various i * 1024. Then changing MAX_TEST_SIZE just works.
>
Fixed in v2.


>
> Cheers,
> Wilco

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c
  2021-08-24 19:32 ` Noah Goldstein via Libc-alpha
@ 2021-08-26 17:06   ` Wilco Dijkstra via Libc-alpha
  0 siblings, 0 replies; 5+ messages in thread
From: Wilco Dijkstra via Libc-alpha @ 2021-08-26 17:06 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

Hi Noah,

> Since on x86_64 at least we use memmove to implement memcpy there are a 
> few points where the logic revolves around src < dst. Particularly in L(more_8x_vec)
> which branch on the condition w.o checking for overlap. I don't think its particularly
> fair for the random tests to essentially ignore that branch and imo a better reflection
> of real-world usage. An implementation could add some really branch heavy code
> in the case ignored by the current implementation that looked good on the fixed
> benchmarks but would perform worse in the randomized tests.

Yes such branches are a bad idea in general, particularly for small sizes. However
I don't understand why you think that branch is ignored given the current random
tests will already test it (so you should see it being expensive in profiles, and you
can easily check whether doing an overlap test ends up faster).

So why would there need to be any changes to the benchmark?

> For example in cases where src/dst 4k alias forward/backward looping can make a big difference. 
> On fixed tests it may be worth it to add branches that check for that, but in a randomized 
> environment where the branches can miss could say otherwise.

Again the existing benchmark should already test those branches - if they
execute frequently and are hard to predict, the results will be worse.


> I generally agree. But on the flip side there is essentially no way to tiebreak different
> alignment methods. For example in x86_64 aligning to 1x vs 2x VECs will improve about
> 50% of the fixed tests and harm 50%. I think this can be useful for determining "which method
> is expected to yield better results". There are other more branch heavy possible alignment
> configurations, which I don't think fixed tests would be able to accurately benchmark as they
> would be 100% predicted.
 
Yes the fixed tests are not very good for tuning at all since they completely ignore the
effects of branch prediction. I agree adding fixed sizes might be useful for alignment
tuning but then it's best to keep it to smaller sizes since that's where the alignment
overhead is the largest (say from 128 bytes to 4KB rather than 4KB to 128KB).

+  uint64_t src : 26;
+  uint64_t dst : 26;

>> This doesn't make any sense, where do we need offsets larger than 2^24?
>
> We need the extra bits so store max_size offsets to alternate direction.

You don't need the extra bits, 24 bits is more than enough.

>> This doesn't make sense since this is now copying outside of the region,
>> effectively enlarging the region but also always reading from one part and
>> writing to another. The data outside the region is uninitialized which means
>> it is using the zero page which gives unrepresentative results.
>
> I see. I didn't quite grasp the purpose of the max_size. (As seen by the
> fact that I renamed "length" to "max-alignment", now "region-size"). 
>
> Changed it to use max_size instead of MAX_TEST_SIZE and initialize
> 3 * max_size.

That still doesn't solve the issue. Basically the original test runs within
a block of max_size. You're effectively doubling or tripling it which means
the results are not comparable with the original code, and the results
cannot be compared between the variants in the new version either.

> I think its better to expand region size to 3x max_size as opposed to 2x
> max_size and have noise for store-forwarding (on x86 at least)

I don't see how that helps. The point of the benchmark is to be more
real-world than the fixed benchmarks. This means you will see lots of
mispredictions, cachemisses and other penalties. Yes, it will tell you if you
have a bad memcpy implemention. And that is a good thing rather than
something to be avoided.

+    /* Create a random set of copies with the given size and alignment
      distributions.  */
   for (i = 0; i < MAX_COPIES; i++)
     {
+      dst_offset  = dst_gt_src == -1
+                        ? (rand() & 1) ? MAX_TEST_SIZE + getpagesize() : 0
+                        : dst_offset;

> I don't quite understand what you mean? I change it to be
> 2 * max_size in v2 and src_offset always has a value of
> max_size if src > dst or randomized. This should prevent
> overlaps as essentially src has its own max_size buffer
> and dst has two possible max_size buffers, one below src
> and one above.

My point is that doesn't make sense even if we ignore issues in
the implementation. Basically it doubles the region size for 2 of
the 3 variants and triples it for the other, so they're not comparable.
When you can't even come to any conclusion about that src < dst
branch, what use does all this have?

The other thing is that with splitting the buffer into read-only and
write-only we go a step back towards fixed tests. A memcpy often
reads data that has recently been written to, so doing both reads
and writes from the same buffer is closer to real world scenarios.

> Fixed in v2

Is that this version (which doesn't appear to have fixed it) or are you
planning to post another one?

https://sourceware.org/pipermail/libc-alpha/2021-August/130493.html

 void
+do_test (json_ctx_t *json_ctx, size_t max_size, int dst_gt_src)
+{
+  size_t n;
+  memset (buf1, 1, max_size);

>> Note this doesn't initialize the extra data in the buffer which creates
>> odd performance behaviour due to the zero page.
>
> Fixed in v2.

The memset looks fixed indeed. However there now more bugs in the
fixed size case where it does:

+  n = init_copy(3 * max_size, dst_gt_src);

> I misunderstood the purpose of "length" which is essentially total
> region size. Imo "length" in context of memcpy generally refers to
> copy size. Renamed "region-size" and set to full region size used 
> which previously was 2 * max_size, now 3 * max_size.

"region-size" sounds reasonable, but you might need to update the
XML template to allow all these changes too.

>> It's much easier to change the for loop to i = 4096 to MAX_TEST_SIZE
>> and remove the various i * 1024. Then changing MAX_TEST_SIZE just works.
> Fixed in v2.
 
+  for (int i = 4096; i < MAX_TEST_SIZE; i = i * 2)

That looks much better indeed, but please use i <= MAX_TEST_SIZE.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-08-26 17:07 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-24 17:09 [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c Wilco Dijkstra via Libc-alpha
2021-08-24 19:32 ` Noah Goldstein via Libc-alpha
2021-08-26 17:06   ` Wilco Dijkstra via Libc-alpha
  -- strict thread matches above, loose matches on Subject: below --
2021-08-24  8:27 [PATCH 1/5] string: Make tests birdirectional test-memcpy.c Noah Goldstein via Libc-alpha
2021-08-24  8:27 ` [PATCH 2/5] benchtests: Add new random cases to bench-memcpy-random.c Noah Goldstein via Libc-alpha
2021-08-24 15:18   ` H.J. Lu via Libc-alpha

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).