[PATCH] benchtests: Add memset zero fill benchmark tests

unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH] benchtests: Add memset zero fill benchmark tests
@ 2021-07-13  8:22 Naohiro Tamura via Libc-alpha
  2021-07-13 13:50 ` Lucas A. M. Magalhaes via Libc-alpha
                   ` (8 more replies)
  0 siblings, 9 replies; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-07-13  8:22 UTC (permalink / raw)
  To: libc-alpha

Memset takes 0 as the second parameter in most cases.
More than 95% of memset takes 0 as the second parameter in case of
Linux Kernel source code.
However, we cannot measure the zero fill performance by
bench-memset-zerofill.c and bench-memset-large-zerofill.c.
This patch provides bench-memset-zerofill.c and
bench-memset-large-zerofill.c which are suitable to see the
performance of zero fill by fixing the second parameter to 0.
---
 benchtests/Makefile                      |   3 +-
 benchtests/bench-memset-large-zerofill.c | 125 ++++++++++++++++++
 benchtests/bench-memset-zerofill.c       | 156 +++++++++++++++++++++++
 3 files changed, 283 insertions(+), 1 deletion(-)
 create mode 100644 benchtests/bench-memset-large-zerofill.c
 create mode 100644 benchtests/bench-memset-zerofill.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 1530939a8ce8..1261f7650fc7 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -53,7 +53,8 @@ string-benchset := memccpy memchr memcmp memcpy memmem memmove \
 		   strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
 		   strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
 		   strcoll memcpy-large memcpy-random memmove-large memset-large \
-		   memcpy-walk memset-walk memmove-walk
+		   memcpy-walk memset-walk memmove-walk memset-zerofill \
+		   memset-large-zerofill
 
 # Build and run locale-dependent benchmarks only if we're building natively.
 ifeq (no,$(cross-compiling))
diff --git a/benchtests/bench-memset-large-zerofill.c b/benchtests/bench-memset-large-zerofill.c
new file mode 100644
index 000000000000..d8eae9d9789f
--- /dev/null
+++ b/benchtests/bench-memset-large-zerofill.c
@@ -0,0 +1,125 @@
+/* Measure memset functions with large data sizes.
+   Copyright (C) 2016-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#define TEST_NAME "memset"
+#define START_SIZE (128 * 1024)
+#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+#include <assert.h>
+#include "json-lib.h"
+
+void *generic_memset (void *, int, size_t);
+typedef void *(*proto_t) (void *, int, size_t);
+
+IMPL (MEMSET, 1)
+IMPL (generic_memset, 0)
+
+static void
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
+	     int c __attribute ((unused)), size_t n)
+{
+  size_t i, iters = 16;
+  timing_t start, stop, cur;
+
+  TIMING_NOW (start);
+  for (i = 0; i < iters; ++i)
+    {
+      CALL (impl, s, c, n);
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  json_element_double (json_ctx, (double) cur / (double) iters);
+}
+
+static void
+do_test (json_ctx_t *json_ctx, size_t align, int c, size_t len)
+{
+  align &= 63;
+  if ((align + len) * sizeof (CHAR) > page_size)
+    return;
+
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_uint (json_ctx, "alignment", align);
+  json_attr_int (json_ctx, "char", c);
+  json_array_begin (json_ctx, "timings");
+
+  FOR_EACH_IMPL (impl, 0)
+    {
+      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c, len);
+      alloc_bufs ();
+    }
+
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
+}
+
+int
+test_main (void)
+{
+  json_ctx_t json_ctx;
+  size_t i;
+  int c;
+
+  test_init ();
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "large-zerofill");
+
+  json_array_begin (&json_ctx, "ifuncs");
+  FOR_EACH_IMPL (impl, 0)
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
+
+  c = 0;
+  for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+    {
+      do_test (&json_ctx, 0, c, i);
+      do_test (&json_ctx, 3, c, i);
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return ret;
+}
+
+#include <support/test-driver.c>
+
+#define libc_hidden_builtin_def(X)
+#define libc_hidden_def(X)
+#define libc_hidden_weak(X)
+#define weak_alias(X,Y)
+#undef MEMSET
+#define MEMSET generic_memset
+#include <string/memset.c>
diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
new file mode 100644
index 000000000000..ac20ae4c6537
--- /dev/null
+++ b/benchtests/bench-memset-zerofill.c
@@ -0,0 +1,156 @@
+/* Measure memset functions.
+   Copyright (C) 2013-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef WIDE
+# define TEST_NAME "memset"
+#else
+# define TEST_NAME "wmemset"
+# define generic_memset generic_wmemset
+#endif /* WIDE */
+#define MIN_PAGE_SIZE 131072
+#include "bench-string.h"
+
+#include "json-lib.h"
+
+#ifdef WIDE
+CHAR *generic_wmemset (CHAR *, CHAR, size_t);
+#else
+void *generic_memset (void *, int, size_t);
+#endif
+
+typedef void *(*proto_t) (void *, int, size_t);
+
+IMPL (MEMSET, 1)
+IMPL (generic_memset, 0)
+
+static void
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
+	     int c __attribute ((unused)), size_t n)
+{
+  size_t i, iters = INNER_LOOP_ITERS;
+  timing_t start, stop, cur;
+
+  TIMING_NOW (start);
+  for (i = 0; i < iters; ++i)
+    {
+      CALL (impl, s, c, n);
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  json_element_double (json_ctx, (double) cur / (double) iters);
+}
+
+static void
+do_test (json_ctx_t *json_ctx, size_t align, int c, size_t len)
+{
+  align &= 4095;
+  if ((align + len) * sizeof (CHAR) > page_size)
+    return;
+
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_uint (json_ctx, "alignment", align);
+  json_attr_int (json_ctx, "char", c);
+  json_array_begin (json_ctx, "timings");
+
+  FOR_EACH_IMPL (impl, 0)
+    {
+      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c, len);
+      alloc_bufs ();
+    }
+
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
+}
+
+int
+test_main (void)
+{
+  json_ctx_t json_ctx;
+  size_t i;
+  int c = 0;
+
+  test_init ();
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "default-zerofill");
+
+  json_array_begin (&json_ctx, "ifuncs");
+  FOR_EACH_IMPL (impl, 0)
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
+
+  c = 0;
+  for (i = 0; i < 18; ++i)
+    do_test (&json_ctx, 0, c, 1 << i);
+  for (i = 1; i < 64; ++i)
+    {
+      do_test (&json_ctx, i, c, i);
+      do_test (&json_ctx, 4096 - i, c, i);
+      do_test (&json_ctx, 4095, c, i);
+      if (i & (i - 1))
+	do_test (&json_ctx, 0, c, i);
+    }
+  for (i = 32; i < 512; i+=32)
+    {
+      do_test (&json_ctx, 0, c, i);
+      do_test (&json_ctx, i, c, i);
+    }
+  do_test (&json_ctx, 1, c, 14);
+  do_test (&json_ctx, 3, c, 1024);
+  do_test (&json_ctx, 4, c, 64);
+  do_test (&json_ctx, 2, c, 25);
+  for (i = 33; i <= 256; i += 4)
+    {
+      do_test (&json_ctx, 0, c, 32 * i);
+      do_test (&json_ctx, i, c, 32 * i);
+    }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return ret;
+}
+
+#include <support/test-driver.c>
+
+#define libc_hidden_builtin_def(X)
+#define libc_hidden_def(X)
+#define libc_hidden_weak(X)
+#define weak_alias(X,Y)
+#ifndef WIDE
+# undef MEMSET
+# define MEMSET generic_memset
+# include <string/memset.c>
+#else
+# define WMEMSET generic_wmemset
+# include <wcsmbs/wmemset.c>
+#endif
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH] benchtests: Add memset zero fill benchmark tests
  2021-07-13  8:22 [PATCH] benchtests: Add memset zero fill benchmark tests Naohiro Tamura via Libc-alpha
@ 2021-07-13 13:50 ` Lucas A. M. Magalhaes via Libc-alpha
  2021-07-20  6:31 ` [PATCH v2 0/5] " Naohiro Tamura via Libc-alpha
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 83+ messages in thread
From: Lucas A. M. Magalhaes via Libc-alpha @ 2021-07-13 13:50 UTC (permalink / raw)
  To: Naohiro Tamura, libc-alpha

Hi Naohiro,
Thanks for working on this.

I like the idea of a benchmark specific for 0 on memset. However having two
implementations seems too much. I would rather see just one
bench-memset-zerofill.c. What I guess would be even better is to have this
performance test inside bench-memset.c and bench-memset-large.c.

Quoting Naohiro Tamura via Libc-alpha (2021-07-13 05:22:14)
> Memset takes 0 as the second parameter in most cases.
> More than 95% of memset takes 0 as the second parameter in case of
> Linux Kernel source code.
The Linux Kernel does not use glibc, it has his own memset implementation.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/string.c#n784
Therefore IMO this argument is not suited for this commit.

> However, we cannot measure the zero fill performance by
> bench-memset-zerofill.c and bench-memset-large-zerofill.c.
> This patch provides bench-memset-zerofill.c and
> bench-memset-large-zerofill.c which are suitable to see the
> performance of zero fill by fixing the second parameter to 0.
In this section I guess you mistake bench-memset.c and bench-memset-large.c for
bench-memset-zerofill.c and bench-memset-large-zerofill.c.

> ---
>  benchtests/Makefile                      |   3 +-
>  benchtests/bench-memset-large-zerofill.c | 125 ++++++++++++++++++
>  benchtests/bench-memset-zerofill.c       | 156 +++++++++++++++++++++++
>  3 files changed, 283 insertions(+), 1 deletion(-)
>  create mode 100644 benchtests/bench-memset-large-zerofill.c
>  create mode 100644 benchtests/bench-memset-zerofill.c
> 
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 1530939a8ce8..1261f7650fc7 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -53,7 +53,8 @@ string-benchset := memccpy memchr memcmp memcpy memmem memmove \
>                    strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
>                    strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
>                    strcoll memcpy-large memcpy-random memmove-large memset-large \
> -                  memcpy-walk memset-walk memmove-walk
> +                  memcpy-walk memset-walk memmove-walk memset-zerofill \
> +                  memset-large-zerofill
>  
>  # Build and run locale-dependent benchmarks only if we're building natively.
>  ifeq (no,$(cross-compiling))
> diff --git a/benchtests/bench-memset-large-zerofill.c b/benchtests/bench-memset-large-zerofill.c
> new file mode 100644
> index 000000000000..d8eae9d9789f
> --- /dev/null
> +++ b/benchtests/bench-memset-large-zerofill.c
> @@ -0,0 +1,125 @@
> +/* Measure memset functions with large data sizes.
Please fix this description.

> +   Copyright (C) 2016-2021 Free Software Foundation, Inc.
Just 2021 here.

> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define TEST_MAIN
> +#define TEST_NAME "memset"
> +#define START_SIZE (128 * 1024)
> +#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
> +#define TIMEOUT (20 * 60)
> +#include "bench-string.h"
> +
> +#include <assert.h>
> +#include "json-lib.h"
> +
This code don't need the assert.h.

> +void *generic_memset (void *, int, size_t);
> +typedef void *(*proto_t) (void *, int, size_t);
> +
> +IMPL (MEMSET, 1)
> +IMPL (generic_memset, 0)
> +
> +static void
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> +            int c __attribute ((unused)), size_t n)
> +{
> +  size_t i, iters = 16;
> +  timing_t start, stop, cur;
> +
> +  TIMING_NOW (start);
> +  for (i = 0; i < iters; ++i)
> +    {
> +      CALL (impl, s, c, n);
> +    }
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  json_element_double (json_ctx, (double) cur / (double) iters);
> +}
> +
> +static void
> +do_test (json_ctx_t *json_ctx, size_t align, int c, size_t len)
> +{
> +  align &= 63;
> +  if ((align + len) * sizeof (CHAR) > page_size)
> +    return;
> +
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "length", len);
> +  json_attr_uint (json_ctx, "alignment", align);
> +  json_attr_int (json_ctx, "char", c);
> +  json_array_begin (json_ctx, "timings");
> +
> +  FOR_EACH_IMPL (impl, 0)
> +    {
> +      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c, len);
> +      alloc_bufs ();
> +    }
> +
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
> +}
> +
> +int
> +test_main (void)
> +{
> +  json_ctx_t json_ctx;
> +  size_t i;
> +  int c;
> +
> +  test_init ();
> +
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "large-zerofill");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
> +  FOR_EACH_IMPL (impl, 0)
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
> +
> +  c = 0;
> +  for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
> +    {
> +      do_test (&json_ctx, 0, c, i);
> +      do_test (&json_ctx, 3, c, i);
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
> +  return ret;
> +}
> +
> +#include <support/test-driver.c>
> +
> +#define libc_hidden_builtin_def(X)
> +#define libc_hidden_def(X)
> +#define libc_hidden_weak(X)
> +#define weak_alias(X,Y)
> +#undef MEMSET
> +#define MEMSET generic_memset
> +#include <string/memset.c>
> diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
> new file mode 100644
> index 000000000000..ac20ae4c6537
> --- /dev/null
> +++ b/benchtests/bench-memset-zerofill.c
> @@ -0,0 +1,156 @@
> +/* Measure memset functions.
Fix the description.

> +   Copyright (C) 2013-2021 Free Software Foundation, Inc.
Only 2021 here.

> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define TEST_MAIN
> +#ifndef WIDE
> +# define TEST_NAME "memset"
> +#else
> +# define TEST_NAME "wmemset"
> +# define generic_memset generic_wmemset
> +#endif /* WIDE */
> +#define MIN_PAGE_SIZE 131072
> +#include "bench-string.h"
> +
> +#include "json-lib.h"
> +
> +#ifdef WIDE
> +CHAR *generic_wmemset (CHAR *, CHAR, size_t);
> +#else
> +void *generic_memset (void *, int, size_t);
> +#endif
> +
> +typedef void *(*proto_t) (void *, int, size_t);
> +
> +IMPL (MEMSET, 1)
> +IMPL (generic_memset, 0)
> +
> +static void
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> +            int c __attribute ((unused)), size_t n)
> +{
> +  size_t i, iters = INNER_LOOP_ITERS;
> +  timing_t start, stop, cur;
> +
> +  TIMING_NOW (start);
> +  for (i = 0; i < iters; ++i)
> +    {
> +      CALL (impl, s, c, n);
> +    }
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  json_element_double (json_ctx, (double) cur / (double) iters);
> +}
> +
> +static void
> +do_test (json_ctx_t *json_ctx, size_t align, int c, size_t len)
> +{
> +  align &= 4095;
> +  if ((align + len) * sizeof (CHAR) > page_size)
> +    return;
> +
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "length", len);
> +  json_attr_uint (json_ctx, "alignment", align);
> +  json_attr_int (json_ctx, "char", c);
> +  json_array_begin (json_ctx, "timings");
> +
> +  FOR_EACH_IMPL (impl, 0)
> +    {
> +      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c, len);
> +      alloc_bufs ();
> +    }
> +
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
> +}
> +
> +int
> +test_main (void)
> +{
> +  json_ctx_t json_ctx;
> +  size_t i;
> +  int c = 0;
> +
> +  test_init ();
> +
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "default-zerofill");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
> +  FOR_EACH_IMPL (impl, 0)
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
> +
> +  c = 0;
> +  for (i = 0; i < 18; ++i)
> +    do_test (&json_ctx, 0, c, 1 << i);
> +  for (i = 1; i < 64; ++i)
> +    {
> +      do_test (&json_ctx, i, c, i);
> +      do_test (&json_ctx, 4096 - i, c, i);
> +      do_test (&json_ctx, 4095, c, i);
> +      if (i & (i - 1))
> +       do_test (&json_ctx, 0, c, i);
> +    }
> +  for (i = 32; i < 512; i+=32)
> +    {
> +      do_test (&json_ctx, 0, c, i);
> +      do_test (&json_ctx, i, c, i);
> +    }
> +  do_test (&json_ctx, 1, c, 14);
> +  do_test (&json_ctx, 3, c, 1024);
> +  do_test (&json_ctx, 4, c, 64);
> +  do_test (&json_ctx, 2, c, 25);
> +  for (i = 33; i <= 256; i += 4)
> +    {
> +      do_test (&json_ctx, 0, c, 32 * i);
> +      do_test (&json_ctx, i, c, 32 * i);
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
> +  return ret;
> +}
> +
> +#include <support/test-driver.c>
> +
> +#define libc_hidden_builtin_def(X)
> +#define libc_hidden_def(X)
> +#define libc_hidden_weak(X)
> +#define weak_alias(X,Y)
> +#ifndef WIDE
> +# undef MEMSET
> +# define MEMSET generic_memset
> +# include <string/memset.c>
> +#else
> +# define WMEMSET generic_wmemset
> +# include <wcsmbs/wmemset.c>
> +#endif
> -- 
> 2.17.1
> 

---
Lucas A. M. Magalhães

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH v2 0/5] benchtests: Add memset zero fill benchmark tests
  2021-07-13  8:22 [PATCH] benchtests: Add memset zero fill benchmark tests Naohiro Tamura via Libc-alpha
  2021-07-13 13:50 ` Lucas A. M. Magalhaes via Libc-alpha
@ 2021-07-20  6:31 ` Naohiro Tamura via Libc-alpha
  2021-08-05  7:47   ` [PATCH v3 0/5] benchtests: Add memset zero fill benchmark test Naohiro Tamura via Libc-alpha
  2021-07-20  6:34 ` [PATCH v2 1/5] benchtests: Enable scripts/plot_strings.py to read stdin Naohiro Tamura via Libc-alpha
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-07-20  6:31 UTC (permalink / raw)
  To: Noah Goldstein, Wilco Dijkstra, Lucas A. M. Magalhaes, libc-alpha

Hi Lucas, Wilco, Noah, and all,

This series of patches contains the version 2 of memset zero fill
benchmark tests and its related ground work.
Please find them.

Thanks.
Naohiro

Naohiro Tamura (5):
  benchtests: Enable scripts/plot_strings.py to read stdin
  benchtests: Add memset zero fill benchtest
  benchtests: Add a script to convert benchout string JSON to CSV
  benchtests: Remove redundant assert.h
  benchtests: Fix validate_benchout.py exceptions

 benchtests/Makefile                       |   2 +-
 benchtests/bench-memset-large.c           |   1 -
 benchtests/bench-memset-walk.c            |   1 -
 benchtests/bench-memset-zerofill.c        | 128 ++++++++++++++++++++++
 benchtests/scripts/benchout_string2csv.sh |  44 ++++++++
 benchtests/scripts/import_bench.py        |   5 +-
 benchtests/scripts/plot_strings.py        |  11 +-
 benchtests/scripts/validate_benchout.py   |   6 +-
 8 files changed, 190 insertions(+), 8 deletions(-)
 create mode 100644 benchtests/bench-memset-zerofill.c
 create mode 100755 benchtests/scripts/benchout_string2csv.sh

-- 
2.17.1


^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH v2 1/5] benchtests: Enable scripts/plot_strings.py to read stdin
  2021-07-13  8:22 [PATCH] benchtests: Add memset zero fill benchmark tests Naohiro Tamura via Libc-alpha
  2021-07-13 13:50 ` Lucas A. M. Magalhaes via Libc-alpha
  2021-07-20  6:31 ` [PATCH v2 0/5] " Naohiro Tamura via Libc-alpha
@ 2021-07-20  6:34 ` Naohiro Tamura via Libc-alpha
  2021-07-20  6:35 ` [PATCH v2 2/5] benchtests: Add memset zero fill benchtest Naohiro Tamura via Libc-alpha
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-07-20  6:34 UTC (permalink / raw)
  To: Noah Goldstein, Wilco Dijkstra, Lucas A. M. Magalhaes, libc-alpha

This patch enables scripts/plot_strings.py to read a benchmark result
file from stdin.
To keep backward compatibility, that is to keep accepting multiple of
benchmark result files in argument, blank argument doesn't mean stdin,
but '-' does.
Therefore nargs parameter of ArgumentParser.add_argument() method is
not changed to '?', but keep '+'.

ex:
  $ jq '.' bench-memset.out | plot_strings.py -
  $ jq '.' bench-memset.out | plot_strings.py - bench-memset-large.out
  $ plot_strings.py bench-memset.out bench-memset-large.out

error ex:
  $ jq '.' bench-memset.out | plot_strings.py
---
 benchtests/scripts/plot_strings.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/benchtests/scripts/plot_strings.py b/benchtests/scripts/plot_strings.py
index c71f0804e4de..ec634692d9ad 100755
--- a/benchtests/scripts/plot_strings.py
+++ b/benchtests/scripts/plot_strings.py
@@ -31,6 +31,7 @@ import json
 import matplotlib as mpl
 import numpy as np
 import os
+import sys
 
 try:
     import jsonschema as validator
@@ -331,8 +332,11 @@ def main(args):
     for filename in args.bench:
         bench = None
 
-        with open(filename, "r") as f:
-            bench = json.load(f)
+        if filename == '-':
+            bench = json.load(sys.stdin)
+        else:
+            with open(filename, "r") as f:
+                bench = json.load(f)
 
         validator.validate(bench, schema)
 
@@ -354,7 +358,8 @@ if __name__ == "__main__":
 
     # Required parameter
     parser.add_argument("bench", nargs="+",
-                        help="benchmark results file(s) in json format")
+                        help="benchmark results file(s) in json format, " \
+                        "and/or '-' as a benchmark result file from stdin")
 
     # Optional parameters
     parser.add_argument("-b", "--baseline", type=str,
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-13  8:22 [PATCH] benchtests: Add memset zero fill benchmark tests Naohiro Tamura via Libc-alpha
                   ` (2 preceding siblings ...)
  2021-07-20  6:34 ` [PATCH v2 1/5] benchtests: Enable scripts/plot_strings.py to read stdin Naohiro Tamura via Libc-alpha
@ 2021-07-20  6:35 ` Naohiro Tamura via Libc-alpha
  2021-07-20 16:48   ` Noah Goldstein via Libc-alpha
  2021-07-20  6:35 ` [PATCH v2 3/5] benchtests: Add a script to convert benchout string JSON to CSV Naohiro Tamura via Libc-alpha
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-07-20  6:35 UTC (permalink / raw)
  To: Noah Goldstein, Wilco Dijkstra, Lucas A. M. Magalhaes, libc-alpha

Memset takes 0 as the second parameter in most cases.
However, we cannot measure the zero fill performance by bench-memset.c
and bench-memset-large.c precisely.
X86_64 micro-architecture has some zero-over-zero optimization, and
AArch64 micro-architecture also has some optimization for DC ZVA
instruction.
This patch provides bench-memset-zerofill.c which is suitable to
analyze the zero fill performance by zero-over-zero and zero-over-one
test cases from 16KB(L1), through L2 and L3, to 64MB(RAM).
---
 benchtests/Makefile                |   2 +-
 benchtests/bench-memset-zerofill.c | 128 +++++++++++++++++++++++++++++
 2 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 benchtests/bench-memset-zerofill.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 1530939a8ce8..21b95c736190 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -53,7 +53,7 @@ string-benchset := memccpy memchr memcmp memcpy memmem memmove \
 		   strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
 		   strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
 		   strcoll memcpy-large memcpy-random memmove-large memset-large \
-		   memcpy-walk memset-walk memmove-walk
+		   memcpy-walk memset-walk memmove-walk memset-zerofill
 
 # Build and run locale-dependent benchmarks only if we're building natively.
 ifeq (no,$(cross-compiling))
diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
new file mode 100644
index 000000000000..2579b6edd09e
--- /dev/null
+++ b/benchtests/bench-memset-zerofill.c
@@ -0,0 +1,128 @@
+/* Measure memset functions with zero fill data.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#define TEST_NAME "memset"
+#define START_SIZE (16 * 1024)
+#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+#include "json-lib.h"
+
+void *generic_memset (void *, int, size_t);
+typedef void *(*proto_t) (void *, int, size_t);
+
+IMPL (MEMSET, 1)
+IMPL (generic_memset, 0)
+
+static void
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
+	     int c1 __attribute ((unused)), int c2 __attribute ((unused)),
+	     size_t n)
+{
+  size_t i, iters = 16;
+  timing_t start, stop, cur;
+
+  TIMING_NOW (start);
+  for (i = 0; i < iters; i += 2)
+    {
+      CALL (impl, s, c1, n);
+      CALL (impl, s, c2, n);
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  json_element_double (json_ctx, (double) cur / (double) iters);
+}
+
+static void
+do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
+{
+  align &= 63;
+  if ((align + len) * sizeof (CHAR) > page_size)
+    return;
+
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_uint (json_ctx, "alignment", align);
+  json_attr_int (json_ctx, "char1", c1);
+  json_attr_int (json_ctx, "char2", c2);
+  json_array_begin (json_ctx, "timings");
+
+  FOR_EACH_IMPL (impl, 0)
+    {
+      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c1, c2, len);
+      alloc_bufs ();
+    }
+
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
+}
+
+int
+test_main (void)
+{
+  json_ctx_t json_ctx;
+  size_t i;
+  int c1, c2;
+
+  test_init ();
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "zerofill");
+
+  json_array_begin (&json_ctx, "ifuncs");
+  FOR_EACH_IMPL (impl, 0)
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
+
+  c2 = 0;
+  for (c1 = 0; c1 < 2; c1++)
+    for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+      {
+	do_test (&json_ctx, 0, c1, c2, i);
+	do_test (&json_ctx, 3, c1, c2, i);
+      }
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return ret;
+}
+
+#include <support/test-driver.c>
+
+#define libc_hidden_builtin_def(X)
+#define libc_hidden_def(X)
+#define libc_hidden_weak(X)
+#define weak_alias(X,Y)
+#undef MEMSET
+#define MEMSET generic_memset
+#include <string/memset.c>
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH v2 3/5] benchtests: Add a script to convert benchout string JSON to CSV
  2021-07-13  8:22 [PATCH] benchtests: Add memset zero fill benchmark tests Naohiro Tamura via Libc-alpha
                   ` (3 preceding siblings ...)
  2021-07-20  6:35 ` [PATCH v2 2/5] benchtests: Add memset zero fill benchtest Naohiro Tamura via Libc-alpha
@ 2021-07-20  6:35 ` Naohiro Tamura via Libc-alpha
  2021-07-21  2:41   ` naohirot--- via Libc-alpha
  2021-07-27 20:17   ` Joseph Myers
  2021-07-20  6:36 ` [PATCH v2 4/5] benchtests: Remove redundant assert.h Naohiro Tamura via Libc-alpha
                   ` (3 subsequent siblings)
  8 siblings, 2 replies; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-07-20  6:35 UTC (permalink / raw)
  To: Noah Goldstein, Wilco Dijkstra, Lucas A. M. Magalhaes, libc-alpha

This patch adds "benchout_string2csv.sh" script to convert benchout
string JSON to CSV so that we can visualize performance data by any
spreadsheet such as MS Excel and Google Sheet.

Usage: benchout_string2csv.sh
  read benchout string JSON from standard input
  write CSV to standard output
ex:
  $ cat bench-memset.out | benchout_string2csv.sh > bench-memset.csv
---
 benchtests/scripts/benchout_string2csv.sh | 44 +++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100755 benchtests/scripts/benchout_string2csv.sh

diff --git a/benchtests/scripts/benchout_string2csv.sh b/benchtests/scripts/benchout_string2csv.sh
new file mode 100755
index 000000000000..045870fed162
--- /dev/null
+++ b/benchtests/scripts/benchout_string2csv.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright (C) 2021 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+# Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+#
+# Convert benchout string JSON to CSV
+#
+if [[ $1 == "-h" ]] || [[ $# != 0 ]]; then
+  echo "Usage: ${0##*/}"
+  echo "  read benchout string JSON from standard input"
+  echo "  write CSV to standard output"
+  echo "ex:"
+  echo "  $ cat bench-memset.out | ${0##*/} > bench-memset.csv"
+exit 1
+fi
+
+jq -r '
+  . as $root |
+  . as {$functions} |
+  $functions | to_entries | .[0].value as $func_value |
+  $func_value as {$_, $ifuncs, $results} |
+  (["timing_type", $root.timing_type] | @csv),
+  (["functions", ($functions | keys | .[0]),
+    "bench-variant", $func_value."bench-variant"] | @csv),
+  ($results[0] | to_entries | map([.key]) | flatten | @csv),
+  ($results[0] | reduce range(1; . | length) as $_ ([]; . + [""])
+    + $ifuncs | @csv),
+  ($results[] | to_entries | map([.value]) | flatten | @csv)
+'
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH v2 4/5] benchtests: Remove redundant assert.h
  2021-07-13  8:22 [PATCH] benchtests: Add memset zero fill benchmark tests Naohiro Tamura via Libc-alpha
                   ` (4 preceding siblings ...)
  2021-07-20  6:35 ` [PATCH v2 3/5] benchtests: Add a script to convert benchout string JSON to CSV Naohiro Tamura via Libc-alpha
@ 2021-07-20  6:36 ` Naohiro Tamura via Libc-alpha
  2021-07-20  6:37 ` [PATCH v2 5/5] benchtests: Fix validate_benchout.py exceptions Naohiro Tamura via Libc-alpha
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-07-20  6:36 UTC (permalink / raw)
  To: Noah Goldstein, Wilco Dijkstra, Lucas A. M. Magalhaes, libc-alpha

This patch removed redundant "#include <assert.h>" from
bench-memset-large.c and bench-memset-walk.c.
---
 benchtests/bench-memset-large.c | 1 -
 benchtests/bench-memset-walk.c  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/benchtests/bench-memset-large.c b/benchtests/bench-memset-large.c
index 97ed78d0d6a1..3fd20b79e53d 100644
--- a/benchtests/bench-memset-large.c
+++ b/benchtests/bench-memset-large.c
@@ -23,7 +23,6 @@
 #define TIMEOUT (20 * 60)
 #include "bench-string.h"
 
-#include <assert.h>
 #include "json-lib.h"
 
 void *generic_memset (void *, int, size_t);
diff --git a/benchtests/bench-memset-walk.c b/benchtests/bench-memset-walk.c
index 0dcad09c484f..5fb315384992 100644
--- a/benchtests/bench-memset-walk.c
+++ b/benchtests/bench-memset-walk.c
@@ -23,7 +23,6 @@
 #define TIMEOUT (20 * 60)
 #include "bench-string.h"
 
-#include <assert.h>
 #include "json-lib.h"
 
 void *generic_memset (void *, int, size_t);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH v2 5/5] benchtests: Fix validate_benchout.py exceptions
  2021-07-13  8:22 [PATCH] benchtests: Add memset zero fill benchmark tests Naohiro Tamura via Libc-alpha
                   ` (5 preceding siblings ...)
  2021-07-20  6:36 ` [PATCH v2 4/5] benchtests: Remove redundant assert.h Naohiro Tamura via Libc-alpha
@ 2021-07-20  6:37 ` Naohiro Tamura via Libc-alpha
  2021-07-26  8:34 ` [PATCH] config: Remove HAVE_BUILTIN_MEMSET macro Naohiro Tamura via Libc-alpha
  2021-07-26  8:35 ` [PATCH] benchtests: Add a script to merge two benchout string files Naohiro Tamura via Libc-alpha
  8 siblings, 0 replies; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-07-20  6:37 UTC (permalink / raw)
  To: Noah Goldstein, Wilco Dijkstra, Lucas A. M. Magalhaes, libc-alpha

This patch fixed validate_benchout.py two exceptions, AttributeError
if benchout_strings.schema.json is specified and
json.decoder.JSONDecodeError if benchout is not JSON.
---
 benchtests/scripts/import_bench.py      | 5 ++++-
 benchtests/scripts/validate_benchout.py | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/benchtests/scripts/import_bench.py b/benchtests/scripts/import_bench.py
index a799b4e1b7dc..e3337ca5d638 100644
--- a/benchtests/scripts/import_bench.py
+++ b/benchtests/scripts/import_bench.py
@@ -104,7 +104,10 @@ def do_for_all_timings(bench, callback):
     """
     for func in bench['functions'].keys():
         for k in bench['functions'][func].keys():
-            if 'timings' not in bench['functions'][func][k].keys():
+            try:
+                if 'timings' not in bench['functions'][func][k].keys():
+                    continue
+            except AttributeError:
                 continue
 
             callback(bench, func, k)
diff --git a/benchtests/scripts/validate_benchout.py b/benchtests/scripts/validate_benchout.py
index 47df33ed0252..00d5fa0ee5eb 100755
--- a/benchtests/scripts/validate_benchout.py
+++ b/benchtests/scripts/validate_benchout.py
@@ -73,11 +73,15 @@ def main(args):
 
     except bench.validator.ValidationError as e:
         return print_and_exit("Invalid benchmark output: %s" % e.message,
-            os.EX_DATAERR)
+                os.EX_DATAERR)
 
     except bench.validator.SchemaError as e:
         return print_and_exit("Invalid schema: %s" % e.message, os.EX_DATAERR)
 
+    except json.decoder.JSONDecodeError as e:
+        return print_and_exit("Benchmark output in %s is not JSON." % args[0],
+                os.EX_DATAERR)
+
     print("Benchmark output in %s is valid." % args[0])
     return os.EX_OK
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-20  6:35 ` [PATCH v2 2/5] benchtests: Add memset zero fill benchtest Naohiro Tamura via Libc-alpha
@ 2021-07-20 16:48   ` Noah Goldstein via Libc-alpha
  2021-07-21 12:56     ` naohirot--- via Libc-alpha
  2021-07-26  8:39     ` naohirot--- via Libc-alpha
  0 siblings, 2 replies; 83+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-07-20 16:48 UTC (permalink / raw)
  To: Naohiro Tamura; +Cc: GNU C Library, Wilco Dijkstra

On Tue, Jul 20, 2021 at 2:35 AM Naohiro Tamura <naohirot@fujitsu.com> wrote:

> Memset takes 0 as the second parameter in most cases.
> However, we cannot measure the zero fill performance by bench-memset.c
> and bench-memset-large.c precisely.
> X86_64 micro-architecture has some zero-over-zero optimization, and
> AArch64 micro-architecture also has some optimization for DC ZVA
> instruction.
> This patch provides bench-memset-zerofill.c which is suitable to
> analyze the zero fill performance by zero-over-zero and zero-over-one
> test cases from 16KB(L1), through L2 and L3, to 64MB(RAM).
> ---
>  benchtests/Makefile                |   2 +-
>  benchtests/bench-memset-zerofill.c | 128 +++++++++++++++++++++++++++++
>  2 files changed, 129 insertions(+), 1 deletion(-)
>  create mode 100644 benchtests/bench-memset-zerofill.c
>
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 1530939a8ce8..21b95c736190 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -53,7 +53,7 @@ string-benchset := memccpy memchr memcmp memcpy memmem
> memmove \
>                    strncasecmp strncat strncmp strncpy strnlen strpbrk
> strrchr \
>                    strspn strstr strcpy_chk stpcpy_chk memrchr strsep
> strtok \
>                    strcoll memcpy-large memcpy-random memmove-large
> memset-large \
> -                  memcpy-walk memset-walk memmove-walk
> +                  memcpy-walk memset-walk memmove-walk memset-zerofill
>
>  # Build and run locale-dependent benchmarks only if we're building
> natively.
>  ifeq (no,$(cross-compiling))
> diff --git a/benchtests/bench-memset-zerofill.c
> b/benchtests/bench-memset-zerofill.c
> new file mode 100644
> index 000000000000..2579b6edd09e
> --- /dev/null
> +++ b/benchtests/bench-memset-zerofill.c
> @@ -0,0 +1,128 @@
> +/* Measure memset functions with zero fill data.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define TEST_MAIN
> +#define TEST_NAME "memset"
> +#define START_SIZE (16 * 1024)
> +#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
> +#define TIMEOUT (20 * 60)
> +#include "bench-string.h"
> +
> +#include "json-lib.h"
> +
> +void *generic_memset (void *, int, size_t);
> +typedef void *(*proto_t) (void *, int, size_t);
> +
> +IMPL (MEMSET, 1)
> +IMPL (generic_memset, 0)
> +
> +static void
>
Do we want __attribute__((noinline, noclone))?

> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> +            int c1 __attribute ((unused)), int c2 __attribute ((unused)),
> +            size_t n)
> +{
> +  size_t i, iters = 16;
>

I think 16 is probably too few iterations for reliable benchmarking.
Maybe `INNER_LOOP_ITERS` which is 8192

+  timing_t start, stop, cur;
> +
> +  TIMING_NOW (start);
> +  for (i = 0; i < iters; i += 2)
> +    {
> +      CALL (impl, s, c1, n);
>
I am a bit worried that the overhead from the first call with `c1` will
distort the results.
Is it possible to implement it with a nested loop where you fill `s` with
`c1` for
`n * inner_loop_iterations` in the outer loop and in the inner loop fill
`c2` on `s + n * i`?
In that case maybe 16 for inner loop iterations and 512 for outer loop
iterations.

> +      CALL (impl, s, c2, n);
> +    }
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  json_element_double (json_ctx, (double) cur / (double) iters);
> +}
> +
> +static void
> +do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
> +{
> +  align &= 63;
>
Can you make this `align &= getpagesize () - 1;`?

> +  if ((align + len) * sizeof (CHAR) > page_size)
> +    return;
> +
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "length", len);
> +  json_attr_uint (json_ctx, "alignment", align);
> +  json_attr_int (json_ctx, "char1", c1);
> +  json_attr_int (json_ctx, "char2", c2);
> +  json_array_begin (json_ctx, "timings");
> +
> +  FOR_EACH_IMPL (impl, 0)
> +    {
> +      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c1, c2, len);
> +      alloc_bufs ();
> +    }
> +
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
> +}
> +
> +int
> +test_main (void)
> +{
> +  json_ctx_t json_ctx;
> +  size_t i;
> +  int c1, c2;
> +
> +  test_init ();
> +
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "zerofill");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
> +  FOR_EACH_IMPL (impl, 0)
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
> +
> +  c2 = 0;
> +  for (c1 = 0; c1 < 2; c1++)
> +    for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
> +      {
> +       do_test (&json_ctx, 0, c1, c2, i);
> +       do_test (&json_ctx, 3, c1, c2, i);
> +      }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
> +  return ret;
> +}
> +
> +#include <support/test-driver.c>
> +
> +#define libc_hidden_builtin_def(X)
> +#define libc_hidden_def(X)
> +#define libc_hidden_weak(X)
> +#define weak_alias(X,Y)
> +#undef MEMSET
> +#define MEMSET generic_memset
> +#include <string/memset.c>
> --
> 2.17.1
>
>

^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v2 3/5] benchtests: Add a script to convert benchout string JSON to CSV
  2021-07-20  6:35 ` [PATCH v2 3/5] benchtests: Add a script to convert benchout string JSON to CSV Naohiro Tamura via Libc-alpha
@ 2021-07-21  2:41   ` naohirot--- via Libc-alpha
  2021-07-27 20:17   ` Joseph Myers
  1 sibling, 0 replies; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-21  2:41 UTC (permalink / raw)
  To: naohirot@fujitsu.com, Noah Goldstein, Wilco Dijkstra,
	Lucas A. M. Magalhaes, libc-alpha@sourceware.org

This is self-review.

> From: Naohiro Tamura <naohirot@fujitsu.com>
> Sent: Tuesday, July 20, 2021 3:36 PM
 
> This patch adds "benchout_string2csv.sh" script to convert benchout
> string JSON to CSV so that we can visualize performance data by any
> spreadsheet such as MS Excel and Google Sheet.
> 
> Usage: benchout_string2csv.sh
>   read benchout string JSON from standard input
>   write CSV to standard output
> ex:
>   $ cat bench-memset.out | benchout_string2csv.sh > bench-memset.csv
> ---
>  benchtests/scripts/benchout_string2csv.sh | 44 +++++++++++++++++++++++
>  1 file changed, 44 insertions(+)
>  create mode 100755 benchtests/scripts/benchout_string2csv.sh
> 
> diff --git a/benchtests/scripts/benchout_string2csv.sh b/benchtests/scripts/benchout_string2csv.sh
> new file mode 100755
> index 000000000000..045870fed162
> --- /dev/null
> +++ b/benchtests/scripts/benchout_string2csv.sh
> @@ -0,0 +1,44 @@
> +#!/bin/bash
> +# Copyright (C) 2021 Free Software Foundation, Inc.
> +# This file is part of the GNU C Library.
> +# Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
oops! I'll remove the above line.

> +
> +# The GNU C Library is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU Lesser General Public
> +# License as published by the Free Software Foundation; either
> +# version 2.1 of the License, or (at your option) any later version.
> +
> +# The GNU C Library is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +# Lesser General Public License for more details.
> +
> +# You should have received a copy of the GNU Lesser General Public
> +# License along with the GNU C Library; if not, see
> +# <https://www.gnu.org/licenses/>.
> +
> +#
> +# Convert benchout string JSON to CSV
> +#
> +if [[ $1 == "-h" ]] || [[ $# != 0 ]]; then
> +  echo "Usage: ${0##*/}"
> +  echo "  read benchout string JSON from standard input"
> +  echo "  write CSV to standard output"
> +  echo "ex:"
> +  echo "  $ cat bench-memset.out | ${0##*/} > bench-memset.csv"
> +exit 1
> +fi
> +
> +jq -r '
> +  . as $root |
> +  . as {$functions} |
> +  $functions | to_entries | .[0].value as $func_value |
> +  $func_value as {$_, $ifuncs, $results} |
> +  (["timing_type", $root.timing_type] | @csv),
> +  (["functions", ($functions | keys | .[0]),
> +    "bench-variant", $func_value."bench-variant"] | @csv),
> +  ($results[0] | to_entries | map([.key]) | flatten | @csv),
> +  ($results[0] | reduce range(1; . | length) as $_ ([]; . + [""])
> +    + $ifuncs | @csv),
> +  ($results[] | to_entries | map([.value]) | flatten | @csv)
> +'
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-20 16:48   ` Noah Goldstein via Libc-alpha
@ 2021-07-21 12:56     ` naohirot--- via Libc-alpha
  2021-07-21 13:07       ` naohirot--- via Libc-alpha
  2021-07-26  8:39     ` naohirot--- via Libc-alpha
  1 sibling, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-21 12:56 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Wilco Dijkstra

Hi Noah,

Thank you for the review.

> > +#define TEST_MAIN
> > +#define TEST_NAME "memset"
> > +#define START_SIZE (16 * 1024)
> > +#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
> > +#define TIMEOUT (20 * 60)
> > +#include "bench-string.h"
> > +
> > +#include "json-lib.h"
> > +
> > +void *generic_memset (void *, int, size_t);
> > +typedef void *(*proto_t) (void *, int, size_t);
> > +
> > +IMPL (MEMSET, 1)
> > +IMPL (generic_memset, 0)
> > +
> > +static void
> Do we want __attribute__((noinline, noclone))? 

Yes, I'll add it.

> > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> > +            int c1 __attribute ((unused)), int c2 __attribute ((unused)),
> > +            size_t n)
> > +{
> > +  size_t i, iters = 16;
> 
> I think 16 is probably too few iterations for reliable benchmarking. 
> Maybe `INNER_LOOP_ITERS` which is 8192

I tried it. If it is changed to 8192, it hit the TIMEOUT (20 * 60) on a64fx.
Please check the code below.

> 
> > +  timing_t start, stop, cur;
> > +
> > +  TIMING_NOW (start);
> > +  for (i = 0; i < iters; i += 2)
> > +    {
> > +      CALL (impl, s, c1, n);
> I am a bit worried that the overhead from the first call with `c1` will distort the results.
> Is it possible to implement it with a nested loop where you fill `s` with `c1` for 
> `n * inner_loop_iterations` in the outer loop and in the inner loop fill `c2` on `s + n * i`? 
> In that case maybe 16 for inner loop iterations and 512 for outer loop iterations. 

It seems that we have to set smaller number if this implementation is not wrong.
Because it will take 99.4 minutes estimating from the case that "iters = 32"
took 23.3 seconds.
(8192/32*23.3/60=99.4)


#define START_SIZE (16 * 1024)
...
static void
__attribute__((noinline, noclone))
do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
             int c1 __attribute ((unused)), int c2 __attribute ((unused)),
             size_t n)
{
  size_t i, j, iters = INNER_LOOP_ITERS; // 32;
  timing_t start, stop, cur, latency = 0;

  for (i = 0; i < 512; i++) // for (i = 0; i < 2; i++)
    {
      CALL (impl, s, c1, n * 16);
      TIMING_NOW (start);
      for (j = 0; j < 16; j++)
        CALL (impl, s + n * j, c2, n);
      TIMING_NOW (stop);
      TIMING_DIFF (cur, start, stop);
      TIMING_ACCUM (latency, cur);
    }

  json_element_double (json_ctx, (double) latency / (double) iters);
}

> > +      CALL (impl, s, c2, n);
> > +    }
> > +  TIMING_NOW (stop);
> > +
> > +  TIMING_DIFF (cur, start, stop);
> > +
> > +  json_element_double (json_ctx, (double) cur / (double) iters);
> > +}
> > +
> > +static void
> > +do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
> > +{
> > +  align &= 63;
> Can you make this `align &= getpagesize () - 1;`?  

I'll change it.

Thanks.
Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-21 12:56     ` naohirot--- via Libc-alpha
@ 2021-07-21 13:07       ` naohirot--- via Libc-alpha
  2021-07-21 18:14         ` Noah Goldstein via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-21 13:07 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Wilco Dijkstra

Hi Noah,

One typo in the updated code.
Wrong:
  #define START_SIZE (16 * 1024)
Right:
  #define BUF1PAGES 16

Thanks
Naohiro
________________________________________
From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
Sent: Wednesday, 21 July 2021 21:56
To: Noah Goldstein
Cc: Wilco Dijkstra; Lucas A. M. Magalhaes; GNU C Library
Subject: RE: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest

Hi Noah,

Thank you for the review.

> > +#define TEST_MAIN
> > +#define TEST_NAME "memset"
> > +#define START_SIZE (16 * 1024)
> > +#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
> > +#define TIMEOUT (20 * 60)
> > +#include "bench-string.h"
> > +
> > +#include "json-lib.h"
> > +
> > +void *generic_memset (void *, int, size_t);
> > +typedef void *(*proto_t) (void *, int, size_t);
> > +
> > +IMPL (MEMSET, 1)
> > +IMPL (generic_memset, 0)
> > +
> > +static void
> Do we want __attribute__((noinline, noclone))?

Yes, I'll add it.

> > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> > +            int c1 __attribute ((unused)), int c2 __attribute ((unused)),
> > +            size_t n)
> > +{
> > +  size_t i, iters = 16;
>
> I think 16 is probably too few iterations for reliable benchmarking.
> Maybe `INNER_LOOP_ITERS` which is 8192

I tried it. If it is changed to 8192, it hit the TIMEOUT (20 * 60) on a64fx.
Please check the code below.

>
> > +  timing_t start, stop, cur;
> > +
> > +  TIMING_NOW (start);
> > +  for (i = 0; i < iters; i += 2)
> > +    {
> > +      CALL (impl, s, c1, n);
> I am a bit worried that the overhead from the first call with `c1` will distort the results.
> Is it possible to implement it with a nested loop where you fill `s` with `c1` for
> `n * inner_loop_iterations` in the outer loop and in the inner loop fill `c2` on `s + n * i`?
> In that case maybe 16 for inner loop iterations and 512 for outer loop iterations.

It seems that we have to set smaller number if this implementation is not wrong.
Because it will take 99.4 minutes estimating from the case that "iters = 32"
took 23.3 seconds.
(8192/32*23.3/60=99.4)


#define START_SIZE (16 * 1024)
...
static void
__attribute__((noinline, noclone))
do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
             int c1 __attribute ((unused)), int c2 __attribute ((unused)),
             size_t n)
{
  size_t i, j, iters = INNER_LOOP_ITERS; // 32;
  timing_t start, stop, cur, latency = 0;

  for (i = 0; i < 512; i++) // for (i = 0; i < 2; i++)
    {
      CALL (impl, s, c1, n * 16);
      TIMING_NOW (start);
      for (j = 0; j < 16; j++)
        CALL (impl, s + n * j, c2, n);
      TIMING_NOW (stop);
      TIMING_DIFF (cur, start, stop);
      TIMING_ACCUM (latency, cur);
    }

  json_element_double (json_ctx, (double) latency / (double) iters);
}

> > +      CALL (impl, s, c2, n);
> > +    }
> > +  TIMING_NOW (stop);
> > +
> > +  TIMING_DIFF (cur, start, stop);
> > +
> > +  json_element_double (json_ctx, (double) cur / (double) iters);
> > +}
> > +
> > +static void
> > +do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
> > +{
> > +  align &= 63;
> Can you make this `align &= getpagesize () - 1;`?

I'll change it.

Thanks.
Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-21 13:07       ` naohirot--- via Libc-alpha
@ 2021-07-21 18:14         ` Noah Goldstein via Libc-alpha
  2021-07-21 19:17           ` Wilco Dijkstra via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-07-21 18:14 UTC (permalink / raw)
  To: naohirot@fujitsu.com; +Cc: GNU C Library, Wilco Dijkstra

On Wed, Jul 21, 2021 at 9:07 AM naohirot@fujitsu.com <naohirot@fujitsu.com>
wrote:

> Hi Noah,
>
> One typo in the updated code.
> Wrong:
>   #define START_SIZE (16 * 1024)
> Right:
>   #define BUF1PAGES 16
>
> Thanks
> Naohiro
> ________________________________________
> From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Sent: Wednesday, 21 July 2021 21:56
> To: Noah Goldstein
> Cc: Wilco Dijkstra; Lucas A. M. Magalhaes; GNU C Library
> Subject: RE: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
>
> Hi Noah,
>
> Thank you for the review.
>
> > > +#define TEST_MAIN
> > > +#define TEST_NAME "memset"
> > > +#define START_SIZE (16 * 1024)
> > > +#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
> > > +#define TIMEOUT (20 * 60)
> > > +#include "bench-string.h"
> > > +
> > > +#include "json-lib.h"
> > > +
> > > +void *generic_memset (void *, int, size_t);
> > > +typedef void *(*proto_t) (void *, int, size_t);
> > > +
> > > +IMPL (MEMSET, 1)
> > > +IMPL (generic_memset, 0)
> > > +
> > > +static void
> > Do we want __attribute__((noinline, noclone))?
>
> Yes, I'll add it.
>
> > > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> > > +            int c1 __attribute ((unused)), int c2 __attribute
> ((unused)),
> > > +            size_t n)
> > > +{
> > > +  size_t i, iters = 16;
> >
> > I think 16 is probably too few iterations for reliable benchmarking.
> > Maybe `INNER_LOOP_ITERS` which is 8192
>
> I tried it. If it is changed to 8192, it hit the TIMEOUT (20 * 60) on
> a64fx.
> Please check the code below.
>
> >
> > > +  timing_t start, stop, cur;
> > > +
> > > +  TIMING_NOW (start);
> > > +  for (i = 0; i < iters; i += 2)
> > > +    {
> > > +      CALL (impl, s, c1, n);
> > I am a bit worried that the overhead from the first call with `c1` will
> distort the results.
> > Is it possible to implement it with a nested loop where you fill `s`
> with `c1` for
> > `n * inner_loop_iterations` in the outer loop and in the inner loop fill
> `c2` on `s + n * i`?
> > In that case maybe 16 for inner loop iterations and 512 for outer loop
> iterations.
>
> It seems that we have to set smaller number if this implementation is not
> wrong.
> Because it will take 99.4 minutes estimating from the case that "iters =
> 32"
> took 23.3 seconds.
> (8192/32*23.3/60=99.4)

I see. I think 16 for the inner loop makes sense. From the x86_64
perspective this
will keep the loop from running out of the LSD which is necessary for
accurate
benchmarking. I guess then somewhere between [2, 8] is reasonable for the
outer
loop?


> #define START_SIZE (16 * 1024)
> ...
> static void
> __attribute__((noinline, noclone))
> do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
>              int c1 __attribute ((unused)), int c2 __attribute ((unused)),
>              size_t n)
> {
>   size_t i, j, iters = INNER_LOOP_ITERS; // 32;
>   timing_t start, stop, cur, latency = 0;
>
>   for (i = 0; i < 512; i++) // for (i = 0; i < 2; i++)
>     {

      CALL (impl, s, c1, n * 16);
>       TIMING_NOW (start);
>       for (j = 0; j < 16; j++)
>         CALL (impl, s + n * j, c2, n);
>       TIMING_NOW (stop);
>       TIMING_DIFF (cur, start, stop);
>       TIMING_ACCUM (latency, cur);
>     }
>
This looks good. But as you said, a much smaller value for outer loop.

>
>   json_element_double (json_ctx, (double) latency / (double) iters);
> }
>
> > > +      CALL (impl, s, c2, n);
> > > +    }
> > > +  TIMING_NOW (stop);
> > > +
> > > +  TIMING_DIFF (cur, start, stop);
> > > +
> > > +  json_element_double (json_ctx, (double) cur / (double) iters);
> > > +}
> > > +
> > > +static void
> > > +do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t
> len)
> > > +{
> > > +  align &= 63;
> > Can you make this `align &= getpagesize () - 1;`?
>
> I'll change it.
>
> Thanks.
> Naohiro
>

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-21 18:14         ` Noah Goldstein via Libc-alpha
@ 2021-07-21 19:17           ` Wilco Dijkstra via Libc-alpha
  2021-07-26  8:42             ` naohirot--- via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Wilco Dijkstra via Libc-alpha @ 2021-07-21 19:17 UTC (permalink / raw)
  To: Noah Goldstein, naohirot@fujitsu.com; +Cc: GNU C Library

Hi,

      TIMING_NOW (start);
      for (j = 0; j < 16; j++)
        CALL (impl, s + n * j, c2, n);
      TIMING_NOW (stop);

This loop is basically equivalent to CALL (impl, s, c2, n * 16), so you
might as well change the outer loop to use a larger 'n'. The accuracy
will be bad unless 'n' is really large, and there is no way to improve it.

If you want to test zero/non-zero combinations accurately, store the
memset values in an array rather than using a single constant.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH] config: Remove HAVE_BUILTIN_MEMSET macro
  2021-07-13  8:22 [PATCH] benchtests: Add memset zero fill benchmark tests Naohiro Tamura via Libc-alpha
                   ` (6 preceding siblings ...)
  2021-07-20  6:37 ` [PATCH v2 5/5] benchtests: Fix validate_benchout.py exceptions Naohiro Tamura via Libc-alpha
@ 2021-07-26  8:34 ` Naohiro Tamura via Libc-alpha
  2021-07-26  8:48   ` naohirot--- via Libc-alpha
  2021-07-26  8:49   ` Andreas Schwab
  2021-07-26  8:35 ` [PATCH] benchtests: Add a script to merge two benchout string files Naohiro Tamura via Libc-alpha
  8 siblings, 2 replies; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-07-26  8:34 UTC (permalink / raw)
  To: libc-alpha

s patch removed HAVE_BUILTIN_MEMSET macro because GCC 6.2 that is
minimum requirement to compile glibc already support
__builtin_memset()[1].

Interestingly, removed code had a critical bug that is
HAVE_BUILTIN_MEMSET macro never be defined, because yes/no assignment
to libc_cv_gcc_builtin_memset was reversed in configure.ac as below:

1519 if AC_TRY_COMMAND([${CC-cc} -O3 -S conftest.c -o - | grep -F "memset" > /dev/null]);
1520 then
1521   libc_cv_gcc_builtin_memset=no   # shold be yes
1522 else
1523   libc_cv_gcc_builtin_memset=yes  # should be no
1524 fi
1525 rm -f conftest* ])
1526 if test "$libc_cv_gcc_builtin_memset" = yes ; then
1527   AC_DEFINE(HAVE_BUILTIN_MEMSET)
1528 fi

Therefor __builtin_memset() in elf/rtld.c was never be compiled.

 534 # ifdef HAVE_BUILTIN_MEMSET
 535   __builtin_memset (bootstrap_map.l_info, '\0', sizeof (bootstrap_map.l_info));
 536 # else

[1] https://gcc.gnu.org/onlinedocs/gcc-6.2.0/gcc/Other-Builtins.html
---
 config.h.in  |  3 ---
 configure    | 31 -------------------------------
 configure.ac | 19 -------------------
 elf/rtld.c   | 15 ++++-----------
 4 files changed, 4 insertions(+), 64 deletions(-)

diff --git a/config.h.in b/config.h.in
index 8b45a3a61d77..4647632f2632 100644
--- a/config.h.in
+++ b/config.h.in
@@ -40,9 +40,6 @@
    shared between GNU libc and GNU gettext projects.  */
 #define HAVE_BUILTIN_EXPECT 1
 
-/* Define if the compiler supports __builtin_memset.  */
-#undef	HAVE_BUILTIN_MEMSET
-
 /* Define if compiler accepts -ftree-loop-distribute-patterns.  */
 #undef  HAVE_CC_INHIBIT_LOOP_TO_LIBCALL
 
diff --git a/configure b/configure
index 9619c10991d0..6f85d28ea085 100755
--- a/configure
+++ b/configure
@@ -6261,37 +6261,6 @@ if test $libc_cv_have_section_quotes = yes; then
 
 fi
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_memset" >&5
-$as_echo_n "checking for __builtin_memset... " >&6; }
-if ${libc_cv_gcc_builtin_memset+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat > conftest.c <<\EOF
-void zero (void *x)
-{
-  __builtin_memset (x, 0, 1000);
-}
-EOF
-if { ac_try='${CC-cc} -O3 -S conftest.c -o - | grep -F "memset" > /dev/null'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; };
-then
-  libc_cv_gcc_builtin_memset=no
-else
-  libc_cv_gcc_builtin_memset=yes
-fi
-rm -f conftest*
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_gcc_builtin_memset" >&5
-$as_echo "$libc_cv_gcc_builtin_memset" >&6; }
-if test "$libc_cv_gcc_builtin_memset" = yes ; then
-  $as_echo "#define HAVE_BUILTIN_MEMSET 1" >>confdefs.h
-
-fi
-
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for redirection of built-in functions" >&5
 $as_echo_n "checking for redirection of built-in functions... " >&6; }
 if ${libc_cv_gcc_builtin_redirection+:} false; then :
diff --git a/configure.ac b/configure.ac
index 34ecbba54054..0c5ee6623c4c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1508,25 +1508,6 @@ if test $libc_cv_have_section_quotes = yes; then
   AC_DEFINE(HAVE_SECTION_QUOTES)
 fi
 
-AC_CACHE_CHECK(for __builtin_memset, libc_cv_gcc_builtin_memset, [dnl
-cat > conftest.c <<\EOF
-void zero (void *x)
-{
-  __builtin_memset (x, 0, 1000);
-}
-EOF
-dnl
-if AC_TRY_COMMAND([${CC-cc} -O3 -S conftest.c -o - | grep -F "memset" > /dev/null]);
-then
-  libc_cv_gcc_builtin_memset=no
-else
-  libc_cv_gcc_builtin_memset=yes
-fi
-rm -f conftest* ])
-if test "$libc_cv_gcc_builtin_memset" = yes ; then
-  AC_DEFINE(HAVE_BUILTIN_MEMSET)
-fi
-
 AC_CACHE_CHECK(for redirection of built-in functions, libc_cv_gcc_builtin_redirection, [dnl
 cat > conftest.c <<\EOF
 extern char *strstr (const char *, const char *) __asm ("my_strstr");
diff --git a/elf/rtld.c b/elf/rtld.c
index d733359eaf80..d0da99bd6d78 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -526,19 +526,12 @@ _dl_start (void *arg)
 
   /* Partly clean the `bootstrap_map' structure up.  Don't use
      `memset' since it might not be built in or inlined and we cannot
-     make function calls at this point.  Use '__builtin_memset' if we
-     know it is available.  We do not have to clear the memory if we
-     do not have to use the temporary bootstrap_map.  Global variables
-     are initialized to zero by default.  */
+     make function calls at this point.  Use '__builtin_memset' instead.
+     We do not have to clear the memory if we do not have to use the
+     temporary bootstrap_map.  Global variables are initialized to zero
+     by default.  */
 #ifndef DONT_USE_BOOTSTRAP_MAP
-# ifdef HAVE_BUILTIN_MEMSET
   __builtin_memset (bootstrap_map.l_info, '\0', sizeof (bootstrap_map.l_info));
-# else
-  for (size_t cnt = 0;
-       cnt < sizeof (bootstrap_map.l_info) / sizeof (bootstrap_map.l_info[0]);
-       ++cnt)
-    bootstrap_map.l_info[cnt] = 0;
-# endif
 #endif
 
   /* Figure out the run-time load address of the dynamic linker itself.  */
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH] benchtests: Add a script to merge two benchout string files
  2021-07-13  8:22 [PATCH] benchtests: Add memset zero fill benchmark tests Naohiro Tamura via Libc-alpha
                   ` (7 preceding siblings ...)
  2021-07-26  8:34 ` [PATCH] config: Remove HAVE_BUILTIN_MEMSET macro Naohiro Tamura via Libc-alpha
@ 2021-07-26  8:35 ` Naohiro Tamura via Libc-alpha
  2021-07-27 20:51   ` Joseph Myers
  8 siblings, 1 reply; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-07-26  8:35 UTC (permalink / raw)
  To: libc-alpha

This patch adds a script to merge two benchout string files into one
in terms of an ifunc in order to create a comparison graph.

Usage: merge_strings4graph.sh ifunc_name graph_tag1 graph_tag2
  read two benchout string files from standard input
  write merged benchout string file to standard output

ex:

$ cat master/bench-memset.out patch/bench-memset.out | \
> merge_strings4graph.sh __memset_generic master patch | \
> plot_strings.py -l -p thru -v -
---
 benchtests/scripts/merge_strings4graph.sh | 57 +++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100755 benchtests/scripts/merge_strings4graph.sh

diff --git a/benchtests/scripts/merge_strings4graph.sh b/benchtests/scripts/merge_strings4graph.sh
new file mode 100755
index 000000000000..ac38b6327f01
--- /dev/null
+++ b/benchtests/scripts/merge_strings4graph.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright (C) 2021 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+#
+# Merge two benchout string files into one in terms of an ifunc
+# in order to create a comparison graph
+#
+if [[ $1 == "-h" ]] || [[ $# != 3 ]]; then
+  echo "Usage: ${0##*/} ifunc_name graph_tag1 graph_tag2"
+  echo "  read two benchout string files from standard input"
+  echo "  write merged benchout string file to standard output"
+  echo "ex:"
+  echo "  $ cat bench-memset-first.out bench-memset-second.out | \\
+  > ${0##*/} __memset_generic graph_tag1 graph_tag2 | \\
+  > plot_strings.py -l -p thru -v -"
+exit 1
+fi
+
+jq -rs --arg ifunc_name $1 --arg graph_tag1 $2 --arg graph_tag2 $3 '
+. as $root |
+.[0] as $first |
+$first.functions.memset.ifuncs |
+  length as $ifuncs_len |
+  index($ifunc_name) as $ifunc_index |
+$root |
+  del(.[].functions.memset.results[].timings[$ifunc_index+1:$ifuncs_len]) |
+  del(.[].functions.memset.results[].timings[0:$ifunc_index]) | 
+  [.[].functions.memset.results] | transpose as $pair |
+$pair |
+  reduce range(0; $pair|length) as $i (
+    []; . + [$pair[$i][0].timings+$pair[$i][1].timings]
+  ) | . as $newtimings |
+  reduce range(0; $pair|length) as $j (
+    []; . + [{"length":$first.functions.memset.results[$j].length,
+              "timings":$newtimings[$j]}]
+  ) | . as $newresults |
+$first |
+  .functions.memset."bench-variant"+="-"+$graph_tag1+"-"+$graph_tag2 |
+  .functions.memset.ifuncs=[$ifunc_name+"-"+$graph_tag1,$ifunc_name+"-"+$graph_tag2] |
+  .functions.memset.results=$newresults
+'
+
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-20 16:48   ` Noah Goldstein via Libc-alpha
  2021-07-21 12:56     ` naohirot--- via Libc-alpha
@ 2021-07-26  8:39     ` naohirot--- via Libc-alpha
  2021-07-26 17:22       ` Noah Goldstein via Libc-alpha
  1 sibling, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-26  8:39 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Wilco Dijkstra

Hi Noah,

> I see. I think 16 for the inner loop makes sense. From the x86_64
> perspective this
> will keep the loop from running out of the LSD which is necessary for
> accurate
> benchmarking. I guess then somewhere between [2, 8] is reasonable for the
> outer
> loop?
> 
> 
> > #define START_SIZE (16 * 1024)
> > ...
> > static void
> > __attribute__((noinline, noclone))
> > do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> >              int c1 __attribute ((unused)), int c2 __attribute ((unused)),
> >              size_t n)
> > {
> >   size_t i, j, iters = INNER_LOOP_ITERS; // 32;
> >   timing_t start, stop, cur, latency = 0;
> >
> >   for (i = 0; i < 512; i++) // for (i = 0; i < 2; i++)
> >     {
> >
> >       CALL (impl, s, c1, n * 16);
> >       TIMING_NOW (start);
> >       for (j = 0; j < 16; j++)
> >         CALL (impl, s + n * j, c2, n);
> >       TIMING_NOW (stop);
> >       TIMING_DIFF (cur, start, stop);
> >       TIMING_ACCUM (latency, cur);
> >     }
> >
> This looks good. But as you said, a much smaller value for outer loop.

I made one improvement that replaced 
  CALL (impl, s, c1, n * 16);
to
  __builtin_memset (s, c1, n * 16);
and tentatively chose outer loop two times such as the followings:

-----
static void
__attribute__((noinline, noclone))
do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
             int c1 __attribute ((unused)), int c2 __attribute ((unused)),
             size_t n)
{
  size_t i, j, iters = 32;
  timing_t start, stop, cur, latency = 0;

  for (i = 0; i < 2; i++)
    {
      __builtin_memset (s, c1, n * 16);
      TIMING_NOW (start);
      for (j = 0; j < 16; j++)
        CALL (impl, s + n * j, c2, n);
      TIMING_NOW (stop);
      TIMING_DIFF (cur, start, stop);
      TIMING_ACCUM (latency, cur);
    }

  json_element_double (json_ctx, (double) latency / (double) iters);
}
-----

In case of __memset_generic on a64fx, execution of outer loop 8times
and 2times took as follows:

8times
real    0m26.236s
user    0m18.806s
sys     0m6.562s

2times
real    0m12.956s
user    0m5.081s
sys     0m6.594s

The performance difference is shown in a comparison graph [1],
there is a difference at 16KB.
This difference would not be critical if we use the performance data
mainly to compare "before" with "after" such as master version of
memset with patched version of memset.


This graph[1] can be drawn as the following:

$ cat 2times/bench-memset-zerofill.out 8times/bench-memset-zerofill.out | \
> merge_strings4graph.sh __memset_generic 2times 8times | \
> plot_strings.py -l -p thru -v -


In order to use __builtin_memset() and create the comparison graph [1],
I submitted two ground work patches [2][3].

[1] https://drive.google.com/file/d/1vD1VE3pdHLoYdaAMWXtImvDlGFDHYkyx/view?usp=sharing
[2] https://sourceware.org/pipermail/libc-alpha/2021-July/129459.html
[3] https://sourceware.org/pipermail/libc-alpha/2021-July/129460.html

Thanks.
Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-21 19:17           ` Wilco Dijkstra via Libc-alpha
@ 2021-07-26  8:42             ` naohirot--- via Libc-alpha
  2021-07-26 11:15               ` Wilco Dijkstra via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-26  8:42 UTC (permalink / raw)
  To: Wilco Dijkstra, Noah Goldstein; +Cc: GNU C Library

Hi Wilco,

> 
>       TIMING_NOW (start);
>       for (j = 0; j < 16; j++)
>         CALL (impl, s + n * j, c2, n);
>       TIMING_NOW (stop);
>  
> This loop is basically equivalent to CALL (impl, s, c2, n * 16), so you

Yes, but the number of function call is different between 1 time and
16 times.

> might as well change the outer loop to use a larger 'n'. The accuracy
> will be bad unless 'n' is really large, and there is no way to improve it.

Umm I couldn't understand the logic of this part.
How do we change the the outer loop to use a larger 'n'?

> If you want to test zero/non-zero combinations accurately, store the
> memset values in an array rather than using a single constant.

I changed one line as shown in the mail [1]
from
  CALL (impl, s, c1, n * 16);
to
  __builtin_memset (s, c1, n * 16);

Is this the array you mentioned?

[1] https://sourceware.org/pipermail/libc-alpha/2021-July/129461.html

Thanks.
Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] config: Remove HAVE_BUILTIN_MEMSET macro
  2021-07-26  8:34 ` [PATCH] config: Remove HAVE_BUILTIN_MEMSET macro Naohiro Tamura via Libc-alpha
@ 2021-07-26  8:48   ` naohirot--- via Libc-alpha
  2021-07-26  8:49   ` Andreas Schwab
  1 sibling, 0 replies; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-26  8:48 UTC (permalink / raw)
  To: libc-alpha@sourceware.org

I'll fix the typo

> s patch removed HAVE_BUILTIN_MEMSET macro because GCC 6.2 that is

s/s/This/

> minimum requirement to compile glibc already support
>__builtin_memset()[1].

Thanks.
Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] config: Remove HAVE_BUILTIN_MEMSET macro
  2021-07-26  8:34 ` [PATCH] config: Remove HAVE_BUILTIN_MEMSET macro Naohiro Tamura via Libc-alpha
  2021-07-26  8:48   ` naohirot--- via Libc-alpha
@ 2021-07-26  8:49   ` Andreas Schwab
  2021-07-26  9:42     ` naohirot--- via Libc-alpha
  1 sibling, 1 reply; 83+ messages in thread
From: Andreas Schwab @ 2021-07-26  8:49 UTC (permalink / raw)
  To: Naohiro Tamura via Libc-alpha

On Jul 26 2021, Naohiro Tamura via Libc-alpha wrote:

> Interestingly, removed code had a critical bug that is
> HAVE_BUILTIN_MEMSET macro never be defined, because yes/no assignment
> to libc_cv_gcc_builtin_memset was reversed in configure.ac as below:

No, the point of the check is that __buildin_memset does *not* expand to
a memset libcall, as explained in the comment you removed.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH] config: Remove HAVE_BUILTIN_MEMSET macro
  2021-07-26  8:49   ` Andreas Schwab
@ 2021-07-26  9:42     ` naohirot--- via Libc-alpha
  2021-07-26  9:51       ` Andreas Schwab
  0 siblings, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-26  9:42 UTC (permalink / raw)
  To: 'Andreas Schwab', Naohiro Tamura via Libc-alpha

Hi Andreas,

Thank you for the comment.

> From: Andreas Schwab <schwab@linux-m68k.org>

> On Jul 26 2021, Naohiro Tamura via Libc-alpha wrote:
> 
> > Interestingly, removed code had a critical bug that is
> > HAVE_BUILTIN_MEMSET macro never be defined, because yes/no assignment
> > to libc_cv_gcc_builtin_memset was reversed in configure.ac as below:
> 
> No, the point of the check is that __buildin_memset does *not* expand to
> a memset libcall, as explained in the comment you removed.
> 

Is the comment you mentioned below?

   /* Partly clean the `bootstrap_map' structure up.  Don't use
      `memset' since it might not be built in or inlined and we cannot
-     make function calls at this point.  Use '__builtin_memset' if we
-     know it is available.  We do not have to clear the memory if we
-     do not have to use the temporary bootstrap_map.  Global variables
-     are initialized to zero by default.  */
+     make function calls at this point.  Use '__builtin_memset' instead.
+     We do not have to clear the memory if we do not have to use the
+     temporary bootstrap_map.  Global variables are initialized to zero
+     by default.  */

Do you mean that yes/no assignment to libc_cv_gcc_builtin_memset was NOT reversed?
If it was not reversed, config.h never had " #define HAVE_BUILTIN_MEMSET 1" 
even if gcc 8.3 is used.

Thanks.
Naohiro


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] config: Remove HAVE_BUILTIN_MEMSET macro
  2021-07-26  9:42     ` naohirot--- via Libc-alpha
@ 2021-07-26  9:51       ` Andreas Schwab
  2021-07-26 13:16         ` naohirot--- via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Andreas Schwab @ 2021-07-26  9:51 UTC (permalink / raw)
  To: naohirot; +Cc: Naohiro Tamura via Libc-alpha

On Jul 26 2021, naohirot@fujitsu.com wrote:

> If it was not reversed, config.h never had " #define HAVE_BUILTIN_MEMSET 1" 
> even if gcc 8.3 is used.

This is correct if the builtin just expands to a memset libcall.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-26  8:42             ` naohirot--- via Libc-alpha
@ 2021-07-26 11:15               ` Wilco Dijkstra via Libc-alpha
  2021-07-27  2:24                 ` naohirot--- via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Wilco Dijkstra via Libc-alpha @ 2021-07-26 11:15 UTC (permalink / raw)
  To: naohirot@fujitsu.com, Noah Goldstein; +Cc: GNU C Library

Hi Naohiro,

>> This loop is basically equivalent to CALL (impl, s, c2, n * 16), so you
>
> Yes, but the number of function call is different between 1 time and
> 16 times.

The call overhead is not an issue unless 'n' is really small. The point is that
the loop writes 16 * n bytes, so you're really testing 16n rather than n.

>> might as well change the outer loop to use a larger 'n'. The accuracy
>> will be bad unless 'n' is really large, and there is no way to improve it.
>
> Umm I couldn't understand the logic of this part.
> How do we change the the outer loop to use a larger 'n'?

By removing the * 16 from the inner loop and adding it to the outer loop.
That avoids the confusion that we are testing size 'n' when we are really
testing n*16.

>  CALL (impl, s, c1, n * 16);
> to
>   __builtin_memset (s, c1, n * 16);
>
> Is this the array you mentioned?

That doesn't make any sense since that will just call memset and use the
default ifunc for memset.

What I mean is something trivial like: CALL (impl, s, memset_array[i & 15], n);
This way you can test any kind of pattern (like all zero, all one, and combinations
with varying number of zero->non-zero and non-zero->zero transitions).

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] config: Remove HAVE_BUILTIN_MEMSET macro
  2021-07-26  9:51       ` Andreas Schwab
@ 2021-07-26 13:16         ` naohirot--- via Libc-alpha
  0 siblings, 0 replies; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-26 13:16 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: Naohiro Tamura via Libc-alpha

Hi Andreas,

Thanks for the explanation.

> From: Andreas Schwab <schwab@linux-m68k.org>

> > If it was not reversed, config.h never had " #define HAVE_BUILTIN_MEMSET 1"
> > even if gcc 8.3 is used.
>
> This is correct if the builtin just expands to a memset libcall.

Now I understood that by looking at the grep argument again in the line below:

1519 if AC_TRY_COMMAND([${CC-cc} -O3 -S conftest.c -o - | grep -F "memset" > /dev/null]);

So the macro name "HAVE_BUILTIN_MEMSET" is very confusing,
since GCC 6.2 manual mentions that __builtin_memset is supported.
Can we agree to change it to "HAVE_NON_LIB_EXPAND_BUILTIN_MEMSET"  or "HAVE_NON_LIBCALL_BUILTIN_MEMSET"?

Thanks.
Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-26  8:39     ` naohirot--- via Libc-alpha
@ 2021-07-26 17:22       ` Noah Goldstein via Libc-alpha
  0 siblings, 0 replies; 83+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-07-26 17:22 UTC (permalink / raw)
  To: naohirot@fujitsu.com; +Cc: GNU C Library, Wilco Dijkstra

On Mon, Jul 26, 2021 at 4:39 AM naohirot@fujitsu.com <naohirot@fujitsu.com>
wrote:

> Hi Noah,
>
> > I see. I think 16 for the inner loop makes sense. From the x86_64
> > perspective this
> > will keep the loop from running out of the LSD which is necessary for
> > accurate
> > benchmarking. I guess then somewhere between [2, 8] is reasonable for the
> > outer
> > loop?
> >
> >
> > > #define START_SIZE (16 * 1024)
> > > ...
> > > static void
> > > __attribute__((noinline, noclone))
> > > do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> > >              int c1 __attribute ((unused)), int c2 __attribute
> ((unused)),
> > >              size_t n)
> > > {
> > >   size_t i, j, iters = INNER_LOOP_ITERS; // 32;
> > >   timing_t start, stop, cur, latency = 0;
> > >
> > >   for (i = 0; i < 512; i++) // for (i = 0; i < 2; i++)
> > >     {
> > >
> > >       CALL (impl, s, c1, n * 16);
> > >       TIMING_NOW (start);
> > >       for (j = 0; j < 16; j++)
> > >         CALL (impl, s + n * j, c2, n);
> > >       TIMING_NOW (stop);
> > >       TIMING_DIFF (cur, start, stop);
> > >       TIMING_ACCUM (latency, cur);
> > >     }
> > >
> > This looks good. But as you said, a much smaller value for outer loop.
>
> I made one improvement that replaced
>   CALL (impl, s, c1, n * 16);
> to
>   __builtin_memset (s, c1, n * 16);
> and tentatively chose outer loop two times such as the followings:
>
> -----
> static void
> __attribute__((noinline, noclone))
> do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
>              int c1 __attribute ((unused)), int c2 __attribute ((unused)),
>              size_t n)
> {
>   size_t i, j, iters = 32;
>   timing_t start, stop, cur, latency = 0;
>
>   for (i = 0; i < 2; i++)
>     {
>       __builtin_memset (s, c1, n * 16);
>       TIMING_NOW (start);
>       for (j = 0; j < 16; j++)
>         CALL (impl, s + n * j, c2, n);
>       TIMING_NOW (stop);
>       TIMING_DIFF (cur, start, stop);
>       TIMING_ACCUM (latency, cur);
>     }
>
>   json_element_double (json_ctx, (double) latency / (double) iters);
> }
>

Looks good!

> -----
>
In case of __memset_generic on a64fx, execution of outer loop 8times
> and 2times took as follows:
>
> 8times
> real    0m26.236s
> user    0m18.806s
> sys     0m6.562s
>
> 2times
> real    0m12.956s
> user    0m5.081s
> sys     0m6.594s
>
> The performance difference is shown in a comparison graph [1],
> there is a difference at 16KB.
> This difference would not be critical if we use the performance data
> mainly to compare "before" with "after" such as master version of
> memset with patched version of memset.
>
>
> This graph[1] can be drawn as the following:
>
> $ cat 2times/bench-memset-zerofill.out 8times/bench-memset-zerofill.out | \
> > merge_strings4graph.sh __memset_generic 2times 8times | \
> > plot_strings.py -l -p thru -v -
>
>
> In order to use __builtin_memset() and create the comparison graph [1],
> I submitted two ground work patches [2][3].
>
> [1]
> https://drive.google.com/file/d/1vD1VE3pdHLoYdaAMWXtImvDlGFDHYkyx/view?usp=sharing
> [2] https://sourceware.org/pipermail/libc-alpha/2021-July/129459.html
> [3] https://sourceware.org/pipermail/libc-alpha/2021-July/129460.html
>
> Thanks.
> Naohiro
>

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-26 11:15               ` Wilco Dijkstra via Libc-alpha
@ 2021-07-27  2:24                 ` naohirot--- via Libc-alpha
  2021-07-27 17:26                   ` Wilco Dijkstra via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-27  2:24 UTC (permalink / raw)
  To: Wilco Dijkstra, Noah Goldstein; +Cc: GNU C Library

Hi Wilco,

Thank you for the explanation.

> >> might as well change the outer loop to use a larger 'n'. The accuracy
> >> will be bad unless 'n' is really large, and there is no way to improve it.
> >
> > Umm I couldn't understand the logic of this part.
> > How do we change the the outer loop to use a larger 'n'?
> 
> By removing the * 16 from the inner loop and adding it to the outer loop.
> That avoids the confusion that we are testing size 'n' when we are really
> testing n*16.

There may be miscomminuation.
The * 16 is already in the outer loop (1).
Let me copy the code from the mail [1] I put in the previouse mail [2].

-----
static void
__attribute__((noinline, noclone))
do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
             int c1 __attribute ((unused)), int c2 __attribute ((unused)),
             size_t n)
{
  size_t i, j, iters = 32;
  timing_t start, stop, cur, latency = 0;

  for (i = 0; i < 2; i++)
    {
      __builtin_memset (s, c1, n * 16);  // (1)
      TIMING_NOW (start);
      for (j = 0; j < 16; j++)
        CALL (impl, s + n * j, c2, n);
      TIMING_NOW (stop);
      TIMING_DIFF (cur, start, stop);
      TIMING_ACCUM (latency, cur);
    }

  json_element_double (json_ctx, (double) latency / (double) iters);
}
-----

[1] https://sourceware.org/pipermail/libc-alpha/2021-July/129461.html
[2] https://sourceware.org/pipermail/libc-alpha/2021-July/129462.html

BTW, are you thinking the code like this?
But it must be not, because there is no inner and outer loops.

  for (i = 0; i < 16; i++)
    {
      __builtin_memset (s, c1, n);

      TIMING_NOW (start);
      CALL (impl, s, c2, n);
      TIMING_NOW (stop);
      TIMING_DIFF (cur, start, stop);
      TIMING_ACCUM (latency, cur);
    }

> 
> >  CALL (impl, s, c1, n * 16);
> > to
> >   __builtin_memset (s, c1, n * 16);
> >
> > Is this the array you mentioned?
> 
> That doesn't make any sense since that will just call memset and use the
> default ifunc for memset.

This is my intention, because "CALL (impl, s, c1, n * 16);" is not
measured, that is outside of "TIMING_NOW (start);" and "TIMING_NOW (stop);". 
It doesn't matter what kind of memset is called, but matters the
function name in the code so that we can understand it is not mesured.

> 
> What I mean is something trivial like: CALL (impl, s, memset_array[i & 15], n);
> This way you can test any kind of pattern (like all zero, all one, and combinations
> with varying number of zero->non-zero and non-zero->zero transitions).

I understood, thanks.
Why don't we separate it to another patch if it is really matter?

From AArch64 point of view, the purpose of this bench is to measure
"DC ZVA" performance. So non-zero value can be any value except zero.
Do we have any specific reason to vary the non-zero value?

Thanks.
Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-27  2:24                 ` naohirot--- via Libc-alpha
@ 2021-07-27 17:26                   ` Wilco Dijkstra via Libc-alpha
  2021-07-28  7:27                     ` naohirot--- via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Wilco Dijkstra via Libc-alpha @ 2021-07-27 17:26 UTC (permalink / raw)
  To: naohirot@fujitsu.com, Noah Goldstein; +Cc: GNU C Library

Hi Naohiro,

> There may be miscomminuation.
> The * 16 is already in the outer loop (1).

The outer loop is in test_main, and it determines 'n' in do_one_test:

  for (i = ...)
    {
      do_test (&json_ctx, 0, c, i);
    }

> Let me copy the code from the mail [1] I put in the previouse mail [2].

The key issue is that this loop:

      for (j = 0; j < 16; j++)
        CALL (impl, s + n * j, c2, n);

is equivalent to:

CALL (impl, s, c2, n * 16);

The loop we really want is something like bench-memset-large:

  CALL (impl, s, c, n);
  TIMING_NOW (start);
  for (i = 0; i < iters; ++i)
    {
      CALL (impl, s, c, n);
    }
  TIMING_NOW (stop);

This repeats CALL on data of size 'n' after an initial warmup of the caches.

> It doesn't matter what kind of memset is called, but matters the
> function name in the code so that we can understand it is not mesured.

Then using the standard name 'memset' would be best.

>> What I mean is something trivial like: CALL (impl, s, memset_array[i & 15], n);
>> This way you can test any kind of pattern (like all zero, all one, and combinations
>> with varying number of zero->non-zero and non-zero->zero transitions).
>
> I understood, thanks.
> Why don't we separate it to another patch if it is really matter?

I don't think it matters, however I thought that is what your loops try to
measure? If not, then why not use the loop from bench-memset-large?

> From AArch64 point of view, the purpose of this bench is to measure
> "DC ZVA" performance. So non-zero value can be any value except zero.
> Do we have any specific reason to vary the non-zero value?

Well if that is the goal then bench-memset-large can measure zero performance
with minor changes. If you don't need to do anything completely different then
the existing code is good enough.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 3/5] benchtests: Add a script to convert benchout string JSON to CSV
  2021-07-20  6:35 ` [PATCH v2 3/5] benchtests: Add a script to convert benchout string JSON to CSV Naohiro Tamura via Libc-alpha
  2021-07-21  2:41   ` naohirot--- via Libc-alpha
@ 2021-07-27 20:17   ` Joseph Myers
  2021-07-29  1:56     ` naohirot--- via Libc-alpha
  1 sibling, 1 reply; 83+ messages in thread
From: Joseph Myers @ 2021-07-27 20:17 UTC (permalink / raw)
  To: Naohiro Tamura; +Cc: libc-alpha, Wilco Dijkstra

On Tue, 20 Jul 2021, Naohiro Tamura via Libc-alpha wrote:

> +jq -r '

I don't think introducing a use of a new tool like that (not mentioned in 
install.texi) is a particularly good idea.  I'd suggest implementing this 
conversion in Python, given that the Python standard library supports both 
JSON and CSV and is already used for various purposes in glibc scripts.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] benchtests: Add a script to merge two benchout string files
  2021-07-26  8:35 ` [PATCH] benchtests: Add a script to merge two benchout string files Naohiro Tamura via Libc-alpha
@ 2021-07-27 20:51   ` Joseph Myers
  2021-07-30  7:04     ` naohirot--- via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Joseph Myers @ 2021-07-27 20:51 UTC (permalink / raw)
  To: Naohiro Tamura; +Cc: libc-alpha

On Mon, 26 Jul 2021, Naohiro Tamura via Libc-alpha wrote:

> +jq -rs --arg ifunc_name $1 --arg graph_tag1 $2 --arg graph_tag2 $3 '

My comments about avoiding introducing use of a new tool apply here as 
well.  New dependencies for glibc scripts ought to be mentioned in 
install.texi (with some indication of when they are needed), but it seems 
better in this case just to use Python (which is what we're tending to 
consolidate on for miscellaneous glibc scripts, unless very simple in some 
other language).

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-27 17:26                   ` Wilco Dijkstra via Libc-alpha
@ 2021-07-28  7:27                     ` naohirot--- via Libc-alpha
  2021-08-04  9:11                       ` naohirot--- via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-28  7:27 UTC (permalink / raw)
  To: Wilco Dijkstra, Noah Goldstein; +Cc: GNU C Library

Hi Wilco, Noah,

> > There may be miscomminuation.
> > The * 16 is already in the outer loop (1).
> 
> The outer loop is in test_main, and it determines 'n' in do_one_test:
> 
>   for (i = ...)
>     {
>       do_test (&json_ctx, 0, c, i);
>     }
> 
> > Let me copy the code from the mail [1] I put in the previouse mail [2].
> 
> The key issue is that this loop:
> 
>       for (j = 0; j < 16; j++)
>         CALL (impl, s + n * j, c2, n);
> 
> is equivalent to:
> 
> CALL (impl, s, c2, n * 16);
> 
> The loop we really want is something like bench-memset-large:
> 
>   CALL (impl, s, c, n);
>   TIMING_NOW (start);
>   for (i = 0; i < iters; ++i)
>     {
>       CALL (impl, s, c, n);
>     }
>   TIMING_NOW (stop);
> 
> This repeats CALL on data of size 'n' after an initial warmup of the caches.
> 
> > It doesn't matter what kind of memset is called, but matters the
> > function name in the code so that we can understand it is not mesured.
> 
> Then using the standard name 'memset' would be best.
> 

OK, I understood, thanks.

Taking Noah's comment [1] into account, the final code should be like
the below. Can we agree with this code?

Two results, two loop version in the mail [1] and one loop version
below, are almost same in case of __memset_generic on a64fx as
shown in the graph [2].

-----
static void
__attribute__((noinline, noclone))
do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
             int c1 __attribute ((unused)), int c2 __attribute ((unused)),
             size_t n)
{
  size_t i, iters = 32;
  timing_t start, stop, cur, latency = 0;

  CALL (impl, s, c2, n); // warm up

  for (i = 0; i < iters; i++)
    {
      memset (s, c1, n); // alternation

      TIMING_NOW (start);

      CALL (impl, s, c2, n);

      TIMING_NOW (stop);
      TIMING_DIFF (cur, start, stop);
      TIMING_ACCUM (latency, cur);
    }

  json_element_double (json_ctx, (double) latency / (double) iters);
}
-----

[1] https://sourceware.org/pipermail/libc-alpha/2021-July/129486.html
[2] https://drive.google.com/file/d/1bptHqg5vvFAGoYgoR3w_pvclXFSP8Sr0/view?usp=sharing

Thanks.
Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v2 3/5] benchtests: Add a script to convert benchout string JSON to CSV
  2021-07-27 20:17   ` Joseph Myers
@ 2021-07-29  1:56     ` naohirot--- via Libc-alpha
  2021-07-29  4:42       ` Siddhesh Poyarekar
  0 siblings, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-29  1:56 UTC (permalink / raw)
  To: 'Joseph Myers'; +Cc: libc-alpha@sourceware.org, Wilco Dijkstra

Hi Joseph,

Thanks for the comment.

> > +jq -r '
> 
> I don't think introducing a use of a new tool like that (not mentioned in
> install.texi) is a particularly good idea.  I'd suggest implementing this
> conversion in Python, given that the Python standard library supports both
> JSON and CSV and is already used for various purposes in glibc scripts.

I'm having a hard time to analyze string benchmark results.
'jq' was chosen to just get my job done quickly, because it's natural
for me to process JSON than python.

I believe that most of people who tried to improve string ifunc may
have developed similar tools in their local, and not shared.
And those people must be in the same situation which doesn't allow to
spend time to port it to another language or sort out to be useful for
other people because it's not their primary job, but side way job.

It would be nice if we can stop that each developer develops similar
tools again and again.

So is there any possibility to be accepted to be able to share these
trivial tools if install.texi is updated?

Thanks
Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 3/5] benchtests: Add a script to convert benchout string JSON to CSV
  2021-07-29  1:56     ` naohirot--- via Libc-alpha
@ 2021-07-29  4:42       ` Siddhesh Poyarekar
  2021-07-30  7:05         ` naohirot--- via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Siddhesh Poyarekar @ 2021-07-29  4:42 UTC (permalink / raw)
  To: naohirot@fujitsu.com, 'Joseph Myers'
  Cc: libc-alpha@sourceware.org, Wilco Dijkstra

On 7/29/21 7:26 AM, naohirot--- via Libc-alpha wrote:
> I'm having a hard time to analyze string benchmark results.
> 'jq' was chosen to just get my job done quickly, because it's natural
> for me to process JSON than python.
> 
> I believe that most of people who tried to improve string ifunc may
> have developed similar tools in their local, and not shared.
> And those people must be in the same situation which doesn't allow to
> spend time to port it to another language or sort out to be useful for
> other people because it's not their primary job, but side way job.
> 
> It would be nice if we can stop that each developer develops similar
> tools again and again.

Most people in the community who work in string function improvements  
tend to use (and improve wherever it is lacking)  
benchtests/scripts/compare_strings.py for their result analysis.  Adding  
a flag to dump csv to that script ought to be trivial if that's what you  
need.

The script is under-documented though, so perhaps a wiki page describing  
what the script does and various example uses would go a very long way.

> So is there any possibility to be accepted to be able to share these
> trivial tools if install.texi is updated?

The reason for emitting json is precisely to allow developers to  
implement their own analysis tools around them when their use cases are  
niche.  Your specific use case is not niche and could be added as a flag  
to compare_strings.py if needed.  You only need a new flag --csv (or -o  
csv, tab, etc.) to print in csv instead of the current output, which is  
meant for reading on the terminal.

Siddhesh

^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH] benchtests: Add a script to merge two benchout string files
  2021-07-27 20:51   ` Joseph Myers
@ 2021-07-30  7:04     ` naohirot--- via Libc-alpha
  0 siblings, 0 replies; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-30  7:04 UTC (permalink / raw)
  To: 'Joseph Myers'; +Cc: libc-alpha@sourceware.org

Hi Joseph,

Thank you for the comment.

> > +jq -rs --arg ifunc_name $1 --arg graph_tag1 $2 --arg graph_tag2 $3 '
> 
> My comments about avoiding introducing use of a new tool apply here as
> well.  New dependencies for glibc scripts ought to be mentioned in
> install.texi (with some indication of when they are needed), but it seems
> better in this case just to use Python (which is what we're tending to
> consolidate on for miscellaneous glibc scripts, unless very simple in some
> other language).

OK, I understood.
Proliferation of trivial tools in different language must be a problem for the community.

Best regards,
Naohiro


^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v2 3/5] benchtests: Add a script to convert benchout string JSON to CSV
  2021-07-29  4:42       ` Siddhesh Poyarekar
@ 2021-07-30  7:05         ` naohirot--- via Libc-alpha
  2021-07-31 10:47           ` Siddhesh Poyarekar
  0 siblings, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-30  7:05 UTC (permalink / raw)
  To: 'Siddhesh Poyarekar', 'Joseph Myers'
  Cc: libc-alpha@sourceware.org, Wilco Dijkstra

Hi Siddhesh,

Thank you for the advice!

> Most people in the community who work in string function improvements
> tend to use (and improve wherever it is lacking)
> benchtests/scripts/compare_strings.py for their result analysis.  Adding
> a flag to dump csv to that script ought to be trivial if that's what you
> need.

I see. I didn't use compare_strings.py daily, but plot_strings.py.

> The script is under-documented though, so perhaps a wiki page describing
> what the script does and various example uses would go a very long way.

I found the wiki page.
https://sourceware.org/glibc/wiki/benchmarking/benchmarks

> The reason for emitting json is precisely to allow developers to
> implement their own analysis tools around them when their use cases are
> niche.  Your specific use case is not niche and could be added as a flag
> to compare_strings.py if needed.  You only need a new flag --csv (or -o
> csv, tab, etc.) to print in csv instead of the current output, which is
> meant for reading on the terminal.

Yes, converting to CSV in Python will be easy.
But comparing two string benchout results directly between "before" and "after"
was not so easy AFAIK. 
And creating graphs in spreadsheet manually is tolerable in a few times, but not
in frequent times.
That's the reason I created another 'jq' script, merge_strings4graph.sh".
Dose most people compare the two results indirectly through a common base ifunc
using "--base" option of compare_strings.py or "--baseline" option of plot_strings.py?

Thanks.
Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v2 3/5] benchtests: Add a script to convert benchout string JSON to CSV
  2021-07-30  7:05         ` naohirot--- via Libc-alpha
@ 2021-07-31 10:47           ` Siddhesh Poyarekar
  0 siblings, 0 replies; 83+ messages in thread
From: Siddhesh Poyarekar @ 2021-07-31 10:47 UTC (permalink / raw)
  To: naohirot@fujitsu.com, 'Joseph Myers'
  Cc: libc-alpha@sourceware.org, Wilco Dijkstra

On 7/30/21 12:35 PM, naohirot@fujitsu.com wrote:
> Hi Siddhesh,
> 
> Thank you for the advice!
> 
>> Most people in the community who work in string function improvements
>> tend to use (and improve wherever it is lacking)
>> benchtests/scripts/compare_strings.py for their result analysis.  Adding
>> a flag to dump csv to that script ought to be trivial if that's what you
>> need.
> 
> I see. I didn't use compare_strings.py daily, but plot_strings.py.
> 
>> The script is under-documented though, so perhaps a wiki page describing
>> what the script does and various example uses would go a very long way.
> 
> I found the wiki page.
> https://sourceware.org/glibc/wiki/benchmarking/benchmarks

Yeah that needs to improve :/

>> The reason for emitting json is precisely to allow developers to
>> implement their own analysis tools around them when their use cases are
>> niche.  Your specific use case is not niche and could be added as a flag
>> to compare_strings.py if needed.  You only need a new flag --csv (or -o
>> csv, tab, etc.) to print in csv instead of the current output, which is
>> meant for reading on the terminal.
> 
> Yes, converting to CSV in Python will be easy.
> But comparing two string benchout results directly between "before" and "after"
> was not so easy AFAIK.
> And creating graphs in spreadsheet manually is tolerable in a few times, but not
> in frequent times.
> That's the reason I created another 'jq' script, merge_strings4graph.sh".
> Dose most people compare the two results indirectly through a common base ifunc
> using "--base" option of compare_strings.py or "--baseline" option of plot_strings.py?

If you need to compare two results then just set one as the base using  
the --base/--baseline and see how the other compares.  That's usually  
sufficient to justify addition of new variants to glibc.

Siddhesh

^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v2 2/5] benchtests: Add memset zero fill benchtest
  2021-07-28  7:27                     ` naohirot--- via Libc-alpha
@ 2021-08-04  9:11                       ` naohirot--- via Libc-alpha
  0 siblings, 0 replies; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-08-04  9:11 UTC (permalink / raw)
  To: Wilco Dijkstra, Noah Goldstein; +Cc: GNU C Library

Hi Wilco, Noah,

> From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Sent: Wednesday, July 28, 2021 4:28 PM
> 
> Taking Noah's comment [1] into account, the final code should be like
> the below. Can we agree with this code?
> 
> Two results, two loop version in the mail [1] and one loop version
> below, are almost same in case of __memset_generic on a64fx as
> shown in the graph [2].
> 
> -----
> static void
> __attribute__((noinline, noclone))
> do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
>              int c1 __attribute ((unused)), int c2 __attribute ((unused)),
>              size_t n)
> {
>   size_t i, iters = 32;
>   timing_t start, stop, cur, latency = 0;
> 
>   CALL (impl, s, c2, n); // warm up
> 
>   for (i = 0; i < iters; i++)
>     {
>       memset (s, c1, n); // alternation
> 
>       TIMING_NOW (start);
> 
>       CALL (impl, s, c2, n);
> 
>       TIMING_NOW (stop);
>       TIMING_DIFF (cur, start, stop);
>       TIMING_ACCUM (latency, cur);
>     }
> 
>   json_element_double (json_ctx, (double) latency / (double) iters);
> }
> -----

I'd like to share an interesting insight which was found when
START_SIZE was changed to smaller size 256 from 16KB.
Currently DC ZVA is called if size is more than 256B and value is zero
in __memset_generic (sysdeps/aarch64/memset.S).
However DC ZVA is slower than store instruction if size is less than
16KB on A64FX[3].
So this would indicate that the appropriate DC ZVA start size might
be different on each CPU.
It would be interesting to see how other CPU behaves.

The code is below, which measures 4 patterns, zero-over-zero,
zero-over-one, one-over-zero and one-over-one from 256B to 64MB.
In the graph [3], 4 patterns are abbreviated 0o0, 0o1, 1o0 and 1o1.


#define START_SIZE 256
#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)

  for (c1 = 0; c1 < 2; c1++)
    for (c2 = 0; c2 < 2; c2++)
      for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
        {
          do_test (&json_ctx, 0, c1, c2, i);
          do_test (&json_ctx, 3, c1, c2, i);
        }

I'd like to submit V3 patch incorporating above change too.

[3] https://drive.google.com/file/d/1fonjDDlF4LPLfZY9-z22DGn-yaSpGN4g/view?usp=sharing

Thanks.
Naohiro

> [1] https://sourceware.org/pipermail/libc-alpha/2021-July/129486.html
> [2] https://drive.google.com/file/d/1bptHqg5vvFAGoYgoR3w_pvclXFSP8Sr0/view?usp=sharing
> 
> Thanks.
> Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH v3 0/5] benchtests: Add memset zero fill benchmark test
  2021-07-20  6:31 ` [PATCH v2 0/5] " Naohiro Tamura via Libc-alpha
@ 2021-08-05  7:47   ` Naohiro Tamura via Libc-alpha
  2021-08-05  7:49     ` [PATCH v3 1/5] benchtests: Enable scripts/plot_strings.py to read stdin Naohiro Tamura via Libc-alpha
                       ` (4 more replies)
  0 siblings, 5 replies; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-08-05  7:47 UTC (permalink / raw)
  To: Lucas A. M. Magalhaes, Wilco Dijkstra, Noah Goldstein,
	Joseph Myers, Siddhesh Poyarekar, Andreas Schwab, libc-alpha

Hi Lucas, Wilco, Noah, Joseph, Siddhesh, Andreas, and all,

This series of patches contains the version 3 of memset zero fill
benchmark test and its related ground work.

All comments of patch v2 are reflected, please find them.

Thanks.
Naohiro

Naohiro Tamura (5):
  benchtests: Enable scripts/plot_strings.py to read stdin
  benchtests: Add memset zero fill benchtest
  benchtests: Remove redundant assert.h
  benchtests: Fix validate_benchout.py exceptions
  config: Rename HAVE_BUILTIN_MEMSET macro

 benchtests/Makefile                     |   2 +-
 benchtests/bench-memset-large.c         |   1 -
 benchtests/bench-memset-walk.c          |   1 -
 benchtests/bench-memset-zerofill.c      | 134 ++++++++++++++++++++++++
 benchtests/scripts/import_bench.py      |   5 +-
 benchtests/scripts/plot_strings.py      |  11 +-
 benchtests/scripts/validate_benchout.py |   6 +-
 config.h.in                             |   4 +-
 configure                               |  14 +--
 configure.ac                            |  10 +-
 elf/rtld.c                              |   9 +-
 11 files changed, 171 insertions(+), 26 deletions(-)
 create mode 100644 benchtests/bench-memset-zerofill.c

-- 
2.17.1


^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH v3 1/5] benchtests: Enable scripts/plot_strings.py to read stdin
  2021-08-05  7:47   ` [PATCH v3 0/5] benchtests: Add memset zero fill benchmark test Naohiro Tamura via Libc-alpha
@ 2021-08-05  7:49     ` Naohiro Tamura via Libc-alpha
  2021-08-05  7:56       ` Siddhesh Poyarekar
  2021-08-05  7:50     ` [PATCH v3 2/5] benchtests: Add memset zero fill benchtest Naohiro Tamura via Libc-alpha
                       ` (3 subsequent siblings)
  4 siblings, 1 reply; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-08-05  7:49 UTC (permalink / raw)
  To: libc-alpha

This patch enables scripts/plot_strings.py to read a benchmark result
file from stdin.
To keep backward compatibility, that is to keep accepting multiple of
benchmark result files in argument, blank argument doesn't mean stdin,
but '-' does.
Therefore nargs parameter of ArgumentParser.add_argument() method is
not changed to '?', but keep '+'.

ex:
  $ jq '.' bench-memset.out | plot_strings.py -
  $ jq '.' bench-memset.out | plot_strings.py - bench-memset-large.out
  $ plot_strings.py bench-memset.out bench-memset-large.out

error ex:
  $ jq '.' bench-memset.out | plot_strings.py
---
 benchtests/scripts/plot_strings.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/benchtests/scripts/plot_strings.py b/benchtests/scripts/plot_strings.py
index c71f0804e4de..ec634692d9ad 100755
--- a/benchtests/scripts/plot_strings.py
+++ b/benchtests/scripts/plot_strings.py
@@ -31,6 +31,7 @@ import json
 import matplotlib as mpl
 import numpy as np
 import os
+import sys
 
 try:
     import jsonschema as validator
@@ -331,8 +332,11 @@ def main(args):
     for filename in args.bench:
         bench = None
 
-        with open(filename, "r") as f:
-            bench = json.load(f)
+        if filename == '-':
+            bench = json.load(sys.stdin)
+        else:
+            with open(filename, "r") as f:
+                bench = json.load(f)
 
         validator.validate(bench, schema)
 
@@ -354,7 +358,8 @@ if __name__ == "__main__":
 
     # Required parameter
     parser.add_argument("bench", nargs="+",
-                        help="benchmark results file(s) in json format")
+                        help="benchmark results file(s) in json format, " \
+                        "and/or '-' as a benchmark result file from stdin")
 
     # Optional parameters
     parser.add_argument("-b", "--baseline", type=str,
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-08-05  7:47   ` [PATCH v3 0/5] benchtests: Add memset zero fill benchmark test Naohiro Tamura via Libc-alpha
  2021-08-05  7:49     ` [PATCH v3 1/5] benchtests: Enable scripts/plot_strings.py to read stdin Naohiro Tamura via Libc-alpha
@ 2021-08-05  7:50     ` Naohiro Tamura via Libc-alpha
  2021-09-08  2:03       ` naohirot--- via Libc-alpha
  2021-09-10 20:40       ` Lucas A. M. Magalhaes via Libc-alpha
  2021-08-05  7:51     ` [PATCH v3 3/5] benchtests: Remove redundant assert.h Naohiro Tamura via Libc-alpha
                       ` (2 subsequent siblings)
  4 siblings, 2 replies; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-08-05  7:50 UTC (permalink / raw)
  To: Lucas A. M. Magalhaes, Wilco Dijkstra, Noah Goldstein, libc-alpha

Memset takes 0 as the second parameter in most cases.
However, we cannot measure the zero fill performance by
bench-memset.c, bench-memset-large.c and bench-memset-walk.c
precisely.
X86_64 micro-architecture has some zero-over-zero optimization, and
AArch64 micro-architecture also has some optimization for DC ZVA
instruction.
This patch provides bench-memset-zerofill.c which is suitable to
analyze the zero fill performance by comparing among 4 patterns,
zero-over-zero, zero-over-one, one-over-zero and one-over-one, from
256B to 64MB(RAM) through L1, L2 and L3 caches.

The following commands are examples to analyze a JSON output,
bench-memset-zerofill.out, by 'jq' and 'plot_strings.py'.

1) compare zero-over-zero performance

$ cat bench-memset-zerofill.out | \
  jq -r '
    .functions.memset."bench-variant"="zerofill-0o0" |
    del(.functions.memset.results[] | select(.char1 != 0 or .char2 != 0))
  ' | \
  plot_strings.py -l -p thru -v -

2) compare zero paformance

$ cat bench-memset-zerofill.out | \
  jq -r '
    .functions.memset."bench-variant"="zerofill-zero" |
    del(.functions.memset.results[] | select(.char2 != 0))
  ' | \
  plot_strings.py -l -p thru -v -

3) compare nonzero paformance

$ cat bench-memset-zerofill.out | \
  jq -r '
    .functions.memset."bench-variant"="zerofill-nonzero" |
    del(.functions.memset.results[] | select(.char2 == 0))
  ' | \
  plot_strings.py -l -p thru -v -
---
 benchtests/Makefile                |   2 +-
 benchtests/bench-memset-zerofill.c | 134 +++++++++++++++++++++++++++++
 2 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 benchtests/bench-memset-zerofill.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 1530939a8ce8..21b95c736190 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -53,7 +53,7 @@ string-benchset := memccpy memchr memcmp memcpy memmem memmove \
 		   strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
 		   strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
 		   strcoll memcpy-large memcpy-random memmove-large memset-large \
-		   memcpy-walk memset-walk memmove-walk
+		   memcpy-walk memset-walk memmove-walk memset-zerofill
 
 # Build and run locale-dependent benchmarks only if we're building natively.
 ifeq (no,$(cross-compiling))
diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
new file mode 100644
index 000000000000..7aa7fe048574
--- /dev/null
+++ b/benchtests/bench-memset-zerofill.c
@@ -0,0 +1,134 @@
+/* Measure memset functions with zero fill data.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#define TEST_NAME "memset"
+#define START_SIZE 256
+#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+#include "json-lib.h"
+
+void *generic_memset (void *, int, size_t);
+typedef void *(*proto_t) (void *, int, size_t);
+
+IMPL (MEMSET, 1)
+IMPL (generic_memset, 0)
+
+static void
+__attribute__((noinline, noclone))
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
+	     int c1 __attribute ((unused)), int c2 __attribute ((unused)),
+	     size_t n)
+{
+  size_t i, iters = 32;
+  timing_t start, stop, cur, latency = 0;
+
+  CALL (impl, s, c2, n); // warm up
+
+  for (i = 0; i < iters; i++)
+    {
+      memset (s, c1, n); // alternation
+
+      TIMING_NOW (start);
+
+      CALL (impl, s, c2, n);
+
+      TIMING_NOW (stop);
+      TIMING_DIFF (cur, start, stop);
+      TIMING_ACCUM (latency, cur);
+    }
+
+  json_element_double (json_ctx, (double) latency / (double) iters);
+}
+
+static void
+do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
+{
+  align &= getpagesize () - 1;
+  if ((align + len) * sizeof (CHAR) > page_size)
+    return;
+
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_uint (json_ctx, "alignment", align);
+  json_attr_int (json_ctx, "char1", c1);
+  json_attr_int (json_ctx, "char2", c2);
+  json_array_begin (json_ctx, "timings");
+
+  FOR_EACH_IMPL (impl, 0)
+    {
+      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c1, c2, len);
+      alloc_bufs ();
+    }
+
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
+}
+
+int
+test_main (void)
+{
+  json_ctx_t json_ctx;
+  size_t i;
+  int c1, c2;
+
+  test_init ();
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "zerofill");
+
+  json_array_begin (&json_ctx, "ifuncs");
+  FOR_EACH_IMPL (impl, 0)
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
+
+  for (c1 = 0; c1 < 2; c1++)
+    for (c2 = 0; c2 < 2; c2++)
+      for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+	{
+	  do_test (&json_ctx, 0, c1, c2, i);
+	  do_test (&json_ctx, 3, c1, c2, i);
+	}
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return ret;
+}
+
+#include <support/test-driver.c>
+
+#define libc_hidden_builtin_def(X)
+#define libc_hidden_def(X)
+#define libc_hidden_weak(X)
+#define weak_alias(X,Y)
+#undef MEMSET
+#define MEMSET generic_memset
+#include <string/memset.c>
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH v3 3/5] benchtests: Remove redundant assert.h
  2021-08-05  7:47   ` [PATCH v3 0/5] benchtests: Add memset zero fill benchmark test Naohiro Tamura via Libc-alpha
  2021-08-05  7:49     ` [PATCH v3 1/5] benchtests: Enable scripts/plot_strings.py to read stdin Naohiro Tamura via Libc-alpha
  2021-08-05  7:50     ` [PATCH v3 2/5] benchtests: Add memset zero fill benchtest Naohiro Tamura via Libc-alpha
@ 2021-08-05  7:51     ` Naohiro Tamura via Libc-alpha
  2021-09-08  1:59       ` naohirot--- via Libc-alpha
  2021-09-13  3:36       ` Siddhesh Poyarekar
  2021-08-05  7:51     ` [PATCH v3 4/5] benchtests: Fix validate_benchout.py exceptions Naohiro Tamura via Libc-alpha
  2021-08-05  7:52     ` [PATCH v3 5/5] config: Rename HAVE_BUILTIN_MEMSET macro Naohiro Tamura via Libc-alpha
  4 siblings, 2 replies; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-08-05  7:51 UTC (permalink / raw)
  To: Lucas A. M. Magalhaes, libc-alpha

This patch removed redundant "#include <assert.h>" from
bench-memset-large.c and bench-memset-walk.c.
---
 benchtests/bench-memset-large.c | 1 -
 benchtests/bench-memset-walk.c  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/benchtests/bench-memset-large.c b/benchtests/bench-memset-large.c
index 97ed78d0d6a1..3fd20b79e53d 100644
--- a/benchtests/bench-memset-large.c
+++ b/benchtests/bench-memset-large.c
@@ -23,7 +23,6 @@
 #define TIMEOUT (20 * 60)
 #include "bench-string.h"
 
-#include <assert.h>
 #include "json-lib.h"
 
 void *generic_memset (void *, int, size_t);
diff --git a/benchtests/bench-memset-walk.c b/benchtests/bench-memset-walk.c
index 0dcad09c484f..5fb315384992 100644
--- a/benchtests/bench-memset-walk.c
+++ b/benchtests/bench-memset-walk.c
@@ -23,7 +23,6 @@
 #define TIMEOUT (20 * 60)
 #include "bench-string.h"
 
-#include <assert.h>
 #include "json-lib.h"
 
 void *generic_memset (void *, int, size_t);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH v3 4/5] benchtests: Fix validate_benchout.py exceptions
  2021-08-05  7:47   ` [PATCH v3 0/5] benchtests: Add memset zero fill benchmark test Naohiro Tamura via Libc-alpha
                       ` (2 preceding siblings ...)
  2021-08-05  7:51     ` [PATCH v3 3/5] benchtests: Remove redundant assert.h Naohiro Tamura via Libc-alpha
@ 2021-08-05  7:51     ` Naohiro Tamura via Libc-alpha
  2021-09-08  1:55       ` naohirot--- via Libc-alpha
  2021-09-13  3:42       ` Siddhesh Poyarekar
  2021-08-05  7:52     ` [PATCH v3 5/5] config: Rename HAVE_BUILTIN_MEMSET macro Naohiro Tamura via Libc-alpha
  4 siblings, 2 replies; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-08-05  7:51 UTC (permalink / raw)
  To: libc-alpha

This patch fixed validate_benchout.py two exceptions, AttributeError
if benchout_strings.schema.json is specified and
json.decoder.JSONDecodeError if benchout is not JSON.
---
 benchtests/scripts/import_bench.py      | 5 ++++-
 benchtests/scripts/validate_benchout.py | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/benchtests/scripts/import_bench.py b/benchtests/scripts/import_bench.py
index a799b4e1b7dc..e3337ca5d638 100644
--- a/benchtests/scripts/import_bench.py
+++ b/benchtests/scripts/import_bench.py
@@ -104,7 +104,10 @@ def do_for_all_timings(bench, callback):
     """
     for func in bench['functions'].keys():
         for k in bench['functions'][func].keys():
-            if 'timings' not in bench['functions'][func][k].keys():
+            try:
+                if 'timings' not in bench['functions'][func][k].keys():
+                    continue
+            except AttributeError:
                 continue
 
             callback(bench, func, k)
diff --git a/benchtests/scripts/validate_benchout.py b/benchtests/scripts/validate_benchout.py
index 47df33ed0252..00d5fa0ee5eb 100755
--- a/benchtests/scripts/validate_benchout.py
+++ b/benchtests/scripts/validate_benchout.py
@@ -73,11 +73,15 @@ def main(args):
 
     except bench.validator.ValidationError as e:
         return print_and_exit("Invalid benchmark output: %s" % e.message,
-            os.EX_DATAERR)
+                os.EX_DATAERR)
 
     except bench.validator.SchemaError as e:
         return print_and_exit("Invalid schema: %s" % e.message, os.EX_DATAERR)
 
+    except json.decoder.JSONDecodeError as e:
+        return print_and_exit("Benchmark output in %s is not JSON." % args[0],
+                os.EX_DATAERR)
+
     print("Benchmark output in %s is valid." % args[0])
     return os.EX_OK
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH v3 5/5] config: Rename HAVE_BUILTIN_MEMSET macro
  2021-08-05  7:47   ` [PATCH v3 0/5] benchtests: Add memset zero fill benchmark test Naohiro Tamura via Libc-alpha
                       ` (3 preceding siblings ...)
  2021-08-05  7:51     ` [PATCH v3 4/5] benchtests: Fix validate_benchout.py exceptions Naohiro Tamura via Libc-alpha
@ 2021-08-05  7:52     ` Naohiro Tamura via Libc-alpha
  2021-08-11 20:34       ` Adhemerval Zanella via Libc-alpha
  4 siblings, 1 reply; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-08-05  7:52 UTC (permalink / raw)
  To: Andreas Schwab, libc-alpha

This patch renames HAVE_BUILTIN_MEMSET macro to
HAVE_NON_LIB_EXPAND_BUILTIN_MEMSET.

The name "HAVE_BUILTIN_MEMSET" is very confusing.
This macro cannot be removed even though GCC 6.2, that is minimum
requirement to compile glibc, already supports __builtin_memset[1].
It doesn't indicate whether GCC supports __builtin_memset or not.

But it indicates whether GCC supports __builtin_memset which doesn't
expand to a memset libcall or not.

Therefor HAVE_NON_LIB_EXPAND_BUILTIN_MEMSET is more appropriate to
increase code readability.

[1] https://gcc.gnu.org/onlinedocs/gcc-6.2.0/gcc/Other-Builtins.html
---
 config.h.in  |  4 ++--
 configure    | 14 +++++++-------
 configure.ac | 10 +++++-----
 elf/rtld.c   |  9 +++++----
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/config.h.in b/config.h.in
index 8b45a3a61d77..4700dc9eba9b 100644
--- a/config.h.in
+++ b/config.h.in
@@ -40,8 +40,8 @@
    shared between GNU libc and GNU gettext projects.  */
 #define HAVE_BUILTIN_EXPECT 1
 
-/* Define if the compiler supports __builtin_memset.  */
-#undef	HAVE_BUILTIN_MEMSET
+/* Define if the compiler supports non lib expand __builtin_memset.  */
+#undef	HAVE_NON_LIB_EXPAND_BUILTIN_MEMSET
 
 /* Define if compiler accepts -ftree-loop-distribute-patterns.  */
 #undef  HAVE_CC_INHIBIT_LOOP_TO_LIBCALL
diff --git a/configure b/configure
index 9619c10991d0..224c754cf466 100755
--- a/configure
+++ b/configure
@@ -6263,7 +6263,7 @@ fi
 
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_memset" >&5
 $as_echo_n "checking for __builtin_memset... " >&6; }
-if ${libc_cv_gcc_builtin_memset+:} false; then :
+if ${libc_cv_gcc_non_lib_expand_builtin_memset+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   cat > conftest.c <<\EOF
@@ -6279,16 +6279,16 @@ if { ac_try='${CC-cc} -O3 -S conftest.c -o - | grep -F "memset" > /dev/null'
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; };
 then
-  libc_cv_gcc_builtin_memset=no
+  libc_cv_gcc_non_lib_expand_builtin_memset=no
 else
-  libc_cv_gcc_builtin_memset=yes
+  libc_cv_gcc_non_lib_expand_builtin_memset=yes
 fi
 rm -f conftest*
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_gcc_builtin_memset" >&5
-$as_echo "$libc_cv_gcc_builtin_memset" >&6; }
-if test "$libc_cv_gcc_builtin_memset" = yes ; then
-  $as_echo "#define HAVE_BUILTIN_MEMSET 1" >>confdefs.h
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_gcc_non_lib_expand_builtin_memset" >&5
+$as_echo "$libc_cv_gcc_non_lib_expand_builtin_memset" >&6; }
+if test "$libc_cv_gcc_non_lib_expand_builtin_memset" = yes ; then
+  $as_echo "#define HAVE_NON_LIB_EXPAND_BUILTIN_MEMSET 1" >>confdefs.h
 
 fi
 
diff --git a/configure.ac b/configure.ac
index 34ecbba54054..451cea7683e6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1508,7 +1508,7 @@ if test $libc_cv_have_section_quotes = yes; then
   AC_DEFINE(HAVE_SECTION_QUOTES)
 fi
 
-AC_CACHE_CHECK(for __builtin_memset, libc_cv_gcc_builtin_memset, [dnl
+AC_CACHE_CHECK(for __builtin_memset, libc_cv_gcc_non_lib_expand_builtin_memset, [dnl
 cat > conftest.c <<\EOF
 void zero (void *x)
 {
@@ -1518,13 +1518,13 @@ EOF
 dnl
 if AC_TRY_COMMAND([${CC-cc} -O3 -S conftest.c -o - | grep -F "memset" > /dev/null]);
 then
-  libc_cv_gcc_builtin_memset=no
+  libc_cv_gcc_non_lib_expand_builtin_memset=no
 else
-  libc_cv_gcc_builtin_memset=yes
+  libc_cv_gcc_non_lib_expand_builtin_memset=yes
 fi
 rm -f conftest* ])
-if test "$libc_cv_gcc_builtin_memset" = yes ; then
-  AC_DEFINE(HAVE_BUILTIN_MEMSET)
+if test "$libc_cv_gcc_non_lib_expand_builtin_memset" = yes ; then
+  AC_DEFINE(HAVE_NON_LIB_EXPAND_BUILTIN_MEMSET)
 fi
 
 AC_CACHE_CHECK(for redirection of built-in functions, libc_cv_gcc_builtin_redirection, [dnl
diff --git a/elf/rtld.c b/elf/rtld.c
index d733359eaf80..a18494fcd38e 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -527,11 +527,12 @@ _dl_start (void *arg)
   /* Partly clean the `bootstrap_map' structure up.  Don't use
      `memset' since it might not be built in or inlined and we cannot
      make function calls at this point.  Use '__builtin_memset' if we
-     know it is available.  We do not have to clear the memory if we
-     do not have to use the temporary bootstrap_map.  Global variables
-     are initialized to zero by default.  */
+     know it is available and does not expand to a memset libcall.
+     We do not have to clear the memory if we do not have to use the
+     temporary bootstrap_map.  Global variables are initialized to
+     zero by default.  */
 #ifndef DONT_USE_BOOTSTRAP_MAP
-# ifdef HAVE_BUILTIN_MEMSET
+# ifdef HAVE_NON_LIB_EXPAND_BUILTIN_MEMSET
   __builtin_memset (bootstrap_map.l_info, '\0', sizeof (bootstrap_map.l_info));
 # else
   for (size_t cnt = 0;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 1/5] benchtests: Enable scripts/plot_strings.py to read stdin
  2021-08-05  7:49     ` [PATCH v3 1/5] benchtests: Enable scripts/plot_strings.py to read stdin Naohiro Tamura via Libc-alpha
@ 2021-08-05  7:56       ` Siddhesh Poyarekar
  2021-09-08  1:46         ` naohirot--- via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Siddhesh Poyarekar @ 2021-08-05  7:56 UTC (permalink / raw)
  To: Naohiro Tamura, libc-alpha

On 8/5/21 1:19 PM, Naohiro Tamura via Libc-alpha wrote:
> This patch enables scripts/plot_strings.py to read a benchmark result
> file from stdin.
> To keep backward compatibility, that is to keep accepting multiple of
> benchmark result files in argument, blank argument doesn't mean stdin,
> but '-' does.
> Therefore nargs parameter of ArgumentParser.add_argument() method is
> not changed to '?', but keep '+'.
> 
> ex:
>    $ jq '.' bench-memset.out | plot_strings.py -
>    $ jq '.' bench-memset.out | plot_strings.py - bench-memset-large.out
>    $ plot_strings.py bench-memset.out bench-memset-large.out
> 
> error ex:
>    $ jq '.' bench-memset.out | plot_strings.py
> ---
>   benchtests/scripts/plot_strings.py | 11 ++++++++---
>   1 file changed, 8 insertions(+), 3 deletions(-)

Very nice!  LGTM.

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

> 
> diff --git a/benchtests/scripts/plot_strings.py b/benchtests/scripts/plot_strings.py
> index c71f0804e4de..ec634692d9ad 100755
> --- a/benchtests/scripts/plot_strings.py
> +++ b/benchtests/scripts/plot_strings.py
> @@ -31,6 +31,7 @@ import json
>   import matplotlib as mpl
>   import numpy as np
>   import os
> +import sys
>   
>   try:
>       import jsonschema as validator
> @@ -331,8 +332,11 @@ def main(args):
>       for filename in args.bench:
>           bench = None
>   
> -        with open(filename, "r") as f:
> -            bench = json.load(f)
> +        if filename == '-':
> +            bench = json.load(sys.stdin)
> +        else:
> +            with open(filename, "r") as f:
> +                bench = json.load(f)
>   
>           validator.validate(bench, schema)
>   
> @@ -354,7 +358,8 @@ if __name__ == "__main__":
>   
>       # Required parameter
>       parser.add_argument("bench", nargs="+",
> -                        help="benchmark results file(s) in json format")
> +                        help="benchmark results file(s) in json format, " \
> +                        "and/or '-' as a benchmark result file from stdin")
>   
>       # Optional parameters
>       parser.add_argument("-b", "--baseline", type=str,
> 


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 5/5] config: Rename HAVE_BUILTIN_MEMSET macro
  2021-08-05  7:52     ` [PATCH v3 5/5] config: Rename HAVE_BUILTIN_MEMSET macro Naohiro Tamura via Libc-alpha
@ 2021-08-11 20:34       ` Adhemerval Zanella via Libc-alpha
  0 siblings, 0 replies; 83+ messages in thread
From: Adhemerval Zanella via Libc-alpha @ 2021-08-11 20:34 UTC (permalink / raw)
  To: Naohiro Tamura, Andreas Schwab, libc-alpha



On 05/08/2021 04:52, Naohiro Tamura via Libc-alpha wrote:
> This patch renames HAVE_BUILTIN_MEMSET macro to
> HAVE_NON_LIB_EXPAND_BUILTIN_MEMSET.
> 
> The name "HAVE_BUILTIN_MEMSET" is very confusing.
> This macro cannot be removed even though GCC 6.2, that is minimum
> requirement to compile glibc, already supports __builtin_memset[1].
> It doesn't indicate whether GCC supports __builtin_memset or not.
> 
> But it indicates whether GCC supports __builtin_memset which doesn't
> expand to a memset libcall or not.
> 
> Therefor HAVE_NON_LIB_EXPAND_BUILTIN_MEMSET is more appropriate to
> increase code readability.
> 
> [1] https://gcc.gnu.org/onlinedocs/gcc-6.2.0/gcc/Other-Builtins.html

Sorry, but I don't see much gain in renaming the internal variable.  I agree
with the comment improvement.

> ---
>  config.h.in  |  4 ++--
>  configure    | 14 +++++++-------
>  configure.ac | 10 +++++-----
>  elf/rtld.c   |  9 +++++----
>  4 files changed, 19 insertions(+), 18 deletions(-)
> 
> diff --git a/config.h.in b/config.h.in
> index 8b45a3a61d77..4700dc9eba9b 100644
> --- a/config.h.in
> +++ b/config.h.in
> @@ -40,8 +40,8 @@
>     shared between GNU libc and GNU gettext projects.  */
>  #define HAVE_BUILTIN_EXPECT 1
>  
> -/* Define if the compiler supports __builtin_memset.  */
> -#undef	HAVE_BUILTIN_MEMSET
> +/* Define if the compiler supports non lib expand __builtin_memset.  */
> +#undef	HAVE_NON_LIB_EXPAND_BUILTIN_MEMSET
>  
>  /* Define if compiler accepts -ftree-loop-distribute-patterns.  */
>  #undef  HAVE_CC_INHIBIT_LOOP_TO_LIBCALL

Ok.

> diff --git a/configure b/configure
> index 9619c10991d0..224c754cf466 100755
> --- a/configure
> +++ b/configure
> @@ -6263,7 +6263,7 @@ fi
>  
>  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_memset" >&5
>  $as_echo_n "checking for __builtin_memset... " >&6; }
> -if ${libc_cv_gcc_builtin_memset+:} false; then :
> +if ${libc_cv_gcc_non_lib_expand_builtin_memset+:} false; then :
>    $as_echo_n "(cached) " >&6
>  else
>    cat > conftest.c <<\EOF
> @@ -6279,16 +6279,16 @@ if { ac_try='${CC-cc} -O3 -S conftest.c -o - | grep -F "memset" > /dev/null'
>    $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
>    test $ac_status = 0; }; };
>  then
> -  libc_cv_gcc_builtin_memset=no
> +  libc_cv_gcc_non_lib_expand_builtin_memset=no
>  else
> -  libc_cv_gcc_builtin_memset=yes
> +  libc_cv_gcc_non_lib_expand_builtin_memset=yes
>  fi
>  rm -f conftest*
>  fi
> -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_gcc_builtin_memset" >&5
> -$as_echo "$libc_cv_gcc_builtin_memset" >&6; }
> -if test "$libc_cv_gcc_builtin_memset" = yes ; then
> -  $as_echo "#define HAVE_BUILTIN_MEMSET 1" >>confdefs.h
> +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_gcc_non_lib_expand_builtin_memset" >&5
> +$as_echo "$libc_cv_gcc_non_lib_expand_builtin_memset" >&6; }
> +if test "$libc_cv_gcc_non_lib_expand_builtin_memset" = yes ; then
> +  $as_echo "#define HAVE_NON_LIB_EXPAND_BUILTIN_MEMSET 1" >>confdefs.h
>  
>  fi
>  
> diff --git a/configure.ac b/configure.ac
> index 34ecbba54054..451cea7683e6 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -1508,7 +1508,7 @@ if test $libc_cv_have_section_quotes = yes; then
>    AC_DEFINE(HAVE_SECTION_QUOTES)
>  fi
>  
> -AC_CACHE_CHECK(for __builtin_memset, libc_cv_gcc_builtin_memset, [dnl
> +AC_CACHE_CHECK(for __builtin_memset, libc_cv_gcc_non_lib_expand_builtin_memset, [dnl
>  cat > conftest.c <<\EOF
>  void zero (void *x)
>  {
> @@ -1518,13 +1518,13 @@ EOF
>  dnl
>  if AC_TRY_COMMAND([${CC-cc} -O3 -S conftest.c -o - | grep -F "memset" > /dev/null]);
>  then
> -  libc_cv_gcc_builtin_memset=no
> +  libc_cv_gcc_non_lib_expand_builtin_memset=no
>  else
> -  libc_cv_gcc_builtin_memset=yes
> +  libc_cv_gcc_non_lib_expand_builtin_memset=yes
>  fi
>  rm -f conftest* ])
> -if test "$libc_cv_gcc_builtin_memset" = yes ; then
> -  AC_DEFINE(HAVE_BUILTIN_MEMSET)
> +if test "$libc_cv_gcc_non_lib_expand_builtin_memset" = yes ; then
> +  AC_DEFINE(HAVE_NON_LIB_EXPAND_BUILTIN_MEMSET)
>  fi
>  
>  AC_CACHE_CHECK(for redirection of built-in functions, libc_cv_gcc_builtin_redirection, [dnl

I see no point in rename it, a better comment as you are doing is suffice.

> diff --git a/elf/rtld.c b/elf/rtld.c
> index d733359eaf80..a18494fcd38e 100644
> --- a/elf/rtld.c
> +++ b/elf/rtld.c
> @@ -527,11 +527,12 @@ _dl_start (void *arg)
>    /* Partly clean the `bootstrap_map' structure up.  Don't use
>       `memset' since it might not be built in or inlined and we cannot
>       make function calls at this point.  Use '__builtin_memset' if we
> -     know it is available.  We do not have to clear the memory if we
> -     do not have to use the temporary bootstrap_map.  Global variables
> -     are initialized to zero by default.  */
> +     know it is available and does not expand to a memset libcall.
> +     We do not have to clear the memory if we do not have to use the
> +     temporary bootstrap_map.  Global variables are initialized to
> +     zero by default.  */
>  #ifndef DONT_USE_BOOTSTRAP_MAP
> -# ifdef HAVE_BUILTIN_MEMSET
> +# ifdef HAVE_NON_LIB_EXPAND_BUILTIN_MEMSET
>    __builtin_memset (bootstrap_map.l_info, '\0', sizeof (bootstrap_map.l_info));
>  # else
>    for (size_t cnt = 0;
> 

Ok.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v3 1/5] benchtests: Enable scripts/plot_strings.py to read stdin
  2021-08-05  7:56       ` Siddhesh Poyarekar
@ 2021-09-08  1:46         ` naohirot--- via Libc-alpha
  2021-09-08 12:56           ` Siddhesh Poyarekar
  0 siblings, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-08  1:46 UTC (permalink / raw)
  To: 'Siddhesh Poyarekar', libc-alpha@sourceware.org

Hi Siddhesh, Thank you for the review comment.

Hi all, is there any other comment?
https://sourceware.org/pipermail/libc-alpha/2021-August/129838.html

Thanks.
Naohiro
> -----Original Message-----
> From: Siddhesh Poyarekar <siddhesh@gotplt.org>
> Sent: Thursday, August 5, 2021 4:57 PM
> To: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>; libc-alpha@sourceware.org
> Subject: Re: [PATCH v3 1/5] benchtests: Enable scripts/plot_strings.py to read stdin
> 
> On 8/5/21 1:19 PM, Naohiro Tamura via Libc-alpha wrote:
> > This patch enables scripts/plot_strings.py to read a benchmark result
> > file from stdin.
> > To keep backward compatibility, that is to keep accepting multiple of
> > benchmark result files in argument, blank argument doesn't mean stdin,
> > but '-' does.
> > Therefore nargs parameter of ArgumentParser.add_argument() method is
> > not changed to '?', but keep '+'.
> >
> > ex:
> >    $ jq '.' bench-memset.out | plot_strings.py -
> >    $ jq '.' bench-memset.out | plot_strings.py - bench-memset-large.out
> >    $ plot_strings.py bench-memset.out bench-memset-large.out
> >
> > error ex:
> >    $ jq '.' bench-memset.out | plot_strings.py
> > ---
> >   benchtests/scripts/plot_strings.py | 11 ++++++++---
> >   1 file changed, 8 insertions(+), 3 deletions(-)
> 
> Very nice!  LGTM.
> 
> Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
> 
> >
> > diff --git a/benchtests/scripts/plot_strings.py b/benchtests/scripts/plot_strings.py
> > index c71f0804e4de..ec634692d9ad 100755
> > --- a/benchtests/scripts/plot_strings.py
> > +++ b/benchtests/scripts/plot_strings.py
> > @@ -31,6 +31,7 @@ import json
> >   import matplotlib as mpl
> >   import numpy as np
> >   import os
> > +import sys
> >
> >   try:
> >       import jsonschema as validator
> > @@ -331,8 +332,11 @@ def main(args):
> >       for filename in args.bench:
> >           bench = None
> >
> > -        with open(filename, "r") as f:
> > -            bench = json.load(f)
> > +        if filename == '-':
> > +            bench = json.load(sys.stdin)
> > +        else:
> > +            with open(filename, "r") as f:
> > +                bench = json.load(f)
> >
> >           validator.validate(bench, schema)
> >
> > @@ -354,7 +358,8 @@ if __name__ == "__main__":
> >
> >       # Required parameter
> >       parser.add_argument("bench", nargs="+",
> > -                        help="benchmark results file(s) in json format")
> > +                        help="benchmark results file(s) in json format, " \
> > +                        "and/or '-' as a benchmark result file from stdin")
> >
> >       # Optional parameters
> >       parser.add_argument("-b", "--baseline", type=str,
> >


^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v3 4/5] benchtests: Fix validate_benchout.py exceptions
  2021-08-05  7:51     ` [PATCH v3 4/5] benchtests: Fix validate_benchout.py exceptions Naohiro Tamura via Libc-alpha
@ 2021-09-08  1:55       ` naohirot--- via Libc-alpha
  2021-09-13  3:42       ` Siddhesh Poyarekar
  1 sibling, 0 replies; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-08  1:55 UTC (permalink / raw)
  To: libc-alpha@sourceware.org

Hi all, is there any comment?
https://sourceware.org/pipermail/libc-alpha/2021-August/129841.html
Thanks.
Naohiro
> -----Original Message-----
> From: Naohiro Tamura <naohirot@fujitsu.com>
> Sent: Thursday, August 5, 2021 4:52 PM
> To: libc-alpha@sourceware.org
> Cc: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Subject: [PATCH v3 4/5] benchtests: Fix validate_benchout.py exceptions
> 
> This patch fixed validate_benchout.py two exceptions, AttributeError
> if benchout_strings.schema.json is specified and
> json.decoder.JSONDecodeError if benchout is not JSON.
> ---
>  benchtests/scripts/import_bench.py      | 5 ++++-
>  benchtests/scripts/validate_benchout.py | 6 +++++-
>  2 files changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/benchtests/scripts/import_bench.py b/benchtests/scripts/import_bench.py
> index a799b4e1b7dc..e3337ca5d638 100644
> --- a/benchtests/scripts/import_bench.py
> +++ b/benchtests/scripts/import_bench.py
> @@ -104,7 +104,10 @@ def do_for_all_timings(bench, callback):
>      """
>      for func in bench['functions'].keys():
>          for k in bench['functions'][func].keys():
> -            if 'timings' not in bench['functions'][func][k].keys():
> +            try:
> +                if 'timings' not in bench['functions'][func][k].keys():
> +                    continue
> +            except AttributeError:
>                  continue
> 
>              callback(bench, func, k)
> diff --git a/benchtests/scripts/validate_benchout.py b/benchtests/scripts/validate_benchout.py
> index 47df33ed0252..00d5fa0ee5eb 100755
> --- a/benchtests/scripts/validate_benchout.py
> +++ b/benchtests/scripts/validate_benchout.py
> @@ -73,11 +73,15 @@ def main(args):
> 
>      except bench.validator.ValidationError as e:
>          return print_and_exit("Invalid benchmark output: %s" % e.message,
> -            os.EX_DATAERR)
> +                os.EX_DATAERR)
> 
>      except bench.validator.SchemaError as e:
>          return print_and_exit("Invalid schema: %s" % e.message, os.EX_DATAERR)
> 
> +    except json.decoder.JSONDecodeError as e:
> +        return print_and_exit("Benchmark output in %s is not JSON." % args[0],
> +                os.EX_DATAERR)
> +
>      print("Benchmark output in %s is valid." % args[0])
>      return os.EX_OK
> 
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v3 3/5] benchtests: Remove redundant assert.h
  2021-08-05  7:51     ` [PATCH v3 3/5] benchtests: Remove redundant assert.h Naohiro Tamura via Libc-alpha
@ 2021-09-08  1:59       ` naohirot--- via Libc-alpha
  2021-09-13  3:36       ` Siddhesh Poyarekar
  1 sibling, 0 replies; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-08  1:59 UTC (permalink / raw)
  To: Lucas A. M. Magalhaes, libc-alpha@sourceware.org

Hi all, is there any comment?
https://sourceware.org/pipermail/libc-alpha/2021-August/129840.html
Thanks.
Naohiro

> -----Original Message-----
> From: Naohiro Tamura <naohirot@fujitsu.com>
> Sent: Thursday, August 5, 2021 4:51 PM
> To: Lucas A. M. Magalhaes <lamm@linux.ibm.com>; libc-alpha@sourceware.org
> Cc: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Subject: [PATCH v3 3/5] benchtests: Remove redundant assert.h
> 
> This patch removed redundant "#include <assert.h>" from
> bench-memset-large.c and bench-memset-walk.c.
> ---
>  benchtests/bench-memset-large.c | 1 -
>  benchtests/bench-memset-walk.c  | 1 -
>  2 files changed, 2 deletions(-)
> 
> diff --git a/benchtests/bench-memset-large.c b/benchtests/bench-memset-large.c
> index 97ed78d0d6a1..3fd20b79e53d 100644
> --- a/benchtests/bench-memset-large.c
> +++ b/benchtests/bench-memset-large.c
> @@ -23,7 +23,6 @@
>  #define TIMEOUT (20 * 60)
>  #include "bench-string.h"
> 
> -#include <assert.h>
>  #include "json-lib.h"
> 
>  void *generic_memset (void *, int, size_t);
> diff --git a/benchtests/bench-memset-walk.c b/benchtests/bench-memset-walk.c
> index 0dcad09c484f..5fb315384992 100644
> --- a/benchtests/bench-memset-walk.c
> +++ b/benchtests/bench-memset-walk.c
> @@ -23,7 +23,6 @@
>  #define TIMEOUT (20 * 60)
>  #include "bench-string.h"
> 
> -#include <assert.h>
>  #include "json-lib.h"
> 
>  void *generic_memset (void *, int, size_t);
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-08-05  7:50     ` [PATCH v3 2/5] benchtests: Add memset zero fill benchtest Naohiro Tamura via Libc-alpha
@ 2021-09-08  2:03       ` naohirot--- via Libc-alpha
  2021-09-10 20:40       ` Lucas A. M. Magalhaes via Libc-alpha
  1 sibling, 0 replies; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-08  2:03 UTC (permalink / raw)
  To: Lucas A. M. Magalhaes, Wilco Dijkstra, Noah Goldstein,
	libc-alpha@sourceware.org

Hi Lucas, Wilco, Noah and all,
Is there any comment?
https://sourceware.org/pipermail/libc-alpha/2021-August/129839.html
Thanks.
Naohiro

> -----Original Message-----
> From: Naohiro Tamura <naohirot@fujitsu.com>
> Sent: Thursday, August 5, 2021 4:51 PM
> To: Lucas A. M. Magalhaes <lamm@linux.ibm.com>; Wilco Dijkstra <Wilco.Dijkstra@arm.com>; Noah Goldstein
> <goldstein.w.n@gmail.com>; libc-alpha@sourceware.org
> Cc: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Subject: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
> 
> Memset takes 0 as the second parameter in most cases.
> However, we cannot measure the zero fill performance by
> bench-memset.c, bench-memset-large.c and bench-memset-walk.c
> precisely.
> X86_64 micro-architecture has some zero-over-zero optimization, and
> AArch64 micro-architecture also has some optimization for DC ZVA
> instruction.
> This patch provides bench-memset-zerofill.c which is suitable to
> analyze the zero fill performance by comparing among 4 patterns,
> zero-over-zero, zero-over-one, one-over-zero and one-over-one, from
> 256B to 64MB(RAM) through L1, L2 and L3 caches.
> 
> The following commands are examples to analyze a JSON output,
> bench-memset-zerofill.out, by 'jq' and 'plot_strings.py'.
> 
> 1) compare zero-over-zero performance
> 
> $ cat bench-memset-zerofill.out | \
>   jq -r '
>     .functions.memset."bench-variant"="zerofill-0o0" |
>     del(.functions.memset.results[] | select(.char1 != 0 or .char2 != 0))
>   ' | \
>   plot_strings.py -l -p thru -v -
> 
> 2) compare zero paformance
> 
> $ cat bench-memset-zerofill.out | \
>   jq -r '
>     .functions.memset."bench-variant"="zerofill-zero" |
>     del(.functions.memset.results[] | select(.char2 != 0))
>   ' | \
>   plot_strings.py -l -p thru -v -
> 
> 3) compare nonzero paformance
> 
> $ cat bench-memset-zerofill.out | \
>   jq -r '
>     .functions.memset."bench-variant"="zerofill-nonzero" |
>     del(.functions.memset.results[] | select(.char2 == 0))
>   ' | \
>   plot_strings.py -l -p thru -v -
> ---
>  benchtests/Makefile                |   2 +-
>  benchtests/bench-memset-zerofill.c | 134 +++++++++++++++++++++++++++++
>  2 files changed, 135 insertions(+), 1 deletion(-)
>  create mode 100644 benchtests/bench-memset-zerofill.c
> 
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 1530939a8ce8..21b95c736190 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -53,7 +53,7 @@ string-benchset := memccpy memchr memcmp memcpy memmem memmove \
>  		   strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
>  		   strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
>  		   strcoll memcpy-large memcpy-random memmove-large memset-large \
> -		   memcpy-walk memset-walk memmove-walk
> +		   memcpy-walk memset-walk memmove-walk memset-zerofill
> 
>  # Build and run locale-dependent benchmarks only if we're building natively.
>  ifeq (no,$(cross-compiling))
> diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
> new file mode 100644
> index 000000000000..7aa7fe048574
> --- /dev/null
> +++ b/benchtests/bench-memset-zerofill.c
> @@ -0,0 +1,134 @@
> +/* Measure memset functions with zero fill data.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define TEST_MAIN
> +#define TEST_NAME "memset"
> +#define START_SIZE 256
> +#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
> +#define TIMEOUT (20 * 60)
> +#include "bench-string.h"
> +
> +#include "json-lib.h"
> +
> +void *generic_memset (void *, int, size_t);
> +typedef void *(*proto_t) (void *, int, size_t);
> +
> +IMPL (MEMSET, 1)
> +IMPL (generic_memset, 0)
> +
> +static void
> +__attribute__((noinline, noclone))
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> +	     int c1 __attribute ((unused)), int c2 __attribute ((unused)),
> +	     size_t n)
> +{
> +  size_t i, iters = 32;
> +  timing_t start, stop, cur, latency = 0;
> +
> +  CALL (impl, s, c2, n); // warm up
> +
> +  for (i = 0; i < iters; i++)
> +    {
> +      memset (s, c1, n); // alternation
> +
> +      TIMING_NOW (start);
> +
> +      CALL (impl, s, c2, n);
> +
> +      TIMING_NOW (stop);
> +      TIMING_DIFF (cur, start, stop);
> +      TIMING_ACCUM (latency, cur);
> +    }
> +
> +  json_element_double (json_ctx, (double) latency / (double) iters);
> +}
> +
> +static void
> +do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
> +{
> +  align &= getpagesize () - 1;
> +  if ((align + len) * sizeof (CHAR) > page_size)
> +    return;
> +
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "length", len);
> +  json_attr_uint (json_ctx, "alignment", align);
> +  json_attr_int (json_ctx, "char1", c1);
> +  json_attr_int (json_ctx, "char2", c2);
> +  json_array_begin (json_ctx, "timings");
> +
> +  FOR_EACH_IMPL (impl, 0)
> +    {
> +      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c1, c2, len);
> +      alloc_bufs ();
> +    }
> +
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
> +}
> +
> +int
> +test_main (void)
> +{
> +  json_ctx_t json_ctx;
> +  size_t i;
> +  int c1, c2;
> +
> +  test_init ();
> +
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "zerofill");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
> +  FOR_EACH_IMPL (impl, 0)
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
> +
> +  for (c1 = 0; c1 < 2; c1++)
> +    for (c2 = 0; c2 < 2; c2++)
> +      for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
> +	{
> +	  do_test (&json_ctx, 0, c1, c2, i);
> +	  do_test (&json_ctx, 3, c1, c2, i);
> +	}
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
> +  return ret;
> +}
> +
> +#include <support/test-driver.c>
> +
> +#define libc_hidden_builtin_def(X)
> +#define libc_hidden_def(X)
> +#define libc_hidden_weak(X)
> +#define weak_alias(X,Y)
> +#undef MEMSET
> +#define MEMSET generic_memset
> +#include <string/memset.c>
> --
> 2.17.1


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 1/5] benchtests: Enable scripts/plot_strings.py to read stdin
  2021-09-08  1:46         ` naohirot--- via Libc-alpha
@ 2021-09-08 12:56           ` Siddhesh Poyarekar
  2021-09-09  0:22             ` naohirot--- via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Siddhesh Poyarekar @ 2021-09-08 12:56 UTC (permalink / raw)
  To: naohirot@fujitsu.com, libc-alpha@sourceware.org

On 9/8/21 7:16 AM, naohirot@fujitsu.com wrote:
> Hi Siddhesh, Thank you for the review comment.
> 
> Hi all, is there any other comment?
> https://sourceware.org/pipermail/libc-alpha/2021-August/129838.html
> 

I approved the patch with this...

>>
>> Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
>>

You may push this patch (you may still need review for other patches)  
right away if you want.

Siddhesh

^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v3 1/5] benchtests: Enable scripts/plot_strings.py to read stdin
  2021-09-08 12:56           ` Siddhesh Poyarekar
@ 2021-09-09  0:22             ` naohirot--- via Libc-alpha
  2021-09-13  3:45               ` Siddhesh Poyarekar
  0 siblings, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-09  0:22 UTC (permalink / raw)
  To: 'Siddhesh Poyarekar', libc-alpha@sourceware.org

Hi Siddhesh,

> From: Siddhesh Poyarekar <siddhesh@gotplt.org>
> Sent: Wednesday, September 8, 2021 9:56 PM
> 
> On 9/8/21 7:16 AM, naohirot@fujitsu.com wrote:
> > Hi Siddhesh, Thank you for the review comment.
> >
> > Hi all, is there any other comment?
> > https://sourceware.org/pipermail/libc-alpha/2021-August/129838.html
> >
> 
> I approved the patch with this...
> 
> >>
> >> Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
> >>

I see. Thanks!

> You may push this patch (you may still need review for other patches)
> right away if you want.

OK please merge it for me? I don't have the access right.
And could you review the other patches of this series too?
https://sourceware.org/pipermail/libc-alpha/2021-August/129841.html
https://sourceware.org/pipermail/libc-alpha/2021-August/129840.html
https://sourceware.org/pipermail/libc-alpha/2021-August/129839.html

Thanks.
Naohiro


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-08-05  7:50     ` [PATCH v3 2/5] benchtests: Add memset zero fill benchtest Naohiro Tamura via Libc-alpha
  2021-09-08  2:03       ` naohirot--- via Libc-alpha
@ 2021-09-10 20:40       ` Lucas A. M. Magalhaes via Libc-alpha
  2021-09-13  0:53         ` naohirot--- via Libc-alpha
  1 sibling, 1 reply; 83+ messages in thread
From: Lucas A. M. Magalhaes via Libc-alpha @ 2021-09-10 20:40 UTC (permalink / raw)
  To: Naohiro Tamura, Noah Goldstein, Wilco Dijkstra, libc-alpha

Hi Naohiro,

Thanks for working on this. Please, correct me if I'm wrong but I guess you sent
an old version by mistake. This patch is lacking the bench-variant
implementations mentioned on the commit message.

---
Lucas A. M. Magalhães

Quoting Naohiro Tamura (2021-08-05 04:50:53)
> Memset takes 0 as the second parameter in most cases.
> However, we cannot measure the zero fill performance by
> bench-memset.c, bench-memset-large.c and bench-memset-walk.c
> precisely.
> X86_64 micro-architecture has some zero-over-zero optimization, and
> AArch64 micro-architecture also has some optimization for DC ZVA
> instruction.
> This patch provides bench-memset-zerofill.c which is suitable to
> analyze the zero fill performance by comparing among 4 patterns,
> zero-over-zero, zero-over-one, one-over-zero and one-over-one, from
> 256B to 64MB(RAM) through L1, L2 and L3 caches.
> 
> The following commands are examples to analyze a JSON output,
> bench-memset-zerofill.out, by 'jq' and 'plot_strings.py'.
> 
> 1) compare zero-over-zero performance
> 
> $ cat bench-memset-zerofill.out | \
>   jq -r '
>     .functions.memset."bench-variant"="zerofill-0o0" |
>     del(.functions.memset.results[] | select(.char1 != 0 or .char2 != 0))
>   ' | \
>   plot_strings.py -l -p thru -v -
> 
> 2) compare zero paformance
> 
> $ cat bench-memset-zerofill.out | \
>   jq -r '
>     .functions.memset."bench-variant"="zerofill-zero" |
>     del(.functions.memset.results[] | select(.char2 != 0))
>   ' | \
>   plot_strings.py -l -p thru -v -
> 
> 3) compare nonzero paformance
> 
> $ cat bench-memset-zerofill.out | \
>   jq -r '
>     .functions.memset."bench-variant"="zerofill-nonzero" |
>     del(.functions.memset.results[] | select(.char2 == 0))
>   ' | \
>   plot_strings.py -l -p thru -v -
> ---
>  benchtests/Makefile                |   2 +-
>  benchtests/bench-memset-zerofill.c | 134 +++++++++++++++++++++++++++++
>  2 files changed, 135 insertions(+), 1 deletion(-)
>  create mode 100644 benchtests/bench-memset-zerofill.c
> 
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 1530939a8ce8..21b95c736190 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -53,7 +53,7 @@ string-benchset := memccpy memchr memcmp memcpy memmem memmove \
>                    strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
>                    strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
>                    strcoll memcpy-large memcpy-random memmove-large memset-large \
> -                  memcpy-walk memset-walk memmove-walk
> +                  memcpy-walk memset-walk memmove-walk memset-zerofill
>  
>  # Build and run locale-dependent benchmarks only if we're building natively.
>  ifeq (no,$(cross-compiling))
> diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
> new file mode 100644
> index 000000000000..7aa7fe048574
> --- /dev/null
> +++ b/benchtests/bench-memset-zerofill.c
> @@ -0,0 +1,134 @@
> +/* Measure memset functions with zero fill data.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define TEST_MAIN
> +#define TEST_NAME "memset"
> +#define START_SIZE 256
> +#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
> +#define TIMEOUT (20 * 60)
> +#include "bench-string.h"
> +
> +#include "json-lib.h"
> +
> +void *generic_memset (void *, int, size_t);
> +typedef void *(*proto_t) (void *, int, size_t);
> +
> +IMPL (MEMSET, 1)
> +IMPL (generic_memset, 0)
> +
> +static void
> +__attribute__((noinline, noclone))
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> +            int c1 __attribute ((unused)), int c2 __attribute ((unused)),
> +            size_t n)
> +{
> +  size_t i, iters = 32;
> +  timing_t start, stop, cur, latency = 0;
> +
> +  CALL (impl, s, c2, n); // warm up
> +
> +  for (i = 0; i < iters; i++)
> +    {
> +      memset (s, c1, n); // alternation
> +
> +      TIMING_NOW (start);
> +
> +      CALL (impl, s, c2, n);
> +
> +      TIMING_NOW (stop);
> +      TIMING_DIFF (cur, start, stop);
> +      TIMING_ACCUM (latency, cur);
> +    }
> +
> +  json_element_double (json_ctx, (double) latency / (double) iters);
> +}
> +
> +static void
> +do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
> +{
> +  align &= getpagesize () - 1;
> +  if ((align + len) * sizeof (CHAR) > page_size)
> +    return;
> +
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "length", len);
> +  json_attr_uint (json_ctx, "alignment", align);
> +  json_attr_int (json_ctx, "char1", c1);
> +  json_attr_int (json_ctx, "char2", c2);
> +  json_array_begin (json_ctx, "timings");
> +
> +  FOR_EACH_IMPL (impl, 0)
> +    {
> +      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c1, c2, len);
> +      alloc_bufs ();
> +    }
> +
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
> +}
> +
> +int
> +test_main (void)
> +{
> +  json_ctx_t json_ctx;
> +  size_t i;
> +  int c1, c2;
> +
> +  test_init ();
> +
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "zerofill");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
> +  FOR_EACH_IMPL (impl, 0)
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
> +
> +  for (c1 = 0; c1 < 2; c1++)
> +    for (c2 = 0; c2 < 2; c2++)
> +      for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
> +       {
> +         do_test (&json_ctx, 0, c1, c2, i);
> +         do_test (&json_ctx, 3, c1, c2, i);
> +       }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
> +  return ret;
> +}
> +
> +#include <support/test-driver.c>
> +
> +#define libc_hidden_builtin_def(X)
> +#define libc_hidden_def(X)
> +#define libc_hidden_weak(X)
> +#define weak_alias(X,Y)
> +#undef MEMSET
> +#define MEMSET generic_memset
> +#include <string/memset.c>
> -- 
> 2.17.1
>

^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-09-10 20:40       ` Lucas A. M. Magalhaes via Libc-alpha
@ 2021-09-13  0:53         ` naohirot--- via Libc-alpha
  2021-09-13 14:05           ` Lucas A. M. Magalhaes via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-13  0:53 UTC (permalink / raw)
  To: 'Lucas A. M. Magalhaes', Noah Goldstein, Wilco Dijkstra,
	libc-alpha@sourceware.org

Hi Lucas,

> From: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
> Sent: Saturday, September 11, 2021 5:40 AM
> 
> Thanks for working on this. Please, correct me if I'm wrong but I guess you sent
> an old version by mistake. This patch is lacking the bench-variant
> implementations mentioned on the commit message.

Thank you for the comment!
I double checked the source code and confirmed it is the one I intended.
4 patterns are combination of json attribute "char1" and "char2".
"char1" and "char2" varies 0 and 1 respectively.

zero-over-zero: char1=0, char2=0
zero-over-one: char1=0, char2=1
one-over-zero: char1=1, char2=0
one-over-one: char1=1, char2=1

I made a comment inline too.

BTW, could you review the patch "benchtests: Remove redundant assert.h" [1]
that is reflected your comment [2] to other bench tests if you had time?

[1] https://sourceware.org/pipermail/libc-alpha/2021-August/129840.html
[2] https://sourceware.org/pipermail/libc-alpha/2021-July/128989.html

> 
> Quoting Naohiro Tamura (2021-08-05 04:50:53)
> > Memset takes 0 as the second parameter in most cases.
> > However, we cannot measure the zero fill performance by
> > bench-memset.c, bench-memset-large.c and bench-memset-walk.c
> > precisely.
> > X86_64 micro-architecture has some zero-over-zero optimization, and
> > AArch64 micro-architecture also has some optimization for DC ZVA
> > instruction.
> > This patch provides bench-memset-zerofill.c which is suitable to
> > analyze the zero fill performance by comparing among 4 patterns,
> > zero-over-zero, zero-over-one, one-over-zero and one-over-one, from
> > 256B to 64MB(RAM) through L1, L2 and L3 caches.
> >
> > The following commands are examples to analyze a JSON output,
> > bench-memset-zerofill.out, by 'jq' and 'plot_strings.py'.
> >
> > 1) compare zero-over-zero performance
> >
> > $ cat bench-memset-zerofill.out | \
> >   jq -r '
> >     .functions.memset."bench-variant"="zerofill-0o0" |
> >     del(.functions.memset.results[] | select(.char1 != 0 or .char2 != 0))
> >   ' | \
> >   plot_strings.py -l -p thru -v -
> >
> > 2) compare zero paformance
> >
> > $ cat bench-memset-zerofill.out | \
> >   jq -r '
> >     .functions.memset."bench-variant"="zerofill-zero" |
> >     del(.functions.memset.results[] | select(.char2 != 0))
> >   ' | \
> >   plot_strings.py -l -p thru -v -
> >
> > 3) compare nonzero paformance
> >
> > $ cat bench-memset-zerofill.out | \
> >   jq -r '
> >     .functions.memset."bench-variant"="zerofill-nonzero" |
> >     del(.functions.memset.results[] | select(.char2 == 0))
> >   ' | \
> >   plot_strings.py -l -p thru -v -
> > ---
> >  benchtests/Makefile                |   2 +-
> >  benchtests/bench-memset-zerofill.c | 134 +++++++++++++++++++++++++++++
> >  2 files changed, 135 insertions(+), 1 deletion(-)
> >  create mode 100644 benchtests/bench-memset-zerofill.c
> >
> > diff --git a/benchtests/Makefile b/benchtests/Makefile
> > index 1530939a8ce8..21b95c736190 100644
> > --- a/benchtests/Makefile
> > +++ b/benchtests/Makefile
> > @@ -53,7 +53,7 @@ string-benchset := memccpy memchr memcmp memcpy memmem memmove \
> >                    strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
> >                    strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
> >                    strcoll memcpy-large memcpy-random memmove-large memset-large \
> > -                  memcpy-walk memset-walk memmove-walk
> > +                  memcpy-walk memset-walk memmove-walk memset-zerofill
> >
> >  # Build and run locale-dependent benchmarks only if we're building natively.
> >  ifeq (no,$(cross-compiling))
> > diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
> > new file mode 100644
> > index 000000000000..7aa7fe048574
> > --- /dev/null
> > +++ b/benchtests/bench-memset-zerofill.c
> > @@ -0,0 +1,134 @@
> > +/* Measure memset functions with zero fill data.
> > +   Copyright (C) 2021 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#define TEST_MAIN
> > +#define TEST_NAME "memset"
> > +#define START_SIZE 256
> > +#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
> > +#define TIMEOUT (20 * 60)
> > +#include "bench-string.h"
> > +
> > +#include "json-lib.h"
> > +
> > +void *generic_memset (void *, int, size_t);
> > +typedef void *(*proto_t) (void *, int, size_t);
> > +
> > +IMPL (MEMSET, 1)
> > +IMPL (generic_memset, 0)
> > +
> > +static void
> > +__attribute__((noinline, noclone))
> > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> > +            int c1 __attribute ((unused)), int c2 __attribute ((unused)),
> > +            size_t n)
> > +{
> > +  size_t i, iters = 32;
> > +  timing_t start, stop, cur, latency = 0;
> > +
> > +  CALL (impl, s, c2, n); // warm up
> > +
> > +  for (i = 0; i < iters; i++)
> > +    {
> > +      memset (s, c1, n); // alternation
> > +
> > +      TIMING_NOW (start);
> > +
> > +      CALL (impl, s, c2, n);
> > +
> > +      TIMING_NOW (stop);
> > +      TIMING_DIFF (cur, start, stop);
> > +      TIMING_ACCUM (latency, cur);
> > +    }
> > +
> > +  json_element_double (json_ctx, (double) latency / (double) iters);
> > +}
> > +
> > +static void
> > +do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
> > +{
> > +  align &= getpagesize () - 1;
> > +  if ((align + len) * sizeof (CHAR) > page_size)
> > +    return;
> > +
> > +  json_element_object_begin (json_ctx);
> > +  json_attr_uint (json_ctx, "length", len);
> > +  json_attr_uint (json_ctx, "alignment", align);
> > +  json_attr_int (json_ctx, "char1", c1);
> > +  json_attr_int (json_ctx, "char2", c2);
> > +  json_array_begin (json_ctx, "timings");
> > +
> > +  FOR_EACH_IMPL (impl, 0)
> > +    {
> > +      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c1, c2, len);
> > +      alloc_bufs ();
> > +    }
> > +
> > +  json_array_end (json_ctx);
> > +  json_element_object_end (json_ctx);
> > +}
> > +
> > +int
> > +test_main (void)
> > +{
> > +  json_ctx_t json_ctx;
> > +  size_t i;
> > +  int c1, c2;
> > +
> > +  test_init ();
> > +
> > +  json_init (&json_ctx, 0, stdout);
> > +
> > +  json_document_begin (&json_ctx);
> > +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> > +
> > +  json_attr_object_begin (&json_ctx, "functions");
> > +  json_attr_object_begin (&json_ctx, TEST_NAME);
> > +  json_attr_string (&json_ctx, "bench-variant", "zerofill");
> > +
> > +  json_array_begin (&json_ctx, "ifuncs");
> > +  FOR_EACH_IMPL (impl, 0)
> > +    json_element_string (&json_ctx, impl->name);
> > +  json_array_end (&json_ctx);
> > +
> > +  json_array_begin (&json_ctx, "results");
> > +
> > +  for (c1 = 0; c1 < 2; c1++)
> > +    for (c2 = 0; c2 < 2; c2++)
> > +      for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
> > +       {

Creating 4 patterns here.

Thanks.
Naohiro

> > +         do_test (&json_ctx, 0, c1, c2, i);
> > +         do_test (&json_ctx, 3, c1, c2, i);
> > +       }
> > +
> > +  json_array_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_attr_object_end (&json_ctx);
> > +  json_document_end (&json_ctx);
> > +
> > +  return ret;
> > +}
> > +
> > +#include <support/test-driver.c>
> > +
> > +#define libc_hidden_builtin_def(X)
> > +#define libc_hidden_def(X)
> > +#define libc_hidden_weak(X)
> > +#define weak_alias(X,Y)
> > +#undef MEMSET
> > +#define MEMSET generic_memset
> > +#include <string/memset.c>
> > --
> > 2.17.1
> >

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 3/5] benchtests: Remove redundant assert.h
  2021-08-05  7:51     ` [PATCH v3 3/5] benchtests: Remove redundant assert.h Naohiro Tamura via Libc-alpha
  2021-09-08  1:59       ` naohirot--- via Libc-alpha
@ 2021-09-13  3:36       ` Siddhesh Poyarekar
  1 sibling, 0 replies; 83+ messages in thread
From: Siddhesh Poyarekar @ 2021-09-13  3:36 UTC (permalink / raw)
  To: Naohiro Tamura, Lucas A. M. Magalhaes, libc-alpha

On 8/5/21 1:21 PM, Naohiro Tamura via Libc-alpha wrote:
> This patch removed redundant "#include <assert.h>" from
> bench-memset-large.c and bench-memset-walk.c.
> ---
>   benchtests/bench-memset-large.c | 1 -
>   benchtests/bench-memset-walk.c  | 1 -
>   2 files changed, 2 deletions(-)

LGTM.

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 4/5] benchtests: Fix validate_benchout.py exceptions
  2021-08-05  7:51     ` [PATCH v3 4/5] benchtests: Fix validate_benchout.py exceptions Naohiro Tamura via Libc-alpha
  2021-09-08  1:55       ` naohirot--- via Libc-alpha
@ 2021-09-13  3:42       ` Siddhesh Poyarekar
  2021-09-13  3:50         ` Siddhesh Poyarekar
  1 sibling, 1 reply; 83+ messages in thread
From: Siddhesh Poyarekar @ 2021-09-13  3:42 UTC (permalink / raw)
  To: Naohiro Tamura, libc-alpha

On 8/5/21 1:21 PM, Naohiro Tamura via Libc-alpha wrote:
> This patch fixed validate_benchout.py two exceptions, AttributeError
> if benchout_strings.schema.json is specified and
> json.decoder.JSONDecodeError if benchout is not JSON.
> ---
>   benchtests/scripts/import_bench.py      | 5 ++++-
>   benchtests/scripts/validate_benchout.py | 6 +++++-
>   2 files changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/benchtests/scripts/import_bench.py b/benchtests/scripts/import_bench.py
> index a799b4e1b7dc..e3337ca5d638 100644
> --- a/benchtests/scripts/import_bench.py
> +++ b/benchtests/scripts/import_bench.py
> @@ -104,7 +104,10 @@ def do_for_all_timings(bench, callback):
>       """
>       for func in bench['functions'].keys():
>           for k in bench['functions'][func].keys():
> -            if 'timings' not in bench['functions'][func][k].keys():
> +            try:
> +                if 'timings' not in bench['functions'][func][k].keys():
> +                    continue
> +            except AttributeError:
>                   continue

When do you get an AttributeError here?

>   
>               callback(bench, func, k)
> diff --git a/benchtests/scripts/validate_benchout.py b/benchtests/scripts/validate_benchout.py
> index 47df33ed0252..00d5fa0ee5eb 100755
> --- a/benchtests/scripts/validate_benchout.py
> +++ b/benchtests/scripts/validate_benchout.py
> @@ -73,11 +73,15 @@ def main(args):
>   
>       except bench.validator.ValidationError as e:
>           return print_and_exit("Invalid benchmark output: %s" % e.message,
> -            os.EX_DATAERR)
> +                os.EX_DATAERR)
>   
>       except bench.validator.SchemaError as e:
>           return print_and_exit("Invalid schema: %s" % e.message, os.EX_DATAERR)
>   
> +    except json.decoder.JSONDecodeError as e:
> +        return print_and_exit("Benchmark output in %s is not JSON." % args[0],
> +                os.EX_DATAERR)
> +
>       print("Benchmark output in %s is valid." % args[0])
>       return os.EX_OK
>   
> 


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 1/5] benchtests: Enable scripts/plot_strings.py to read stdin
  2021-09-09  0:22             ` naohirot--- via Libc-alpha
@ 2021-09-13  3:45               ` Siddhesh Poyarekar
  0 siblings, 0 replies; 83+ messages in thread
From: Siddhesh Poyarekar @ 2021-09-13  3:45 UTC (permalink / raw)
  To: naohirot@fujitsu.com, libc-alpha@sourceware.org

On 9/9/21 5:52 AM, naohirot@fujitsu.com wrote:
> OK please merge it for me? I don't have the access right.

I've merged this and 3/5.

Siddhesh

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 4/5] benchtests: Fix validate_benchout.py exceptions
  2021-09-13  3:42       ` Siddhesh Poyarekar
@ 2021-09-13  3:50         ` Siddhesh Poyarekar
  2021-09-13 13:44           ` [PATCH v4] " Naohiro Tamura via Libc-alpha
  2021-09-13 13:46           ` [PATCH v3 4/5] " naohirot--- via Libc-alpha
  0 siblings, 2 replies; 83+ messages in thread
From: Siddhesh Poyarekar @ 2021-09-13  3:50 UTC (permalink / raw)
  To: Naohiro Tamura, libc-alpha

On 9/13/21 9:12 AM, Siddhesh Poyarekar wrote:
>> --- a/benchtests/scripts/import_bench.py
>> +++ b/benchtests/scripts/import_bench.py
>> @@ -104,7 +104,10 @@ def do_for_all_timings(bench, callback):
>>       """
>>       for func in bench['functions'].keys():
>>           for k in bench['functions'][func].keys():
>> -            if 'timings' not in bench['functions'][func][k].keys():
>> +            try:
>> +                if 'timings' not in bench['functions'][func][k].keys():
>> +                    continue
>> +            except AttributeError:
>>                   continue
> 
> When do you get an AttributeError here?
> 

OK the one possibility I can think of is when 
bench['functions'][func][k] is None.  This implies the existence of a 
benchmark output that has a function variant without any inputs and 
hence, without any benchmark data.  That should be invalid, in which 
case the benchmark should be fixed, not the validator.

Thanks,
Siddhesh

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH v4] benchtests: Fix validate_benchout.py exceptions
  2021-09-13  3:50         ` Siddhesh Poyarekar
@ 2021-09-13 13:44           ` Naohiro Tamura via Libc-alpha
  2021-09-15  3:23             ` Siddhesh Poyarekar
  2021-09-13 13:46           ` [PATCH v3 4/5] " naohirot--- via Libc-alpha
  1 sibling, 1 reply; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-09-13 13:44 UTC (permalink / raw)
  To: Siddhesh Poyarekar, libc-alpha

This patch fixed validate_benchout.py two exceptions, AttributeError
if benchout_strings.schema.json is specified and
json.decoder.JSONDecodeError if benchout is not JSON.

AttributeError unconditionally occurs with a correct JSON benchout
file such as below because the code
"bench['functions'][func][k].keys()" is either  "bench-variant",
"ifunc", or "results" that doesn't have keys()."

$ ~/glibc/benchtests/scripts/validate_benchout.py bench-memcpy.out \
  ~/glibc/benchtests/scripts/benchout_strings.schema.json
Traceback (most recent call last):
  File "/home/naohirot/work/github/glibc/benchtests/scripts/validate_benchout.py", line 86, in <module>
    sys.exit(main(sys.argv[1:]))
  File "/home/naohirot/work/github/glibc/benchtests/scripts/validate_benchout.py", line 69, in main
    bench.parse_bench(args[0], args[1])
  File "/home/naohirot/work/github/glibc/benchtests/scripts/import_bench.py", line 139, in parse_bench
    do_for_all_timings(bench, lambda b, f, v:
  File "/home/naohirot/work/github/glibc/benchtests/scripts/import_bench.py", line 107, in do_for_all_timings
    if 'timings' not in bench['functions'][func][k].keys():
AttributeError: 'str' object has no attribute 'keys'

$ cat bench-memcpy.out
  1 {
  2  "timing_type": "hp_timing",
  3  "functions": {
  4   "memcpy": {
  5    "bench-variant": "default",
  6    "ifuncs": ["generic_memcpy", "__memcpy_thunderx", "__memcpy_thunderx2", "__memcpy_falkor", "__memcpy_simd", "__memcpy_a64fx", "__memcpy_generic"],
  7    "results": [
  8     {
  9      "length": 1,
 10      "align1": 0,
 11      "align2": 0,
 12      "dst > src": 0,
 13      "timings": [10.9326, 11.0449, 11.5515, 13.5693, 11.5198, 6.77368, 11.5259]
 14     },
 ...
---
 benchtests/scripts/import_bench.py      | 17 +++++++++++------
 benchtests/scripts/validate_benchout.py |  6 +++++-
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/benchtests/scripts/import_bench.py b/benchtests/scripts/import_bench.py
index a799b4e1b7dc..f5e67570d4c5 100644
--- a/benchtests/scripts/import_bench.py
+++ b/benchtests/scripts/import_bench.py
@@ -101,13 +101,18 @@ def do_for_all_timings(bench, callback):
     Args:
         bench: The benchmark object
         callback: The callback function
+    Raises:
+        validator.exceptions.ValidationError: if 'timings' key not found
     """
     for func in bench['functions'].keys():
         for k in bench['functions'][func].keys():
-            if 'timings' not in bench['functions'][func][k].keys():
-                continue
-
-            callback(bench, func, k)
+            if k == 'results':
+                for r in range(len(bench['functions'][func][k])):
+                    if 'timings' not in bench['functions'][func][k][r].keys():
+                        raise validator.exceptions.ValidationError(
+                            "'timings' key not found")
+                    else:
+                        callback(bench, func, k, r)
 
 
 def compress_timings(points):
@@ -136,6 +141,6 @@ def parse_bench(filename, schema_filename):
         with open(filename, 'r') as benchfile:
             bench = json.load(benchfile)
             validator.validate(bench, schema)
-            do_for_all_timings(bench, lambda b, f, v:
-                    b['functions'][f][v]['timings'].sort())
+            do_for_all_timings(bench, lambda b, f, v, r:
+                    b['functions'][f][v][r]['timings'].sort())
             return bench
diff --git a/benchtests/scripts/validate_benchout.py b/benchtests/scripts/validate_benchout.py
index 47df33ed0252..00d5fa0ee5eb 100755
--- a/benchtests/scripts/validate_benchout.py
+++ b/benchtests/scripts/validate_benchout.py
@@ -73,11 +73,15 @@ def main(args):
 
     except bench.validator.ValidationError as e:
         return print_and_exit("Invalid benchmark output: %s" % e.message,
-            os.EX_DATAERR)
+                os.EX_DATAERR)
 
     except bench.validator.SchemaError as e:
         return print_and_exit("Invalid schema: %s" % e.message, os.EX_DATAERR)
 
+    except json.decoder.JSONDecodeError as e:
+        return print_and_exit("Benchmark output in %s is not JSON." % args[0],
+                os.EX_DATAERR)
+
     print("Benchmark output in %s is valid." % args[0])
     return os.EX_OK
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 4/5] benchtests: Fix validate_benchout.py exceptions
  2021-09-13  3:50         ` Siddhesh Poyarekar
  2021-09-13 13:44           ` [PATCH v4] " Naohiro Tamura via Libc-alpha
@ 2021-09-13 13:46           ` naohirot--- via Libc-alpha
  1 sibling, 0 replies; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-13 13:46 UTC (permalink / raw)
  To: Siddhesh Poyarekar, libc-alpha@sourceware.org

Hi Siddhesh,

Thank you for the two merges and this review.

> From: Siddhesh Poyarekar <siddhesh@gotplt.org>
> Sent: Monday, 13 September 2021 12:50
> On 9/13/21 9:12 AM, Siddhesh Poyarekar wrote:
> >> --- a/benchtests/scripts/import_bench.py
> >> +++ b/benchtests/scripts/import_bench.py
> >> @@ -104,7 +104,10 @@ def do_for_all_timings(bench, callback):
> >>       """
> >>       for func in bench['functions'].keys():
> >>           for k in bench['functions'][func].keys():
> >> -            if 'timings' not in bench['functions'][func][k].keys():
> >> +            try:
> >> +                if 'timings' not in bench['functions'][func][k].keys():
> >> +                    continue
> >> +            except AttributeError:
> >>                   continue
> >
> > When do you get an AttributeError here?
> >
> 
> OK the one possibility I can think of is when
> bench['functions'][func][k] is None.  This implies the existence of a
> benchmark output that has a function variant without any inputs and
> hence, without any benchmark data.  That should be invalid, in which
> case the benchmark should be fixed, not the validator.

AttributeError unconditionally occurs with a correct JSON benchout
file such as below because the code
"bench['functions'][func][k].keys()" is either  "bench-variant",
"ifunc", or "results" that doesn't have keys()."

$ ~/glibc/benchtests/scripts/validate_benchout.py bench-memcpy.out \
  ~/glibc/benchtests/scripts/benchout_strings.schema.json
Traceback (most recent call last):
  File "/home/naohirot/work/github/glibc/benchtests/scripts/validate_benchout.py", line 86, in <module>
    sys.exit(main(sys.argv[1:]))
  File "/home/naohirot/work/github/glibc/benchtests/scripts/validate_benchout.py", line 69, in main
    bench.parse_bench(args[0], args[1])
  File "/home/naohirot/work/github/glibc/benchtests/scripts/import_bench.py", line 139, in parse_bench
    do_for_all_timings(bench, lambda b, f, v:
  File "/home/naohirot/work/github/glibc/benchtests/scripts/import_bench.py", line 107, in do_for_all_timings
    if 'timings' not in bench['functions'][func][k].keys():
AttributeError: 'str' object has no attribute 'keys'

$ cat bench-memcpy.out
  1 {
  2  "timing_type": "hp_timing",
  3  "functions": {
  4   "memcpy": {
  5    "bench-variant": "default",
  6    "ifuncs": ["generic_memcpy", "__memcpy_thunderx", "__memcpy_thunderx2", "__memcpy_falkor", "__memcpy_simd", "__memcpy_a64fx", "__memcpy_generic"],
  7    "results": [
  8     {
  9      "length": 1,
 10      "align1": 0,
 11      "align2": 0,
 12      "dst > src": 0,
 13      "timings": [10.9326, 11.0449, 11.5515, 13.5693, 11.5198, 6.77368, 11.5259]
 14     },
 ...
 
I found out that the implementation is not right, and sent V4 patch [1].
Please find it.

[1] https://sourceware.org/pipermail/libc-alpha/2021-September/130915.html

Thanks.
Naohiro


^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-09-13  0:53         ` naohirot--- via Libc-alpha
@ 2021-09-13 14:05           ` Lucas A. M. Magalhaes via Libc-alpha
  2021-09-14  0:38             ` [PATCH v4] " Naohiro Tamura via Libc-alpha
  2021-09-14  0:44             ` [PATCH v3 2/5] " naohirot--- via Libc-alpha
  0 siblings, 2 replies; 83+ messages in thread
From: Lucas A. M. Magalhaes via Libc-alpha @ 2021-09-13 14:05 UTC (permalink / raw)
  To: Noah Goldstein, Wilco Dijkstra, libc-alpha, naohirot@fujitsu.com

Quoting naohirot@fujitsu.com (2021-09-12 21:53:22)
> Hi Lucas,
> 
> > From: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
> > Sent: Saturday, September 11, 2021 5:40 AM
> > 
> > Thanks for working on this. Please, correct me if I'm wrong but I guess you sent
> > an old version by mistake. This patch is lacking the bench-variant
> > implementations mentioned on the commit message.
> 
> Thank you for the comment!
> I double checked the source code and confirmed it is the one I intended.
> 4 patterns are combination of json attribute "char1" and "char2".
> "char1" and "char2" varies 0 and 1 respectively.
> 
> zero-over-zero: char1=0, char2=0
> zero-over-one: char1=0, char2=1
> one-over-zero: char1=1, char2=0
> one-over-one: char1=1, char2=1
> 
> I made a comment inline too.
> 

Thanks for clarifying, now I got it. Please can you add a comment on the
code explaining this patterns and the reason behind them?

With that said this patch LGTM.

> BTW, could you review the patch "benchtests: Remove redundant assert.h" [1]
> that is reflected your comment [2] to other bench tests if you had time?
> 
> [1] https://sourceware.org/pipermail/libc-alpha/2021-August/129840.html
> [2] https://sourceware.org/pipermail/libc-alpha/2021-July/128989.html
> 
> > 
> > Quoting Naohiro Tamura (2021-08-05 04:50:53)
> > > Memset takes 0 as the second parameter in most cases.
> > > However, we cannot measure the zero fill performance by
> > > bench-memset.c, bench-memset-large.c and bench-memset-walk.c
> > > precisely.
> > > X86_64 micro-architecture has some zero-over-zero optimization, and
> > > AArch64 micro-architecture also has some optimization for DC ZVA
> > > instruction.
> > > This patch provides bench-memset-zerofill.c which is suitable to
> > > analyze the zero fill performance by comparing among 4 patterns,
> > > zero-over-zero, zero-over-one, one-over-zero and one-over-one, from
> > > 256B to 64MB(RAM) through L1, L2 and L3 caches.
> > >
> > > The following commands are examples to analyze a JSON output,
> > > bench-memset-zerofill.out, by 'jq' and 'plot_strings.py'.
> > >
> > > 1) compare zero-over-zero performance
> > >
> > > $ cat bench-memset-zerofill.out | \
> > >   jq -r '
> > >     .functions.memset."bench-variant"="zerofill-0o0" |
> > >     del(.functions.memset.results[] | select(.char1 != 0 or .char2 != 0))
> > >   ' | \
> > >   plot_strings.py -l -p thru -v -
> > >
> > > 2) compare zero paformance
> > >
> > > $ cat bench-memset-zerofill.out | \
> > >   jq -r '
> > >     .functions.memset."bench-variant"="zerofill-zero" |
> > >     del(.functions.memset.results[] | select(.char2 != 0))
> > >   ' | \
> > >   plot_strings.py -l -p thru -v -
> > >
> > > 3) compare nonzero paformance
> > >
> > > $ cat bench-memset-zerofill.out | \
> > >   jq -r '
> > >     .functions.memset."bench-variant"="zerofill-nonzero" |
> > >     del(.functions.memset.results[] | select(.char2 == 0))
> > >   ' | \
> > >   plot_strings.py -l -p thru -v -
> > > ---
> > >  benchtests/Makefile                |   2 +-
> > >  benchtests/bench-memset-zerofill.c | 134 +++++++++++++++++++++++++++++
> > >  2 files changed, 135 insertions(+), 1 deletion(-)
> > >  create mode 100644 benchtests/bench-memset-zerofill.c
> > >
> > > diff --git a/benchtests/Makefile b/benchtests/Makefile
> > > index 1530939a8ce8..21b95c736190 100644
> > > --- a/benchtests/Makefile
> > > +++ b/benchtests/Makefile
> > > @@ -53,7 +53,7 @@ string-benchset := memccpy memchr memcmp memcpy memmem memmove \
> > >                    strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
> > >                    strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
> > >                    strcoll memcpy-large memcpy-random memmove-large memset-large \
> > > -                  memcpy-walk memset-walk memmove-walk
> > > +                  memcpy-walk memset-walk memmove-walk memset-zerofill
> > >
> > >  # Build and run locale-dependent benchmarks only if we're building natively.
> > >  ifeq (no,$(cross-compiling))
> > > diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
> > > new file mode 100644
> > > index 000000000000..7aa7fe048574
> > > --- /dev/null
> > > +++ b/benchtests/bench-memset-zerofill.c
> > > @@ -0,0 +1,134 @@
> > > +/* Measure memset functions with zero fill data.
> > > +   Copyright (C) 2021 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#define TEST_MAIN
> > > +#define TEST_NAME "memset"
> > > +#define START_SIZE 256
> > > +#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
> > > +#define TIMEOUT (20 * 60)
> > > +#include "bench-string.h"
> > > +
> > > +#include "json-lib.h"
> > > +
> > > +void *generic_memset (void *, int, size_t);
> > > +typedef void *(*proto_t) (void *, int, size_t);
> > > +
> > > +IMPL (MEMSET, 1)
> > > +IMPL (generic_memset, 0)
> > > +
> > > +static void
> > > +__attribute__((noinline, noclone))
> > > +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> > > +            int c1 __attribute ((unused)), int c2 __attribute ((unused)),
> > > +            size_t n)
> > > +{
> > > +  size_t i, iters = 32;
> > > +  timing_t start, stop, cur, latency = 0;
> > > +
> > > +  CALL (impl, s, c2, n); // warm up
> > > +
> > > +  for (i = 0; i < iters; i++)
> > > +    {
> > > +      memset (s, c1, n); // alternation
> > > +
> > > +      TIMING_NOW (start);
> > > +
> > > +      CALL (impl, s, c2, n);
> > > +
> > > +      TIMING_NOW (stop);
> > > +      TIMING_DIFF (cur, start, stop);
> > > +      TIMING_ACCUM (latency, cur);
> > > +    }
> > > +
> > > +  json_element_double (json_ctx, (double) latency / (double) iters);
> > > +}
> > > +
Ok.

> > > +static void
> > > +do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
> > > +{
> > > +  align &= getpagesize () - 1;
> > > +  if ((align + len) * sizeof (CHAR) > page_size)
> > > +    return;
> > > +
> > > +  json_element_object_begin (json_ctx);
> > > +  json_attr_uint (json_ctx, "length", len);
> > > +  json_attr_uint (json_ctx, "alignment", align);
> > > +  json_attr_int (json_ctx, "char1", c1);
> > > +  json_attr_int (json_ctx, "char2", c2);
> > > +  json_array_begin (json_ctx, "timings");
> > > +
> > > +  FOR_EACH_IMPL (impl, 0)
> > > +    {
> > > +      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c1, c2, len);
> > > +      alloc_bufs ();
> > > +    }
> > > +
> > > +  json_array_end (json_ctx);
> > > +  json_element_object_end (json_ctx);
> > > +}
Ok.

> > > +
> > > +int
> > > +test_main (void)
> > > +{
> > > +  json_ctx_t json_ctx;
> > > +  size_t i;
> > > +  int c1, c2;
> > > +
> > > +  test_init ();
> > > +
> > > +  json_init (&json_ctx, 0, stdout);
> > > +
> > > +  json_document_begin (&json_ctx);
> > > +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> > > +
> > > +  json_attr_object_begin (&json_ctx, "functions");
> > > +  json_attr_object_begin (&json_ctx, TEST_NAME);
> > > +  json_attr_string (&json_ctx, "bench-variant", "zerofill");
> > > +
> > > +  json_array_begin (&json_ctx, "ifuncs");
> > > +  FOR_EACH_IMPL (impl, 0)
> > > +    json_element_string (&json_ctx, impl->name);
> > > +  json_array_end (&json_ctx);
> > > +
> > > +  json_array_begin (&json_ctx, "results");
> > > +
> > > +  for (c1 = 0; c1 < 2; c1++)
> > > +    for (c2 = 0; c2 < 2; c2++)
> > > +      for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
> > > +       {
> > > +         do_test (&json_ctx, 0, c1, c2, i);
> > > +         do_test (&json_ctx, 3, c1, c2, i);
> > > +       }
> > > +
> > > +  json_array_end (&json_ctx);
> > > +  json_attr_object_end (&json_ctx);
> > > +  json_attr_object_end (&json_ctx);
> > > +  json_document_end (&json_ctx);
> > > +
> > > +  return ret;
> > > +}
Ok.

> > > +
> > > +#include <support/test-driver.c>
> > > +
> > > +#define libc_hidden_builtin_def(X)
> > > +#define libc_hidden_def(X)
> > > +#define libc_hidden_weak(X)
> > > +#define weak_alias(X,Y)
> > > +#undef MEMSET
> > > +#define MEMSET generic_memset
> > > +#include <string/memset.c>
> > > --
> > > 2.17.1
> > >

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH v4] benchtests: Add memset zero fill benchtest
  2021-09-13 14:05           ` Lucas A. M. Magalhaes via Libc-alpha
@ 2021-09-14  0:38             ` Naohiro Tamura via Libc-alpha
  2021-09-14  0:44             ` [PATCH v3 2/5] " naohirot--- via Libc-alpha
  1 sibling, 0 replies; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-09-14  0:38 UTC (permalink / raw)
  To: Lucas A. M. Magalhaes, Wilco Dijkstra, Noah Goldstein, libc-alpha

Memset takes 0 as the second parameter in most cases.
However, we cannot measure the zero fill performance by
bench-memset.c, bench-memset-large.c and bench-memset-walk.c
precisely.
X86_64 micro-architecture has some zero-over-zero optimization, and
AArch64 micro-architecture also has some optimization for DC ZVA
instruction.
This patch provides bench-memset-zerofill.c which is suitable to
analyze the zero fill performance by comparing among 4 patterns,
zero-over-zero, zero-over-one, one-over-zero and one-over-one, from
256B to 64MB(RAM) through L1, L2 and L3 caches.

The following commands are examples to analyze a JSON output,
bench-memset-zerofill.out, by 'jq' and 'plot_strings.py'.

1) compare zero-over-zero performance

$ cat bench-memset-zerofill.out | \
  jq -r '
    .functions.memset."bench-variant"="zerofill-0o0" |
    del(.functions.memset.results[] | select(.char1 != 0 or .char2 != 0))
  ' | \
  plot_strings.py -l -p thru -v -

2) compare zero paformance

$ cat bench-memset-zerofill.out | \
  jq -r '
    .functions.memset."bench-variant"="zerofill-zero" |
    del(.functions.memset.results[] | select(.char2 != 0))
  ' | \
  plot_strings.py -l -p thru -v -

3) compare nonzero paformance

$ cat bench-memset-zerofill.out | \
  jq -r '
    .functions.memset."bench-variant"="zerofill-nonzero" |
    del(.functions.memset.results[] | select(.char2 == 0))
  ' | \
  plot_strings.py -l -p thru -v -

Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
---
 benchtests/Makefile                |   2 +-
 benchtests/bench-memset-zerofill.c | 140 +++++++++++++++++++++++++++++
 2 files changed, 141 insertions(+), 1 deletion(-)
 create mode 100644 benchtests/bench-memset-zerofill.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 1530939a8ce8..21b95c736190 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -53,7 +53,7 @@ string-benchset := memccpy memchr memcmp memcpy memmem memmove \
 		   strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
 		   strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
 		   strcoll memcpy-large memcpy-random memmove-large memset-large \
-		   memcpy-walk memset-walk memmove-walk
+		   memcpy-walk memset-walk memmove-walk memset-zerofill
 
 # Build and run locale-dependent benchmarks only if we're building natively.
 ifeq (no,$(cross-compiling))
diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
new file mode 100644
index 000000000000..0e6958ab59dd
--- /dev/null
+++ b/benchtests/bench-memset-zerofill.c
@@ -0,0 +1,140 @@
+/* Measure memset functions with zero fill data.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#define TEST_NAME "memset"
+#define START_SIZE 256
+#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+#include "json-lib.h"
+
+void *generic_memset (void *, int, size_t);
+typedef void *(*proto_t) (void *, int, size_t);
+
+IMPL (MEMSET, 1)
+IMPL (generic_memset, 0)
+
+static void
+__attribute__((noinline, noclone))
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
+	     int c1 __attribute ((unused)), int c2 __attribute ((unused)),
+	     size_t n)
+{
+  size_t i, iters = 32;
+  timing_t start, stop, cur, latency = 0;
+
+  CALL (impl, s, c2, n); // warm up
+
+  for (i = 0; i < iters; i++)
+    {
+      memset (s, c1, n); // alternation
+
+      TIMING_NOW (start);
+
+      CALL (impl, s, c2, n);
+
+      TIMING_NOW (stop);
+      TIMING_DIFF (cur, start, stop);
+      TIMING_ACCUM (latency, cur);
+    }
+
+  json_element_double (json_ctx, (double) latency / (double) iters);
+}
+
+static void
+do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
+{
+  align &= getpagesize () - 1;
+  if ((align + len) * sizeof (CHAR) > page_size)
+    return;
+
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_uint (json_ctx, "alignment", align);
+  json_attr_int (json_ctx, "char1", c1);
+  json_attr_int (json_ctx, "char2", c2);
+  json_array_begin (json_ctx, "timings");
+
+  FOR_EACH_IMPL (impl, 0)
+    {
+      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c1, c2, len);
+      alloc_bufs ();
+    }
+
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
+}
+
+int
+test_main (void)
+{
+  json_ctx_t json_ctx;
+  size_t i;
+  int c1, c2;
+
+  test_init ();
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "zerofill");
+
+  json_array_begin (&json_ctx, "ifuncs");
+  FOR_EACH_IMPL (impl, 0)
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
+
+  // To analyze zero fill performance by comparing among the following 4
+  // patterns from 256B to 64MB(RAM) through L1, L2 and L3 caches.
+  // - zero-over-zero: c1=0, c2=0
+  // - zero-over-one:  c1=0, c2=1
+  // - one-over-zero:  c1=1, c2=0
+  // - one-over-one:   c1=1, c2=1
+  for (c1 = 0; c1 < 2; c1++)
+    for (c2 = 0; c2 < 2; c2++)
+      for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+	{
+	  do_test (&json_ctx, 0, c1, c2, i);
+	  do_test (&json_ctx, 3, c1, c2, i);
+	}
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return ret;
+}
+
+#include <support/test-driver.c>
+
+#define libc_hidden_builtin_def(X)
+#define libc_hidden_def(X)
+#define libc_hidden_weak(X)
+#define weak_alias(X,Y)
+#undef MEMSET
+#define MEMSET generic_memset
+#include <string/memset.c>
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* RE: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-09-13 14:05           ` Lucas A. M. Magalhaes via Libc-alpha
  2021-09-14  0:38             ` [PATCH v4] " Naohiro Tamura via Libc-alpha
@ 2021-09-14  0:44             ` naohirot--- via Libc-alpha
  2021-09-14 14:02               ` Wilco Dijkstra via Libc-alpha
  1 sibling, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-14  0:44 UTC (permalink / raw)
  To: 'Lucas A. M. Magalhaes', Noah Goldstein, Wilco Dijkstra,
	libc-alpha@sourceware.org

Hi Lucas,

> From: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
> Sent: Monday, September 13, 2021 11:05 PM
>
> Thanks for clarifying, now I got it. Please can you add a comment on the
> code explaining this patterns and the reason behind them?
> 
> With that said this patch LGTM.

Thank you for the review!
I just submitted V4 patch by adding the comment.
Please find it [1] and merge if it's OK.

Changes from V3:

> Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
> Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
> Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>

> +  // To analyze zero fill performance by comparing among the following 4
> +  // patterns from 256B to 64MB(RAM) through L1, L2 and L3 caches.
> +  // - zero-over-zero: c1=0, c2=0
> +  // - zero-over-one:  c1=0, c2=1
> +  // - one-over-zero:  c1=1, c2=0
> +  // - one-over-one:   c1=1, c2=1

[1] https://sourceware.org/pipermail/libc-alpha/2021-September/130946.html

Thanks.
Naohiro


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-09-14  0:44             ` [PATCH v3 2/5] " naohirot--- via Libc-alpha
@ 2021-09-14 14:02               ` Wilco Dijkstra via Libc-alpha
  2021-09-15  8:24                 ` naohirot--- via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Wilco Dijkstra via Libc-alpha @ 2021-09-14 14:02 UTC (permalink / raw)
  To: naohirot@fujitsu.com, 'Lucas A. M. Magalhaes',
	Noah Goldstein, libc-alpha@sourceware.org

Hi Naohiro,

I had a quick go at running the new benchmark. The main problem is that it doesn't
give repeatable results - there are huge variations from run to run of about 50% for
the smaller sizes. This is a fundamental problem due to the timing loop, and the only
way to reduce it is to increase the time taken by memset, ie. start at a much larger
size (say at 16KB).

It also takes a long time to run - generally it's best to ensure a benchmark takes less
than 10 seconds on a typical modern system (remember there will be many that are
slower!). It should be feasible to reduce the iteration count for large sizes, but you
could go up to 16MB rather than 64MB.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v4] benchtests: Fix validate_benchout.py exceptions
  2021-09-13 13:44           ` [PATCH v4] " Naohiro Tamura via Libc-alpha
@ 2021-09-15  3:23             ` Siddhesh Poyarekar
  2021-09-16  1:12               ` naohirot--- via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Siddhesh Poyarekar @ 2021-09-15  3:23 UTC (permalink / raw)
  To: Naohiro Tamura, libc-alpha

On 9/13/21 7:14 PM, Naohiro Tamura wrote:
> This patch fixed validate_benchout.py two exceptions, AttributeError
> if benchout_strings.schema.json is specified and
> json.decoder.JSONDecodeError if benchout is not JSON.
> 
> AttributeError unconditionally occurs with a correct JSON benchout
> file such as below because the code
> "bench['functions'][func][k].keys()" is either  "bench-variant",
> "ifunc", or "results" that doesn't have keys()."
> 
> $ ~/glibc/benchtests/scripts/validate_benchout.py bench-memcpy.out \
>    ~/glibc/benchtests/scripts/benchout_strings.schema.json
> Traceback (most recent call last):
>    File "/home/naohirot/work/github/glibc/benchtests/scripts/validate_benchout.py", line 86, in <module>
>      sys.exit(main(sys.argv[1:]))
>    File "/home/naohirot/work/github/glibc/benchtests/scripts/validate_benchout.py", line 69, in main
>      bench.parse_bench(args[0], args[1])
>    File "/home/naohirot/work/github/glibc/benchtests/scripts/import_bench.py", line 139, in parse_bench
>      do_for_all_timings(bench, lambda b, f, v:
>    File "/home/naohirot/work/github/glibc/benchtests/scripts/import_bench.py", line 107, in do_for_all_timings
>      if 'timings' not in bench['functions'][func][k].keys():
> AttributeError: 'str' object has no attribute 'keys'
> 
> $ cat bench-memcpy.out
>    1 {
>    2  "timing_type": "hp_timing",
>    3  "functions": {
>    4   "memcpy": {
>    5    "bench-variant": "default",
>    6    "ifuncs": ["generic_memcpy", "__memcpy_thunderx", "__memcpy_thunderx2", "__memcpy_falkor", "__memcpy_simd", "__memcpy_a64fx", "__memcpy_generic"],
>    7    "results": [
>    8     {
>    9      "length": 1,
>   10      "align1": 0,
>   11      "align2": 0,
>   12      "dst > src": 0,
>   13      "timings": [10.9326, 11.0449, 11.5515, 13.5693, 11.5198, 6.77368, 11.5259]
>   14     },
>   ...
> ---
>   benchtests/scripts/import_bench.py      | 17 +++++++++++------
>   benchtests/scripts/validate_benchout.py |  6 +++++-
>   2 files changed, 16 insertions(+), 7 deletions(-)
> 
> diff --git a/benchtests/scripts/import_bench.py b/benchtests/scripts/import_bench.py
> index a799b4e1b7dc..f5e67570d4c5 100644
> --- a/benchtests/scripts/import_bench.py
> +++ b/benchtests/scripts/import_bench.py
> @@ -101,13 +101,18 @@ def do_for_all_timings(bench, callback):
>       Args:
>           bench: The benchmark object
>           callback: The callback function
> +    Raises:
> +        validator.exceptions.ValidationError: if 'timings' key not found
>       """
>       for func in bench['functions'].keys():
>           for k in bench['functions'][func].keys():
> -            if 'timings' not in bench['functions'][func][k].keys():
> -                continue
> -
> -            callback(bench, func, k)
> +            if k == 'results':
> +                for r in range(len(bench['functions'][func][k])):
> +                    if 'timings' not in bench['functions'][func][k][r].keys():
> +                        raise validator.exceptions.ValidationError(
> +                            "'timings' key not found")
> +                    else:
> +                        callback(bench, func, k, r)
>   
>   
>   def compress_timings(points):
> @@ -136,6 +141,6 @@ def parse_bench(filename, schema_filename):
>           with open(filename, 'r') as benchfile:
>               bench = json.load(benchfile)
>               validator.validate(bench, schema)
> -            do_for_all_timings(bench, lambda b, f, v:
> -                    b['functions'][f][v]['timings'].sort())
> +            do_for_all_timings(bench, lambda b, f, v, r:
> +                    b['functions'][f][v][r]['timings'].sort())
>               return bench

This will break the original use case, i.e. bench.out, since it doesn't 
have the 'results' nesting timings.  The sorting doesn't seem necessary 
for validation, it's only necessary for compare_bench.py.  You could 
move the do_for_all_timings call into compare_bench.py, which is 
specific to bench.out so that you don't have to modify do_for_all_timings.

Siddhesh

^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-09-14 14:02               ` Wilco Dijkstra via Libc-alpha
@ 2021-09-15  8:24                 ` naohirot--- via Libc-alpha
  2021-09-21  1:27                   ` naohirot--- via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-15  8:24 UTC (permalink / raw)
  To: Wilco Dijkstra, 'Lucas A. M. Magalhaes', Noah Goldstein,
	libc-alpha@sourceware.org

Hi Wilco,

Thank you for the comment.
I understood your concerns about the start size and the end size.

> From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> Sent: Tuesday, September 14, 2021 11:03 PM
>
> I had a quick go at running the new benchmark. The main problem is that it doesn't
> give repeatable results - there are huge variations from run to run of about 50% for
> the smaller sizes. This is a fundamental problem due to the timing loop, and the only
> way to reduce it is to increase the time taken by memset, ie. start at a much larger
> size (say at 16KB).

In terms of the start size, 256B is chosen because __memset_generic
(sysdeps/aarch64/memset.S) calls DC ZVA for zero fill from 256B, which
code you committed [1].
And I reported an interesting insight in the mail [2] that DC ZVA is
slower than store instruction from 256B to 16KB on A64FX [3].
So it seems valuable to measure the range from 256B to 16KB to see
each CPU's behavior.
What do you think?

[1] https://sourceware.org/git/?p=glibc.git&h=a8c5a2a9521e105da6e96eaf4029b8e4d595e4f5
[2] https://sourceware.org/pipermail/libc-alpha/2021-August/129805.html
[3] https://drive.google.com/file/d/1fonjDDlF4LPLfZY9-z22DGn-yaSpGN4g/view

> It also takes a long time to run - generally it's best to ensure a benchmark takes less
> than 10 seconds on a typical modern system (remember there will be many that are
> slower!). It should be feasible to reduce the iteration count for large sizes, but you
> could go up to 16MB rather than 64MB.

OK, I'll change the end size to 16MB.

Thanks.
Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v4] benchtests: Fix validate_benchout.py exceptions
  2021-09-15  3:23             ` Siddhesh Poyarekar
@ 2021-09-16  1:12               ` naohirot--- via Libc-alpha
  2021-09-16  1:41                 ` Siddhesh Poyarekar
  0 siblings, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-16  1:12 UTC (permalink / raw)
  To: 'Siddhesh Poyarekar', libc-alpha@sourceware.org

Hi Siddhesh,

Thank you for the comment. 

> From: Siddhesh Poyarekar <siddhesh@gotplt.org>
> Sent: Wednesday, September 15, 2021 12:23 PM
> 
> This will break the original use case, i.e. bench.out, since it doesn't
> have the 'results' nesting timings.  The sorting doesn't seem necessary
> for validation, it's only necessary for compare_bench.py.  You could
> move the do_for_all_timings call into compare_bench.py, which is
> specific to bench.out so that you don't have to modify do_for_all_timings.

If do_for_all_timings is moved to compare_bench.py, parse_bench also has
to be moved to compare_bench.py and compare_strings.py.

So I propose simply to catch AttributeError and return such as below.
Because compare_strings.py is not affected.
Can we agree?

--- a/benchtests/scripts/import_bench.py
+++ b/benchtests/scripts/import_bench.py
@@ -104,10 +104,15 @@ def do_for_all_timings(bench, callback):
     """
     for func in bench['functions'].keys():
         for k in bench['functions'][func].keys():
-            if 'timings' not in bench['functions'][func][k].keys():
-                continue
-
-            callback(bench, func, k)
+            try:
+                if 'timings' not in bench['functions'][func][k].keys():
+                    continue
+
+                callback(bench, func, k)
+            # in case that <bench.out file> conforms <bench.out schema>
+            # benchout_strings.schema.json
+            except AttributeError:
+                return

Thanks.
Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v4] benchtests: Fix validate_benchout.py exceptions
  2021-09-16  1:12               ` naohirot--- via Libc-alpha
@ 2021-09-16  1:41                 ` Siddhesh Poyarekar
  2021-09-16  2:23                   ` [PATCH v5] " Naohiro Tamura via Libc-alpha
  2021-09-16  2:26                   ` [PATCH v4] " naohirot--- via Libc-alpha
  0 siblings, 2 replies; 83+ messages in thread
From: Siddhesh Poyarekar @ 2021-09-16  1:41 UTC (permalink / raw)
  To: naohirot@fujitsu.com, libc-alpha@sourceware.org

On 9/16/21 6:42 AM, naohirot@fujitsu.com wrote:
> Hi Siddhesh,
> 
> Thank you for the comment.
> 
>> From: Siddhesh Poyarekar <siddhesh@gotplt.org>
>> Sent: Wednesday, September 15, 2021 12:23 PM
>>
>> This will break the original use case, i.e. bench.out, since it doesn't
>> have the 'results' nesting timings.  The sorting doesn't seem necessary
>> for validation, it's only necessary for compare_bench.py.  You could
>> move the do_for_all_timings call into compare_bench.py, which is
>> specific to bench.out so that you don't have to modify do_for_all_timings.
> 
> If do_for_all_timings is moved to compare_bench.py, parse_bench also has
> to be moved to compare_bench.py and compare_strings.py.
> 

Ahh no, I didn't suggest moving the implementation, only the call, like  
below.  This way import_bench.parse_bench does not have to know about  
the specific json format.

diff --git a/benchtests/scripts/compare_bench.py  
b/benchtests/scripts/compare_bench.py
index 6fcbd08038..fa7481c76e 100755
--- a/benchtests/scripts/compare_bench.py
+++ b/benchtests/scripts/compare_bench.py
@@ -163,7 +163,11 @@ def plot_graphs(bench1, bench2):

  def main(bench1, bench2, schema, threshold, stats):
      bench1 = bench.parse_bench(bench1, schema)
+    bench.do_for_all_timings(bench1, lambda b, f, v:
+            b['functions'][f][v]['timings'].sort())
      bench2 = bench.parse_bench(bench2, schema)
+    bench.do_for_all_timings(bench2, lambda b, f, v:
+            b['functions'][f][v]['timings'].sort())

      plot_graphs(bench1, bench2)

diff --git a/benchtests/scripts/import_bench.py  
b/benchtests/scripts/import_bench.py
index a799b4e1b7..362c990fe1 100644
--- a/benchtests/scripts/import_bench.py
+++ b/benchtests/scripts/import_bench.py
@@ -136,6 +136,4 @@ def parse_bench(filename, schema_filename):
          with open(filename, 'r') as benchfile:
              bench = json.load(benchfile)
              validator.validate(bench, schema)
-            do_for_all_timings(bench, lambda b, f, v:
-                    b['functions'][f][v]['timings'].sort())
              return bench

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* [PATCH v5] benchtests: Fix validate_benchout.py exceptions
  2021-09-16  1:41                 ` Siddhesh Poyarekar
@ 2021-09-16  2:23                   ` Naohiro Tamura via Libc-alpha
  2021-09-16  3:48                     ` Siddhesh Poyarekar
  2021-09-16  2:26                   ` [PATCH v4] " naohirot--- via Libc-alpha
  1 sibling, 1 reply; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-09-16  2:23 UTC (permalink / raw)
  To: Siddhesh Poyarekar, libc-alpha

This patch fixed validate_benchout.py two exceptions,
1) AttributeError
   if benchout_strings.schema.json is specified, and
2) json.decoder.JSONDecodeError
   if benchout file is not JSON.

$ ~/glibc/benchtests/scripts/validate_benchout.py bench-memset.out \
~/glibc/benchtests/scripts/benchout_strings.schema.json
Traceback (most recent call last):
  File "/home/naohirot/glibc/benchtests/scripts/validate_benchout.py", line 86, in <module>
    sys.exit(main(sys.argv[1:]))
  File "/home/naohirot/glibc/benchtests/scripts/validate_benchout.py", line 69, in main
    bench.parse_bench(args[0], args[1])
  File "/home/naohirot/glibc/benchtests/scripts/import_bench.py", line 139, in parse_bench
    do_for_all_timings(bench, lambda b, f, v:
  File "/home/naohirot/glibc/benchtests/scripts/import_bench.py", line 107, in do_for_all_timings
    if 'timings' not in bench['functions'][func][k].keys():
AttributeError: 'str' object has no attribute 'keys'

$ ~/glibc/benchtests/scripts/validate_benchout.py bench-math-inlines.out \
~/glibc/benchtests/scripts/benchout_strings.schema.json
Traceback (most recent call last):
  File "/home/naohirot/glibc/benchtests/scripts/validate_benchout.py", line 86, in <module>
    sys.exit(main(sys.argv[1:]))
  File "/home/naohirot/glibc/benchtests/scripts/validate_benchout.py", line 69, in main
    bench.parse_bench(args[0], args[1])
  File "/home/naohirot/glibc/benchtests/scripts/import_bench.py", line 137, in parse_bench
    bench = json.load(benchfile)
  File "/usr/lib/python3.6/json/__init__.py", line 299, in load
    parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
  File "/usr/lib/python3.6/json/__init__.py", line 354, in loads
    return _default_decoder.decode(s)
  File "/usr/lib/python3.6/json/decoder.py", line 342, in decode
    raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 1 column 17 (char 16)

Reviewed-by: Siddhesh Poyarekar <siddhesh@gotplt.org>
---
 benchtests/scripts/compare_bench.py     | 4 ++++
 benchtests/scripts/import_bench.py      | 2 --
 benchtests/scripts/validate_benchout.py | 6 +++++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/benchtests/scripts/compare_bench.py b/benchtests/scripts/compare_bench.py
index 6fcbd0803808..1a497f8bb61b 100755
--- a/benchtests/scripts/compare_bench.py
+++ b/benchtests/scripts/compare_bench.py
@@ -163,7 +163,11 @@ def plot_graphs(bench1, bench2):
 
 def main(bench1, bench2, schema, threshold, stats):
     bench1 = bench.parse_bench(bench1, schema)
+    bench.do_for_all_timings(bench1, lambda b, f, v:
+        b['functions'][f][v]['timings'].sort())
     bench2 = bench.parse_bench(bench2, schema)
+    bench.do_for_all_timings(bench2, lambda b, f, v:
+        b['functions'][f][v]['timings'].sort())
 
     plot_graphs(bench1, bench2)
 
diff --git a/benchtests/scripts/import_bench.py b/benchtests/scripts/import_bench.py
index a799b4e1b7dc..362c990fe153 100644
--- a/benchtests/scripts/import_bench.py
+++ b/benchtests/scripts/import_bench.py
@@ -136,6 +136,4 @@ def parse_bench(filename, schema_filename):
         with open(filename, 'r') as benchfile:
             bench = json.load(benchfile)
             validator.validate(bench, schema)
-            do_for_all_timings(bench, lambda b, f, v:
-                    b['functions'][f][v]['timings'].sort())
             return bench
diff --git a/benchtests/scripts/validate_benchout.py b/benchtests/scripts/validate_benchout.py
index 47df33ed0252..00d5fa0ee5eb 100755
--- a/benchtests/scripts/validate_benchout.py
+++ b/benchtests/scripts/validate_benchout.py
@@ -73,11 +73,15 @@ def main(args):
 
     except bench.validator.ValidationError as e:
         return print_and_exit("Invalid benchmark output: %s" % e.message,
-            os.EX_DATAERR)
+                os.EX_DATAERR)
 
     except bench.validator.SchemaError as e:
         return print_and_exit("Invalid schema: %s" % e.message, os.EX_DATAERR)
 
+    except json.decoder.JSONDecodeError as e:
+        return print_and_exit("Benchmark output in %s is not JSON." % args[0],
+                os.EX_DATAERR)
+
     print("Benchmark output in %s is valid." % args[0])
     return os.EX_OK
 
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* RE: [PATCH v4] benchtests: Fix validate_benchout.py exceptions
  2021-09-16  1:41                 ` Siddhesh Poyarekar
  2021-09-16  2:23                   ` [PATCH v5] " Naohiro Tamura via Libc-alpha
@ 2021-09-16  2:26                   ` naohirot--- via Libc-alpha
  1 sibling, 0 replies; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-16  2:26 UTC (permalink / raw)
  To: 'Siddhesh Poyarekar', libc-alpha@sourceware.org

Hi Siddhesh,

> Ahh no, I didn't suggest moving the implementation, only the call, like
> below.  This way import_bench.parse_bench does not have to know about
> the specific json format.

Sorry about that 😊, now I got it.
Please find V5 [1] and check it again.

[1] https://sourceware.org/pipermail/libc-alpha/2021-September/131058.html

Thanks.
Naohiro


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v5] benchtests: Fix validate_benchout.py exceptions
  2021-09-16  2:23                   ` [PATCH v5] " Naohiro Tamura via Libc-alpha
@ 2021-09-16  3:48                     ` Siddhesh Poyarekar
  2021-09-16  5:23                       ` naohirot--- via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Siddhesh Poyarekar @ 2021-09-16  3:48 UTC (permalink / raw)
  To: Naohiro Tamura, libc-alpha

On 9/16/21 7:53 AM, Naohiro Tamura wrote:
> This patch fixed validate_benchout.py two exceptions,
> 1) AttributeError
>     if benchout_strings.schema.json is specified, and
> 2) json.decoder.JSONDecodeError
>     if benchout file is not JSON.
> 
> $ ~/glibc/benchtests/scripts/validate_benchout.py bench-memset.out \
> ~/glibc/benchtests/scripts/benchout_strings.schema.json
> Traceback (most recent call last):
>    File "/home/naohirot/glibc/benchtests/scripts/validate_benchout.py", line 86, in <module>
>      sys.exit(main(sys.argv[1:]))
>    File "/home/naohirot/glibc/benchtests/scripts/validate_benchout.py", line 69, in main
>      bench.parse_bench(args[0], args[1])
>    File "/home/naohirot/glibc/benchtests/scripts/import_bench.py", line 139, in parse_bench
>      do_for_all_timings(bench, lambda b, f, v:
>    File "/home/naohirot/glibc/benchtests/scripts/import_bench.py", line 107, in do_for_all_timings
>      if 'timings' not in bench['functions'][func][k].keys():
> AttributeError: 'str' object has no attribute 'keys'
> 
> $ ~/glibc/benchtests/scripts/validate_benchout.py bench-math-inlines.out \
> ~/glibc/benchtests/scripts/benchout_strings.schema.json
> Traceback (most recent call last):
>    File "/home/naohirot/glibc/benchtests/scripts/validate_benchout.py", line 86, in <module>
>      sys.exit(main(sys.argv[1:]))
>    File "/home/naohirot/glibc/benchtests/scripts/validate_benchout.py", line 69, in main
>      bench.parse_bench(args[0], args[1])
>    File "/home/naohirot/glibc/benchtests/scripts/import_bench.py", line 137, in parse_bench
>      bench = json.load(benchfile)
>    File "/usr/lib/python3.6/json/__init__.py", line 299, in load
>      parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
>    File "/usr/lib/python3.6/json/__init__.py", line 354, in loads
>      return _default_decoder.decode(s)
>    File "/usr/lib/python3.6/json/decoder.py", line 342, in decode
>      raise JSONDecodeError("Extra data", s, end)
> json.decoder.JSONDecodeError: Extra data: line 1 column 17 (char 16)
> 
> Reviewed-by: Siddhesh Poyarekar <siddhesh@gotplt.org>

Please don't add Reviewed-by until the reviewer specifically adds it 
themselves.  It is an indicator on patchwork that the patch is ready for 
inclusion.

That said, LGTM, I'll push this :)

Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>

> ---
>   benchtests/scripts/compare_bench.py     | 4 ++++
>   benchtests/scripts/import_bench.py      | 2 --
>   benchtests/scripts/validate_benchout.py | 6 +++++-
>   3 files changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/benchtests/scripts/compare_bench.py b/benchtests/scripts/compare_bench.py
> index 6fcbd0803808..1a497f8bb61b 100755
> --- a/benchtests/scripts/compare_bench.py
> +++ b/benchtests/scripts/compare_bench.py
> @@ -163,7 +163,11 @@ def plot_graphs(bench1, bench2):
>   
>   def main(bench1, bench2, schema, threshold, stats):
>       bench1 = bench.parse_bench(bench1, schema)
> +    bench.do_for_all_timings(bench1, lambda b, f, v:
> +        b['functions'][f][v]['timings'].sort())
>       bench2 = bench.parse_bench(bench2, schema)
> +    bench.do_for_all_timings(bench2, lambda b, f, v:
> +        b['functions'][f][v]['timings'].sort())
>   
>       plot_graphs(bench1, bench2)
>   
> diff --git a/benchtests/scripts/import_bench.py b/benchtests/scripts/import_bench.py
> index a799b4e1b7dc..362c990fe153 100644
> --- a/benchtests/scripts/import_bench.py
> +++ b/benchtests/scripts/import_bench.py
> @@ -136,6 +136,4 @@ def parse_bench(filename, schema_filename):
>           with open(filename, 'r') as benchfile:
>               bench = json.load(benchfile)
>               validator.validate(bench, schema)
> -            do_for_all_timings(bench, lambda b, f, v:
> -                    b['functions'][f][v]['timings'].sort())
>               return bench
> diff --git a/benchtests/scripts/validate_benchout.py b/benchtests/scripts/validate_benchout.py
> index 47df33ed0252..00d5fa0ee5eb 100755
> --- a/benchtests/scripts/validate_benchout.py
> +++ b/benchtests/scripts/validate_benchout.py
> @@ -73,11 +73,15 @@ def main(args):
>   
>       except bench.validator.ValidationError as e:
>           return print_and_exit("Invalid benchmark output: %s" % e.message,
> -            os.EX_DATAERR)
> +                os.EX_DATAERR)
>   
>       except bench.validator.SchemaError as e:
>           return print_and_exit("Invalid schema: %s" % e.message, os.EX_DATAERR)
>   
> +    except json.decoder.JSONDecodeError as e:
> +        return print_and_exit("Benchmark output in %s is not JSON." % args[0],
> +                os.EX_DATAERR)
> +
>       print("Benchmark output in %s is valid." % args[0])
>       return os.EX_OK
>   
> 


^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v5] benchtests: Fix validate_benchout.py exceptions
  2021-09-16  3:48                     ` Siddhesh Poyarekar
@ 2021-09-16  5:23                       ` naohirot--- via Libc-alpha
  0 siblings, 0 replies; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-16  5:23 UTC (permalink / raw)
  To: 'Siddhesh Poyarekar', libc-alpha@sourceware.org

Hi Siddhesh,

> From: Siddhesh Poyarekar <siddhesh@gotplt.org>
> Sent: Thursday, September 16, 2021 12:48 PM
>
> Please don't add Reviewed-by until the reviewer specifically adds it
> themselves.  It is an indicator on patchwork that the patch is ready for
> inclusion.

I see, OK.
 
> That said, LGTM, I'll push this :)

Thanks!
Naohiro


^ permalink raw reply	[flat|nested] 83+ messages in thread

* RE: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-09-15  8:24                 ` naohirot--- via Libc-alpha
@ 2021-09-21  1:27                   ` naohirot--- via Libc-alpha
  2021-09-21 11:09                     ` Wilco Dijkstra via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-21  1:27 UTC (permalink / raw)
  To: Wilco Dijkstra, 'Lucas A. M. Magalhaes', Noah Goldstein,
	libc-alpha@sourceware.org

Hi Wilco,

Let me ping you regarding the start size.

> -----Original Message-----
> From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Sent: Wednesday, September 15, 2021 5:25 PM
> To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>; 'Lucas A. M. Magalhaes' <lamm@linux.ibm.com>; Noah Goldstein
> <goldstein.w.n@gmail.com>; libc-alpha@sourceware.org
> Subject: RE: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
> 
> Hi Wilco,
> 
> Thank you for the comment.
> I understood your concerns about the start size and the end size.
> 
> > From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> > Sent: Tuesday, September 14, 2021 11:03 PM
> >
> > I had a quick go at running the new benchmark. The main problem is that it doesn't
> > give repeatable results - there are huge variations from run to run of about 50% for
> > the smaller sizes. This is a fundamental problem due to the timing loop, and the only
> > way to reduce it is to increase the time taken by memset, ie. start at a much larger
> > size (say at 16KB).
> 
> In terms of the start size, 256B is chosen because __memset_generic
> (sysdeps/aarch64/memset.S) calls DC ZVA for zero fill from 256B, which
> code you committed [1].
> And I reported an interesting insight in the mail [2] that DC ZVA is
> slower than store instruction from 256B to 16KB on A64FX [3].
> So it seems valuable to measure the range from 256B to 16KB to see
> each CPU's behavior.
> What do you think?
> 
> [1] https://sourceware.org/git/?p=glibc.git&h=a8c5a2a9521e105da6e96eaf4029b8e4d595e4f5
> [2] https://sourceware.org/pipermail/libc-alpha/2021-August/129805.html
> [3] https://drive.google.com/file/d/1fonjDDlF4LPLfZY9-z22DGn-yaSpGN4g/view
> 
> > It also takes a long time to run - generally it's best to ensure a benchmark takes less
> > than 10 seconds on a typical modern system (remember there will be many that are
> > slower!). It should be feasible to reduce the iteration count for large sizes, but you
> > could go up to 16MB rather than 64MB.
> 
> OK, I'll change the end size to 16MB.
> 
> Thanks.
> Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-09-21  1:27                   ` naohirot--- via Libc-alpha
@ 2021-09-21 11:09                     ` Wilco Dijkstra via Libc-alpha
  2021-09-22  1:05                       ` [PATCH v5] " Naohiro Tamura via Libc-alpha
  2021-09-22  1:07                       ` [PATCH v3 2/5] " naohirot--- via Libc-alpha
  0 siblings, 2 replies; 83+ messages in thread
From: Wilco Dijkstra via Libc-alpha @ 2021-09-21 11:09 UTC (permalink / raw)
  To: naohirot@fujitsu.com, 'Lucas A. M. Magalhaes',
	Noah Goldstein, libc-alpha@sourceware.org

Hi Naohiro,

> In terms of the start size, 256B is chosen because __memset_generic
> (sysdeps/aarch64/memset.S) calls DC ZVA for zero fill from 256B, which
> code you committed [1].
> And I reported an interesting insight in the mail [2] that DC ZVA is
> slower than store instruction from 256B to 16KB on A64FX [3].
> So it seems valuable to measure the range from 256B to 16KB to see
> each CPU's behavior.
> What do you think?

As I've mentioned, this will never work using the current benchmark loop.
At size 256 your loop has only 1 timer tick... The only way to get any data
out is to increase the time taken per call. At 16K there are about 20 ticks so
it is still very inaccurate. By repeating the test thousands of times you can
some signal out (eg. 20% is 20 ticks, 80% is 21 gives ~20.8 ticks on average),
but that's impossible for smaller sizes.

So if you want to measure small sizes, you need to use a more accurate timing
loop.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH v5] benchtests: Add memset zero fill benchtest
  2021-09-21 11:09                     ` Wilco Dijkstra via Libc-alpha
@ 2021-09-22  1:05                       ` Naohiro Tamura via Libc-alpha
  2023-02-09 17:23                         ` Carlos O'Donell via Libc-alpha
  2021-09-22  1:07                       ` [PATCH v3 2/5] " naohirot--- via Libc-alpha
  1 sibling, 1 reply; 83+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-09-22  1:05 UTC (permalink / raw)
  To: Wilco Dijkstra, 'Lucas A . M . Magalhaes', Noah Goldstein,
	libc-alpha

Memset takes 0 as the second parameter in most cases.
However, we cannot measure the zero fill performance by
bench-memset.c, bench-memset-large.c and bench-memset-walk.c
precisely.
X86_64 micro-architecture has some zero-over-zero optimization, and
AArch64 micro-architecture also has some optimization for DC ZVA
instruction.
This patch provides bench-memset-zerofill.c which is suitable to
analyze the zero fill performance by comparing among 4 patterns,
zero-over-zero, zero-over-one, one-over-zero and one-over-one through
L1, L2 and L3 caches.

The following commands are examples to analyze a JSON output,
bench-memset-zerofill.out, by 'jq' and 'plot_strings.py'.

1) compare zero-over-zero performance

$ cat bench-memset-zerofill.out | \
  jq -r '
    .functions.memset."bench-variant"="zerofill-0o0" |
    del(.functions.memset.results[] | select(.char1 != 0 or .char2 != 0))
  ' | \
  plot_strings.py -l -p thru -v -

2) compare zero paformance

$ cat bench-memset-zerofill.out | \
  jq -r '
    .functions.memset."bench-variant"="zerofill-zero" |
    del(.functions.memset.results[] | select(.char2 != 0))
  ' | \
  plot_strings.py -l -p thru -v -

3) compare nonzero paformance

$ cat bench-memset-zerofill.out | \
  jq -r '
    .functions.memset."bench-variant"="zerofill-nonzero" |
    del(.functions.memset.results[] | select(.char2 == 0))
  ' | \
  plot_strings.py -l -p thru -v -

Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
---
 benchtests/Makefile                |   2 +-
 benchtests/bench-memset-zerofill.c | 140 +++++++++++++++++++++++++++++
 2 files changed, 141 insertions(+), 1 deletion(-)
 create mode 100644 benchtests/bench-memset-zerofill.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 1530939a8ce8..21b95c736190 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -53,7 +53,7 @@ string-benchset := memccpy memchr memcmp memcpy memmem memmove \
 		   strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
 		   strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
 		   strcoll memcpy-large memcpy-random memmove-large memset-large \
-		   memcpy-walk memset-walk memmove-walk
+		   memcpy-walk memset-walk memmove-walk memset-zerofill
 
 # Build and run locale-dependent benchmarks only if we're building natively.
 ifeq (no,$(cross-compiling))
diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
new file mode 100644
index 000000000000..2e146e157e25
--- /dev/null
+++ b/benchtests/bench-memset-zerofill.c
@@ -0,0 +1,140 @@
+/* Measure memset functions with zero fill data.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#define TEST_NAME "memset"
+#define START_SIZE (16 * 1024)
+#define MIN_PAGE_SIZE (getpagesize () + 16 * 1024 * 1024)
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+#include "json-lib.h"
+
+void *generic_memset (void *, int, size_t);
+typedef void *(*proto_t) (void *, int, size_t);
+
+IMPL (MEMSET, 1)
+IMPL (generic_memset, 0)
+
+static void
+__attribute__((noinline, noclone))
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
+	     int c1 __attribute ((unused)), int c2 __attribute ((unused)),
+	     size_t n)
+{
+  size_t i, iters = 32;
+  timing_t start, stop, cur, latency = 0;
+
+  CALL (impl, s, c2, n); // warm up
+
+  for (i = 0; i < iters; i++)
+    {
+      memset (s, c1, n); // alternation
+
+      TIMING_NOW (start);
+
+      CALL (impl, s, c2, n);
+
+      TIMING_NOW (stop);
+      TIMING_DIFF (cur, start, stop);
+      TIMING_ACCUM (latency, cur);
+    }
+
+  json_element_double (json_ctx, (double) latency / (double) iters);
+}
+
+static void
+do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
+{
+  align &= getpagesize () - 1;
+  if ((align + len) * sizeof (CHAR) > page_size)
+    return;
+
+  json_element_object_begin (json_ctx);
+  json_attr_uint (json_ctx, "length", len);
+  json_attr_uint (json_ctx, "alignment", align);
+  json_attr_int (json_ctx, "char1", c1);
+  json_attr_int (json_ctx, "char2", c2);
+  json_array_begin (json_ctx, "timings");
+
+  FOR_EACH_IMPL (impl, 0)
+    {
+      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c1, c2, len);
+      alloc_bufs ();
+    }
+
+  json_array_end (json_ctx);
+  json_element_object_end (json_ctx);
+}
+
+int
+test_main (void)
+{
+  json_ctx_t json_ctx;
+  size_t i;
+  int c1, c2;
+
+  test_init ();
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+  json_attr_object_begin (&json_ctx, TEST_NAME);
+  json_attr_string (&json_ctx, "bench-variant", "zerofill");
+
+  json_array_begin (&json_ctx, "ifuncs");
+  FOR_EACH_IMPL (impl, 0)
+    json_element_string (&json_ctx, impl->name);
+  json_array_end (&json_ctx);
+
+  json_array_begin (&json_ctx, "results");
+
+  // To analyze zero fill performance by comparing among the following 4
+  // patterns through L1, L2 and L3 caches.
+  // - zero-over-zero: c1=0, c2=0
+  // - zero-over-one:  c1=0, c2=1
+  // - one-over-zero:  c1=1, c2=0
+  // - one-over-one:   c1=1, c2=1
+  for (c1 = 0; c1 < 2; c1++)
+    for (c2 = 0; c2 < 2; c2++)
+      for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+	{
+	  do_test (&json_ctx, 0, c1, c2, i);
+	  do_test (&json_ctx, 3, c1, c2, i);
+	}
+
+  json_array_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_attr_object_end (&json_ctx);
+  json_document_end (&json_ctx);
+
+  return ret;
+}
+
+#include <support/test-driver.c>
+
+#define libc_hidden_builtin_def(X)
+#define libc_hidden_def(X)
+#define libc_hidden_weak(X)
+#define weak_alias(X,Y)
+#undef MEMSET
+#define MEMSET generic_memset
+#include <string/memset.c>
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* RE: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-09-21 11:09                     ` Wilco Dijkstra via Libc-alpha
  2021-09-22  1:05                       ` [PATCH v5] " Naohiro Tamura via Libc-alpha
@ 2021-09-22  1:07                       ` naohirot--- via Libc-alpha
  2021-09-28  1:40                         ` naohirot--- via Libc-alpha
  1 sibling, 1 reply; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-22  1:07 UTC (permalink / raw)
  To: 'Wilco Dijkstra', 'Lucas A. M. Magalhaes',
	Noah Goldstein, libc-alpha@sourceware.org

Hi Wilco,

> From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> Sent: Tuesday, September 21, 2021 8:09 PM
>
> > What do you think?
> 
> As I've mentioned, this will never work using the current benchmark loop.
> At size 256 your loop has only 1 timer tick... The only way to get any data
> out is to increase the time taken per call. At 16K there are about 20 ticks so
> it is still very inaccurate. By repeating the test thousands of times you can
> some signal out (eg. 20% is 20 ticks, 80% is 21 gives ~20.8 ticks on average),
> but that's impossible for smaller sizes.
> 
> So if you want to measure small sizes, you need to use a more accurate timing
> loop.

Thank you for the comment.
OK, I understood. So I updated the start size to 16KB too to commit first.
Please find V5 [1] and merge it if it's OK.
Changes from V4:
- Start size to 16KB from 256B
- End size to 16MB from 64MB

[1] https://sourceware.org/pipermail/libc-alpha/2021-September/131245.html

Thanks.
Naohiro


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-09-22  1:07                       ` [PATCH v3 2/5] " naohirot--- via Libc-alpha
@ 2021-09-28  1:40                         ` naohirot--- via Libc-alpha
  2021-09-30  0:55                           ` Tamura, Naohiro/田村 直� via Libc-alpha
  2021-10-18 12:57                           ` Lucas A. M. Magalhaes via Libc-alpha
  0 siblings, 2 replies; 83+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-09-28  1:40 UTC (permalink / raw)
  To: 'Wilco Dijkstra', 'Lucas A. M. Magalhaes',
	Noah Goldstein, libc-alpha@sourceware.org, naohirot@fujitsu.com

Hi Wilco,

Let me ping you again if V5 [1] is OK or not.
[1]https://sourceware.org/pipermail/libc-alpha/2021-September/131245.html
________________________________________
From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
Sent: Wednesday, 22 September 2021 10:07
To: 'Wilco Dijkstra'; 'Lucas A. M. Magalhaes'; Noah Goldstein; libc-alpha@sourceware.org
Subject: RE: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest

Hi Wilco,

> From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> Sent: Tuesday, September 21, 2021 8:09 PM
>
> > What do you think?
>
> As I've mentioned, this will never work using the current benchmark loop.
> At size 256 your loop has only 1 timer tick... The only way to get any data
> out is to increase the time taken per call. At 16K there are about 20 ticks so
> it is still very inaccurate. By repeating the test thousands of times you can
> some signal out (eg. 20% is 20 ticks, 80% is 21 gives ~20.8 ticks on average),
> but that's impossible for smaller sizes.
>
> So if you want to measure small sizes, you need to use a more accurate timing
> loop.

Thank you for the comment.
OK, I understood. So I updated the start size to 16KB too to commit first.
Please find V5 [1] and merge it if it's OK.
Changes from V4:
- Start size to 16KB from 256B
- End size to 16MB from 64MB

[1] https://sourceware.org/pipermail/libc-alpha/2021-September/131245.html

Thanks.
Naohiro


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-09-28  1:40                         ` naohirot--- via Libc-alpha
@ 2021-09-30  0:55                           ` Tamura, Naohiro/田村 直� via Libc-alpha
  2021-10-18 12:57                           ` Lucas A. M. Magalhaes via Libc-alpha
  1 sibling, 0 replies; 83+ messages in thread
From: Tamura, Naohiro/田村 直� via Libc-alpha @ 2021-09-30  0:55 UTC (permalink / raw)
  To: 'Lucas A. M. Magalhaes'
  Cc: libc-alpha@sourceware.org, 'Wilco Dijkstra'

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="gb2312", Size: 2184 bytes --]

Hi Lucas,
cc: Wilco, Noah,

Can you merge V5 patch [1] for me if we can think of having reached to "consensus" since there is no sustained opposition from Wilco?
V5 patch is basically same as V4 that you gave "LGTM".

Changes from V4:
- Start size to 16KB from 256B
- End size to 16MB from 64MB

And I'm open to hear any opinion from Wilco even if V5 is merged.

[1]https://sourceware.org/pipermail/libc-alpha/2021-September/131245.html

Thanks.
Naohiro

________________________________________
From: Tamura, Naohiro/Ìï´å Ö±ŽÚ <naohirot@fujitsu.com>
Sent: Tuesday, 28 September 2021 10:40
To: 'Wilco Dijkstra'; 'Lucas A. M. Magalhaes'; Noah Goldstein; libc-alpha@sourceware.org; Tamura, Naohiro/Ìï´å Ö±ŽÚ
Subject: Re: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest

Hi Wilco,

Let me ping you again if V5 [1] is OK or not.
[1]https://sourceware.org/pipermail/libc-alpha/2021-September/131245.html
________________________________________
From: Tamura, Naohiro/Ìï´å Ö±ŽÚ <naohirot@fujitsu.com>
Sent: Wednesday, 22 September 2021 10:07
To: 'Wilco Dijkstra'; 'Lucas A. M. Magalhaes'; Noah Goldstein; libc-alpha@sourceware.org
Subject: RE: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest

Hi Wilco,

> From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> Sent: Tuesday, September 21, 2021 8:09 PM
>
> > What do you think?
>
> As I've mentioned, this will never work using the current benchmark loop.
> At size 256 your loop has only 1 timer tick... The only way to get any data
> out is to increase the time taken per call. At 16K there are about 20 ticks so
> it is still very inaccurate. By repeating the test thousands of times you can
> some signal out (eg. 20% is 20 ticks, 80% is 21 gives ~20.8 ticks on average),
> but that's impossible for smaller sizes.
>
> So if you want to measure small sizes, you need to use a more accurate timing
> loop.

Thank you for the comment.
OK, I understood. So I updated the start size to 16KB too to commit first.
Please find V5 [1] and merge it if it's OK.
Changes from V4:
- Start size to 16KB from 256B
- End size to 16MB from 64MB

[1] https://sourceware.org/pipermail/libc-alpha/2021-September/131245.html

Thanks.
Naohiro

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-09-28  1:40                         ` naohirot--- via Libc-alpha
  2021-09-30  0:55                           ` Tamura, Naohiro/田村 直� via Libc-alpha
@ 2021-10-18 12:57                           ` Lucas A. M. Magalhaes via Libc-alpha
  2021-10-20 13:44                             ` Wilco Dijkstra via Libc-alpha
  1 sibling, 1 reply; 83+ messages in thread
From: Lucas A. M. Magalhaes via Libc-alpha @ 2021-10-18 12:57 UTC (permalink / raw)
  To: libc-alpha, naohirot; +Cc: 'Wilco Dijkstra'

> > > What do you think?
> >
> > As I've mentioned, this will never work using the current benchmark loop.
> > At size 256 your loop has only 1 timer tick... The only way to get any data
> > out is to increase the time taken per call. At 16K there are about 20 ticks so
> > it is still very inaccurate. By repeating the test thousands of times you can
> > some signal out (eg. 20% is 20 ticks, 80% is 21 gives ~20.8 ticks on average),
> > but that's impossible for smaller sizes.
> >
> > So if you want to measure small sizes, you need to use a more accurate timing
> > loop.
> 
> Thank you for the comment.
> OK, I understood. So I updated the start size to 16KB too to commit first.
> Please find V5 [1] and merge it if it's OK.
> Changes from V4:
> - Start size to 16KB from 256B
> - End size to 16MB from 64MB
 
> [1] https://sourceware.org/pipermail/libc-alpha/2021-September/131245.html
 
Hi Tamura,

I agree with you that is important to measure calls with smaller
lengths.  IMHO the issue here is not if the benchmark should measure or
not this lengths, but how it could measure that.

+static void
+__attribute__((noinline, noclone))
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
+	     int c1 __attribute ((unused)), int c2 __attribute ((unused)),
+	     size_t n)
+{
+  size_t i, iters = 32;
+  timing_t start, stop, cur, latency = 0;
+
+  CALL (impl, s, c2, n); // warm up
+
+  for (i = 0; i < iters; i++)
+    {
+      memset (s, c1, n); // alternation
+
+      TIMING_NOW (start);
+
+      CALL (impl, s, c2, n);
+
+      TIMING_NOW (stop);
+      TIMING_DIFF (cur, start, stop);
+      TIMING_ACCUM (latency, cur);
+    }
+
+  json_element_double (json_ctx, (double) latency / (double) iters);
+}

By doing this you are measuring just the call it self and accumulating
the results. This is indeed not measurable for really small lengths.
You could try moving the memset and the timing out of the loop and
measure the time spent in multiple runs. To fix the memset you could
memset a bigger buffer and move the s pointer on each loop. I guess this
will reduce the variations Wilco mentioned.
Maybe we need to keep this loop for bigger lengths as we will need
a buffer too much big for the implementation that I suggested.

Another point here is that GNU Code Style asks for /**/ comments
instead of //. As seen in
http://www.gnu.org/prep/standards/standards.html#Comments

Finally, Sorry that I took so long to reply here.
Thanks for working on this.
---
Lucas A. M. Magalhães

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-10-18 12:57                           ` Lucas A. M. Magalhaes via Libc-alpha
@ 2021-10-20 13:44                             ` Wilco Dijkstra via Libc-alpha
  2021-10-20 15:35                               ` Lucas A. M. Magalhaes via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Wilco Dijkstra via Libc-alpha @ 2021-10-20 13:44 UTC (permalink / raw)
  To: Lucas A. M. Magalhaes, libc-alpha@sourceware.org,
	naohirot@fujitsu.com

Hi Lucas,

> By doing this you are measuring just the call it self and accumulating
> the results. This is indeed not measurable for really small lengths.
> You could try moving the memset and the timing out of the loop and
> measure the time spent in multiple runs. To fix the memset you could
> memset a bigger buffer and move the s pointer on each loop. I guess this
> will reduce the variations Wilco mentioned.

That would basically end up the same as bench-memset-walk.c given that
you need a huge buffer to get reasonable accuracy (bench-memset does
8192 iterations by default, and that is still inaccurate for small sizes).
In that case it would be easier to improve bench-memset-walk.c rather than
adding yet another benchmark that is too inaccurate to be useful.

Alternatively we could use the timing loop I suggested which allows any
pattern of zero/non-zero to be tested accurately:

      TIMING_NOW (start);
      for (j = 0; j < iters; j++)
        CALL (impl, s, memset_value[j & MASK], n);
      TIMING_NOW (stop);

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-10-20 13:44                             ` Wilco Dijkstra via Libc-alpha
@ 2021-10-20 15:35                               ` Lucas A. M. Magalhaes via Libc-alpha
  2021-10-20 17:47                                 ` Wilco Dijkstra via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Lucas A. M. Magalhaes via Libc-alpha @ 2021-10-20 15:35 UTC (permalink / raw)
  To: Wilco Dijkstra, libc-alpha, naohirot

Hi Wilco,
> > By doing this you are measuring just the call it self and accumulating
> > the results. This is indeed not measurable for really small lengths.
> > You could try moving the memset and the timing out of the loop and
> > measure the time spent in multiple runs. To fix the memset you could
> > memset a bigger buffer and move the s pointer on each loop. I guess this
> > will reduce the variations Wilco mentioned.
> 
> That would basically end up the same as bench-memset-walk.c given that
> you need a huge buffer to get reasonable accuracy (bench-memset does
> 8192 iterations by default, and that is still inaccurate for small sizes).
> In that case it would be easier to improve bench-memset-walk.c rather than
> adding yet another benchmark that is too inaccurate to be useful.
Yeah, I agree with you.
> 
> Alternatively we could use the timing loop I suggested which allows any
> pattern of zero/non-zero to be tested accurately:
> 
>       TIMING_NOW (start);
>       for (j = 0; j < iters; j++)
>         CALL (impl, s, memset_value[j & MASK], n);
>       TIMING_NOW (stop);
> 
Sorry but I suppose don't understood your suggestion completely.  The
memset_value array will hold patterns like [0,0], [0,1] or [1,1],
right?  If so, this will not work to measure the zero-to-one pattern for
example, as it will be mixing zero-to-one with one-to-zero calls. In
order to measure just an specific patter the buffer must be loaded
previously of the timing loop.

---
Lucas A. M. Magalhães

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-10-20 15:35                               ` Lucas A. M. Magalhaes via Libc-alpha
@ 2021-10-20 17:47                                 ` Wilco Dijkstra via Libc-alpha
  2021-10-22 13:08                                   ` Lucas A. M. Magalhaes via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Wilco Dijkstra via Libc-alpha @ 2021-10-20 17:47 UTC (permalink / raw)
  To: Lucas A. M. Magalhaes, libc-alpha@sourceware.org,
	naohirot@fujitsu.com

Hi Lucas,

> Sorry but I suppose don't understood your suggestion completely.  The
> memset_value array will hold patterns like [0,0], [0,1] or [1,1],
> right?  If so, this will not work to measure the zero-to-one pattern for
> example, as it will be mixing zero-to-one with one-to-zero calls. In
> order to measure just an specific patter the buffer must be loaded
> previously of the timing loop.

The original idea was to add more tests for memset of zero and check
whether writing zero is optimized and/or writing zero over zero. There is
an equal number of 0->1 and 1->0 transitions in a pattern, so you can't
easily differentiate between them, but you can tell whether they are the
same or faster than 1->1 transitions.

For 0->0 you can run different patterns with a varying number of transitions
but the same number of zeroes and ones: eg. 0000000011111111 (7 times 0->0)
vs 0011001100110011 (4 times 0->0) vs 0101010101010101 (no 0->0).

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v3 2/5] benchtests: Add memset zero fill benchtest
  2021-10-20 17:47                                 ` Wilco Dijkstra via Libc-alpha
@ 2021-10-22 13:08                                   ` Lucas A. M. Magalhaes via Libc-alpha
  0 siblings, 0 replies; 83+ messages in thread
From: Lucas A. M. Magalhaes via Libc-alpha @ 2021-10-22 13:08 UTC (permalink / raw)
  To: Wilco Dijkstra, libc-alpha, naohirot

Hi Wilco, Thanks for clarifying.

> > Sorry but I suppose don't understood your suggestion completely.  The
> > memset_value array will hold patterns like [0,0], [0,1] or [1,1],
> > right?  If so, this will not work to measure the zero-to-one pattern for
> > example, as it will be mixing zero-to-one with one-to-zero calls. In
> > order to measure just an specific patter the buffer must be loaded
> > previously of the timing loop.
> 
> The original idea was to add more tests for memset of zero and check
> whether writing zero is optimized and/or writing zero over zero. There is
> an equal number of 0->1 and 1->0 transitions in a pattern, so you can't
> easily differentiate between them, but you can tell whether they are the
> same or faster than 1->1 transitions.
> 
> For 0->0 you can run different patterns with a varying number of transitions
> but the same number of zeroes and ones: eg. 0000000011111111 (7 times 0->0)
> vs 0011001100110011 (4 times 0->0) vs 0101010101010101 (no 0->0).

That's an interesting strategy, indeed. I guess that's a little more
complex than most of the other benchmarks. I agree that this could solve
the issues with variations for small lenghts.

Thanks.
---
Lucas A. M. Magalhães

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v5] benchtests: Add memset zero fill benchtest
  2021-09-22  1:05                       ` [PATCH v5] " Naohiro Tamura via Libc-alpha
@ 2023-02-09 17:23                         ` Carlos O'Donell via Libc-alpha
  2023-02-10  1:26                           ` Siddhesh Poyarekar via Libc-alpha
  0 siblings, 1 reply; 83+ messages in thread
From: Carlos O'Donell via Libc-alpha @ 2023-02-09 17:23 UTC (permalink / raw)
  To: Naohiro Tamura, Wilco Dijkstra, 'Lucas A . M . Magalhaes',
	Noah Goldstein, libc-alpha, Siddhesh Poyarekar

On 9/21/21 21:05, Naohiro Tamura via Libc-alpha wrote:
> Memset takes 0 as the second parameter in most cases.
> However, we cannot measure the zero fill performance by
> bench-memset.c, bench-memset-large.c and bench-memset-walk.c
> precisely.
> X86_64 micro-architecture has some zero-over-zero optimization, and
> AArch64 micro-architecture also has some optimization for DC ZVA
> instruction.
> This patch provides bench-memset-zerofill.c which is suitable to
> analyze the zero fill performance by comparing among 4 patterns,
> zero-over-zero, zero-over-one, one-over-zero and one-over-one through
> L1, L2 and L3 caches.

As I aim to drive the SLI to 90-days for patch review... this is next on my list.

We have 3 reviewed-by aggregations here, but we have not yet committed this patch.

Where did we land here? Is this ready to be included in the microbenchmark?

> The following commands are examples to analyze a JSON output,
> bench-memset-zerofill.out, by 'jq' and 'plot_strings.py'.
> 
> 1) compare zero-over-zero performance
> 
> $ cat bench-memset-zerofill.out | \
>   jq -r '
>     .functions.memset."bench-variant"="zerofill-0o0" |
>     del(.functions.memset.results[] | select(.char1 != 0 or .char2 != 0))
>   ' | \
>   plot_strings.py -l -p thru -v -
> 
> 2) compare zero paformance
> 
> $ cat bench-memset-zerofill.out | \
>   jq -r '
>     .functions.memset."bench-variant"="zerofill-zero" |
>     del(.functions.memset.results[] | select(.char2 != 0))
>   ' | \
>   plot_strings.py -l -p thru -v -
> 
> 3) compare nonzero paformance
> 
> $ cat bench-memset-zerofill.out | \
>   jq -r '
>     .functions.memset."bench-variant"="zerofill-nonzero" |
>     del(.functions.memset.results[] | select(.char2 == 0))
>   ' | \
>   plot_strings.py -l -p thru -v -
> 
> Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
> Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
> Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> ---
>  benchtests/Makefile                |   2 +-
>  benchtests/bench-memset-zerofill.c | 140 +++++++++++++++++++++++++++++
>  2 files changed, 141 insertions(+), 1 deletion(-)
>  create mode 100644 benchtests/bench-memset-zerofill.c
> 
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 1530939a8ce8..21b95c736190 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -53,7 +53,7 @@ string-benchset := memccpy memchr memcmp memcpy memmem memmove \
>  		   strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
>  		   strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
>  		   strcoll memcpy-large memcpy-random memmove-large memset-large \
> -		   memcpy-walk memset-walk memmove-walk
> +		   memcpy-walk memset-walk memmove-walk memset-zerofill
>  
>  # Build and run locale-dependent benchmarks only if we're building natively.
>  ifeq (no,$(cross-compiling))
> diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
> new file mode 100644
> index 000000000000..2e146e157e25
> --- /dev/null
> +++ b/benchtests/bench-memset-zerofill.c
> @@ -0,0 +1,140 @@
> +/* Measure memset functions with zero fill data.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define TEST_MAIN
> +#define TEST_NAME "memset"
> +#define START_SIZE (16 * 1024)
> +#define MIN_PAGE_SIZE (getpagesize () + 16 * 1024 * 1024)
> +#define TIMEOUT (20 * 60)
> +#include "bench-string.h"
> +
> +#include "json-lib.h"
> +
> +void *generic_memset (void *, int, size_t);
> +typedef void *(*proto_t) (void *, int, size_t);
> +
> +IMPL (MEMSET, 1)
> +IMPL (generic_memset, 0)
> +
> +static void
> +__attribute__((noinline, noclone))
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> +	     int c1 __attribute ((unused)), int c2 __attribute ((unused)),
> +	     size_t n)
> +{
> +  size_t i, iters = 32;
> +  timing_t start, stop, cur, latency = 0;
> +
> +  CALL (impl, s, c2, n); // warm up
> +
> +  for (i = 0; i < iters; i++)
> +    {
> +      memset (s, c1, n); // alternation
> +
> +      TIMING_NOW (start);
> +
> +      CALL (impl, s, c2, n);
> +
> +      TIMING_NOW (stop);
> +      TIMING_DIFF (cur, start, stop);
> +      TIMING_ACCUM (latency, cur);
> +    }
> +
> +  json_element_double (json_ctx, (double) latency / (double) iters);
> +}
> +
> +static void
> +do_test (json_ctx_t *json_ctx, size_t align, int c1, int c2, size_t len)
> +{
> +  align &= getpagesize () - 1;
> +  if ((align + len) * sizeof (CHAR) > page_size)
> +    return;
> +
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "length", len);
> +  json_attr_uint (json_ctx, "alignment", align);
> +  json_attr_int (json_ctx, "char1", c1);
> +  json_attr_int (json_ctx, "char2", c2);
> +  json_array_begin (json_ctx, "timings");
> +
> +  FOR_EACH_IMPL (impl, 0)
> +    {
> +      do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c1, c2, len);
> +      alloc_bufs ();
> +    }
> +
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
> +}
> +
> +int
> +test_main (void)
> +{
> +  json_ctx_t json_ctx;
> +  size_t i;
> +  int c1, c2;
> +
> +  test_init ();
> +
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, TEST_NAME);
> +  json_attr_string (&json_ctx, "bench-variant", "zerofill");
> +
> +  json_array_begin (&json_ctx, "ifuncs");
> +  FOR_EACH_IMPL (impl, 0)
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
> +
> +  // To analyze zero fill performance by comparing among the following 4
> +  // patterns through L1, L2 and L3 caches.
> +  // - zero-over-zero: c1=0, c2=0
> +  // - zero-over-one:  c1=0, c2=1
> +  // - one-over-zero:  c1=1, c2=0
> +  // - one-over-one:   c1=1, c2=1
> +  for (c1 = 0; c1 < 2; c1++)
> +    for (c2 = 0; c2 < 2; c2++)
> +      for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
> +	{
> +	  do_test (&json_ctx, 0, c1, c2, i);
> +	  do_test (&json_ctx, 3, c1, c2, i);
> +	}
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
> +  return ret;
> +}
> +
> +#include <support/test-driver.c>
> +
> +#define libc_hidden_builtin_def(X)
> +#define libc_hidden_def(X)
> +#define libc_hidden_weak(X)
> +#define weak_alias(X,Y)
> +#undef MEMSET
> +#define MEMSET generic_memset
> +#include <string/memset.c>

-- 
Cheers,
Carlos.


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH v5] benchtests: Add memset zero fill benchtest
  2023-02-09 17:23                         ` Carlos O'Donell via Libc-alpha
@ 2023-02-10  1:26                           ` Siddhesh Poyarekar via Libc-alpha
  0 siblings, 0 replies; 83+ messages in thread
From: Siddhesh Poyarekar via Libc-alpha @ 2023-02-10  1:26 UTC (permalink / raw)
  To: Carlos O'Donell
  Cc: Naohiro Tamura, Wilco Dijkstra, Lucas A . M . Magalhaes,
	Noah Goldstein, libc-alpha

On Thu, Feb 9, 2023 at 12:23 PM Carlos O'Donell <carlos@redhat.com> wrote:
>
> On 9/21/21 21:05, Naohiro Tamura via Libc-alpha wrote:
> > Memset takes 0 as the second parameter in most cases.
> > However, we cannot measure the zero fill performance by
> > bench-memset.c, bench-memset-large.c and bench-memset-walk.c
> > precisely.
> > X86_64 micro-architecture has some zero-over-zero optimization, and
> > AArch64 micro-architecture also has some optimization for DC ZVA
> > instruction.
> > This patch provides bench-memset-zerofill.c which is suitable to
> > analyze the zero fill performance by comparing among 4 patterns,
> > zero-over-zero, zero-over-one, one-over-zero and one-over-one through
> > L1, L2 and L3 caches.
>
> As I aim to drive the SLI to 90-days for patch review... this is next on my list.
>
> We have 3 reviewed-by aggregations here, but we have not yet committed this patch.
>
> Where did we land here? Is this ready to be included in the microbenchmark?

I dug around a bit and found that the conversation for this v5
continued in the v3 thread[1] and AFAICT, the conclusion was to drop
this patch.

Sid

[1] https://patchwork.sourceware.org/project/glibc/patch/20210805075053.433538-1-naohirot@fujitsu.com/


^ permalink raw reply	[flat|nested] 83+ messages in thread

end of thread, other threads:[~2023-02-10  1:27 UTC | newest]

Thread overview: 83+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-13  8:22 [PATCH] benchtests: Add memset zero fill benchmark tests Naohiro Tamura via Libc-alpha
2021-07-13 13:50 ` Lucas A. M. Magalhaes via Libc-alpha
2021-07-20  6:31 ` [PATCH v2 0/5] " Naohiro Tamura via Libc-alpha
2021-08-05  7:47   ` [PATCH v3 0/5] benchtests: Add memset zero fill benchmark test Naohiro Tamura via Libc-alpha
2021-08-05  7:49     ` [PATCH v3 1/5] benchtests: Enable scripts/plot_strings.py to read stdin Naohiro Tamura via Libc-alpha
2021-08-05  7:56       ` Siddhesh Poyarekar
2021-09-08  1:46         ` naohirot--- via Libc-alpha
2021-09-08 12:56           ` Siddhesh Poyarekar
2021-09-09  0:22             ` naohirot--- via Libc-alpha
2021-09-13  3:45               ` Siddhesh Poyarekar
2021-08-05  7:50     ` [PATCH v3 2/5] benchtests: Add memset zero fill benchtest Naohiro Tamura via Libc-alpha
2021-09-08  2:03       ` naohirot--- via Libc-alpha
2021-09-10 20:40       ` Lucas A. M. Magalhaes via Libc-alpha
2021-09-13  0:53         ` naohirot--- via Libc-alpha
2021-09-13 14:05           ` Lucas A. M. Magalhaes via Libc-alpha
2021-09-14  0:38             ` [PATCH v4] " Naohiro Tamura via Libc-alpha
2021-09-14  0:44             ` [PATCH v3 2/5] " naohirot--- via Libc-alpha
2021-09-14 14:02               ` Wilco Dijkstra via Libc-alpha
2021-09-15  8:24                 ` naohirot--- via Libc-alpha
2021-09-21  1:27                   ` naohirot--- via Libc-alpha
2021-09-21 11:09                     ` Wilco Dijkstra via Libc-alpha
2021-09-22  1:05                       ` [PATCH v5] " Naohiro Tamura via Libc-alpha
2023-02-09 17:23                         ` Carlos O'Donell via Libc-alpha
2023-02-10  1:26                           ` Siddhesh Poyarekar via Libc-alpha
2021-09-22  1:07                       ` [PATCH v3 2/5] " naohirot--- via Libc-alpha
2021-09-28  1:40                         ` naohirot--- via Libc-alpha
2021-09-30  0:55                           ` Tamura, Naohiro/田村 直� via Libc-alpha
2021-10-18 12:57                           ` Lucas A. M. Magalhaes via Libc-alpha
2021-10-20 13:44                             ` Wilco Dijkstra via Libc-alpha
2021-10-20 15:35                               ` Lucas A. M. Magalhaes via Libc-alpha
2021-10-20 17:47                                 ` Wilco Dijkstra via Libc-alpha
2021-10-22 13:08                                   ` Lucas A. M. Magalhaes via Libc-alpha
2021-08-05  7:51     ` [PATCH v3 3/5] benchtests: Remove redundant assert.h Naohiro Tamura via Libc-alpha
2021-09-08  1:59       ` naohirot--- via Libc-alpha
2021-09-13  3:36       ` Siddhesh Poyarekar
2021-08-05  7:51     ` [PATCH v3 4/5] benchtests: Fix validate_benchout.py exceptions Naohiro Tamura via Libc-alpha
2021-09-08  1:55       ` naohirot--- via Libc-alpha
2021-09-13  3:42       ` Siddhesh Poyarekar
2021-09-13  3:50         ` Siddhesh Poyarekar
2021-09-13 13:44           ` [PATCH v4] " Naohiro Tamura via Libc-alpha
2021-09-15  3:23             ` Siddhesh Poyarekar
2021-09-16  1:12               ` naohirot--- via Libc-alpha
2021-09-16  1:41                 ` Siddhesh Poyarekar
2021-09-16  2:23                   ` [PATCH v5] " Naohiro Tamura via Libc-alpha
2021-09-16  3:48                     ` Siddhesh Poyarekar
2021-09-16  5:23                       ` naohirot--- via Libc-alpha
2021-09-16  2:26                   ` [PATCH v4] " naohirot--- via Libc-alpha
2021-09-13 13:46           ` [PATCH v3 4/5] " naohirot--- via Libc-alpha
2021-08-05  7:52     ` [PATCH v3 5/5] config: Rename HAVE_BUILTIN_MEMSET macro Naohiro Tamura via Libc-alpha
2021-08-11 20:34       ` Adhemerval Zanella via Libc-alpha
2021-07-20  6:34 ` [PATCH v2 1/5] benchtests: Enable scripts/plot_strings.py to read stdin Naohiro Tamura via Libc-alpha
2021-07-20  6:35 ` [PATCH v2 2/5] benchtests: Add memset zero fill benchtest Naohiro Tamura via Libc-alpha
2021-07-20 16:48   ` Noah Goldstein via Libc-alpha
2021-07-21 12:56     ` naohirot--- via Libc-alpha
2021-07-21 13:07       ` naohirot--- via Libc-alpha
2021-07-21 18:14         ` Noah Goldstein via Libc-alpha
2021-07-21 19:17           ` Wilco Dijkstra via Libc-alpha
2021-07-26  8:42             ` naohirot--- via Libc-alpha
2021-07-26 11:15               ` Wilco Dijkstra via Libc-alpha
2021-07-27  2:24                 ` naohirot--- via Libc-alpha
2021-07-27 17:26                   ` Wilco Dijkstra via Libc-alpha
2021-07-28  7:27                     ` naohirot--- via Libc-alpha
2021-08-04  9:11                       ` naohirot--- via Libc-alpha
2021-07-26  8:39     ` naohirot--- via Libc-alpha
2021-07-26 17:22       ` Noah Goldstein via Libc-alpha
2021-07-20  6:35 ` [PATCH v2 3/5] benchtests: Add a script to convert benchout string JSON to CSV Naohiro Tamura via Libc-alpha
2021-07-21  2:41   ` naohirot--- via Libc-alpha
2021-07-27 20:17   ` Joseph Myers
2021-07-29  1:56     ` naohirot--- via Libc-alpha
2021-07-29  4:42       ` Siddhesh Poyarekar
2021-07-30  7:05         ` naohirot--- via Libc-alpha
2021-07-31 10:47           ` Siddhesh Poyarekar
2021-07-20  6:36 ` [PATCH v2 4/5] benchtests: Remove redundant assert.h Naohiro Tamura via Libc-alpha
2021-07-20  6:37 ` [PATCH v2 5/5] benchtests: Fix validate_benchout.py exceptions Naohiro Tamura via Libc-alpha
2021-07-26  8:34 ` [PATCH] config: Remove HAVE_BUILTIN_MEMSET macro Naohiro Tamura via Libc-alpha
2021-07-26  8:48   ` naohirot--- via Libc-alpha
2021-07-26  8:49   ` Andreas Schwab
2021-07-26  9:42     ` naohirot--- via Libc-alpha
2021-07-26  9:51       ` Andreas Schwab
2021-07-26 13:16         ` naohirot--- via Libc-alpha
2021-07-26  8:35 ` [PATCH] benchtests: Add a script to merge two benchout string files Naohiro Tamura via Libc-alpha
2021-07-27 20:51   ` Joseph Myers
2021-07-30  7:04     ` naohirot--- via Libc-alpha

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).