* [PATCH] benchtests: Add memset zero fill benchmark tests
@ 2021-07-13 8:22 Naohiro Tamura via Libc-alpha
2021-07-13 13:50 ` Lucas A. M. Magalhaes via Libc-alpha
0 siblings, 1 reply; 7+ messages in thread
From: Naohiro Tamura via Libc-alpha @ 2021-07-13 8:22 UTC (permalink / raw)
To: libc-alpha
Memset takes 0 as the second parameter in most cases.
More than 95% of memset takes 0 as the second parameter in case of
Linux Kernel source code.
However, we cannot measure the zero fill performance by
bench-memset-zerofill.c and bench-memset-large-zerofill.c.
This patch provides bench-memset-zerofill.c and
bench-memset-large-zerofill.c which are suitable to see the
performance of zero fill by fixing the second parameter to 0.
---
benchtests/Makefile | 3 +-
benchtests/bench-memset-large-zerofill.c | 125 ++++++++++++++++++
benchtests/bench-memset-zerofill.c | 156 +++++++++++++++++++++++
3 files changed, 283 insertions(+), 1 deletion(-)
create mode 100644 benchtests/bench-memset-large-zerofill.c
create mode 100644 benchtests/bench-memset-zerofill.c
diff --git a/benchtests/Makefile b/benchtests/Makefile
index 1530939a8ce8..1261f7650fc7 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -53,7 +53,8 @@ string-benchset := memccpy memchr memcmp memcpy memmem memmove \
strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
strcoll memcpy-large memcpy-random memmove-large memset-large \
- memcpy-walk memset-walk memmove-walk
+ memcpy-walk memset-walk memmove-walk memset-zerofill \
+ memset-large-zerofill
# Build and run locale-dependent benchmarks only if we're building natively.
ifeq (no,$(cross-compiling))
diff --git a/benchtests/bench-memset-large-zerofill.c b/benchtests/bench-memset-large-zerofill.c
new file mode 100644
index 000000000000..d8eae9d9789f
--- /dev/null
+++ b/benchtests/bench-memset-large-zerofill.c
@@ -0,0 +1,125 @@
+/* Measure memset functions with large data sizes.
+ Copyright (C) 2016-2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define TEST_MAIN
+#define TEST_NAME "memset"
+#define START_SIZE (128 * 1024)
+#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+#include <assert.h>
+#include "json-lib.h"
+
+void *generic_memset (void *, int, size_t);
+typedef void *(*proto_t) (void *, int, size_t);
+
+IMPL (MEMSET, 1)
+IMPL (generic_memset, 0)
+
+static void
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
+ int c __attribute ((unused)), size_t n)
+{
+ size_t i, iters = 16;
+ timing_t start, stop, cur;
+
+ TIMING_NOW (start);
+ for (i = 0; i < iters; ++i)
+ {
+ CALL (impl, s, c, n);
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (cur, start, stop);
+
+ json_element_double (json_ctx, (double) cur / (double) iters);
+}
+
+static void
+do_test (json_ctx_t *json_ctx, size_t align, int c, size_t len)
+{
+ align &= 63;
+ if ((align + len) * sizeof (CHAR) > page_size)
+ return;
+
+ json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "length", len);
+ json_attr_uint (json_ctx, "alignment", align);
+ json_attr_int (json_ctx, "char", c);
+ json_array_begin (json_ctx, "timings");
+
+ FOR_EACH_IMPL (impl, 0)
+ {
+ do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c, len);
+ alloc_bufs ();
+ }
+
+ json_array_end (json_ctx);
+ json_element_object_end (json_ctx);
+}
+
+int
+test_main (void)
+{
+ json_ctx_t json_ctx;
+ size_t i;
+ int c;
+
+ test_init ();
+
+ json_init (&json_ctx, 0, stdout);
+
+ json_document_begin (&json_ctx);
+ json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+ json_attr_object_begin (&json_ctx, "functions");
+ json_attr_object_begin (&json_ctx, TEST_NAME);
+ json_attr_string (&json_ctx, "bench-variant", "large-zerofill");
+
+ json_array_begin (&json_ctx, "ifuncs");
+ FOR_EACH_IMPL (impl, 0)
+ json_element_string (&json_ctx, impl->name);
+ json_array_end (&json_ctx);
+
+ json_array_begin (&json_ctx, "results");
+
+ c = 0;
+ for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+ {
+ do_test (&json_ctx, 0, c, i);
+ do_test (&json_ctx, 3, c, i);
+ }
+
+ json_array_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_document_end (&json_ctx);
+
+ return ret;
+}
+
+#include <support/test-driver.c>
+
+#define libc_hidden_builtin_def(X)
+#define libc_hidden_def(X)
+#define libc_hidden_weak(X)
+#define weak_alias(X,Y)
+#undef MEMSET
+#define MEMSET generic_memset
+#include <string/memset.c>
diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
new file mode 100644
index 000000000000..ac20ae4c6537
--- /dev/null
+++ b/benchtests/bench-memset-zerofill.c
@@ -0,0 +1,156 @@
+/* Measure memset functions.
+ Copyright (C) 2013-2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define TEST_MAIN
+#ifndef WIDE
+# define TEST_NAME "memset"
+#else
+# define TEST_NAME "wmemset"
+# define generic_memset generic_wmemset
+#endif /* WIDE */
+#define MIN_PAGE_SIZE 131072
+#include "bench-string.h"
+
+#include "json-lib.h"
+
+#ifdef WIDE
+CHAR *generic_wmemset (CHAR *, CHAR, size_t);
+#else
+void *generic_memset (void *, int, size_t);
+#endif
+
+typedef void *(*proto_t) (void *, int, size_t);
+
+IMPL (MEMSET, 1)
+IMPL (generic_memset, 0)
+
+static void
+do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
+ int c __attribute ((unused)), size_t n)
+{
+ size_t i, iters = INNER_LOOP_ITERS;
+ timing_t start, stop, cur;
+
+ TIMING_NOW (start);
+ for (i = 0; i < iters; ++i)
+ {
+ CALL (impl, s, c, n);
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (cur, start, stop);
+
+ json_element_double (json_ctx, (double) cur / (double) iters);
+}
+
+static void
+do_test (json_ctx_t *json_ctx, size_t align, int c, size_t len)
+{
+ align &= 4095;
+ if ((align + len) * sizeof (CHAR) > page_size)
+ return;
+
+ json_element_object_begin (json_ctx);
+ json_attr_uint (json_ctx, "length", len);
+ json_attr_uint (json_ctx, "alignment", align);
+ json_attr_int (json_ctx, "char", c);
+ json_array_begin (json_ctx, "timings");
+
+ FOR_EACH_IMPL (impl, 0)
+ {
+ do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c, len);
+ alloc_bufs ();
+ }
+
+ json_array_end (json_ctx);
+ json_element_object_end (json_ctx);
+}
+
+int
+test_main (void)
+{
+ json_ctx_t json_ctx;
+ size_t i;
+ int c = 0;
+
+ test_init ();
+
+ json_init (&json_ctx, 0, stdout);
+
+ json_document_begin (&json_ctx);
+ json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+ json_attr_object_begin (&json_ctx, "functions");
+ json_attr_object_begin (&json_ctx, TEST_NAME);
+ json_attr_string (&json_ctx, "bench-variant", "default-zerofill");
+
+ json_array_begin (&json_ctx, "ifuncs");
+ FOR_EACH_IMPL (impl, 0)
+ json_element_string (&json_ctx, impl->name);
+ json_array_end (&json_ctx);
+
+ json_array_begin (&json_ctx, "results");
+
+ c = 0;
+ for (i = 0; i < 18; ++i)
+ do_test (&json_ctx, 0, c, 1 << i);
+ for (i = 1; i < 64; ++i)
+ {
+ do_test (&json_ctx, i, c, i);
+ do_test (&json_ctx, 4096 - i, c, i);
+ do_test (&json_ctx, 4095, c, i);
+ if (i & (i - 1))
+ do_test (&json_ctx, 0, c, i);
+ }
+ for (i = 32; i < 512; i+=32)
+ {
+ do_test (&json_ctx, 0, c, i);
+ do_test (&json_ctx, i, c, i);
+ }
+ do_test (&json_ctx, 1, c, 14);
+ do_test (&json_ctx, 3, c, 1024);
+ do_test (&json_ctx, 4, c, 64);
+ do_test (&json_ctx, 2, c, 25);
+ for (i = 33; i <= 256; i += 4)
+ {
+ do_test (&json_ctx, 0, c, 32 * i);
+ do_test (&json_ctx, i, c, 32 * i);
+ }
+
+ json_array_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_attr_object_end (&json_ctx);
+ json_document_end (&json_ctx);
+
+ return ret;
+}
+
+#include <support/test-driver.c>
+
+#define libc_hidden_builtin_def(X)
+#define libc_hidden_def(X)
+#define libc_hidden_weak(X)
+#define weak_alias(X,Y)
+#ifndef WIDE
+# undef MEMSET
+# define MEMSET generic_memset
+# include <string/memset.c>
+#else
+# define WMEMSET generic_wmemset
+# include <wcsmbs/wmemset.c>
+#endif
--
2.17.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH] benchtests: Add memset zero fill benchmark tests
2021-07-13 8:22 Naohiro Tamura via Libc-alpha
@ 2021-07-13 13:50 ` Lucas A. M. Magalhaes via Libc-alpha
0 siblings, 0 replies; 7+ messages in thread
From: Lucas A. M. Magalhaes via Libc-alpha @ 2021-07-13 13:50 UTC (permalink / raw)
To: Naohiro Tamura, libc-alpha
Hi Naohiro,
Thanks for working on this.
I like the idea of a benchmark specific for 0 on memset. However having two
implementations seems too much. I would rather see just one
bench-memset-zerofill.c. What I guess would be even better is to have this
performance test inside bench-memset.c and bench-memset-large.c.
Quoting Naohiro Tamura via Libc-alpha (2021-07-13 05:22:14)
> Memset takes 0 as the second parameter in most cases.
> More than 95% of memset takes 0 as the second parameter in case of
> Linux Kernel source code.
The Linux Kernel does not use glibc, it has his own memset implementation.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/string.c#n784
Therefore IMO this argument is not suited for this commit.
> However, we cannot measure the zero fill performance by
> bench-memset-zerofill.c and bench-memset-large-zerofill.c.
> This patch provides bench-memset-zerofill.c and
> bench-memset-large-zerofill.c which are suitable to see the
> performance of zero fill by fixing the second parameter to 0.
In this section I guess you mistake bench-memset.c and bench-memset-large.c for
bench-memset-zerofill.c and bench-memset-large-zerofill.c.
> ---
> benchtests/Makefile | 3 +-
> benchtests/bench-memset-large-zerofill.c | 125 ++++++++++++++++++
> benchtests/bench-memset-zerofill.c | 156 +++++++++++++++++++++++
> 3 files changed, 283 insertions(+), 1 deletion(-)
> create mode 100644 benchtests/bench-memset-large-zerofill.c
> create mode 100644 benchtests/bench-memset-zerofill.c
>
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 1530939a8ce8..1261f7650fc7 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -53,7 +53,8 @@ string-benchset := memccpy memchr memcmp memcpy memmem memmove \
> strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
> strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
> strcoll memcpy-large memcpy-random memmove-large memset-large \
> - memcpy-walk memset-walk memmove-walk
> + memcpy-walk memset-walk memmove-walk memset-zerofill \
> + memset-large-zerofill
>
> # Build and run locale-dependent benchmarks only if we're building natively.
> ifeq (no,$(cross-compiling))
> diff --git a/benchtests/bench-memset-large-zerofill.c b/benchtests/bench-memset-large-zerofill.c
> new file mode 100644
> index 000000000000..d8eae9d9789f
> --- /dev/null
> +++ b/benchtests/bench-memset-large-zerofill.c
> @@ -0,0 +1,125 @@
> +/* Measure memset functions with large data sizes.
Please fix this description.
> + Copyright (C) 2016-2021 Free Software Foundation, Inc.
Just 2021 here.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define TEST_MAIN
> +#define TEST_NAME "memset"
> +#define START_SIZE (128 * 1024)
> +#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
> +#define TIMEOUT (20 * 60)
> +#include "bench-string.h"
> +
> +#include <assert.h>
> +#include "json-lib.h"
> +
This code don't need the assert.h.
> +void *generic_memset (void *, int, size_t);
> +typedef void *(*proto_t) (void *, int, size_t);
> +
> +IMPL (MEMSET, 1)
> +IMPL (generic_memset, 0)
> +
> +static void
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> + int c __attribute ((unused)), size_t n)
> +{
> + size_t i, iters = 16;
> + timing_t start, stop, cur;
> +
> + TIMING_NOW (start);
> + for (i = 0; i < iters; ++i)
> + {
> + CALL (impl, s, c, n);
> + }
> + TIMING_NOW (stop);
> +
> + TIMING_DIFF (cur, start, stop);
> +
> + json_element_double (json_ctx, (double) cur / (double) iters);
> +}
> +
> +static void
> +do_test (json_ctx_t *json_ctx, size_t align, int c, size_t len)
> +{
> + align &= 63;
> + if ((align + len) * sizeof (CHAR) > page_size)
> + return;
> +
> + json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "length", len);
> + json_attr_uint (json_ctx, "alignment", align);
> + json_attr_int (json_ctx, "char", c);
> + json_array_begin (json_ctx, "timings");
> +
> + FOR_EACH_IMPL (impl, 0)
> + {
> + do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c, len);
> + alloc_bufs ();
> + }
> +
> + json_array_end (json_ctx);
> + json_element_object_end (json_ctx);
> +}
> +
> +int
> +test_main (void)
> +{
> + json_ctx_t json_ctx;
> + size_t i;
> + int c;
> +
> + test_init ();
> +
> + json_init (&json_ctx, 0, stdout);
> +
> + json_document_begin (&json_ctx);
> + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> + json_attr_object_begin (&json_ctx, "functions");
> + json_attr_object_begin (&json_ctx, TEST_NAME);
> + json_attr_string (&json_ctx, "bench-variant", "large-zerofill");
> +
> + json_array_begin (&json_ctx, "ifuncs");
> + FOR_EACH_IMPL (impl, 0)
> + json_element_string (&json_ctx, impl->name);
> + json_array_end (&json_ctx);
> +
> + json_array_begin (&json_ctx, "results");
> +
> + c = 0;
> + for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
> + {
> + do_test (&json_ctx, 0, c, i);
> + do_test (&json_ctx, 3, c, i);
> + }
> +
> + json_array_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_document_end (&json_ctx);
> +
> + return ret;
> +}
> +
> +#include <support/test-driver.c>
> +
> +#define libc_hidden_builtin_def(X)
> +#define libc_hidden_def(X)
> +#define libc_hidden_weak(X)
> +#define weak_alias(X,Y)
> +#undef MEMSET
> +#define MEMSET generic_memset
> +#include <string/memset.c>
> diff --git a/benchtests/bench-memset-zerofill.c b/benchtests/bench-memset-zerofill.c
> new file mode 100644
> index 000000000000..ac20ae4c6537
> --- /dev/null
> +++ b/benchtests/bench-memset-zerofill.c
> @@ -0,0 +1,156 @@
> +/* Measure memset functions.
Fix the description.
> + Copyright (C) 2013-2021 Free Software Foundation, Inc.
Only 2021 here.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define TEST_MAIN
> +#ifndef WIDE
> +# define TEST_NAME "memset"
> +#else
> +# define TEST_NAME "wmemset"
> +# define generic_memset generic_wmemset
> +#endif /* WIDE */
> +#define MIN_PAGE_SIZE 131072
> +#include "bench-string.h"
> +
> +#include "json-lib.h"
> +
> +#ifdef WIDE
> +CHAR *generic_wmemset (CHAR *, CHAR, size_t);
> +#else
> +void *generic_memset (void *, int, size_t);
> +#endif
> +
> +typedef void *(*proto_t) (void *, int, size_t);
> +
> +IMPL (MEMSET, 1)
> +IMPL (generic_memset, 0)
> +
> +static void
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, CHAR *s,
> + int c __attribute ((unused)), size_t n)
> +{
> + size_t i, iters = INNER_LOOP_ITERS;
> + timing_t start, stop, cur;
> +
> + TIMING_NOW (start);
> + for (i = 0; i < iters; ++i)
> + {
> + CALL (impl, s, c, n);
> + }
> + TIMING_NOW (stop);
> +
> + TIMING_DIFF (cur, start, stop);
> +
> + json_element_double (json_ctx, (double) cur / (double) iters);
> +}
> +
> +static void
> +do_test (json_ctx_t *json_ctx, size_t align, int c, size_t len)
> +{
> + align &= 4095;
> + if ((align + len) * sizeof (CHAR) > page_size)
> + return;
> +
> + json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "length", len);
> + json_attr_uint (json_ctx, "alignment", align);
> + json_attr_int (json_ctx, "char", c);
> + json_array_begin (json_ctx, "timings");
> +
> + FOR_EACH_IMPL (impl, 0)
> + {
> + do_one_test (json_ctx, impl, (CHAR *) (buf1) + align, c, len);
> + alloc_bufs ();
> + }
> +
> + json_array_end (json_ctx);
> + json_element_object_end (json_ctx);
> +}
> +
> +int
> +test_main (void)
> +{
> + json_ctx_t json_ctx;
> + size_t i;
> + int c = 0;
> +
> + test_init ();
> +
> + json_init (&json_ctx, 0, stdout);
> +
> + json_document_begin (&json_ctx);
> + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> + json_attr_object_begin (&json_ctx, "functions");
> + json_attr_object_begin (&json_ctx, TEST_NAME);
> + json_attr_string (&json_ctx, "bench-variant", "default-zerofill");
> +
> + json_array_begin (&json_ctx, "ifuncs");
> + FOR_EACH_IMPL (impl, 0)
> + json_element_string (&json_ctx, impl->name);
> + json_array_end (&json_ctx);
> +
> + json_array_begin (&json_ctx, "results");
> +
> + c = 0;
> + for (i = 0; i < 18; ++i)
> + do_test (&json_ctx, 0, c, 1 << i);
> + for (i = 1; i < 64; ++i)
> + {
> + do_test (&json_ctx, i, c, i);
> + do_test (&json_ctx, 4096 - i, c, i);
> + do_test (&json_ctx, 4095, c, i);
> + if (i & (i - 1))
> + do_test (&json_ctx, 0, c, i);
> + }
> + for (i = 32; i < 512; i+=32)
> + {
> + do_test (&json_ctx, 0, c, i);
> + do_test (&json_ctx, i, c, i);
> + }
> + do_test (&json_ctx, 1, c, 14);
> + do_test (&json_ctx, 3, c, 1024);
> + do_test (&json_ctx, 4, c, 64);
> + do_test (&json_ctx, 2, c, 25);
> + for (i = 33; i <= 256; i += 4)
> + {
> + do_test (&json_ctx, 0, c, 32 * i);
> + do_test (&json_ctx, i, c, 32 * i);
> + }
> +
> + json_array_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_document_end (&json_ctx);
> +
> + return ret;
> +}
> +
> +#include <support/test-driver.c>
> +
> +#define libc_hidden_builtin_def(X)
> +#define libc_hidden_def(X)
> +#define libc_hidden_weak(X)
> +#define weak_alias(X,Y)
> +#ifndef WIDE
> +# undef MEMSET
> +# define MEMSET generic_memset
> +# include <string/memset.c>
> +#else
> +# define WMEMSET generic_wmemset
> +# include <wcsmbs/wmemset.c>
> +#endif
> --
> 2.17.1
>
---
Lucas A. M. Magalhães
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] benchtests: Add memset zero fill benchmark tests
@ 2021-07-13 15:57 Wilco Dijkstra via Libc-alpha
2021-07-13 18:47 ` Noah Goldstein via Libc-alpha
0 siblings, 1 reply; 7+ messages in thread
From: Wilco Dijkstra via Libc-alpha @ 2021-07-13 15:57 UTC (permalink / raw)
To: naohirot@fujitsu.com, lamm@linux.ibm.com; +Cc: 'GNU C Library'
Hi,
> I like the idea of a benchmark specific for 0 on memset. However having two
> implementations seems too much. I would rather see just one
> bench-memset-zerofill.c. What I guess would be even better is to have this
> performance test inside bench-memset.c and bench-memset-large.c.
I agree just copying the files is not a good idea. Currently bench-memset and
bench-memset-walk already test zero memsets. Bench-memset-large could just
test zero since that is the most common, especially for large sizes. Reducing the
number of non-zero tests in bench-memset would make it more representative -
you could do the main set of tests with zero only and then have a small selection
where it alternates between zero and non-zero.
> Quoting Naohiro Tamura via Libc-alpha (2021-07-13 05:22:14)
>> Memset takes 0 as the second parameter in most cases.
>> More than 95% of memset takes 0 as the second parameter in case of
>> Linux Kernel source code.
> The Linux Kernel does not use glibc, it has his own memset implementation.
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/string.c#n784
> Therefore IMO this argument is not suited for this commit.
The argument is true in general - you could simply state that almost all memset
calls are zeroing without mentioning the Linux kernel. In some old stats from
SPEC I saw about 1.8% non-zero memsets.
Cheers,
Wilco
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] benchtests: Add memset zero fill benchmark tests
2021-07-13 15:57 [PATCH] benchtests: Add memset zero fill benchmark tests Wilco Dijkstra via Libc-alpha
@ 2021-07-13 18:47 ` Noah Goldstein via Libc-alpha
2021-07-15 8:15 ` naohirot--- via Libc-alpha
0 siblings, 1 reply; 7+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2021-07-13 18:47 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: GNU C Library
On Tue, Jul 13, 2021 at 12:15 PM Wilco Dijkstra via Libc-alpha <
libc-alpha@sourceware.org> wrote:
> Hi,
>
> > I like the idea of a benchmark specific for 0 on memset. However having
> two
> > implementations seems too much. I would rather see just one
> > bench-memset-zerofill.c. What I guess would be even better is to have
> this
> > performance test inside bench-memset.c and bench-memset-large.c.
>
> I agree just copying the files is not a good idea. Currently bench-memset
> and
> bench-memset-walk already test zero memsets. Bench-memset-large could just
> test zero since that is the most common, especially for large sizes.
> Reducing the
> number of non-zero tests in bench-memset would make it more representative
> -
> you could do the main set of tests with zero only and then have a small
> selection
> where it alternates between zero and non-zero.
>
I'm in favor of a seperate file. On some x86_64 systems writing zeros to a
cacheline
that has not been modified can leave the cacheline in an unmodified
state[1] which
affects memory bandwidth on the writeback to DRAM for larger regions. I can
imagine
we might want to test memset zero on unmodified vs modified region which
will require
unique setup that I think justifies a separate file (at least for
memset-large-zero).
[1] https://travisdowns.github.io/blog/2020/05/13/intel-zero-opt.html
>
> > Quoting Naohiro Tamura via Libc-alpha (2021-07-13 05:22:14)
> >> Memset takes 0 as the second parameter in most cases.
> >> More than 95% of memset takes 0 as the second parameter in case of
> >> Linux Kernel source code.
> > The Linux Kernel does not use glibc, it has his own memset
> implementation.
> >
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/string.c#n784
> > Therefore IMO this argument is not suited for this commit.
>
> The argument is true in general - you could simply state that almost all
> memset
> calls are zeroing without mentioning the Linux kernel. In some old stats
> from
> SPEC I saw about 1.8% non-zero memsets.
>
> Cheers,
> Wilco
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] benchtests: Add memset zero fill benchmark tests
2021-07-13 18:47 ` Noah Goldstein via Libc-alpha
@ 2021-07-15 8:15 ` naohirot--- via Libc-alpha
2021-07-20 8:51 ` naohirot--- via Libc-alpha
0 siblings, 1 reply; 7+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-15 8:15 UTC (permalink / raw)
To: Noah Goldstein, Wilco Dijkstra, Lucas A. M. Magalhaes; +Cc: GNU C Library
Hi Lucas, Wilco, Noah,
Thanks for the comments.
> From: Noah Goldstein <goldstein.w.n@gmail.com>
> Sent: Wednesday, 14 July 2021 03:47
>
> On Tue, Jul 13, 2021 at 12:15 PM Wilco Dijkstra via Libc-alpha <libc-alpha@sourceware.org<mailto:libc-alpha@sourceware.org>> wrote:
> > Hi,
> >
> > > I like the idea of a benchmark specific for 0 on memset. However having two
> > > implementations seems too much. I would rather see just one
> > > bench-memset-zerofill.c. What I guess would be even better is to have this
> > > performance test inside bench-memset.c and bench-memset-large.c.
Yes, that's one way. In this case, benchtests/scripts/plot_strings.py
cannot be used as is to visualize zero-fill performance.
We need to extract the zero fill data from the JSON bench results.
> > I agree just copying the files is not a good idea. Currently bench-memset and
> > bench-memset-walk already test zero memsets. Bench-memset-large could just
> > test zero since that is the most common, especially for large sizes. Reducing the
> > number of non-zero tests in bench-memset would make it more representative -
> > you could do the main set of tests with zero only and then have a small selection
> > where it alternates between zero and non-zero.
Yes, this is one way too.
However, we also need to extract the zero fill data from JSON bench result.
>
> I'm in favor of a seperate file. On some x86_64 systems writing zeros to a cacheline
> that has not been modified can leave the cacheline in an unmodified state[1] which
> affects memory bandwidth on the writeback to DRAM for larger regions. I can imagine
> we might want to test memset zero on unmodified vs modified region which will require
> unique setup that I think justifies a separate file (at least for memset-large-zero).
>
> [1] https://travisdowns.github.io/blog/2020/05/13/intel-zero-opt.html
I throughly read the blog, it's very interesting.
It seems that Skylake-S microarchitcture has some zero-over-zero
optimization in the size range of L3 and RAM.
So why don't we proceed like this?
- create a new benchtest file, ex. memset-zerofill.c.
it measures zero-over-zero and zero-over-one from 32KB(L1), through
L2 and L3, to 64MB(RAM)
- create a script to extract some part of data such as zero-over-zero
or zero-over-one from JSON bench result to CSV in the first place in
order to create graph in spreadsheet
- update benchtests/scripts/plot_strings.py later so that it can draw
zero-over-zero and zero-over-one separately
>
> >
> > > Quoting Naohiro Tamura via Libc-alpha (2021-07-13 05:22:14)
> > >> Memset takes 0 as the second parameter in most cases.
> > >> More than 95% of memset takes 0 as the second parameter in case of
> > >> Linux Kernel source code.
> > > The Linux Kernel does not use glibc, it has his own memset implementation.
> > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/string.c#n784
> > > Therefore IMO this argument is not suited for this commit.
> >
> > The argument is true in general - you could simply state that almost all memset
> > calls are zeroing without mentioning the Linux kernel. In some old stats from
> > SPEC I saw about 1.8% non-zero memsets.
I'll fix that.
Thanks.
Naohiro
^ permalink raw reply [flat|nested] 7+ messages in thread
* RE: [PATCH] benchtests: Add memset zero fill benchmark tests
2021-07-15 8:15 ` naohirot--- via Libc-alpha
@ 2021-07-20 8:51 ` naohirot--- via Libc-alpha
2021-07-20 10:29 ` Wilco Dijkstra via Libc-alpha
0 siblings, 1 reply; 7+ messages in thread
From: naohirot--- via Libc-alpha @ 2021-07-20 8:51 UTC (permalink / raw)
To: Noah Goldstein, Wilco Dijkstra, Lucas A. M. Magalhaes; +Cc: GNU C Library
Hi Lucas, Wilco, Noah,
> From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Sent: Thursday, July 15, 2021 5:16 PM
> > [1] https://travisdowns.github.io/blog/2020/05/13/intel-zero-opt.html
>
> I throughly read the blog, it's very interesting.
> It seems that Skylake-S microarchitcture has some zero-over-zero
> optimization in the size range of L3 and RAM.
>
> So why don't we proceed like this?
> - create a new benchtest file, ex. memset-zerofill.c.
> it measures zero-over-zero and zero-over-one from 32KB(L1), through
> L2 and L3, to 64MB(RAM)
> - create a script to extract some part of data such as zero-over-zero
> or zero-over-one from JSON bench result to CSV in the first place in
> order to create graph in spreadsheet
> - update benchtests/scripts/plot_strings.py later so that it can draw
> zero-over-zero and zero-over-one separately
Here is a result of the memset zero fill patch v2 [1] using the source code A64FX memset patch v2 [2].
The first graph [3] is the zero-over-zero result by typing the command:
$ cat bench-memset-zerofill.out | jq -r '.functions.memset."bench-variant"="zerofill-0o0" | del(.functions.memset.results[] | select(.char1 != 0))' | plot_strings.py -l -p thru -v -
The second graph [4] is the zero-over-one result by typing the command:
$ cat bench-memset-zerofill.out | jq -r '.functions.memset."bench-variant"="zerofill-0o1" | del(.functions.memset.results[] | select(.char1 != 1))' | plot_strings.py -l -p thru -v -
In case of zero-over-zero, it's very interesting that memset_generic is faster than memset_a64fx.
On the other hand, however, in case of zero-over-one, memset_a64fx is faster is faster than memset_generic.
It may be due to SVE code.
Comparing the memset_generic performance, that is non SVE code, between zero-over-zero and zero-over-one,
it seems that a64fx micro-architecture has some zero-over-zero optimization?.
[1] https://sourceware.org/pipermail/libc-alpha/2021-July/129290.html
[2] https://sourceware.org/pipermail/libc-alpha/2021-July/128857.html
[3] https://drive.google.com/file/d/1iNlp-srAq-p9fr7PuRh62TRj-V2eJi0U/view?usp=sharing
[4] https://drive.google.com/file/d/1UipGRHUWU8WJ1-EbV6YzoTcOtxtOhAy0/view?usp=sharing
Thanks.
Naohiro
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] benchtests: Add memset zero fill benchmark tests
2021-07-20 8:51 ` naohirot--- via Libc-alpha
@ 2021-07-20 10:29 ` Wilco Dijkstra via Libc-alpha
0 siblings, 0 replies; 7+ messages in thread
From: Wilco Dijkstra via Libc-alpha @ 2021-07-20 10:29 UTC (permalink / raw)
To: naohirot@fujitsu.com, Noah Goldstein, Lucas A. M. Magalhaes; +Cc: GNU C Library
Hi Naohiro,
> In case of zero-over-zero, it's very interesting that memset_generic is faster than memset_a64fx.
That's because it uses DC ZVA for the full range rather than only for sizes > 8MB like in memset v2.
> On the other hand, however, in case of zero-over-one, memset_a64fx is faster is faster than memset_generic.
> It may be due to SVE code.
Yes, you get more bandwidth with 512-bit SVE stores than 128-bit Neon stores.
> Comparing the memset_generic performance, that is non SVE code, between zero-over-zero and
> zero-over-one, it seems that a64fx micro-architecture has some zero-over-zero optimization?.
No, that does not prove that the previous value matters, just that storing zeroes is faster.
Cheers,
Wilco
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2021-07-20 10:30 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-13 15:57 [PATCH] benchtests: Add memset zero fill benchmark tests Wilco Dijkstra via Libc-alpha
2021-07-13 18:47 ` Noah Goldstein via Libc-alpha
2021-07-15 8:15 ` naohirot--- via Libc-alpha
2021-07-20 8:51 ` naohirot--- via Libc-alpha
2021-07-20 10:29 ` Wilco Dijkstra via Libc-alpha
-- strict thread matches above, loose matches on Subject: below --
2021-07-13 8:22 Naohiro Tamura via Libc-alpha
2021-07-13 13:50 ` Lucas A. M. Magalhaes via Libc-alpha
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).