From: Szabolcs Nagy via Libc-alpha <libc-alpha@sourceware.org>
To: Naohiro Tamura <naohirot@fujitsu.com>
Cc: Naohiro Tamura <naohirot@jp.fujitsu.com>, libc-alpha@sourceware.org
Subject: Re: [PATCH v2 3/6] aarch64: Added optimized memcpy and memmove for A64FX
Date: Wed, 26 May 2021 11:19:08 +0100 [thread overview]
Message-ID: <20210526101908.GZ9028@arm.com> (raw)
In-Reply-To: <20210512092809.901182-1-naohirot@fujitsu.com>
The 05/12/2021 09:28, Naohiro Tamura wrote:
> From: Naohiro Tamura <naohirot@jp.fujitsu.com>
>
> This patch optimizes the performance of memcpy/memmove for A64FX [1]
> which implements ARMv8-A SVE and has L1 64KB cache per core and L2 8MB
> cache per NUMA node.
>
> The performance optimization makes use of Scalable Vector Register
> with several techniques such as loop unrolling, memory access
> alignment, cache zero fill, and software pipelining.
>
> SVE assembler code for memcpy/memmove is implemented as Vector Length
> Agnostic code so theoretically it can be run on any SOC which supports
> ARMv8-A SVE standard.
>
> We confirmed that all testcases have been passed by running 'make
> check' and 'make xcheck' not only on A64FX but also on ThunderX2.
>
> And also we confirmed that the SVE 512 bit vector register performance
> is roughly 4 times better than Advanced SIMD 128 bit register and 8
> times better than scalar 64 bit register by running 'make bench'.
>
> [1] https://github.com/fujitsu/A64FX
thanks. this looks ok, except for whitespace usage.
can you please send a version with fixed whitespaces?
> --- a/sysdeps/aarch64/multiarch/memcpy.c
> +++ b/sysdeps/aarch64/multiarch/memcpy.c
> @@ -33,6 +33,9 @@ extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
> +#if HAVE_AARCH64_SVE_ASM
> +extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
> +#endif
>
> libc_ifunc (__libc_memcpy,
> (IS_THUNDERX (midr)
> @@ -44,8 +47,13 @@ libc_ifunc (__libc_memcpy,
> : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
> || IS_NEOVERSE_V1 (midr)
> ? __memcpy_simd
> - : __memcpy_generic)))));
> -
> +#if HAVE_AARCH64_SVE_ASM
> + : (IS_A64FX (midr)
> + ? __memcpy_a64fx
> + : __memcpy_generic))))));
> +#else
> + : __memcpy_generic)))));
> +#endif
glibc uses a mix of tabs and spaces, you used space only.
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
> @@ -0,0 +1,405 @@
> +/* Optimized memcpy for Fujitsu A64FX processor.
> + Copyright (C) 2012-2021 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +#if HAVE_AARCH64_SVE_ASM
> +#if IS_IN (libc)
> +# define MEMCPY __memcpy_a64fx
> +# define MEMMOVE __memmove_a64fx
> +
> +/* Assumptions:
> + *
> + * ARMv8.2-a, AArch64, unaligned accesses, sve
> + *
> + */
> +
> +#define L2_SIZE (8*1024*1024)/2 // L2 8MB/2
> +#define CACHE_LINE_SIZE 256
> +#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance
> +#define dest x0
> +#define src x1
> +#define n x2 // size
> +#define tmp1 x3
> +#define tmp2 x4
> +#define tmp3 x5
> +#define rest x6
> +#define dest_ptr x7
> +#define src_ptr x8
> +#define vector_length x9
> +#define cl_remainder x10 // CACHE_LINE_SIZE remainder
> +
> + .arch armv8.2-a+sve
> +
> + .macro dc_zva times
> + dc zva, tmp1
> + add tmp1, tmp1, CACHE_LINE_SIZE
> + .if \times-1
> + dc_zva "(\times-1)"
> + .endif
> + .endm
> +
> + .macro ld1b_unroll8
> + ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
> + ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
> + ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
> + ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
> + ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
> + ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
> + ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
> + ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
> + .endm
...
please indent all asm code with one tab, see other asm files.
> --- a/sysdeps/aarch64/multiarch/memmove.c
> +++ b/sysdeps/aarch64/multiarch/memmove.c
> @@ -33,6 +33,9 @@ extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
> extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
> extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
> extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
> +#if HAVE_AARCH64_SVE_ASM
> +extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
> +#endif
>
> libc_ifunc (__libc_memmove,
> (IS_THUNDERX (midr)
> @@ -44,8 +47,13 @@ libc_ifunc (__libc_memmove,
> : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
> || IS_NEOVERSE_V1 (midr)
> ? __memmove_simd
> - : __memmove_generic)))));
> -
> +#if HAVE_AARCH64_SVE_ASM
> + : (IS_A64FX (midr)
> + ? __memmove_a64fx
> + : __memmove_generic))))));
> +#else
> + : __memmove_generic)))));
> +#endif
same as above.
next prev parent reply other threads:[~2021-05-26 10:19 UTC|newest]
Thread overview: 36+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-03-17 2:28 [PATCH 0/5] Added optimized memcpy/memmove/memset for A64FX Naohiro Tamura
2021-03-17 2:33 ` [PATCH 1/5] config: Added HAVE_SVE_ASM_SUPPORT for aarch64 Naohiro Tamura
2021-03-29 12:11 ` Szabolcs Nagy via Libc-alpha
2021-03-30 6:19 ` naohirot
2021-03-17 2:34 ` [PATCH 2/5] aarch64: Added optimized memcpy and memmove for A64FX Naohiro Tamura
2021-03-29 12:44 ` Szabolcs Nagy via Libc-alpha
2021-03-30 7:17 ` naohirot
2021-03-17 2:34 ` [PATCH 3/5] aarch64: Added optimized memset " Naohiro Tamura
2021-03-17 2:35 ` [PATCH 4/5] scripts: Added Vector Length Set test helper script Naohiro Tamura
2021-03-29 13:20 ` Szabolcs Nagy via Libc-alpha
2021-03-30 7:25 ` naohirot
2021-03-17 2:35 ` [PATCH 5/5] benchtests: Added generic_memcpy and generic_memmove to large benchtests Naohiro Tamura
2021-03-29 12:03 ` [PATCH 0/5] Added optimized memcpy/memmove/memset for A64FX Szabolcs Nagy via Libc-alpha
2021-05-10 1:45 ` naohirot
2021-05-14 13:35 ` Szabolcs Nagy via Libc-alpha
2021-05-19 0:11 ` naohirot
2021-05-12 9:23 ` [PATCH v2 0/6] aarch64: " Naohiro Tamura
2021-05-12 9:26 ` [PATCH v2 1/6] config: Added HAVE_AARCH64_SVE_ASM for aarch64 Naohiro Tamura
2021-05-26 10:05 ` Szabolcs Nagy via Libc-alpha
2021-05-12 9:27 ` [PATCH v2 2/6] aarch64: define BTI_C and BTI_J macros as NOP unless HAVE_AARCH64_BTI Naohiro Tamura
2021-05-26 10:06 ` Szabolcs Nagy via Libc-alpha
2021-05-12 9:28 ` [PATCH v2 3/6] aarch64: Added optimized memcpy and memmove for A64FX Naohiro Tamura
2021-05-26 10:19 ` Szabolcs Nagy via Libc-alpha [this message]
2021-05-12 9:28 ` [PATCH v2 4/6] aarch64: Added optimized memset " Naohiro Tamura
2021-05-26 10:22 ` Szabolcs Nagy via Libc-alpha
2021-05-12 9:29 ` [PATCH v2 5/6] scripts: Added Vector Length Set test helper script Naohiro Tamura
2021-05-12 16:58 ` Joseph Myers
2021-05-13 9:53 ` naohirot
2021-05-20 7:34 ` Naohiro Tamura
2021-05-26 10:24 ` Szabolcs Nagy via Libc-alpha
2021-05-12 9:29 ` [PATCH v2 6/6] benchtests: Fixed bench-memcpy-random: buf1: mprotect failed Naohiro Tamura
2021-05-26 10:25 ` Szabolcs Nagy via Libc-alpha
2021-05-27 0:22 ` [PATCH v2 0/6] aarch64: Added optimized memcpy/memmove/memset for A64FX naohirot
2021-05-27 23:50 ` naohirot
2021-05-27 7:42 ` [PATCH v3 1/2] aarch64: Added optimized memcpy and memmove " Naohiro Tamura
2021-05-27 7:44 ` [PATCH v3 2/2] aarch64: Added optimized memset " Naohiro Tamura
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://www.gnu.org/software/libc/involved.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210526101908.GZ9028@arm.com \
--to=libc-alpha@sourceware.org \
--cc=naohirot@fujitsu.com \
--cc=naohirot@jp.fujitsu.com \
--cc=szabolcs.nagy@arm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).