unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Szabolcs Nagy via Libc-alpha <libc-alpha@sourceware.org>
To: Naohiro Tamura <naohirot@fujitsu.com>
Cc: Naohiro Tamura <naohirot@jp.fujitsu.com>, libc-alpha@sourceware.org
Subject: Re: [PATCH v2 3/6] aarch64: Added optimized memcpy and memmove for A64FX
Date: Wed, 26 May 2021 11:19:08 +0100	[thread overview]
Message-ID: <20210526101908.GZ9028@arm.com> (raw)
In-Reply-To: <20210512092809.901182-1-naohirot@fujitsu.com>

The 05/12/2021 09:28, Naohiro Tamura wrote:
> From: Naohiro Tamura <naohirot@jp.fujitsu.com>
> 
> This patch optimizes the performance of memcpy/memmove for A64FX [1]
> which implements ARMv8-A SVE and has L1 64KB cache per core and L2 8MB
> cache per NUMA node.
> 
> The performance optimization makes use of Scalable Vector Register
> with several techniques such as loop unrolling, memory access
> alignment, cache zero fill, and software pipelining.
> 
> SVE assembler code for memcpy/memmove is implemented as Vector Length
> Agnostic code so theoretically it can be run on any SOC which supports
> ARMv8-A SVE standard.
> 
> We confirmed that all testcases have been passed by running 'make
> check' and 'make xcheck' not only on A64FX but also on ThunderX2.
> 
> And also we confirmed that the SVE 512 bit vector register performance
> is roughly 4 times better than Advanced SIMD 128 bit register and 8
> times better than scalar 64 bit register by running 'make bench'.
> 
> [1] https://github.com/fujitsu/A64FX

thanks. this looks ok, except for whitespace usage.

can you please send a version with fixed whitespaces?

> --- a/sysdeps/aarch64/multiarch/memcpy.c
> +++ b/sysdeps/aarch64/multiarch/memcpy.c
> @@ -33,6 +33,9 @@ extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
> +#if HAVE_AARCH64_SVE_ASM
> +extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
> +#endif
>  
>  libc_ifunc (__libc_memcpy,
>              (IS_THUNDERX (midr)
> @@ -44,8 +47,13 @@ libc_ifunc (__libc_memcpy,
>  		  : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
>  		     || IS_NEOVERSE_V1 (midr)
>  		     ? __memcpy_simd
> -		     : __memcpy_generic)))));
> -
> +#if HAVE_AARCH64_SVE_ASM
> +                     : (IS_A64FX (midr)
> +                        ? __memcpy_a64fx
> +                        : __memcpy_generic))))));
> +#else
> +                     : __memcpy_generic)))));
> +#endif

glibc uses a mix of tabs and spaces, you used space only.

> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
> @@ -0,0 +1,405 @@
> +/* Optimized memcpy for Fujitsu A64FX processor.
> +   Copyright (C) 2012-2021 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#if HAVE_AARCH64_SVE_ASM
> +#if IS_IN (libc)
> +# define MEMCPY __memcpy_a64fx
> +# define MEMMOVE __memmove_a64fx
> +
> +/* Assumptions:
> + *
> + * ARMv8.2-a, AArch64, unaligned accesses, sve
> + *
> + */
> +
> +#define L2_SIZE         (8*1024*1024)/2 // L2 8MB/2
> +#define CACHE_LINE_SIZE 256
> +#define ZF_DIST         (CACHE_LINE_SIZE * 21)  // Zerofill distance
> +#define dest            x0
> +#define src             x1
> +#define n               x2      // size
> +#define tmp1            x3
> +#define tmp2            x4
> +#define tmp3            x5
> +#define rest            x6
> +#define dest_ptr        x7
> +#define src_ptr         x8
> +#define vector_length   x9
> +#define cl_remainder    x10     // CACHE_LINE_SIZE remainder
> +
> +    .arch armv8.2-a+sve
> +
> +    .macro dc_zva times
> +    dc          zva, tmp1
> +    add         tmp1, tmp1, CACHE_LINE_SIZE
> +    .if \times-1
> +    dc_zva "(\times-1)"
> +    .endif
> +    .endm
> +
> +    .macro ld1b_unroll8
> +    ld1b        z0.b, p0/z, [src_ptr, #0, mul vl]
> +    ld1b        z1.b, p0/z, [src_ptr, #1, mul vl]
> +    ld1b        z2.b, p0/z, [src_ptr, #2, mul vl]
> +    ld1b        z3.b, p0/z, [src_ptr, #3, mul vl]
> +    ld1b        z4.b, p0/z, [src_ptr, #4, mul vl]
> +    ld1b        z5.b, p0/z, [src_ptr, #5, mul vl]
> +    ld1b        z6.b, p0/z, [src_ptr, #6, mul vl]
> +    ld1b        z7.b, p0/z, [src_ptr, #7, mul vl]
> +    .endm
...

please indent all asm code with one tab, see other asm files.

> --- a/sysdeps/aarch64/multiarch/memmove.c
> +++ b/sysdeps/aarch64/multiarch/memmove.c
> @@ -33,6 +33,9 @@ extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
> +#if HAVE_AARCH64_SVE_ASM
> +extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
> +#endif
>  
>  libc_ifunc (__libc_memmove,
>              (IS_THUNDERX (midr)
> @@ -44,8 +47,13 @@ libc_ifunc (__libc_memmove,
>  		  : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
>  		     || IS_NEOVERSE_V1 (midr)
>  		     ? __memmove_simd
> -		     : __memmove_generic)))));
> -
> +#if HAVE_AARCH64_SVE_ASM
> +                     : (IS_A64FX (midr)
> +                        ? __memmove_a64fx
> +                        : __memmove_generic))))));
> +#else
> +                        : __memmove_generic)))));
> +#endif

same as above.

  reply	other threads:[~2021-05-26 10:19 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-17  2:28 [PATCH 0/5] Added optimized memcpy/memmove/memset for A64FX Naohiro Tamura
2021-03-17  2:33 ` [PATCH 1/5] config: Added HAVE_SVE_ASM_SUPPORT for aarch64 Naohiro Tamura
2021-03-29 12:11   ` Szabolcs Nagy via Libc-alpha
2021-03-30  6:19     ` naohirot
2021-03-17  2:34 ` [PATCH 2/5] aarch64: Added optimized memcpy and memmove for A64FX Naohiro Tamura
2021-03-29 12:44   ` Szabolcs Nagy via Libc-alpha
2021-03-30  7:17     ` naohirot
2021-03-17  2:34 ` [PATCH 3/5] aarch64: Added optimized memset " Naohiro Tamura
2021-03-17  2:35 ` [PATCH 4/5] scripts: Added Vector Length Set test helper script Naohiro Tamura
2021-03-29 13:20   ` Szabolcs Nagy via Libc-alpha
2021-03-30  7:25     ` naohirot
2021-03-17  2:35 ` [PATCH 5/5] benchtests: Added generic_memcpy and generic_memmove to large benchtests Naohiro Tamura
2021-03-29 12:03 ` [PATCH 0/5] Added optimized memcpy/memmove/memset for A64FX Szabolcs Nagy via Libc-alpha
2021-05-10  1:45 ` naohirot
2021-05-14 13:35   ` Szabolcs Nagy via Libc-alpha
2021-05-19  0:11     ` naohirot
2021-05-12  9:23 ` [PATCH v2 0/6] aarch64: " Naohiro Tamura
2021-05-12  9:26   ` [PATCH v2 1/6] config: Added HAVE_AARCH64_SVE_ASM for aarch64 Naohiro Tamura
2021-05-26 10:05     ` Szabolcs Nagy via Libc-alpha
2021-05-12  9:27   ` [PATCH v2 2/6] aarch64: define BTI_C and BTI_J macros as NOP unless HAVE_AARCH64_BTI Naohiro Tamura
2021-05-26 10:06     ` Szabolcs Nagy via Libc-alpha
2021-05-12  9:28   ` [PATCH v2 3/6] aarch64: Added optimized memcpy and memmove for A64FX Naohiro Tamura
2021-05-26 10:19     ` Szabolcs Nagy via Libc-alpha [this message]
2021-05-12  9:28   ` [PATCH v2 4/6] aarch64: Added optimized memset " Naohiro Tamura
2021-05-26 10:22     ` Szabolcs Nagy via Libc-alpha
2021-05-12  9:29   ` [PATCH v2 5/6] scripts: Added Vector Length Set test helper script Naohiro Tamura
2021-05-12 16:58     ` Joseph Myers
2021-05-13  9:53       ` naohirot
2021-05-20  7:34     ` Naohiro Tamura
2021-05-26 10:24       ` Szabolcs Nagy via Libc-alpha
2021-05-12  9:29   ` [PATCH v2 6/6] benchtests: Fixed bench-memcpy-random: buf1: mprotect failed Naohiro Tamura
2021-05-26 10:25     ` Szabolcs Nagy via Libc-alpha
2021-05-27  0:22   ` [PATCH v2 0/6] aarch64: Added optimized memcpy/memmove/memset for A64FX naohirot
2021-05-27 23:50     ` naohirot
2021-05-27  7:42   ` [PATCH v3 1/2] aarch64: Added optimized memcpy and memmove " Naohiro Tamura
2021-05-27  7:44   ` [PATCH v3 2/2] aarch64: Added optimized memset " Naohiro Tamura

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/libc/involved.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210526101908.GZ9028@arm.com \
    --to=libc-alpha@sourceware.org \
    --cc=naohirot@fujitsu.com \
    --cc=naohirot@jp.fujitsu.com \
    --cc=szabolcs.nagy@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).