unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: naohirot--- via Libc-alpha <libc-alpha@sourceware.org>
To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Cc: 'GNU C Library' <libc-alpha@sourceware.org>
Subject: Re: [PATCH v3 1/5] AArch64: Improve A64FX memset
Date: Wed, 28 Jul 2021 08:10:55 +0000	[thread overview]
Message-ID: <TYAPR01MB6025BF9842A0B7D55D42D540DFEA9@TYAPR01MB6025.jpnprd01.prod.outlook.com> (raw)
In-Reply-To: <VE1PR08MB5599A80A0480330F1CB5CC0083E49@VE1PR08MB5599.eurprd08.prod.outlook.com>

Hi Wilco,

Thanks for the patch.

I confirmed that the performance is improved than the master as show
in the graphs [1].
There are two comments, please find them.

Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
Tested-by: Naohiro Tamura <naohirot@fujitsu.com>

[1] https://drive.google.com/file/d/1DfYPMd6RRS0Z_2y3VH3Q4b-r8N6TyW1c/view?usp=sharing

> [PATCH v3 1/5] AArch64: Improve A64FX memset
>

Would you update the commit title so as not to be the same among 5
patches?
Because we need to ask distro to backport these patches.
If all commit titles are the same, it will increase the room to happen
confusion and mistake.

How about "AArch64: Improve A64FX memset for less than 512B" ?

> Improve performance of small copies by reducing instruction counts and improving
> alignment. Bench-memset shows 35-45% performance gain for small sizes.
> 
> ---
> 
> diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
> index ce54e5418b08c8bc0ecc7affff68a59272ba6397..f7fcc7b323e1553f50a2e005b8ccef344a08127d 100644
> --- a/sysdeps/aarch64/multiarch/memset_a64fx.S
> +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
> @@ -30,7 +30,6 @@
>  #define L2_SIZE         (8*1024*1024)   // L2 8MB - 1MB
>  #define CACHE_LINE_SIZE 256
>  #define PF_DIST_L1      (CACHE_LINE_SIZE * 16)  // Prefetch distance L1
> -#define ZF_DIST                (CACHE_LINE_SIZE * 21)  // Zerofill distance

This caused compile error.

>  #define rest            x8
>  #define vector_length   x9
>  #define vl_remainder    x10     // vector_length remainder
> @@ -51,78 +50,54 @@
>          .endm
>  
>          .macro st1b_unroll first=0, last=7
> -       st1b    z0.b, p0, [dst, #\first, mul vl]
> +       st1b    z0.b, p0, [dst, \first, mul vl]
>          .if \last-\first
>          st1b_unroll "(\first+1)", \last
>          .endif
>          .endm
>  
> -       .macro shortcut_for_small_size exit
> -       // if rest <= vector_length * 2
> -       whilelo p0.b, xzr, count
> -       whilelo p1.b, vector_length, count
> -       b.last  1f
> -       st1b    z0.b, p0, [dstin, #0, mul vl]
> -       st1b    z0.b, p1, [dstin, #1, mul vl]
> -       ret
> -1:     // if rest > vector_length * 8
> -       cmp     count, vector_length, lsl 3     // vector_length * 8
> -       b.hi    \exit
> -       // if rest <= vector_length * 4
> -       lsl     tmp1, vector_length, 1  // vector_length * 2
> -       whilelo p2.b, tmp1, count
> -       incb    tmp1
> -       whilelo p3.b, tmp1, count
> -       b.last  1f
> -       st1b    z0.b, p0, [dstin, #0, mul vl]
> -       st1b    z0.b, p1, [dstin, #1, mul vl]
> -       st1b    z0.b, p2, [dstin, #2, mul vl]
> -       st1b    z0.b, p3, [dstin, #3, mul vl]
> -       ret
> -1:     // if rest <= vector_length * 8
> -       lsl     tmp1, vector_length, 2  // vector_length * 4
> -       whilelo p4.b, tmp1, count
> -       incb    tmp1
> -       whilelo p5.b, tmp1, count
> -       b.last  1f
> -       st1b    z0.b, p0, [dstin, #0, mul vl]
> -       st1b    z0.b, p1, [dstin, #1, mul vl]
> -       st1b    z0.b, p2, [dstin, #2, mul vl]
> -       st1b    z0.b, p3, [dstin, #3, mul vl]
> -       st1b    z0.b, p4, [dstin, #4, mul vl]
> -       st1b    z0.b, p5, [dstin, #5, mul vl]
> -       ret
> -1:     lsl     tmp1, vector_length, 2  // vector_length * 4
> -       incb    tmp1                    // vector_length * 5
> -       incb    tmp1                    // vector_length * 6
> -       whilelo p6.b, tmp1, count
> -       incb    tmp1
> -       whilelo p7.b, tmp1, count
> -       st1b    z0.b, p0, [dstin, #0, mul vl]
> -       st1b    z0.b, p1, [dstin, #1, mul vl]
> -       st1b    z0.b, p2, [dstin, #2, mul vl]
> -       st1b    z0.b, p3, [dstin, #3, mul vl]
> -       st1b    z0.b, p4, [dstin, #4, mul vl]
> -       st1b    z0.b, p5, [dstin, #5, mul vl]
> -       st1b    z0.b, p6, [dstin, #6, mul vl]
> -       st1b    z0.b, p7, [dstin, #7, mul vl]
> -       ret
> -       .endm
>  
> -ENTRY (MEMSET)
> +#undef BTI_C
> +#define BTI_C
>  
> +ENTRY (MEMSET)
>          PTR_ARG (0)
>          SIZE_ARG (2)
>  
> -       cbnz    count, 1f
> -       ret
> -1:     dup     z0.b, valw
>          cntb    vector_length
> -       // shortcut for less than vector_length * 8
> -       // gives a free ptrue to p0.b for n >= vector_length
> -       shortcut_for_small_size L(vl_agnostic)
> -       // end of shortcut
> +       dup     z0.b, valw
> +       whilelo p0.b, vector_length, count
> +       b.last  1f
> +       whilelo p1.b, xzr, count
> +       st1b    z0.b, p1, [dstin, 0, mul vl]
> +       st1b    z0.b, p0, [dstin, 1, mul vl]
> +       ret
> +
> +       // count >= vector_length * 2
> +1:     cmp     count, vector_length, lsl 2
> +       add     dstend, dstin, count
> +       b.hi    1f
> +       st1b    z0.b, p0, [dstin, 0, mul vl]
> +       st1b    z0.b, p0, [dstin, 1, mul vl]
> +       st1b    z0.b, p0, [dstend, -2, mul vl]
> +       st1b    z0.b, p0, [dstend, -1, mul vl]
> +       ret
> +
> +       // count > vector_length * 4
> +1:     lsl     tmp1, vector_length, 3
> +       cmp     count, tmp1
> +       b.hi    L(vl_agnostic)
> +       st1b    z0.b, p0, [dstin, 0, mul vl]
> +       st1b    z0.b, p0, [dstin, 1, mul vl]
> +       st1b    z0.b, p0, [dstin, 2, mul vl]
> +       st1b    z0.b, p0, [dstin, 3, mul vl]
> +       st1b    z0.b, p0, [dstend, -4, mul vl]
> +       st1b    z0.b, p0, [dstend, -3, mul vl]
> +       st1b    z0.b, p0, [dstend, -2, mul vl]
> +       st1b    z0.b, p0, [dstend, -1, mul vl]
> +       ret
>  
> +       .p2align 4
>  L(vl_agnostic): // VL Agnostic
>          mov     rest, count
>          mov     dst, dstin
>

  reply	other threads:[~2021-07-28  8:33 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-07-22 15:59 [PATCH v3 1/5] AArch64: Improve A64FX memset Wilco Dijkstra via Libc-alpha
2021-07-28  8:10 ` naohirot--- via Libc-alpha [this message]
2021-08-02 13:53   ` naohirot--- via Libc-alpha
2021-08-02 14:38     ` Wilco Dijkstra via Libc-alpha
2021-08-02 14:50       ` Szabolcs Nagy via Libc-alpha
2021-08-03  2:57         ` naohirot--- via Libc-alpha
2021-08-03  8:01           ` Szabolcs Nagy via Libc-alpha
2021-09-24  7:56             ` naohirot--- via Libc-alpha
2021-08-03  2:56       ` naohirot--- via Libc-alpha

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/libc/involved.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=TYAPR01MB6025BF9842A0B7D55D42D540DFEA9@TYAPR01MB6025.jpnprd01.prod.outlook.com \
    --to=libc-alpha@sourceware.org \
    --cc=Wilco.Dijkstra@arm.com \
    --cc=naohirot@fujitsu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).