[PATCH v3 2/5] AArch64: Improve A64FX memset - Wilco Dijkstra via Libc-alpha

unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: Wilco Dijkstra via Libc-alpha <libc-alpha@sourceware.org>
To: "naohirot@fujitsu.com" <naohirot@fujitsu.com>
Cc: 'GNU C Library' <libc-alpha@sourceware.org>
Subject: [PATCH v3 2/5] AArch64: Improve A64FX memset
Date: Thu, 22 Jul 2021 16:00:44 +0000	[thread overview]
Message-ID: <VE1PR08MB559938483A0630B16E8C64B383E49@VE1PR08MB5599.eurprd08.prod.outlook.com> (raw)

Improve performance of large memsets. Simplify alignment code. For zero memset use DC ZVA,
which almost doubles performance. For non-zero memsets use the unroll8 loop which is about 10% faster.

---

diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
index f7fcc7b323e1553f50a2e005b8ccef344a08127d..608e0e2e2ff5259178e2fdadf1eea8816194d879 100644
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -30,10 +30,8 @@
 #define L2_SIZE         (8*1024*1024)	// L2 8MB - 1MB
 #define CACHE_LINE_SIZE	256
 #define PF_DIST_L1	(CACHE_LINE_SIZE * 16)	// Prefetch distance L1
-#define rest		x8
+#define rest		x2
 #define vector_length	x9
-#define vl_remainder	x10	// vector_length remainder
-#define cl_remainder	x11	// CACHE_LINE_SIZE remainder
 
 #if HAVE_AARCH64_SVE_ASM
 # if IS_IN (libc)
@@ -41,14 +39,6 @@
 
 	.arch armv8.2-a+sve
 
-	.macro dc_zva times
-	dc	zva, tmp1
-	add	tmp1, tmp1, CACHE_LINE_SIZE
-	.if \times-1
-	dc_zva "(\times-1)"
-	.endif
-	.endm
-
 	.macro st1b_unroll first=0, last=7
 	st1b	z0.b, p0, [dst, \first, mul vl]
 	.if \last-\first
@@ -187,54 +177,29 @@ L(L1_prefetch): // if rest >= L1_SIZE
 	cbnz	rest, L(unroll32)
 	ret
 
+	// count >= L2_SIZE
 L(L2):
-	// align dst address at vector_length byte boundary
-	sub	tmp1, vector_length, 1
-	ands	tmp2, dst, tmp1
-	// if vl_remainder == 0
-	b.eq	1f
-	sub	vl_remainder, vector_length, tmp2
-	// process remainder until the first vector_length boundary
-	whilelt	p2.b, xzr, vl_remainder
-	st1b	z0.b, p2, [dst]
-	add	dst, dst, vl_remainder
-	sub	rest, rest, vl_remainder
-	// align dstin address at CACHE_LINE_SIZE byte boundary
-1:	mov	tmp1, CACHE_LINE_SIZE
-	ands	tmp2, dst, CACHE_LINE_SIZE - 1
-	// if cl_remainder == 0
-	b.eq	L(L2_dc_zva)
-	sub	cl_remainder, tmp1, tmp2
-	// process remainder until the first CACHE_LINE_SIZE boundary
-	mov	tmp1, xzr       // index
-2:	whilelt	p2.b, tmp1, cl_remainder
-	st1b	z0.b, p2, [dst, tmp1]
-	incb	tmp1
-	cmp	tmp1, cl_remainder
-	b.lo	2b
-	add	dst, dst, cl_remainder
-	sub	rest, rest, cl_remainder
-
-L(L2_dc_zva):
-	// zero fill
-	mov	tmp1, dst
-	dc_zva	(ZF_DIST / CACHE_LINE_SIZE) - 1
-	mov	zva_len, ZF_DIST
-	add	tmp1, zva_len, CACHE_LINE_SIZE * 2
-	// unroll
-	.p2align 3
-1:	st1b_unroll 0, 3
-	add	tmp2, dst, zva_len
-	dc	 zva, tmp2
-	st1b_unroll 4, 7
-	add	tmp2, tmp2, CACHE_LINE_SIZE
-	dc	zva, tmp2
-	add	dst, dst, CACHE_LINE_SIZE * 2
-	sub	rest, rest, CACHE_LINE_SIZE * 2
-	cmp	rest, tmp1	// ZF_DIST + CACHE_LINE_SIZE * 2
-	b.ge	1b
-	cbnz	rest, L(unroll8)
-	ret
+	tst	valw, 255
+	b.ne	L(unroll8)
+        // align dst to CACHE_LINE_SIZE byte boundary
+	and	tmp2, dst, CACHE_LINE_SIZE - 1
+	sub	tmp2, tmp2, CACHE_LINE_SIZE
+	st1b	z0.b, p0, [dst, 0, mul vl]
+	st1b	z0.b, p0, [dst, 1, mul vl]
+	st1b	z0.b, p0, [dst, 2, mul vl]
+	st1b	z0.b, p0, [dst, 3, mul vl]
+	sub	dst, dst, tmp2
+	add	count, count, tmp2
+
+	// clear cachelines using DC ZVA
+	sub	count, count, CACHE_LINE_SIZE
+	.p2align 4
+1:	dc	zva, dst
+	add	dst, dst, CACHE_LINE_SIZE
+	subs	count, count, CACHE_LINE_SIZE
+	b.hi	1b
+	add	count, count, CACHE_LINE_SIZE
+	b	L(last)
 
 END (MEMSET)
 libc_hidden_builtin_def (MEMSET)

next             reply	other threads:[~2021-07-22 16:05 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-07-22 16:00 Wilco Dijkstra via Libc-alpha [this message]
2021-08-02 13:29 ` [PATCH v3 2/5] AArch64: Improve A64FX memset naohirot--- via Libc-alpha
2021-08-03  3:08   ` naohirot--- via Libc-alpha
2021-08-03  5:03   ` naohirot--- via Libc-alpha
2021-08-09 16:16     ` Wilco Dijkstra via Libc-alpha

find likely ancestor, descendant, or conflicting patches for this message:
dfblob:f7fcc7b323e1553f50a2e005b8ccef344a08127
dfblob:608e0e2e2ff5259178e2fdadf1eea8816194d87
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/libc/involved.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=VE1PR08MB559938483A0630B16E8C64B383E49@VE1PR08MB5599.eurprd08.prod.outlook.com \
    --to=libc-alpha@sourceware.org \
    --cc=Wilco.Dijkstra@arm.com \
    --cc=naohirot@fujitsu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).