[PATCH][AArch64] Cleanup memset

unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
To: 'GNU C Library' <libc-alpha@sourceware.org>
Subject: [PATCH][AArch64] Cleanup memset
Date: Wed, 26 Feb 2020 16:11:53 +0000	[thread overview]
Message-ID: <AM5PR0801MB2035EB8FDD9C75E89E30405283EA0@AM5PR0801MB2035.eurprd08.prod.outlook.com> (raw)

Cleanup memset.  Remove unnecessary code for unused ZVA sizes.
For zero memsets it's faster use DC ZVA for >= 160 bytes rather
than 256. Add a define which allows skipping the ZVA size test when
the ZVA size is known to be 64 - reading dczid_el0 may be expensive.
This simplifies the Falkor memset implementation.

--
diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h
index 872a6b511de2bb97939e644c78676a6212c65d6d..22da7381e68bdfaa6086e39cc980c77150242d21 100644
--- a/sysdeps/aarch64/memset-reg.h
+++ b/sysdeps/aarch64/memset-reg.h
@@ -22,9 +22,4 @@
 #define count	x2
 #define dst	x3
 #define dstend	x4
-#define tmp1	x5
-#define tmp1w	w5
-#define tmp2	x6
-#define tmp2w	w6
-#define zva_len x7
-#define zva_lenw w7
+#define zva_val x5
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index ac577f1660e2706e2024e42d0efc09120c041af2..4654732e1bdd4a7e8f673b79ed66be408bed726b 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -78,114 +78,49 @@ L(set96):
 	stp	q0, q0, [dstend, -32]
 	ret
 
-	.p2align 3
-	nop
+	.p2align 4
 L(set_long):
 	and	valw, valw, 255
 	bic	dst, dstin, 15
 	str	q0, [dstin]
-	cmp	count, 256
-	ccmp	valw, 0, 0, cs
-	b.eq	L(try_zva)
-L(no_zva):
-	sub	count, dstend, dst	/* Count is 16 too large.  */
-	sub	dst, dst, 16		/* Dst is biased by -32.  */
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
-1:	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]!
-L(tail64):
-	subs	count, count, 64
-	b.hi	1b
-2:	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
-
-L(try_zva):
-#ifdef ZVA_MACRO
-	zva_macro
-#else
-	.p2align 3
-	mrs	tmp1, dczid_el0
-	tbnz	tmp1w, 4, L(no_zva)
-	and	tmp1w, tmp1w, 15
-	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
-	b.ne	 L(zva_128)
-
-	/* Write the first and last 64 byte aligned block using stp rather
-	   than using DC ZVA.  This is faster on some cores.
-	 */
-L(zva_64):
+	cmp	count, 160
+	ccmp	valw, 0, 0, hs
+	b.ne	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
 	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
 	bic	dst, dst, 63
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-	nop
-1:	dc	zva, dst
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
 	add	dst, dst, 64
+	dc	zva, dst
 	subs	count, count, 64
-	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
+	b.hi	L(zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
-	.p2align 3
-L(zva_128):
-	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
-	b.ne	L(zva_other)
-
-	str	q0, [dst, 16]
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
 	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	bic	dst, dst, 127
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
-	add	dst, dst, 128
-	subs	count, count, 128
-	b.hi	1b
-	stp	q0, q0, [dstend, -128]
-	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
-L(zva_other):
-	mov	tmp2w, 4
-	lsl	zva_lenw, tmp2w, tmp1w
-	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
-	cmp	count, tmp1
-	blo	L(no_zva)
-
-	sub	tmp2, zva_len, 1
-	add	tmp1, dst, zva_len
-	add	dst, dst, 16
-	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
-	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
-	beq	2f
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-	subs	count, count, 64
-	b.hi	1b
-2:	mov	dst, tmp1
-	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
-	subs	count, count, zva_len
-	b.lo	4f
-3:	dc	zva, dst
-	add	dst, dst, zva_len
-	subs	count, count, zva_len
-	b.hs	3b
-4:	add	count, count, zva_len
-	sub	dst, dst, 32		/* Bias dst for tail loop.  */
-	b	L(tail64)
-#endif
-
 END (MEMSET)
 libc_hidden_builtin_def (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S
index 54fd5abffb1b6638ef8a5fc29e58b2f67765b28a..bee4aed52aab69f5aaded367d40e8b64e406c545 100644
--- a/sysdeps/aarch64/multiarch/memset_falkor.S
+++ b/sysdeps/aarch64/multiarch/memset_falkor.S
@@ -24,30 +24,8 @@
    use this function only when ZVA is enabled.  */
 
 #if IS_IN (libc)
-.macro zva_macro
-	.p2align 4
-	/* Write the first and last 64 byte aligned block using stp rather
-	   than using DC ZVA.  This is faster on some cores.  */
-	str	q0, [dst, 16]
-	stp	q0, q0, [dst, 32]
-	bic	dst, dst, 63
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
-	add	dst, dst, 64
-	subs	count, count, 64
-	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
-.endm
-
-# define ZVA_MACRO zva_macro
+
+# define SKIP_ZVA_CHECK
 # define MEMSET __memset_falkor
 # include <sysdeps/aarch64/memset.S>
 #endif

next             reply	other threads:[~2020-02-26 16:12 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-02-26 16:11 Wilco Dijkstra [this message]
2020-02-27  5:50 ` [PATCH][AArch64] Cleanup memset Siddhesh Poyarekar
2020-03-03 14:42   ` Wilco Dijkstra
2020-03-03 15:23     ` Siddhesh Poyarekar
2020-03-03 19:34     ` Andrew Pinski
2020-03-12 15:46       ` Wilco Dijkstra
2020-03-12 17:06         ` Andrew Pinski via Libc-alpha
2020-03-13  2:24           ` Siddhesh Poyarekar

find likely ancestor, descendant, or conflicting patches for this message:
dfblob:872a6b511de2bb97939e644c78676a6212c65d6
dfblob:ac577f1660e2706e2024e42d0efc09120c041af
dfblob:54fd5abffb1b6638ef8a5fc29e58b2f67765b28
dfblob:22da7381e68bdfaa6086e39cc980c77150242d2
dfblob:4654732e1bdd4a7e8f673b79ed66be408bed726
dfblob:bee4aed52aab69f5aaded367d40e8b64e406c54
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/libc/involved.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=AM5PR0801MB2035EB8FDD9C75E89E30405283EA0@AM5PR0801MB2035.eurprd08.prod.outlook.com \
    --to=wilco.dijkstra@arm.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).