[PATCH] AArch64: Improve A64FX memset

unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH] AArch64: Improve A64FX memset
@ 2021-06-30 15:49 Wilco Dijkstra via Libc-alpha
  2021-07-09 12:23 ` [PATCH v2] " Wilco Dijkstra via Libc-alpha
  0 siblings, 1 reply; 2+ messages in thread
From: Wilco Dijkstra via Libc-alpha @ 2021-06-30 15:49 UTC (permalink / raw)
  To: naohirot@fujitsu.com; +Cc: 'GNU C Library'

Hi Naohiro,

And here is the memset version. The code is smaller and easier to follow plus a bit
faster. One thing I noticed is that it does not optimize for the common case of memset
of zero (the generic memset is significantly faster for large sizes). It is possible to just
use DC ZVA for zeroing memsets and not do any vector stores.


Reduce the codesize of the A64FX memset by simplifying the small memset code,
better handling of alignment and last 8 vectors as well as removing redundant
instructions and branches. The size for memset goes down from 1032 to 604 bytes.
Performance is noticeably better for small memsets.

Passes GLIBC regress, OK for commit?

---

diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
index ce54e5418b08c8bc0ecc7affff68a59272ba6397..da8930c2b0e5ab552943331e9a1aa355e917e775 100644
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -57,149 +57,78 @@
 	.endif
 	.endm
 
-	.macro shortcut_for_small_size exit
-	// if rest <= vector_length * 2
+
+#undef BTI_C
+#define BTI_C
+
+ENTRY (MEMSET)
+
+	PTR_ARG (0)
+	SIZE_ARG (2)
+
+	dup	z0.b, valw
 	whilelo	p0.b, xzr, count
+	cntb	vector_length
 	whilelo	p1.b, vector_length, count
-	b.last	1f
 	st1b	z0.b, p0, [dstin, #0, mul vl]
 	st1b	z0.b, p1, [dstin, #1, mul vl]
-	ret
-1:	// if rest > vector_length * 8
-	cmp	count, vector_length, lsl 3	// vector_length * 8
-	b.hi	\exit
-	// if rest <= vector_length * 4
-	lsl	tmp1, vector_length, 1	// vector_length * 2
-	whilelo	p2.b, tmp1, count
-	incb	tmp1
-	whilelo	p3.b, tmp1, count
 	b.last	1f
-	st1b	z0.b, p0, [dstin, #0, mul vl]
-	st1b	z0.b, p1, [dstin, #1, mul vl]
-	st1b	z0.b, p2, [dstin, #2, mul vl]
-	st1b	z0.b, p3, [dstin, #3, mul vl]
 	ret
-1:	// if rest <= vector_length * 8
-	lsl	tmp1, vector_length, 2	// vector_length * 4
-	whilelo	p4.b, tmp1, count
-	incb	tmp1
-	whilelo	p5.b, tmp1, count
-	b.last	1f
-	st1b	z0.b, p0, [dstin, #0, mul vl]
-	st1b	z0.b, p1, [dstin, #1, mul vl]
-	st1b	z0.b, p2, [dstin, #2, mul vl]
-	st1b	z0.b, p3, [dstin, #3, mul vl]
-	st1b	z0.b, p4, [dstin, #4, mul vl]
-	st1b	z0.b, p5, [dstin, #5, mul vl]
-	ret
-1:	lsl	tmp1, vector_length, 2	// vector_length * 4
-	incb	tmp1			// vector_length * 5
-	incb	tmp1			// vector_length * 6
-	whilelo	p6.b, tmp1, count
-	incb	tmp1
-	whilelo	p7.b, tmp1, count
-	st1b	z0.b, p0, [dstin, #0, mul vl]
-	st1b	z0.b, p1, [dstin, #1, mul vl]
-	st1b	z0.b, p2, [dstin, #2, mul vl]
-	st1b	z0.b, p3, [dstin, #3, mul vl]
-	st1b	z0.b, p4, [dstin, #4, mul vl]
-	st1b	z0.b, p5, [dstin, #5, mul vl]
-	st1b	z0.b, p6, [dstin, #6, mul vl]
-	st1b	z0.b, p7, [dstin, #7, mul vl]
-	ret
-	.endm
 
-ENTRY (MEMSET)
-
-	PTR_ARG (0)
-	SIZE_ARG (2)
-
-	cbnz	count, 1f
+	.p2align 4
+1:
+	add	dst, dstin, count
+	cmp	count, vector_length, lsl 2
+	b.hi	1f
+	st1b	z0.b, p0, [dst, #-2, mul vl]
+	st1b	z0.b, p0, [dst, #-1, mul vl]
+	ret
+1:
+	cmp	count, vector_length, lsl 3     // vector_length * 8
+	b.hi	L(vl_agnostic)
+
+	st1b	z0.b, p0, [dstin, #2, mul vl]
+	st1b	z0.b, p0, [dstin, #3, mul vl]
+	st1b	z0.b, p0, [dst, #-4, mul vl]
+	st1b	z0.b, p0, [dst, #-3, mul vl]
+	st1b	z0.b, p0, [dst, #-2, mul vl]
+	st1b	z0.b, p0, [dst, #-1, mul vl]
 	ret
-1:	dup	z0.b, valw
-	cntb	vector_length
-	// shortcut for less than vector_length * 8
-	// gives a free ptrue to p0.b for n >= vector_length
-	shortcut_for_small_size L(vl_agnostic)
-	// end of shortcut
 
 L(vl_agnostic): // VL Agnostic
 	mov	rest, count
 	mov	dst, dstin
-	add	dstend, dstin, count
-	// if rest >= L2_SIZE && vector_length == 64 then L(L2)
 	mov	tmp1, 64
-	cmp	rest, L2_SIZE
-	ccmp	vector_length, tmp1, 0, cs
-	b.eq	L(L2)
 	// if rest >= L1_SIZE && vector_length == 64 then L(L1_prefetch)
 	cmp	rest, L1_SIZE
 	ccmp	vector_length, tmp1, 0, cs
 	b.eq	L(L1_prefetch)
 
-L(unroll32):
-	lsl	tmp1, vector_length, 3	// vector_length * 8
-	lsl	tmp2, vector_length, 5	// vector_length * 32
-	.p2align 3
-1:	cmp	rest, tmp2
-	b.cc	L(unroll8)
-	st1b_unroll
-	add	dst, dst, tmp1
-	st1b_unroll
-	add	dst, dst, tmp1
-	st1b_unroll
-	add	dst, dst, tmp1
-	st1b_unroll
-	add	dst, dst, tmp1
-	sub	rest, rest, tmp2
-	b	1b
-
 L(unroll8):
 	lsl	tmp1, vector_length, 3
-	.p2align 3
+	.p2align 4
 1:	cmp	rest, tmp1
-	b.cc	L(last)
+	b.ls	L(last)
 	st1b_unroll
 	add	dst, dst, tmp1
 	sub	rest, rest, tmp1
 	b	1b
 
-L(last):
-	whilelo	p0.b, xzr, rest
-	whilelo	p1.b, vector_length, rest
-	b.last	1f
-	st1b	z0.b, p0, [dst, #0, mul vl]
-	st1b	z0.b, p1, [dst, #1, mul vl]
-	ret
-1:	lsl	tmp1, vector_length, 1	// vector_length * 2
-	whilelo	p2.b, tmp1, rest
-	incb	tmp1
-	whilelo	p3.b, tmp1, rest
-	b.last	1f
-	st1b	z0.b, p0, [dst, #0, mul vl]
-	st1b	z0.b, p1, [dst, #1, mul vl]
-	st1b	z0.b, p2, [dst, #2, mul vl]
-	st1b	z0.b, p3, [dst, #3, mul vl]
-	ret
-1:	lsl	tmp1, vector_length, 2	// vector_length * 4
-	whilelo	p4.b, tmp1, rest
-	incb	tmp1
-	whilelo	p5.b, tmp1, rest
-	incb	tmp1
-	whilelo	p6.b, tmp1, rest
-	incb	tmp1
-	whilelo	p7.b, tmp1, rest
-	st1b	z0.b, p0, [dst, #0, mul vl]
-	st1b	z0.b, p1, [dst, #1, mul vl]
-	st1b	z0.b, p2, [dst, #2, mul vl]
-	st1b	z0.b, p3, [dst, #3, mul vl]
-	st1b	z0.b, p4, [dst, #4, mul vl]
-	st1b	z0.b, p5, [dst, #5, mul vl]
-	st1b	z0.b, p6, [dst, #6, mul vl]
-	st1b	z0.b, p7, [dst, #7, mul vl]
+L(last): // store 8 vectors from the end
+	add	dst, dst, rest
+	st1b	z0.b, p0, [dst, #-8, mul vl]
+	st1b	z0.b, p0, [dst, #-7, mul vl]
+	st1b	z0.b, p0, [dst, #-6, mul vl]
+	st1b	z0.b, p0, [dst, #-5, mul vl]
+	st1b	z0.b, p0, [dst, #-4, mul vl]
+	st1b	z0.b, p0, [dst, #-3, mul vl]
+	st1b	z0.b, p0, [dst, #-2, mul vl]
+	st1b	z0.b, p0, [dst, #-1, mul vl]
 	ret
 
 L(L1_prefetch): // if rest >= L1_SIZE
+	cmp	rest, L2_SIZE
+	b.hs	L(L2)
 	.p2align 3
 1:	st1b_unroll 0, 3
 	prfm	pstl1keep, [dst, PF_DIST_L1]
@@ -208,37 +137,19 @@ L(L1_prefetch): // if rest >= L1_SIZE
 	add	dst, dst, CACHE_LINE_SIZE * 2
 	sub	rest, rest, CACHE_LINE_SIZE * 2
 	cmp	rest, L1_SIZE
-	b.ge	1b
-	cbnz	rest, L(unroll32)
-	ret
+	b.hs	1b
+	b	L(unroll8)
 
 L(L2):
-	// align dst address at vector_length byte boundary
-	sub	tmp1, vector_length, 1
-	ands	tmp2, dst, tmp1
-	// if vl_remainder == 0
-	b.eq	1f
-	sub	vl_remainder, vector_length, tmp2
-	// process remainder until the first vector_length boundary
-	whilelt	p2.b, xzr, vl_remainder
-	st1b	z0.b, p2, [dst]
-	add	dst, dst, vl_remainder
-	sub	rest, rest, vl_remainder
 	// align dstin address at CACHE_LINE_SIZE byte boundary
-1:	mov	tmp1, CACHE_LINE_SIZE
-	ands	tmp2, dst, CACHE_LINE_SIZE - 1
-	// if cl_remainder == 0
-	b.eq	L(L2_dc_zva)
-	sub	cl_remainder, tmp1, tmp2
-	// process remainder until the first CACHE_LINE_SIZE boundary
-	mov	tmp1, xzr       // index
-2:	whilelt	p2.b, tmp1, cl_remainder
-	st1b	z0.b, p2, [dst, tmp1]
-	incb	tmp1
-	cmp	tmp1, cl_remainder
-	b.lo	2b
-	add	dst, dst, cl_remainder
-	sub	rest, rest, cl_remainder
+	and	tmp1, dst, CACHE_LINE_SIZE - 1
+	sub	tmp1, tmp1, CACHE_LINE_SIZE
+	st1b	z0.b, p0, [dst, #0, mul vl]
+	st1b	z0.b, p0, [dst, #1, mul vl]
+	st1b	z0.b, p0, [dst, #2, mul vl]
+	st1b	z0.b, p0, [dst, #3, mul vl]
+	sub	dst, dst, tmp1
+	add	rest, rest, tmp1
 
 L(L2_dc_zva):
 	// zero fill
@@ -250,16 +161,15 @@ L(L2_dc_zva):
 	.p2align 3
 1:	st1b_unroll 0, 3
 	add	tmp2, dst, zva_len
-	dc	 zva, tmp2
+	dc	zva, tmp2
 	st1b_unroll 4, 7
 	add	tmp2, tmp2, CACHE_LINE_SIZE
 	dc	zva, tmp2
 	add	dst, dst, CACHE_LINE_SIZE * 2
 	sub	rest, rest, CACHE_LINE_SIZE * 2
 	cmp	rest, tmp1	// ZF_DIST + CACHE_LINE_SIZE * 2
-	b.ge	1b
-	cbnz	rest, L(unroll8)
-	ret
+	b.hs	1b
+	b	L(unroll8)
 
 END (MEMSET)
 libc_hidden_builtin_def (MEMSET)


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [PATCH v2] AArch64: Improve A64FX memset
  2021-06-30 15:49 [PATCH] AArch64: Improve A64FX memset Wilco Dijkstra via Libc-alpha
@ 2021-07-09 12:23 ` Wilco Dijkstra via Libc-alpha
  0 siblings, 0 replies; 2+ messages in thread
From: Wilco Dijkstra via Libc-alpha @ 2021-07-09 12:23 UTC (permalink / raw)
  To: naohirot@fujitsu.com; +Cc: 'GNU C Library'

Hi Naohiro,

Here is version 2 which should improve things a lot:

v2: Improve handling of last 512 bytes which improves medium sized memsets.
    Further reduce codesize by removing unnecessary unrolling of dc zva.
    Speed up huge memsets of zero and non-zero.

Reduce the codesize of the A64FX memset by simplifying the small memset code,
better handling of alignment and last 8 vectors as well as removing redundant
instructions and branches. The size for memset goes down from 1032 to 376 bytes.
For large zeroing memsets use DC ZVA, which almost doubles performance. Large
non-zero memsets use the unroll8 loop which is about 10% faster.

Passes GLIBC regress, OK for commit?

---


diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
index ce54e5418b08c8bc0ecc7affff68a59272ba6397..2737f0cba3e1a9ac887cd8072f6122f4852a9f94 100644
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -30,11 +30,7 @@
 #define L2_SIZE         (8*1024*1024)	// L2 8MB - 1MB
 #define CACHE_LINE_SIZE	256
 #define PF_DIST_L1	(CACHE_LINE_SIZE * 16)	// Prefetch distance L1
-#define ZF_DIST		(CACHE_LINE_SIZE * 21)	// Zerofill distance
-#define rest		x8
 #define vector_length	x9
-#define vl_remainder	x10	// vector_length remainder
-#define cl_remainder	x11	// CACHE_LINE_SIZE remainder
 
 #if HAVE_AARCH64_SVE_ASM
 # if IS_IN (libc)
@@ -42,224 +38,126 @@
 
 	.arch armv8.2-a+sve
 
-	.macro dc_zva times
-	dc	zva, tmp1
-	add	tmp1, tmp1, CACHE_LINE_SIZE
-	.if \times-1
-	dc_zva "(\times-1)"
-	.endif
-	.endm
-
 	.macro st1b_unroll first=0, last=7
-	st1b	z0.b, p0, [dst, #\first, mul vl]
+	st1b	z0.b, p0, [dst, \first, mul vl]
 	.if \last-\first
 	st1b_unroll "(\first+1)", \last
 	.endif
 	.endm
 
-	.macro shortcut_for_small_size exit
-	// if rest <= vector_length * 2
+
+#undef BTI_C
+#define BTI_C
+
+ENTRY (MEMSET)
+	PTR_ARG (0)
+	SIZE_ARG (2)
+
+	dup	z0.b, valw
 	whilelo	p0.b, xzr, count
+	cntb	vector_length
 	whilelo	p1.b, vector_length, count
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z0.b, p1, [dstin, 1, mul vl]
 	b.last	1f
-	st1b	z0.b, p0, [dstin, #0, mul vl]
-	st1b	z0.b, p1, [dstin, #1, mul vl]
 	ret
-1:	// if rest > vector_length * 8
-	cmp	count, vector_length, lsl 3	// vector_length * 8
-	b.hi	\exit
-	// if rest <= vector_length * 4
-	lsl	tmp1, vector_length, 1	// vector_length * 2
-	whilelo	p2.b, tmp1, count
-	incb	tmp1
-	whilelo	p3.b, tmp1, count
-	b.last	1f
-	st1b	z0.b, p0, [dstin, #0, mul vl]
-	st1b	z0.b, p1, [dstin, #1, mul vl]
-	st1b	z0.b, p2, [dstin, #2, mul vl]
-	st1b	z0.b, p3, [dstin, #3, mul vl]
-	ret
-1:	// if rest <= vector_length * 8
-	lsl	tmp1, vector_length, 2	// vector_length * 4
-	whilelo	p4.b, tmp1, count
-	incb	tmp1
-	whilelo	p5.b, tmp1, count
-	b.last	1f
-	st1b	z0.b, p0, [dstin, #0, mul vl]
-	st1b	z0.b, p1, [dstin, #1, mul vl]
-	st1b	z0.b, p2, [dstin, #2, mul vl]
-	st1b	z0.b, p3, [dstin, #3, mul vl]
-	st1b	z0.b, p4, [dstin, #4, mul vl]
-	st1b	z0.b, p5, [dstin, #5, mul vl]
-	ret
-1:	lsl	tmp1, vector_length, 2	// vector_length * 4
-	incb	tmp1			// vector_length * 5
-	incb	tmp1			// vector_length * 6
-	whilelo	p6.b, tmp1, count
-	incb	tmp1
-	whilelo	p7.b, tmp1, count
-	st1b	z0.b, p0, [dstin, #0, mul vl]
-	st1b	z0.b, p1, [dstin, #1, mul vl]
-	st1b	z0.b, p2, [dstin, #2, mul vl]
-	st1b	z0.b, p3, [dstin, #3, mul vl]
-	st1b	z0.b, p4, [dstin, #4, mul vl]
-	st1b	z0.b, p5, [dstin, #5, mul vl]
-	st1b	z0.b, p6, [dstin, #6, mul vl]
-	st1b	z0.b, p7, [dstin, #7, mul vl]
-	ret
-	.endm
 
-ENTRY (MEMSET)
-
-	PTR_ARG (0)
-	SIZE_ARG (2)
+	// count >= vector_length * 2
+	.p2align 4
+1:	add	dst, dstin, count
+	cmp	count, vector_length, lsl 2
+	b.hi	1f
+	st1b	z0.b, p0, [dst, -2, mul vl]
+	st1b	z0.b, p0, [dst, -1, mul vl]
+	ret
 
-	cbnz	count, 1f
+	// count > vector_length * 4
+1:	cmp	count, vector_length, lsl 3
+	b.hi	L(vl_agnostic)
+	st1b	z0.b, p0, [dstin, 2, mul vl]
+	st1b	z0.b, p0, [dstin, 3, mul vl]
+	st1b	z0.b, p0, [dst, -4, mul vl]
+	st1b	z0.b, p0, [dst, -3, mul vl]
+	st1b	z0.b, p0, [dst, -2, mul vl]
+	st1b	z0.b, p0, [dst, -1, mul vl]
 	ret
-1:	dup	z0.b, valw
-	cntb	vector_length
-	// shortcut for less than vector_length * 8
-	// gives a free ptrue to p0.b for n >= vector_length
-	shortcut_for_small_size L(vl_agnostic)
-	// end of shortcut
 
-L(vl_agnostic): // VL Agnostic
-	mov	rest, count
+	// count >= vector_length * 8
+	.p2align 4
+L(vl_agnostic):
 	mov	dst, dstin
-	add	dstend, dstin, count
-	// if rest >= L2_SIZE && vector_length == 64 then L(L2)
 	mov	tmp1, 64
-	cmp	rest, L2_SIZE
-	ccmp	vector_length, tmp1, 0, cs
-	b.eq	L(L2)
-	// if rest >= L1_SIZE && vector_length == 64 then L(L1_prefetch)
-	cmp	rest, L1_SIZE
+	// if count >= L1_SIZE && vector_length == 64 then L(L1_prefetch)
+	cmp	count, L1_SIZE
 	ccmp	vector_length, tmp1, 0, cs
 	b.eq	L(L1_prefetch)
 
-L(unroll32):
-	lsl	tmp1, vector_length, 3	// vector_length * 8
-	lsl	tmp2, vector_length, 5	// vector_length * 32
-	.p2align 3
-1:	cmp	rest, tmp2
-	b.cc	L(unroll8)
-	st1b_unroll
-	add	dst, dst, tmp1
-	st1b_unroll
-	add	dst, dst, tmp1
-	st1b_unroll
-	add	dst, dst, tmp1
-	st1b_unroll
-	add	dst, dst, tmp1
-	sub	rest, rest, tmp2
-	b	1b
-
+	// count >= 8 * vector_length
 L(unroll8):
 	lsl	tmp1, vector_length, 3
-	.p2align 3
-1:	cmp	rest, tmp1
-	b.cc	L(last)
-	st1b_unroll
+	sub	count, count, tmp1
+	lsl	tmp2, vector_length, 1
+	.p2align 4
+1:	subs	count, count, tmp1
+	st1b_unroll 0, 7
 	add	dst, dst, tmp1
-	sub	rest, rest, tmp1
-	b	1b
-
-L(last):
-	whilelo	p0.b, xzr, rest
-	whilelo	p1.b, vector_length, rest
-	b.last	1f
-	st1b	z0.b, p0, [dst, #0, mul vl]
-	st1b	z0.b, p1, [dst, #1, mul vl]
-	ret
-1:	lsl	tmp1, vector_length, 1	// vector_length * 2
-	whilelo	p2.b, tmp1, rest
-	incb	tmp1
-	whilelo	p3.b, tmp1, rest
-	b.last	1f
-	st1b	z0.b, p0, [dst, #0, mul vl]
-	st1b	z0.b, p1, [dst, #1, mul vl]
-	st1b	z0.b, p2, [dst, #2, mul vl]
-	st1b	z0.b, p3, [dst, #3, mul vl]
-	ret
-1:	lsl	tmp1, vector_length, 2	// vector_length * 4
-	whilelo	p4.b, tmp1, rest
-	incb	tmp1
-	whilelo	p5.b, tmp1, rest
-	incb	tmp1
-	whilelo	p6.b, tmp1, rest
-	incb	tmp1
-	whilelo	p7.b, tmp1, rest
-	st1b	z0.b, p0, [dst, #0, mul vl]
-	st1b	z0.b, p1, [dst, #1, mul vl]
-	st1b	z0.b, p2, [dst, #2, mul vl]
-	st1b	z0.b, p3, [dst, #3, mul vl]
-	st1b	z0.b, p4, [dst, #4, mul vl]
-	st1b	z0.b, p5, [dst, #5, mul vl]
-	st1b	z0.b, p6, [dst, #6, mul vl]
-	st1b	z0.b, p7, [dst, #7, mul vl]
+	b.hi	1b
+
+	add	dst, dst, count
+	add	count, count, tmp1
+	cmp	count, tmp2
+	b.ls	2f
+	add	tmp2, vector_length, vector_length, lsl 2
+	cmp	count, tmp2
+	b.ls	5f
+	st1b	z0.b, p0, [dst, 0, mul vl]
+	st1b	z0.b, p0, [dst, 1, mul vl]
+	st1b	z0.b, p0, [dst, 2, mul vl]
+5:	st1b	z0.b, p0, [dst, 3, mul vl]
+	st1b	z0.b, p0, [dst, 4, mul vl]
+	st1b	z0.b, p0, [dst, 5, mul vl]
+2:	st1b	z0.b, p0, [dst, 6, mul vl]
+	st1b	z0.b, p0, [dst, 7, mul vl]
 	ret
 
-L(L1_prefetch): // if rest >= L1_SIZE
+	// count >= L1_SIZE
 	.p2align 3
+L(L1_prefetch):
+	cmp	count, L2_SIZE
+	b.hs	L(L2)
 1:	st1b_unroll 0, 3
 	prfm	pstl1keep, [dst, PF_DIST_L1]
 	st1b_unroll 4, 7
 	prfm	pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE]
 	add	dst, dst, CACHE_LINE_SIZE * 2
-	sub	rest, rest, CACHE_LINE_SIZE * 2
-	cmp	rest, L1_SIZE
-	b.ge	1b
-	cbnz	rest, L(unroll32)
-	ret
+	sub	count, count, CACHE_LINE_SIZE * 2
+	cmp	count, PF_DIST_L1
+	b.hs	1b
+	b	L(unroll8)
 
+	// count >= L2_SIZE
 L(L2):
-	// align dst address at vector_length byte boundary
-	sub	tmp1, vector_length, 1
-	ands	tmp2, dst, tmp1
-	// if vl_remainder == 0
-	b.eq	1f
-	sub	vl_remainder, vector_length, tmp2
-	// process remainder until the first vector_length boundary
-	whilelt	p2.b, xzr, vl_remainder
-	st1b	z0.b, p2, [dst]
-	add	dst, dst, vl_remainder
-	sub	rest, rest, vl_remainder
-	// align dstin address at CACHE_LINE_SIZE byte boundary
-1:	mov	tmp1, CACHE_LINE_SIZE
-	ands	tmp2, dst, CACHE_LINE_SIZE - 1
-	// if cl_remainder == 0
-	b.eq	L(L2_dc_zva)
-	sub	cl_remainder, tmp1, tmp2
-	// process remainder until the first CACHE_LINE_SIZE boundary
-	mov	tmp1, xzr       // index
-2:	whilelt	p2.b, tmp1, cl_remainder
-	st1b	z0.b, p2, [dst, tmp1]
-	incb	tmp1
-	cmp	tmp1, cl_remainder
-	b.lo	2b
-	add	dst, dst, cl_remainder
-	sub	rest, rest, cl_remainder
-
-L(L2_dc_zva):
-	// zero fill
-	mov	tmp1, dst
-	dc_zva	(ZF_DIST / CACHE_LINE_SIZE) - 1
-	mov	zva_len, ZF_DIST
-	add	tmp1, zva_len, CACHE_LINE_SIZE * 2
-	// unroll
-	.p2align 3
-1:	st1b_unroll 0, 3
-	add	tmp2, dst, zva_len
-	dc	 zva, tmp2
-	st1b_unroll 4, 7
-	add	tmp2, tmp2, CACHE_LINE_SIZE
-	dc	zva, tmp2
-	add	dst, dst, CACHE_LINE_SIZE * 2
-	sub	rest, rest, CACHE_LINE_SIZE * 2
-	cmp	rest, tmp1	// ZF_DIST + CACHE_LINE_SIZE * 2
-	b.ge	1b
-	cbnz	rest, L(unroll8)
-	ret
+	tst	valw, 255
+	b.ne	L(unroll8)
+	// align dst to CACHE_LINE_SIZE byte boundary
+	and	tmp1, dst, CACHE_LINE_SIZE - 1
+	sub	tmp1, tmp1, CACHE_LINE_SIZE
+	st1b	z0.b, p0, [dst, 0, mul vl]
+	st1b	z0.b, p0, [dst, 1, mul vl]
+	st1b	z0.b, p0, [dst, 2, mul vl]
+	st1b	z0.b, p0, [dst, 3, mul vl]
+	sub	dst, dst, tmp1
+	add	count, count, tmp1
+
+	// clear cachelines using DC ZVA
+	sub	count, count, CACHE_LINE_SIZE * 4
+	.p2align 4
+1:	dc	zva, dst
+	add	dst, dst, CACHE_LINE_SIZE
+	subs	count, count, CACHE_LINE_SIZE
+	b.hs	1b
+	add	count, count, CACHE_LINE_SIZE * 4
+	b	L(unroll8)
 
 END (MEMSET)
 libc_hidden_builtin_def (MEMSET)


^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-07-09 12:24 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-30 15:49 [PATCH] AArch64: Improve A64FX memset Wilco Dijkstra via Libc-alpha
2021-07-09 12:23 ` [PATCH v2] " Wilco Dijkstra via Libc-alpha

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).