unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9
@ 2020-09-29 15:21 Raphael Moreira Zinsly via Libc-alpha
  2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Raphael Moreira Zinsly via Libc-alpha @ 2020-09-29 15:21 UTC (permalink / raw)
  To: libc-alpha; +Cc: murphyp, Raphael Moreira Zinsly, pc

Changes since v2:
	- Check for VSX support.
	- Calls memset for large numbers when padding with zeros.

---8<---

Similar to the strcpy P9 optimization, this version uses VSX to improve
performance.
---
 sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 343 ++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |   6 +
 .../powerpc64/multiarch/strncpy-power9.S      |  32 ++
 sysdeps/powerpc/powerpc64/multiarch/strncpy.c |   8 +
 5 files changed, 390 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S

diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
new file mode 100644
index 0000000000..67cb648c65
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -0,0 +1,343 @@
+/* Optimized strncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+# ifndef STRNCPY
+#  define FUNC_NAME strncpy
+# else
+#  define FUNC_NAME STRNCPY
+# endif
+
+#ifndef MEMSET
+/* For builds without IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define MEMSET_is_local
+#  define MEMSET   __GI_memset
+# else
+#  define MEMSET   memset
+# endif
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+48)
+
+/* Implements the function
+
+   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   The implementation can load bytes past a null terminator, but only
+   up to the next 16-byte aligned address, so it never crosses a page.  */
+
+.machine power9
+#ifdef MEMSET_is_local
+ENTRY_TOCLESS (FUNC_NAME, 4)
+#else
+ENTRY (FUNC_NAME, 4)
+#endif
+	CALL_MCOUNT 2
+
+	/* NULL string optimizations  */
+	cmpdi   r5, 0
+	beqlr
+
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+	addi	r11,r3,1
+	addi	r5,r5,-1
+	vspltisb v18,0		/* Zeroes in v18  */
+	cmpdi	r0,0
+	beq	L(zero_padding)
+
+	/* Empty/1-byte string optimization  */
+	cmpdi	r5,0
+	beqlr
+
+	addi	r4,r4,1
+	neg	r7,r4
+	rldicl	r9,r7,0,60	/* How many bytes to get source 16B aligned?  */
+
+	/* Get source 16B aligned  */
+	lvx	v0,0,r4
+	lvsr	v1,0,r4
+	vperm	v0,v18,v0,v1
+
+	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	vctzlsbb r7,v6		/* Number of trailing zeroes  */
+	addi	r8,r7,1		/* Add null terminator  */
+
+	/* r8 = bytes including null
+	   r9 = bytes to get source 16B aligned
+	   if r8 > r9
+	      no null, copy r9 bytes
+	   else
+	      there is a null, copy r8 bytes and return.  */
+	cmpld	r8,r9
+	bgt	L(no_null)
+
+	cmpld	cr6,r8,r5	/* r8 <= n?  */
+	ble	cr6,L(null)
+
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	blr
+
+L(null):
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r11,r11,r8
+	sub	r5,r5,r8
+	b L(zero_padding)
+
+L(no_null):
+	cmpld	r9,r5		/* Check if length was reached.  */
+	bge	L(n_tail1)
+
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r4,r4,r9
+	add	r11,r11,r9
+	sub	r5,r5,r9
+
+L(loop):
+	cmpldi	cr6,r5,64	/* Check if length was reached.  */
+	ble	cr6,L(final_loop)
+
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail1)
+
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail2)
+
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail3)
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail4)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+
+	addi	r4,r4,64
+	addi	r11,r11,64
+	addi	r5,r5,-64
+
+	b	L(loop)
+
+L(final_loop):
+	cmpldi	cr5,r5,16
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail1)
+	bne	cr6,L(count_tail1)
+	addi	r5,r5,-16
+
+	cmpldi	cr5,r5,16
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail2)
+	bne	cr6,L(count_tail2)
+	addi	r5,r5,-16
+
+	cmpldi	cr5,r5,16
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail3)
+	bne	cr6,L(count_tail3)
+	addi	r5,r5,-16
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	beq	cr6,L(n_tail4)
+
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail4)
+
+L(n_tail4):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48	/* Offset */
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail1):
+	beq	cr6,L(n_tail1)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail1)
+
+L(n_tail1):
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail2):
+	beq	cr6,L(n_tail2)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail2)
+
+L(n_tail2):
+	stxv	32+v0,0(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16	/* offset */
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail3):
+	beq	cr6,L(n_tail3)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail3)
+
+L(n_tail3):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32	/* Offset */
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	blr
+
+L(prep_tail1):
+L(count_tail1):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail1):
+	addi	r9,r8,1		/* Add null terminator  */
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding)
+
+L(prep_tail2):
+	addi	r5,r5,-16
+L(count_tail2):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail2):
+	addi	r9,r8,1		/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16	/* offset */
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding)
+
+L(prep_tail3):
+	addi	r5,r5,-32
+L(count_tail3):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail3):
+	addi	r9,r8,1		/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32	/* offset */
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding)
+
+L(prep_tail4):
+	addi	r5,r5,-48
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail4):
+	addi	r9,r8,1		/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48	/* offset */
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+
+/* This code pads the remainder of dest with NULL bytes.  For large numbers
+   memset gives a better performance, 255 was chosen through experimentation.
+   */
+L(zero_padding):
+	cmpldi	r5,255
+	bge	L(zero_padding_memset)
+
+L(zero_padding_loop):
+	cmpldi	cr6,r5,16	/* Check if length was reached.  */
+	ble	cr6,L(zero_padding_end)
+
+	stxv	v18,0(r11)
+	addi	r11,r11,16
+	addi	r5,r5,-16
+
+	b	L(zero_padding_loop)
+
+L(zero_padding_end):
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	v18,r11,r10	/* Partial store  */
+	blr
+
+	.align	4
+L(zero_padding_memset):
+	std	r30,-8(r1)   /* Save r30 on the stack.  */
+	mr	r30,r3       /* Save the return value of strncpy.  */
+	/* Prepare the call to memset.  */
+	mr	r3,r11       /* Pointer to the area to be zero-filled.  */
+	li	r4,0         /* Byte to be written (zero).  */
+
+	/* We delayed the creation of the stack frame, as well as the saving of
+	   the link register, because only at this point, we are sure that
+	   doing so is actually needed.  */
+
+	/* Save the link register.  */
+	mflr	r0
+	std	r0,16(r1)
+
+	/* Create the stack frame.  */
+	stdu	r1,-FRAMESIZE(r1)
+	cfi_adjust_cfa_offset(FRAMESIZE)
+	cfi_offset(lr, 16)
+
+	bl	MEMSET
+#ifndef MEMSET_is_local
+	nop
+#endif
+
+	ld	r0,FRAMESIZE+16(r1)
+
+	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
+				dest.  */
+	ld	r30,FRAMESIZE-8(r1) /* Restore r30.  */
+	/* Restore the stack frame.  */
+	addi	r1,r1,FRAMESIZE
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	/* Restore the link register.  */
+	mtlr	r0
+	cfi_restore(lr)
+	blr
+
+END (FUNC_NAME)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 19acb6c64a..cd2b47b403 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
-		   rawmemchr-power9 strlen-power9
+		   rawmemchr-power9 strlen-power9 strncpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index ea10b00417..fb55b07e53 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -301,6 +301,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+			      && (hwcap & PPC_FEATURE_HAS_VSX),
+			      __strncpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, strncpy,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
new file mode 100644
index 0000000000..68e1e8d925
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
@@ -0,0 +1,32 @@
+/* Optimized strncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRNCPY __strncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+/* memset is used to pad the end of the string.  */
+#define MEMSET __memset_power8
+#ifdef SHARED
+#define MEMSET_is_local
+#endif
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index 7bacf28aca..3f2108ddae 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -28,11 +28,19 @@
 extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
 extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
 extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
+# endif
 # undef strncpy
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
  ifunc symbol properly. */
 libc_ifunc_redirected (__redirect_strncpy, strncpy,
+# ifdef __LITTLE_ENDIAN__
+		     (hwcap2 & PPC_FEATURE2_ARCH_3_00) &&
+		     (hwcap & PPC_FEATURE_HAS_VSX)
+		     ? __strncpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strncpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 2/2] powerpc: Add optimized stpncpy for POWER9
  2020-09-29 15:21 [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly via Libc-alpha
@ 2020-09-29 15:21 ` Raphael Moreira Zinsly via Libc-alpha
  2020-09-29 15:23   ` Raphael M Zinsly via Libc-alpha
                     ` (2 more replies)
  2020-09-29 15:22 ` [PATCH v3 1/2] powerpc: Add optimized strncpy " Raphael M Zinsly via Libc-alpha
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 10+ messages in thread
From: Raphael Moreira Zinsly via Libc-alpha @ 2020-09-29 15:21 UTC (permalink / raw)
  To: libc-alpha; +Cc: murphyp, Raphael Moreira Zinsly, pc

Add stpncpy support into the POWER9 strncpy.
---
 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 +++++++
 sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 68 ++++++++++++++++++-
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |  2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |  6 ++
 .../powerpc64/multiarch/stpncpy-power9.S      | 29 ++++++++
 sysdeps/powerpc/powerpc64/multiarch/stpncpy.c |  8 +++
 6 files changed, 135 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S

diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
new file mode 100644
index 0000000000..81d9673d8b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
index 67cb648c65..b7d308c984 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -18,11 +18,19 @@
 
 #include <sysdep.h>
 
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+#   define FUNC_NAME __stpncpy
+# else
+#   define FUNC_NAME STPNCPY
+# endif
+#else
 # ifndef STRNCPY
 #  define FUNC_NAME strncpy
 # else
 #  define FUNC_NAME STRNCPY
 # endif
+#endif  /* !USE_AS_STPNCPY  */
 
 #ifndef MEMSET
 /* For builds without IFUNC support, local calls should be made to internal
@@ -41,6 +49,12 @@
 
    char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
 
+   or
+
+   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   if USE_AS_STPNCPY is defined.
+
    The implementation can load bytes past a null terminator, but only
    up to the next 16-byte aligned address, so it never crosses a page.  */
 
@@ -66,7 +80,15 @@ ENTRY (FUNC_NAME, 4)
 
 	/* Empty/1-byte string optimization  */
 	cmpdi	r5,0
+#ifdef USE_AS_STPNCPY
+	bgt	L(cont)
+	/* Compute pointer to last byte copied into dest.  */
+	addi	r3,r3,1
+	blr
+L(cont):
+#else
 	beqlr
+#endif
 
 	addi	r4,r4,1
 	neg	r7,r4
@@ -96,12 +118,20 @@ ENTRY (FUNC_NAME, 4)
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
 
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(null):
 	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
 
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r7
+#endif
 	add	r11,r11,r8
 	sub	r5,r5,r8
 	b L(zero_padding)
@@ -185,6 +215,10 @@ L(n_tail4):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,48	/* Offset */
 	stxvl	32+v3,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail1):
@@ -196,6 +230,10 @@ L(prep_n_tail1):
 L(n_tail1):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail2):
@@ -209,6 +247,10 @@ L(n_tail2):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,16	/* offset */
 	stxvl	32+v1,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail3):
@@ -223,6 +265,10 @@ L(n_tail3):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,32	/* Offset */
 	stxvl	32+v2,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_tail1):
@@ -232,6 +278,10 @@ L(tail1):
 	addi	r9,r8,1		/* Add null terminator  */
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding)
@@ -246,6 +296,10 @@ L(tail2):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,16	/* offset */
 	stxvl	32+v1,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding)
@@ -261,6 +315,10 @@ L(tail3):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,32	/* offset */
 	stxvl	32+v2,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding)
@@ -276,6 +334,10 @@ L(tail4):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,48	/* offset */
 	stxvl	32+v3,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 
@@ -330,7 +392,8 @@ L(zero_padding_memset):
 	ld	r0,FRAMESIZE+16(r1)
 
 	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
-				dest.  */
+				dest.  For stpncpy, the return value is the
+				same as return value of memset.  */
 	ld	r30,FRAMESIZE-8(r1) /* Restore r30.  */
 	/* Restore the stack frame.  */
 	addi	r1,r1,FRAMESIZE
@@ -341,3 +404,6 @@ L(zero_padding_memset):
 	blr
 
 END (FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index cd2b47b403..f46bf50732 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
-		   rawmemchr-power9 strlen-power9 strncpy-power9
+		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index fb55b07e53..d0f20cc97f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -318,6 +318,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+			      && (hwcap & PPC_FEATURE_HAS_VSX),
+			      __stpncpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __stpncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
new file mode 100644
index 0000000000..55daa3455f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
@@ -0,0 +1,29 @@
+/* Optimized stpncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define STPNCPY __stpncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#define MEMSET __memset_power8
+#ifdef SHARED
+#define MEMSET_is_local
+#endif
+
+#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index 17df886431..3758f29ad1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -26,10 +26,18 @@
 extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
+# endif
 # undef stpncpy
 # undef __stpncpy
 
 libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
+# ifdef __LITTLE_ENDIAN__
+		     (hwcap2 & PPC_FEATURE2_ARCH_3_00) &&
+		     (hwcap & PPC_FEATURE_HAS_VSX)
+		     ? __stpncpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __stpncpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9
  2020-09-29 15:21 [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly via Libc-alpha
  2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
@ 2020-09-29 15:22 ` Raphael M Zinsly via Libc-alpha
  2020-10-15 15:20 ` Lucas A. M. Magalhaes via Libc-alpha
  2020-11-12 17:09 ` Tulio Magno Quites Machado Filho via Libc-alpha
  3 siblings, 0 replies; 10+ messages in thread
From: Raphael M Zinsly via Libc-alpha @ 2020-09-29 15:22 UTC (permalink / raw)
  To: libc-alpha

Benchtest output:
                             	generic_strncpy	__strncpy_power9 
__strncpy_power8	__strncpy_power7	__strncpy_ppc
Length   16, n   16, alignment  1/ 1:	7.12492	2.55148	2.55079	5.71316 
9.05306
Length   16, n   16, alignment  1/ 1:	6.71919	2.51696	2.56187	5.92056 
9.43297
Length   16, n   16, alignment  1/ 2:	6.65909	2.53904	2.55074	5.6362	9.30194
Length   16, n   16, alignment  2/ 1:	6.50693	2.51671	2.82125	5.9298	9.18833
Length    2, n    4, alignment  7/ 2:	7.68477	2.27528	5.07192	4.8353	8.4619
Length    4, n    2, alignment  2/ 7:	6.03804	1.6644	2.32355	2.8178	6.27398
Length    2, n    4, alignment  7/ 2:	7.68944	2.31009	4.25078	4.83038 
8.59554
Length    4, n    2, alignment  2/ 7:	6.04246	1.66875	2.31775	2.73826	6.8358
Length   16, n   16, alignment  2/ 2:	6.50729	2.51669	2.83075	5.91498 
9.60274
Length   16, n   16, alignment  2/ 2:	6.3441	2.51684	2.82829	5.64233	9.29031
Length   16, n   16, alignment  2/ 4:	6.33989	2.51866	2.82089	5.59129 
9.50426
Length   16, n   16, alignment  4/ 2:	7.88012	2.51145	3.44369	5.91774 
9.50185
Length    4, n    8, alignment  6/ 4:	7.88965	2.27023	5.2189	4.67992	9.06714
Length    8, n    4, alignment  4/ 6:	5.8805	1.69238	2.67589	2.95865	7.70634
Length    4, n    8, alignment  6/ 4:	7.69107	2.29724	5.2196	4.68409	9.07751
Length    8, n    4, alignment  4/ 6:	6.33989	1.69501	2.67179	2.95862 
7.75311
Length   16, n   16, alignment  3/ 3:	6.58968	2.51681	3.14295	5.92364 
8.86981
Length   16, n   16, alignment  3/ 3:	6.76256	2.51385	3.14379	5.91558 
9.02347
Length   16, n   16, alignment  3/ 6:	6.76734	2.53841	3.08182	5.90924 
8.98558
Length   16, n   16, alignment  6/ 3:	6.67014	2.51618	4.16905	5.94761 
7.81751
Length    8, n   16, alignment  5/ 6:	7.70082	2.30026	4.59182	5.41689 
10.5428
Length   16, n    8, alignment  6/ 5:	5.63868	1.87873	2.32929	4.5053	5.78866
Length    8, n   16, alignment  5/ 6:	7.40013	2.2999	4.23768	5.41724	10.1649
Length   16, n    8, alignment  6/ 5:	5.63858	1.87872	2.32768	4.58045 
6.02812
Length   16, n   16, alignment  4/ 4:	7.37003	2.5167	3.50594	5.91125	8.93866
Length   16, n   16, alignment  4/ 4:	7.51015	2.51684	3.58684	5.91127 
8.60509
Length   16, n   16, alignment  4/ 0:	7.42056	2.51149	3.38179	5.92321 
8.86607
Length   16, n   16, alignment  0/ 4:	6.6704	1.87853	2.44519	5.91475	7.68788
Length   16, n   32, alignment  4/ 0:	11.0276	3.0727	6.01877	6.9094	11.4447
Length   32, n   16, alignment  0/ 4:	6.90919	1.87852	2.45708	5.91217	6.7671
Length   16, n   32, alignment  4/ 0:	9.76588	3.07257	5.92168	6.81253 
11.8936
Length   32, n   16, alignment  0/ 4:	6.90342	1.88296	2.44527	5.91673 
7.68469
Length   16, n   16, alignment  5/ 5:	6.90186	2.51712	3.91963	5.91852 
9.46308
Length   16, n   16, alignment  5/ 5:	6.58716	2.51626	3.94884	5.91303 
9.59648
Length   16, n   16, alignment  5/ 2:	6.92421	2.52057	3.80827	5.91558	9.3486
Length   16, n   16, alignment  2/ 5:	6.50526	2.53369	2.82035	5.91729	9.065
Length   32, n   64, alignment  3/ 2:	14.0395	3.79978	6.41657	11.19	13.9713
Length   64, n   32, alignment  2/ 3:	9.85699	2.75331	3.21559	8.23056 
11.4077
Length   32, n   64, alignment  3/ 2:	14.0923	3.8037	6.38851	11.4514	15.9838
Length   64, n   32, alignment  2/ 3:	9.4437	2.75344	3.21249	8.21276	13.9496
Length   16, n   16, alignment  6/ 6:	6.33989	2.51408	4.38486	5.91681 
7.37203
Length   16, n   16, alignment  6/ 6:	6.76503	2.51645	4.26454	5.9103	7.87574
Length   16, n   16, alignment  6/ 4:	6.51654	2.51654	4.24635	5.91578 
7.17827
Length   16, n   16, alignment  4/ 6:	7.28735	2.53335	3.54029	5.92337 
8.63075
Length   64, n  128, alignment  2/ 4:	15.4973	4.98808	7.34157	11.5113 
16.7688
Length  128, n   64, alignment  4/ 2:	11.6235	3.54914	4.80814	10.3103 
11.6194
Length   64, n  128, alignment  2/ 4:	15.4979	5.02559	7.28236	11.5045 
22.1309
Length  128, n   64, alignment  4/ 2:	11.6138	3.53841	4.80527	10.3293 
19.5239
Length   16, n   16, alignment  7/ 7:	6.84212	2.51109	5.0585	5.7457	7.2307
Length   16, n   16, alignment  7/ 7:	6.86215	2.50957	5.06541	5.91726 
8.55044
Length   16, n   16, alignment  7/ 6:	6.97428	2.51876	5.05053	5.92637 
7.07715
Length   16, n   16, alignment  6/ 7:	7.01347	2.53448	4.38004	5.93278 
7.86288
Length  128, n  256, alignment  1/ 6:	17.9407	7.92071	9.38384	16.9419 
20.6065
Length  256, n  128, alignment  6/ 1:	13.3609	4.7983	7.967	12.5699	14.9996
Length  128, n  256, alignment  1/ 6:	17.9371	7.69161	9.36672	16.739	38.9048
Length  256, n  128, alignment  6/ 1:	13.3632	4.87671	7.80194	12.7028 
33.9017
Length    8, n   16, alignment  0/ 0:	7.4529	2.29963	3.62737	4.22665	7.50268
Length   32, n   16, alignment  0/ 0:	6.86674	1.87853	2.45092	2.41528 
7.30161
Length    8, n   16, alignment  7/ 2:	7.40103	2.29399	3.75703	5.43637 
8.45285
Length   32, n   16, alignment  7/ 2:	7.72683	2.35278	5.04996	5.93629 
7.18881
Length   16, n   32, alignment  0/ 0:	9.87066	3.17511	4.89448	4.41405 
10.3408
Length   64, n   32, alignment  0/ 0:	8.06217	2.32926	2.94508	2.71275 
8.11769
Length   16, n   32, alignment  6/ 4:	9.50052	3.07627	6.37858	9.46793 
10.1393
Length   64, n   32, alignment  6/ 4:	9.7197	2.75154	4.47331	7.73667	9.26558
Length   32, n   64, alignment  0/ 0:	10.9157	3.79013	4.83041	4.97713 
11.5486
Length  128, n   64, alignment  0/ 0:	9.28057	3.15788	3.5178	4.23091	11.0874
Length   32, n   64, alignment  5/ 6:	14.0472	3.8515	7.26431	10.1343	12.8115
Length  128, n   64, alignment  5/ 6:	11.5493	3.5659	5.05553	9.1005	13.4053
Length   64, n  128, alignment  0/ 0:	12.0056	4.94615	6.45436	7.06235 
14.4743
Length  256, n  128, alignment  0/ 0:	7.87506	4.49546	6.4492	5.38877	12.1437
Length   64, n  128, alignment  4/ 0:	12.4174	4.99773	7.73749	11.1452 
16.1494
Length  256, n  128, alignment  4/ 0:	12.2601	4.88446	6.95948	13.3726 
16.7583
Length  128, n  256, alignment  0/ 0:	13.9215	7.51155	7.87942	8.79876 
20.4226
Length  512, n  256, alignment  0/ 0:	10.5798	6.77319	8.79757	9.03297 
20.0197
Length  128, n  256, alignment  3/ 2:	18.0213	7.57884	9.89436	18.7839 
20.5445
Length  512, n  256, alignment  3/ 2:	16.9909	7.07957	9.9271	23.2621	25.2442
Length  256, n  512, alignment  0/ 0:	17.6825	12.3074	13.3245	13.9381 
28.7687
Length 1024, n  512, alignment  0/ 0:	16.3837	10.8306	16.6999	16.6797 
38.0562
Length  256, n  512, alignment  2/ 4:	23.1953	13.0445	14.324	26.8918	30.2049
Length 1024, n  512, alignment  2/ 4:	25.4059	12.0938	17.2483	41.4883 
47.2025
Length  512, n 1024, alignment  0/ 0:	21.029	17.1782	19.4815	21.0035	43.2361
Length 2048, n 1024, alignment  0/ 0:	28.5154	19.3221	36.9624	35.482	68.4792
Length  512, n 1024, alignment  1/ 6:	32.4103	17.9272	21.5421	46.6099	55.059
Length 2048, n 1024, alignment  1/ 6:	43.0516	21.6315	37.8787	77.7889 
83.4195

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/2] powerpc: Add optimized stpncpy for POWER9
  2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
@ 2020-09-29 15:23   ` Raphael M Zinsly via Libc-alpha
  2020-09-30 13:42   ` Adhemerval Zanella via Libc-alpha
  2020-11-12 17:12   ` Tulio Magno Quites Machado Filho via Libc-alpha
  2 siblings, 0 replies; 10+ messages in thread
From: Raphael M Zinsly via Libc-alpha @ 2020-09-29 15:23 UTC (permalink / raw)
  To: libc-alpha

                             	generic_stpncpy	__stpncpy_power9 
__stpncpy_power8	__stpncpy_power7	__stpncpy_ppc
Length   16, n   16, alignment  1/ 1:	7.04141	2.66905	2.71071	5.33257 
9.45193
Length   16, n   16, alignment  1/ 1:	7.01728	2.54349	2.70763	5.35555 
9.40601
Length   16, n   16, alignment  1/ 2:	6.76331	2.56894	2.70649	5.28715 
9.19534
Length   16, n   16, alignment  2/ 1:	6.41285	2.52953	2.86392	5.25868 
9.24343
Length    2, n    4, alignment  7/ 2:	7.76627	2.36037	4.34749	4.05757 
8.45648
Length    4, n    2, alignment  2/ 7:	6.15257	1.734	2.66932	2.81884	6.61486
Length    2, n    4, alignment  7/ 2:	7.69004	2.34779	3.90224	4.08693 
8.51617
Length    4, n    2, alignment  2/ 7:	6.14888	1.73738	2.66929	2.81777 
6.39066
Length   16, n   16, alignment  2/ 2:	7.25765	2.5434	2.8759	4.7084	9.43171
Length   16, n   16, alignment  2/ 2:	6.41274	2.52681	2.87939	5.2894	9.2505
Length   16, n   16, alignment  2/ 4:	6.74797	2.6683	2.82869	5.27608	9.43391
Length   16, n   16, alignment  4/ 2:	7.6281	2.54368	3.52982	5.26862	8.7369
Length    4, n    8, alignment  6/ 4:	7.79233	2.33099	5.64785	4.21131	9.03
Length    8, n    4, alignment  4/ 6:	6.01824	1.73782	2.81779	2.81777 
7.90004
Length    4, n    8, alignment  6/ 4:	7.94851	2.33098	4.90456	3.75698 
8.89379
Length    8, n    4, alignment  4/ 6:	6.0183	1.73715	2.81777	2.41521	7.83867
Length   16, n   16, alignment  3/ 3:	6.93178	2.66854	3.22004	5.31673 
9.09542
Length   16, n   16, alignment  3/ 3:	6.99998	2.67084	3.22862	5.48294	9.2366
Length   16, n   16, alignment  3/ 6:	7.14689	2.6615	3.21888	5.25964	9.1277
Length   16, n   16, alignment  6/ 3:	6.46654	2.65885	4.57873	5.25391 
7.75507
Length    8, n   16, alignment  5/ 6:	7.37286	2.33316	3.92971	4.50331 
10.1496
Length   16, n    8, alignment  6/ 5:	5.73663	1.87991	2.633	4.09291	5.91732
Length    8, n   16, alignment  5/ 6:	7.77512	2.33361	3.67636	4.50091	10.147
Length   16, n    8, alignment  6/ 5:	5.73662	1.88001	2.57119	4.10496 
6.15016
Length   16, n   16, alignment  4/ 4:	7.55115	2.65827	3.5838	5.25628	8.81586
Length   16, n   16, alignment  4/ 4:	7.61232	2.66851	3.62508	5.32044 
8.73914
Length   16, n   16, alignment  4/ 0:	7.54588	2.54345	3.48987	5.27812 
8.77989
Length   16, n   16, alignment  0/ 4:	6.82387	1.88425	2.41569	5.27746 
7.19847
Length   16, n   32, alignment  4/ 0:	10.1135	3.10868	6.01894	6.66693 
11.7681
Length   32, n   16, alignment  0/ 4:	6.93527	1.8793	2.4162	5.29155	6.50752
Length   16, n   32, alignment  4/ 0:	10.1565	3.16134	5.78062	6.81425 
11.2226
Length   32, n   16, alignment  0/ 4:	6.76758	1.87928	2.41649	5.30161 
7.22291
Length   16, n   16, alignment  5/ 5:	7.22753	2.56593	4.22659	5.30415 
9.86703
Length   16, n   16, alignment  5/ 5:	6.76256	2.54348	4.23108	5.43866 
9.53557
Length   16, n   16, alignment  5/ 2:	7.23702	2.52833	4.23011	5.26711 
9.52126
Length   16, n   16, alignment  2/ 5:	6.68084	2.66311	2.84314	5.2709	9.24495
Length   32, n   64, alignment  3/ 2:	12.4989	3.84198	6.40671	10.4545	14.317
Length   64, n   32, alignment  2/ 3:	10.1464	2.78457	3.17933	7.67569 
12.4356
Length   32, n   64, alignment  3/ 2:	12.4991	3.83968	7.08471	10.451	15.8984
Length   64, n   32, alignment  2/ 3:	9.61285	2.78401	3.18834	7.66606 
13.9602
Length   16, n   16, alignment  6/ 6:	7.24557	2.66839	4.55951	5.25563	7.7369
Length   16, n   16, alignment  6/ 6:	6.76327	2.65836	4.5127	5.63264	7.80333
Length   16, n   16, alignment  6/ 4:	7.15127	2.54397	4.57355	5.32957 
7.51005
Length   16, n   16, alignment  4/ 6:	7.51733	2.5615	3.67299	5.31244	8.73893
Length   64, n  128, alignment  2/ 4:	14.0745	4.98021	7.33878	11.4384 
17.1572
Length  128, n   64, alignment  4/ 2:	11.7179	3.59088	4.89414	10.2021 
11.6637
Length   64, n  128, alignment  2/ 4:	14.0841	4.99105	7.28507	11.4365 
21.7537
Length  128, n   64, alignment  4/ 2:	11.7142	3.59211	4.83864	9.87632 
19.4664
Length   16, n   16, alignment  7/ 7:	7.12738	2.53533	5.62213	5.30017 
7.90888
Length   16, n   16, alignment  7/ 7:	6.82635	2.53529	5.60694	5.27111 
8.88482
Length   16, n   16, alignment  7/ 6:	6.9193	2.54376	5.48117	5.24785	8.04263
Length   16, n   16, alignment  6/ 7:	6.89261	2.55078	4.51003	5.32471 
7.81768
Length  128, n  256, alignment  1/ 6:	16.2686	7.68983	9.35727	16.2843 
19.8458
Length  256, n  128, alignment  6/ 1:	13.4356	4.94899	7.94404	15.0122 
15.0231
Length  128, n  256, alignment  1/ 6:	16.2511	7.69025	9.35528	16.2859 
37.8453
Length  256, n  128, alignment  6/ 1:	13.4332	4.94446	8.02757	12.2879 
34.1949
Length    8, n   16, alignment  0/ 0:	7.26102	2.33285	3.75702	3.85762 
7.72869
Length   32, n   16, alignment  0/ 0:	7.049	1.88689	2.42187	2.41537	6.58192
Length    8, n   16, alignment  7/ 2:	8.09344	2.31269	3.67403	4.31612 
8.21018
Length   32, n   16, alignment  7/ 2:	6.822	2.45733	5.59593	5.33252	6.53496
Length   16, n   32, alignment  0/ 0:	9.99648	3.36432	4.70547	4.55746	10.148
Length   64, n   32, alignment  0/ 0:	7.89408	2.4309	2.58854	2.70519	8.89171
Length   16, n   32, alignment  6/ 4:	9.31969	3.15547	7.24937	9.47362 
10.0091
Length   64, n   32, alignment  6/ 4:	9.91687	2.78234	4.64259	7.00062 
10.5972
Length   32, n   64, alignment  0/ 0:	11.0651	3.81484	4.4379	4.91663	11.8363
Length  128, n   64, alignment  0/ 0:	9.25821	3.20129	3.55296	4.22664 
9.63556
Length   32, n   64, alignment  5/ 6:	12.5097	3.83422	7.29892	9.09849 
13.2517
Length  128, n   64, alignment  5/ 6:	11.6165	3.60246	5.35542	8.90704 
13.3207
Length   64, n  128, alignment  0/ 0:	12.372	4.91681	5.41951	6.91629	15.0813
Length  256, n  128, alignment  0/ 0:	7.93075	4.5247	6.29502	5.58357	12.5963
Length   64, n  128, alignment  4/ 0:	12.569	5.00092	7.25225	10.4764	15.9366
Length  256, n  128, alignment  4/ 0:	12.2963	4.90654	7.57109	12.0953 
16.7672
Length  128, n  256, alignment  0/ 0:	13.9015	7.34814	7.88738	9.15353 
19.4141
Length  512, n  256, alignment  0/ 0:	10.6865	6.52749	9.15011	9.71701 
20.9021
Length  128, n  256, alignment  3/ 2:	16.3681	7.53318	9.89911	18.5309 
20.8335
Length  512, n  256, alignment  3/ 2:	17.0249	7.10063	10.1568	22.6063 
25.1262
Length  256, n  512, alignment  0/ 0:	16.5169	12.3406	13.6056	14.5875 
29.2826
Length 1024, n  512, alignment  0/ 0:	16.3619	10.8422	16.7061	17.1025 
37.7908
Length  256, n  512, alignment  2/ 4:	21.162	12.9621	14.3306	26.0856	30.0397
Length 1024, n  512, alignment  2/ 4:	25.5543	11.9978	17.7424	42.4293 
47.7581
Length  512, n 1024, alignment  0/ 0:	20.5504	17.3132	19.5751	21.3633 
42.7215
Length 2048, n 1024, alignment  0/ 0:	28.5197	19.3708	37.1801	35.3122 
67.9792
Length  512, n 1024, alignment  1/ 6:	29.9875	17.7823	22.3228	47.3516 
51.3697
Length 2048, n 1024, alignment  1/ 6:	42.9443	21.6004	38.7767	78.1732 
83.9784

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/2] powerpc: Add optimized stpncpy for POWER9
  2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
  2020-09-29 15:23   ` Raphael M Zinsly via Libc-alpha
@ 2020-09-30 13:42   ` Adhemerval Zanella via Libc-alpha
  2020-09-30 14:21     ` Raphael M Zinsly via Libc-alpha
  2020-11-12 17:12   ` Tulio Magno Quites Machado Filho via Libc-alpha
  2 siblings, 1 reply; 10+ messages in thread
From: Adhemerval Zanella via Libc-alpha @ 2020-09-30 13:42 UTC (permalink / raw)
  To: libc-alpha, Raphael M Zinsly



On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote:
> Add stpncpy support into the POWER9 strncpy.

The benchmark numbers you provided [1] seems to show it is slight worse than
the generic_strncpy which uses the same strategy as string/strncpy.c 
(which would use VSX instruction through memset/memcpy).  Did you compare this
optimization against an implementation that just call power8/9 memset/memcpy
instead? 

It should resulting a smaller implementation which reduces i-cache size and
the code is much more simpler and maintainable.  The same applies for stpncpy.

I tried to dissuade Intel developers that such micro-optimization are not
really a real gain and instead we should optimize only a handful of string
operations (memcpy/memset/etc.) and use composable implementation instead
(as generic strncpy).  It still resulted on 1a153e47fcc, but I think we 
might do better for powerpc.

[1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/2] powerpc: Add optimized stpncpy for POWER9
  2020-09-30 13:42   ` Adhemerval Zanella via Libc-alpha
@ 2020-09-30 14:21     ` Raphael M Zinsly via Libc-alpha
  2020-09-30 14:46       ` Adhemerval Zanella via Libc-alpha
  0 siblings, 1 reply; 10+ messages in thread
From: Raphael M Zinsly via Libc-alpha @ 2020-09-30 14:21 UTC (permalink / raw)
  To: Adhemerval Zanella, libc-alpha

Hi Adhemerval,

On 30/09/2020 10:42, Adhemerval Zanella wrote:
> 
> 
> On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote:
>> Add stpncpy support into the POWER9 strncpy.
> 
> The benchmark numbers you provided [1] seems to show it is slight worse than
> the generic_strncpy which uses the same strategy as string/strncpy.c
> (which would use VSX instruction through memset/memcpy).

My implementation is always better than the generic_strncpy, almost 
three times better in average. And it calls memset as well.

Are you talking about __strncpy_ppc? For some reason it is using 
strnlen_ppc instead of the strnlen_power8, but I didn't touch it.

> Did you compare this
> optimization against an implementation that just call power8/9 memset/memcpy
> instead?
> 

Not sure if I understand, isn't that generic_strncpy and strncpy_ppc?


> It should resulting a smaller implementation which reduces i-cache size and
> the code is much more simpler and maintainable.  The same applies for stpncpy.
> 
> I tried to dissuade Intel developers that such micro-optimization are not
> really a real gain and instead we should optimize only a handful of string
> operations (memcpy/memset/etc.) and use composable implementation instead
> (as generic strncpy).  It still resulted on 1a153e47fcc, but I think we
> might do better for powerpc.
> 
> [1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html
> 

Best Regards,
-- 
Raphael Moreira Zinsly
IBM
Linux on Power Toolchain

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/2] powerpc: Add optimized stpncpy for POWER9
  2020-09-30 14:21     ` Raphael M Zinsly via Libc-alpha
@ 2020-09-30 14:46       ` Adhemerval Zanella via Libc-alpha
  0 siblings, 0 replies; 10+ messages in thread
From: Adhemerval Zanella via Libc-alpha @ 2020-09-30 14:46 UTC (permalink / raw)
  To: Raphael M Zinsly, libc-alpha



On 30/09/2020 11:21, Raphael M Zinsly wrote:
> Hi Adhemerval,
> 
> On 30/09/2020 10:42, Adhemerval Zanella wrote:
>>
>>
>> On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote:
>>> Add stpncpy support into the POWER9 strncpy.
>>
>> The benchmark numbers you provided [1] seems to show it is slight worse than
>> the generic_strncpy which uses the same strategy as string/strncpy.c
>> (which would use VSX instruction through memset/memcpy).
> 
> My implementation is always better than the generic_strncpy, almost three times better in average. And it calls memset as well.
> 
> Are you talking about __strncpy_ppc? For some reason it is using strnlen_ppc instead of the strnlen_power8, but I didn't touch it.
> 
>> Did you compare this
>> optimization against an implementation that just call power8/9 memset/memcpy
>> instead?
>>
> 
> Not sure if I understand, isn't that generic_strncpy and strncpy_ppc?


Right, I misread the benchmark.  And I tested my own suggestion on the power9
from gcc farm and it seems that although it is slight faster than power7
variant it does not really beat power8 (as expected since it calls strnlen and
then memcpy/memset and access the input twice).

I do not really oppose it and it is up to the arch maintainer, but I still think
these micro-optimizations tends to just add extra maintainability and icache
pressure where the microbenchmark does not really catch.

> 
> 
>> It should resulting a smaller implementation which reduces i-cache size and
>> the code is much more simpler and maintainable.  The same applies for stpncpy.
>>
>> I tried to dissuade Intel developers that such micro-optimization are not
>> really a real gain and instead we should optimize only a handful of string
>> operations (memcpy/memset/etc.) and use composable implementation instead
>> (as generic strncpy).  It still resulted on 1a153e47fcc, but I think we
>> might do better for powerpc.
>>
>> [1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html
>>
> 
> Best Regards,

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9
  2020-09-29 15:21 [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly via Libc-alpha
  2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
  2020-09-29 15:22 ` [PATCH v3 1/2] powerpc: Add optimized strncpy " Raphael M Zinsly via Libc-alpha
@ 2020-10-15 15:20 ` Lucas A. M. Magalhaes via Libc-alpha
  2020-11-12 17:09 ` Tulio Magno Quites Machado Filho via Libc-alpha
  3 siblings, 0 replies; 10+ messages in thread
From: Lucas A. M. Magalhaes via Libc-alpha @ 2020-10-15 15:20 UTC (permalink / raw)
  To: Raphael Moreira Zinsly, libc-alpha; +Cc: murphyp, pc, Raphael Moreira Zinsly

Hi Raphael,

Thanks for the patch. All tests passed on a P9.

LGTM.

---
Lucas A. M. Magalhães

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9
  2020-09-29 15:21 [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly via Libc-alpha
                   ` (2 preceding siblings ...)
  2020-10-15 15:20 ` Lucas A. M. Magalhaes via Libc-alpha
@ 2020-11-12 17:09 ` Tulio Magno Quites Machado Filho via Libc-alpha
  3 siblings, 0 replies; 10+ messages in thread
From: Tulio Magno Quites Machado Filho via Libc-alpha @ 2020-11-12 17:09 UTC (permalink / raw)
  To: Raphael Moreira Zinsly, libc-alpha; +Cc: murphyp, pc

Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:

> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.

Remember to add the Reviewed-by: lines you collected in previous versions. ;-)

> +#define FRAMESIZE (FRAME_MIN_SIZE+48)

I think you actually meant to use FRAME_MIN_SIZE+8 here.
Fixed.

> +L(zero_padding_end):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	v18,r11,r10	/* Partial store  */
> +	blr
> +
> +	.align	4
> +L(zero_padding_memset):
> +	std	r30,-8(r1)   /* Save r30 on the stack.  */

This requires to add CFI:

	cfi_offset(r30, -8)

Done.

> +#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
> +#define STRNCPY __strncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +/* memset is used to pad the end of the string.  */
> +#define MEMSET __memset_power8
> +#ifdef SHARED
> +#define MEMSET_is_local

Wrong indentation in the previous lines.  Fixed.

I wonder if we can improve this and stop depending on the list of memset
implementations on this file.
Anyway, this isn't new and is a future work.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> index 7bacf28aca..3f2108ddae 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> @@ -28,11 +28,19 @@
>  extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
>  extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
>  extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
> +# endif
>  # undef strncpy
>  
>  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>   ifunc symbol properly. */
>  libc_ifunc_redirected (__redirect_strncpy, strncpy,
> +# ifdef __LITTLE_ENDIAN__
> +		     (hwcap2 & PPC_FEATURE2_ARCH_3_00) &&
> +		     (hwcap & PPC_FEATURE_HAS_VSX)
> +		     ? __strncpy_power9 :

Wrong indentation here.  Fixed.

Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>

Pushed as b9d83bf3eb57.

Thanks!

-- 
Tulio Magno

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/2] powerpc: Add optimized stpncpy for POWER9
  2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
  2020-09-29 15:23   ` Raphael M Zinsly via Libc-alpha
  2020-09-30 13:42   ` Adhemerval Zanella via Libc-alpha
@ 2020-11-12 17:12   ` Tulio Magno Quites Machado Filho via Libc-alpha
  2 siblings, 0 replies; 10+ messages in thread
From: Tulio Magno Quites Machado Filho via Libc-alpha @ 2020-11-12 17:12 UTC (permalink / raw)
  To: Raphael Moreira Zinsly, libc-alpha; +Cc: murphyp, Raphael Moreira Zinsly, pc

Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:

> Add stpncpy support into the POWER9 strncpy.

Same reminder for Reviewed-by.

> +#define MEMSET __memset_power8
> +#ifdef SHARED
> +#define MEMSET_is_local

Wrong indentation here.  Fixed.

Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>

Pushed as 7beee7b39ade.

Thanks!

-- 
Tulio Magno

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-11-12 17:12 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-29 15:21 [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly via Libc-alpha
2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
2020-09-29 15:23   ` Raphael M Zinsly via Libc-alpha
2020-09-30 13:42   ` Adhemerval Zanella via Libc-alpha
2020-09-30 14:21     ` Raphael M Zinsly via Libc-alpha
2020-09-30 14:46       ` Adhemerval Zanella via Libc-alpha
2020-11-12 17:12   ` Tulio Magno Quites Machado Filho via Libc-alpha
2020-09-29 15:22 ` [PATCH v3 1/2] powerpc: Add optimized strncpy " Raphael M Zinsly via Libc-alpha
2020-10-15 15:20 ` Lucas A. M. Magalhaes via Libc-alpha
2020-11-12 17:09 ` Tulio Magno Quites Machado Filho via Libc-alpha

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).