* [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9
@ 2020-09-29 15:21 Raphael Moreira Zinsly via Libc-alpha
2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
` (3 more replies)
0 siblings, 4 replies; 10+ messages in thread
From: Raphael Moreira Zinsly via Libc-alpha @ 2020-09-29 15:21 UTC (permalink / raw)
To: libc-alpha; +Cc: murphyp, Raphael Moreira Zinsly, pc
Changes since v2:
- Check for VSX support.
- Calls memset for large numbers when padding with zeros.
---8<---
Similar to the strcpy P9 optimization, this version uses VSX to improve
performance.
---
sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 343 ++++++++++++++++++
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc64/multiarch/ifunc-impl-list.c | 6 +
.../powerpc64/multiarch/strncpy-power9.S | 32 ++
sysdeps/powerpc/powerpc64/multiarch/strncpy.c | 8 +
5 files changed, 390 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
new file mode 100644
index 0000000000..67cb648c65
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -0,0 +1,343 @@
+/* Optimized strncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+# ifndef STRNCPY
+# define FUNC_NAME strncpy
+# else
+# define FUNC_NAME STRNCPY
+# endif
+
+#ifndef MEMSET
+/* For builds without IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define MEMSET_is_local
+# define MEMSET __GI_memset
+# else
+# define MEMSET memset
+# endif
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+48)
+
+/* Implements the function
+
+ char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ The implementation can load bytes past a null terminator, but only
+ up to the next 16-byte aligned address, so it never crosses a page. */
+
+.machine power9
+#ifdef MEMSET_is_local
+ENTRY_TOCLESS (FUNC_NAME, 4)
+#else
+ENTRY (FUNC_NAME, 4)
+#endif
+ CALL_MCOUNT 2
+
+ /* NULL string optimizations */
+ cmpdi r5, 0
+ beqlr
+
+ lbz r0,0(r4)
+ stb r0,0(r3)
+ addi r11,r3,1
+ addi r5,r5,-1
+ vspltisb v18,0 /* Zeroes in v18 */
+ cmpdi r0,0
+ beq L(zero_padding)
+
+ /* Empty/1-byte string optimization */
+ cmpdi r5,0
+ beqlr
+
+ addi r4,r4,1
+ neg r7,r4
+ rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */
+
+ /* Get source 16B aligned */
+ lvx v0,0,r4
+ lvsr v1,0,r4
+ vperm v0,v18,v0,v1
+
+ vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
+ vctzlsbb r7,v6 /* Number of trailing zeroes */
+ addi r8,r7,1 /* Add null terminator */
+
+ /* r8 = bytes including null
+ r9 = bytes to get source 16B aligned
+ if r8 > r9
+ no null, copy r9 bytes
+ else
+ there is a null, copy r8 bytes and return. */
+ cmpld r8,r9
+ bgt L(no_null)
+
+ cmpld cr6,r8,r5 /* r8 <= n? */
+ ble cr6,L(null)
+
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ blr
+
+L(null):
+ sldi r10,r8,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ add r11,r11,r8
+ sub r5,r5,r8
+ b L(zero_padding)
+
+L(no_null):
+ cmpld r9,r5 /* Check if length was reached. */
+ bge L(n_tail1)
+
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ add r4,r4,r9
+ add r11,r11,r9
+ sub r5,r5,r9
+
+L(loop):
+ cmpldi cr6,r5,64 /* Check if length was reached. */
+ ble cr6,L(final_loop)
+
+ lxv 32+v0,0(r4)
+ vcmpequb. v6,v0,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail1)
+
+ lxv 32+v1,16(r4)
+ vcmpequb. v6,v1,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail2)
+
+ lxv 32+v2,32(r4)
+ vcmpequb. v6,v2,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail3)
+
+ lxv 32+v3,48(r4)
+ vcmpequb. v6,v3,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail4)
+
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+
+ addi r4,r4,64
+ addi r11,r11,64
+ addi r5,r5,-64
+
+ b L(loop)
+
+L(final_loop):
+ cmpldi cr5,r5,16
+ lxv 32+v0,0(r4)
+ vcmpequb. v6,v0,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail1)
+ bne cr6,L(count_tail1)
+ addi r5,r5,-16
+
+ cmpldi cr5,r5,16
+ lxv 32+v1,16(r4)
+ vcmpequb. v6,v1,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail2)
+ bne cr6,L(count_tail2)
+ addi r5,r5,-16
+
+ cmpldi cr5,r5,16
+ lxv 32+v2,32(r4)
+ vcmpequb. v6,v2,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail3)
+ bne cr6,L(count_tail3)
+ addi r5,r5,-16
+
+ lxv 32+v3,48(r4)
+ vcmpequb. v6,v3,v18 /* Any zero bytes? */
+ beq cr6,L(n_tail4)
+
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpld r8,r5 /* r8 < n? */
+ blt L(tail4)
+
+L(n_tail4):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,48 /* Offset */
+ stxvl 32+v3,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail1):
+ beq cr6,L(n_tail1) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpld r8,r5 /* r8 < n? */
+ blt L(tail1)
+
+L(n_tail1):
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail2):
+ beq cr6,L(n_tail2) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpld r8,r5 /* r8 < n? */
+ blt L(tail2)
+
+L(n_tail2):
+ stxv 32+v0,0(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,16 /* offset */
+ stxvl 32+v1,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail3):
+ beq cr6,L(n_tail3) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpld r8,r5 /* r8 < n? */
+ blt L(tail3)
+
+L(n_tail3):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,32 /* Offset */
+ stxvl 32+v2,r11,r10 /* Partial store */
+ blr
+
+L(prep_tail1):
+L(count_tail1):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail1):
+ addi r9,r8,1 /* Add null terminator */
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding)
+
+L(prep_tail2):
+ addi r5,r5,-16
+L(count_tail2):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail2):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,16 /* offset */
+ stxvl 32+v1,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding)
+
+L(prep_tail3):
+ addi r5,r5,-32
+L(count_tail3):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail3):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,32 /* offset */
+ stxvl 32+v2,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding)
+
+L(prep_tail4):
+ addi r5,r5,-48
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail4):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,48 /* offset */
+ stxvl 32+v3,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+
+/* This code pads the remainder of dest with NULL bytes. For large numbers
+ memset gives a better performance, 255 was chosen through experimentation.
+ */
+L(zero_padding):
+ cmpldi r5,255
+ bge L(zero_padding_memset)
+
+L(zero_padding_loop):
+ cmpldi cr6,r5,16 /* Check if length was reached. */
+ ble cr6,L(zero_padding_end)
+
+ stxv v18,0(r11)
+ addi r11,r11,16
+ addi r5,r5,-16
+
+ b L(zero_padding_loop)
+
+L(zero_padding_end):
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl v18,r11,r10 /* Partial store */
+ blr
+
+ .align 4
+L(zero_padding_memset):
+ std r30,-8(r1) /* Save r30 on the stack. */
+ mr r30,r3 /* Save the return value of strncpy. */
+ /* Prepare the call to memset. */
+ mr r3,r11 /* Pointer to the area to be zero-filled. */
+ li r4,0 /* Byte to be written (zero). */
+
+ /* We delayed the creation of the stack frame, as well as the saving of
+ the link register, because only at this point, we are sure that
+ doing so is actually needed. */
+
+ /* Save the link register. */
+ mflr r0
+ std r0,16(r1)
+
+ /* Create the stack frame. */
+ stdu r1,-FRAMESIZE(r1)
+ cfi_adjust_cfa_offset(FRAMESIZE)
+ cfi_offset(lr, 16)
+
+ bl MEMSET
+#ifndef MEMSET_is_local
+ nop
+#endif
+
+ ld r0,FRAMESIZE+16(r1)
+
+ mr r3,r30 /* Restore the return value of strncpy, i.e.:
+ dest. */
+ ld r30,FRAMESIZE-8(r1) /* Restore r30. */
+ /* Restore the stack frame. */
+ addi r1,r1,FRAMESIZE
+ cfi_adjust_cfa_offset(-FRAMESIZE)
+ /* Restore the link register. */
+ mtlr r0
+ cfi_restore(lr)
+ blr
+
+END (FUNC_NAME)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 19acb6c64a..cd2b47b403 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
- rawmemchr-power9 strlen-power9
+ rawmemchr-power9 strlen-power9 strncpy-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index ea10b00417..fb55b07e53 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -301,6 +301,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */
IFUNC_IMPL (i, name, strncpy,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, strncpy,
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ && (hwcap & PPC_FEATURE_HAS_VSX),
+ __strncpy_power9)
+#endif
IFUNC_IMPL_ADD (array, i, strncpy,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
new file mode 100644
index 0000000000..68e1e8d925
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
@@ -0,0 +1,32 @@
+/* Optimized strncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRNCPY __strncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+/* memset is used to pad the end of the string. */
+#define MEMSET __memset_power8
+#ifdef SHARED
+#define MEMSET_is_local
+#endif
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index 7bacf28aca..3f2108ddae 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -28,11 +28,19 @@
extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
+# endif
# undef strncpy
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc_redirected (__redirect_strncpy, strncpy,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00) &&
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strncpy_power9 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strncpy_power8
: (hwcap & PPC_FEATURE_HAS_VSX)
--
2.26.2
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH v3 2/2] powerpc: Add optimized stpncpy for POWER9
2020-09-29 15:21 [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly via Libc-alpha
@ 2020-09-29 15:21 ` Raphael Moreira Zinsly via Libc-alpha
2020-09-29 15:23 ` Raphael M Zinsly via Libc-alpha
` (2 more replies)
2020-09-29 15:22 ` [PATCH v3 1/2] powerpc: Add optimized strncpy " Raphael M Zinsly via Libc-alpha
` (2 subsequent siblings)
3 siblings, 3 replies; 10+ messages in thread
From: Raphael Moreira Zinsly via Libc-alpha @ 2020-09-29 15:21 UTC (permalink / raw)
To: libc-alpha; +Cc: murphyp, Raphael Moreira Zinsly, pc
Add stpncpy support into the POWER9 strncpy.
---
sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 +++++++
sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 68 ++++++++++++++++++-
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc64/multiarch/ifunc-impl-list.c | 6 ++
.../powerpc64/multiarch/stpncpy-power9.S | 29 ++++++++
sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 8 +++
6 files changed, 135 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
new file mode 100644
index 0000000000..81d9673d8b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
index 67cb648c65..b7d308c984 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -18,11 +18,19 @@
#include <sysdep.h>
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+# define FUNC_NAME __stpncpy
+# else
+# define FUNC_NAME STPNCPY
+# endif
+#else
# ifndef STRNCPY
# define FUNC_NAME strncpy
# else
# define FUNC_NAME STRNCPY
# endif
+#endif /* !USE_AS_STPNCPY */
#ifndef MEMSET
/* For builds without IFUNC support, local calls should be made to internal
@@ -41,6 +49,12 @@
char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+ or
+
+ char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ if USE_AS_STPNCPY is defined.
+
The implementation can load bytes past a null terminator, but only
up to the next 16-byte aligned address, so it never crosses a page. */
@@ -66,7 +80,15 @@ ENTRY (FUNC_NAME, 4)
/* Empty/1-byte string optimization */
cmpdi r5,0
+#ifdef USE_AS_STPNCPY
+ bgt L(cont)
+ /* Compute pointer to last byte copied into dest. */
+ addi r3,r3,1
+ blr
+L(cont):
+#else
beqlr
+#endif
addi r4,r4,1
neg r7,r4
@@ -96,12 +118,20 @@ ENTRY (FUNC_NAME, 4)
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(null):
sldi r10,r8,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r7
+#endif
add r11,r11,r8
sub r5,r5,r8
b L(zero_padding)
@@ -185,6 +215,10 @@ L(n_tail4):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,48 /* Offset */
stxvl 32+v3,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail1):
@@ -196,6 +230,10 @@ L(prep_n_tail1):
L(n_tail1):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail2):
@@ -209,6 +247,10 @@ L(n_tail2):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,16 /* offset */
stxvl 32+v1,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail3):
@@ -223,6 +265,10 @@ L(n_tail3):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,32 /* Offset */
stxvl 32+v2,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_tail1):
@@ -232,6 +278,10 @@ L(tail1):
addi r9,r8,1 /* Add null terminator */
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding)
@@ -246,6 +296,10 @@ L(tail2):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,16 /* offset */
stxvl 32+v1,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding)
@@ -261,6 +315,10 @@ L(tail3):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,32 /* offset */
stxvl 32+v2,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding)
@@ -276,6 +334,10 @@ L(tail4):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,48 /* offset */
stxvl 32+v3,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
@@ -330,7 +392,8 @@ L(zero_padding_memset):
ld r0,FRAMESIZE+16(r1)
mr r3,r30 /* Restore the return value of strncpy, i.e.:
- dest. */
+ dest. For stpncpy, the return value is the
+ same as return value of memset. */
ld r30,FRAMESIZE-8(r1) /* Restore r30. */
/* Restore the stack frame. */
addi r1,r1,FRAMESIZE
@@ -341,3 +404,6 @@ L(zero_padding_memset):
blr
END (FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index cd2b47b403..f46bf50732 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
- rawmemchr-power9 strlen-power9 strncpy-power9
+ rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index fb55b07e53..d0f20cc97f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -318,6 +318,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, stpncpy,
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ && (hwcap & PPC_FEATURE_HAS_VSX),
+ __stpncpy_power9)
+#endif
IFUNC_IMPL_ADD (array, i, stpncpy,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__stpncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
new file mode 100644
index 0000000000..55daa3455f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
@@ -0,0 +1,29 @@
+/* Optimized stpncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STPNCPY __stpncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#define MEMSET __memset_power8
+#ifdef SHARED
+#define MEMSET_is_local
+#endif
+
+#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index 17df886431..3758f29ad1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -26,10 +26,18 @@
extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
+# endif
# undef stpncpy
# undef __stpncpy
libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00) &&
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __stpncpy_power9 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __stpncpy_power8
: (hwcap & PPC_FEATURE_HAS_VSX)
--
2.26.2
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH v3 2/2] powerpc: Add optimized stpncpy for POWER9
2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
@ 2020-09-29 15:23 ` Raphael M Zinsly via Libc-alpha
2020-09-30 13:42 ` Adhemerval Zanella via Libc-alpha
2020-11-12 17:12 ` Tulio Magno Quites Machado Filho via Libc-alpha
2 siblings, 0 replies; 10+ messages in thread
From: Raphael M Zinsly via Libc-alpha @ 2020-09-29 15:23 UTC (permalink / raw)
To: libc-alpha
generic_stpncpy __stpncpy_power9
__stpncpy_power8 __stpncpy_power7 __stpncpy_ppc
Length 16, n 16, alignment 1/ 1: 7.04141 2.66905 2.71071 5.33257
9.45193
Length 16, n 16, alignment 1/ 1: 7.01728 2.54349 2.70763 5.35555
9.40601
Length 16, n 16, alignment 1/ 2: 6.76331 2.56894 2.70649 5.28715
9.19534
Length 16, n 16, alignment 2/ 1: 6.41285 2.52953 2.86392 5.25868
9.24343
Length 2, n 4, alignment 7/ 2: 7.76627 2.36037 4.34749 4.05757
8.45648
Length 4, n 2, alignment 2/ 7: 6.15257 1.734 2.66932 2.81884 6.61486
Length 2, n 4, alignment 7/ 2: 7.69004 2.34779 3.90224 4.08693
8.51617
Length 4, n 2, alignment 2/ 7: 6.14888 1.73738 2.66929 2.81777
6.39066
Length 16, n 16, alignment 2/ 2: 7.25765 2.5434 2.8759 4.7084 9.43171
Length 16, n 16, alignment 2/ 2: 6.41274 2.52681 2.87939 5.2894 9.2505
Length 16, n 16, alignment 2/ 4: 6.74797 2.6683 2.82869 5.27608 9.43391
Length 16, n 16, alignment 4/ 2: 7.6281 2.54368 3.52982 5.26862 8.7369
Length 4, n 8, alignment 6/ 4: 7.79233 2.33099 5.64785 4.21131 9.03
Length 8, n 4, alignment 4/ 6: 6.01824 1.73782 2.81779 2.81777
7.90004
Length 4, n 8, alignment 6/ 4: 7.94851 2.33098 4.90456 3.75698
8.89379
Length 8, n 4, alignment 4/ 6: 6.0183 1.73715 2.81777 2.41521 7.83867
Length 16, n 16, alignment 3/ 3: 6.93178 2.66854 3.22004 5.31673
9.09542
Length 16, n 16, alignment 3/ 3: 6.99998 2.67084 3.22862 5.48294 9.2366
Length 16, n 16, alignment 3/ 6: 7.14689 2.6615 3.21888 5.25964 9.1277
Length 16, n 16, alignment 6/ 3: 6.46654 2.65885 4.57873 5.25391
7.75507
Length 8, n 16, alignment 5/ 6: 7.37286 2.33316 3.92971 4.50331
10.1496
Length 16, n 8, alignment 6/ 5: 5.73663 1.87991 2.633 4.09291 5.91732
Length 8, n 16, alignment 5/ 6: 7.77512 2.33361 3.67636 4.50091 10.147
Length 16, n 8, alignment 6/ 5: 5.73662 1.88001 2.57119 4.10496
6.15016
Length 16, n 16, alignment 4/ 4: 7.55115 2.65827 3.5838 5.25628 8.81586
Length 16, n 16, alignment 4/ 4: 7.61232 2.66851 3.62508 5.32044
8.73914
Length 16, n 16, alignment 4/ 0: 7.54588 2.54345 3.48987 5.27812
8.77989
Length 16, n 16, alignment 0/ 4: 6.82387 1.88425 2.41569 5.27746
7.19847
Length 16, n 32, alignment 4/ 0: 10.1135 3.10868 6.01894 6.66693
11.7681
Length 32, n 16, alignment 0/ 4: 6.93527 1.8793 2.4162 5.29155 6.50752
Length 16, n 32, alignment 4/ 0: 10.1565 3.16134 5.78062 6.81425
11.2226
Length 32, n 16, alignment 0/ 4: 6.76758 1.87928 2.41649 5.30161
7.22291
Length 16, n 16, alignment 5/ 5: 7.22753 2.56593 4.22659 5.30415
9.86703
Length 16, n 16, alignment 5/ 5: 6.76256 2.54348 4.23108 5.43866
9.53557
Length 16, n 16, alignment 5/ 2: 7.23702 2.52833 4.23011 5.26711
9.52126
Length 16, n 16, alignment 2/ 5: 6.68084 2.66311 2.84314 5.2709 9.24495
Length 32, n 64, alignment 3/ 2: 12.4989 3.84198 6.40671 10.4545 14.317
Length 64, n 32, alignment 2/ 3: 10.1464 2.78457 3.17933 7.67569
12.4356
Length 32, n 64, alignment 3/ 2: 12.4991 3.83968 7.08471 10.451 15.8984
Length 64, n 32, alignment 2/ 3: 9.61285 2.78401 3.18834 7.66606
13.9602
Length 16, n 16, alignment 6/ 6: 7.24557 2.66839 4.55951 5.25563 7.7369
Length 16, n 16, alignment 6/ 6: 6.76327 2.65836 4.5127 5.63264 7.80333
Length 16, n 16, alignment 6/ 4: 7.15127 2.54397 4.57355 5.32957
7.51005
Length 16, n 16, alignment 4/ 6: 7.51733 2.5615 3.67299 5.31244 8.73893
Length 64, n 128, alignment 2/ 4: 14.0745 4.98021 7.33878 11.4384
17.1572
Length 128, n 64, alignment 4/ 2: 11.7179 3.59088 4.89414 10.2021
11.6637
Length 64, n 128, alignment 2/ 4: 14.0841 4.99105 7.28507 11.4365
21.7537
Length 128, n 64, alignment 4/ 2: 11.7142 3.59211 4.83864 9.87632
19.4664
Length 16, n 16, alignment 7/ 7: 7.12738 2.53533 5.62213 5.30017
7.90888
Length 16, n 16, alignment 7/ 7: 6.82635 2.53529 5.60694 5.27111
8.88482
Length 16, n 16, alignment 7/ 6: 6.9193 2.54376 5.48117 5.24785 8.04263
Length 16, n 16, alignment 6/ 7: 6.89261 2.55078 4.51003 5.32471
7.81768
Length 128, n 256, alignment 1/ 6: 16.2686 7.68983 9.35727 16.2843
19.8458
Length 256, n 128, alignment 6/ 1: 13.4356 4.94899 7.94404 15.0122
15.0231
Length 128, n 256, alignment 1/ 6: 16.2511 7.69025 9.35528 16.2859
37.8453
Length 256, n 128, alignment 6/ 1: 13.4332 4.94446 8.02757 12.2879
34.1949
Length 8, n 16, alignment 0/ 0: 7.26102 2.33285 3.75702 3.85762
7.72869
Length 32, n 16, alignment 0/ 0: 7.049 1.88689 2.42187 2.41537 6.58192
Length 8, n 16, alignment 7/ 2: 8.09344 2.31269 3.67403 4.31612
8.21018
Length 32, n 16, alignment 7/ 2: 6.822 2.45733 5.59593 5.33252 6.53496
Length 16, n 32, alignment 0/ 0: 9.99648 3.36432 4.70547 4.55746 10.148
Length 64, n 32, alignment 0/ 0: 7.89408 2.4309 2.58854 2.70519 8.89171
Length 16, n 32, alignment 6/ 4: 9.31969 3.15547 7.24937 9.47362
10.0091
Length 64, n 32, alignment 6/ 4: 9.91687 2.78234 4.64259 7.00062
10.5972
Length 32, n 64, alignment 0/ 0: 11.0651 3.81484 4.4379 4.91663 11.8363
Length 128, n 64, alignment 0/ 0: 9.25821 3.20129 3.55296 4.22664
9.63556
Length 32, n 64, alignment 5/ 6: 12.5097 3.83422 7.29892 9.09849
13.2517
Length 128, n 64, alignment 5/ 6: 11.6165 3.60246 5.35542 8.90704
13.3207
Length 64, n 128, alignment 0/ 0: 12.372 4.91681 5.41951 6.91629 15.0813
Length 256, n 128, alignment 0/ 0: 7.93075 4.5247 6.29502 5.58357 12.5963
Length 64, n 128, alignment 4/ 0: 12.569 5.00092 7.25225 10.4764 15.9366
Length 256, n 128, alignment 4/ 0: 12.2963 4.90654 7.57109 12.0953
16.7672
Length 128, n 256, alignment 0/ 0: 13.9015 7.34814 7.88738 9.15353
19.4141
Length 512, n 256, alignment 0/ 0: 10.6865 6.52749 9.15011 9.71701
20.9021
Length 128, n 256, alignment 3/ 2: 16.3681 7.53318 9.89911 18.5309
20.8335
Length 512, n 256, alignment 3/ 2: 17.0249 7.10063 10.1568 22.6063
25.1262
Length 256, n 512, alignment 0/ 0: 16.5169 12.3406 13.6056 14.5875
29.2826
Length 1024, n 512, alignment 0/ 0: 16.3619 10.8422 16.7061 17.1025
37.7908
Length 256, n 512, alignment 2/ 4: 21.162 12.9621 14.3306 26.0856 30.0397
Length 1024, n 512, alignment 2/ 4: 25.5543 11.9978 17.7424 42.4293
47.7581
Length 512, n 1024, alignment 0/ 0: 20.5504 17.3132 19.5751 21.3633
42.7215
Length 2048, n 1024, alignment 0/ 0: 28.5197 19.3708 37.1801 35.3122
67.9792
Length 512, n 1024, alignment 1/ 6: 29.9875 17.7823 22.3228 47.3516
51.3697
Length 2048, n 1024, alignment 1/ 6: 42.9443 21.6004 38.7767 78.1732
83.9784
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v3 2/2] powerpc: Add optimized stpncpy for POWER9
2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
2020-09-29 15:23 ` Raphael M Zinsly via Libc-alpha
@ 2020-09-30 13:42 ` Adhemerval Zanella via Libc-alpha
2020-09-30 14:21 ` Raphael M Zinsly via Libc-alpha
2020-11-12 17:12 ` Tulio Magno Quites Machado Filho via Libc-alpha
2 siblings, 1 reply; 10+ messages in thread
From: Adhemerval Zanella via Libc-alpha @ 2020-09-30 13:42 UTC (permalink / raw)
To: libc-alpha, Raphael M Zinsly
On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote:
> Add stpncpy support into the POWER9 strncpy.
The benchmark numbers you provided [1] seems to show it is slight worse than
the generic_strncpy which uses the same strategy as string/strncpy.c
(which would use VSX instruction through memset/memcpy). Did you compare this
optimization against an implementation that just call power8/9 memset/memcpy
instead?
It should resulting a smaller implementation which reduces i-cache size and
the code is much more simpler and maintainable. The same applies for stpncpy.
I tried to dissuade Intel developers that such micro-optimization are not
really a real gain and instead we should optimize only a handful of string
operations (memcpy/memset/etc.) and use composable implementation instead
(as generic strncpy). It still resulted on 1a153e47fcc, but I think we
might do better for powerpc.
[1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v3 2/2] powerpc: Add optimized stpncpy for POWER9
2020-09-30 13:42 ` Adhemerval Zanella via Libc-alpha
@ 2020-09-30 14:21 ` Raphael M Zinsly via Libc-alpha
2020-09-30 14:46 ` Adhemerval Zanella via Libc-alpha
0 siblings, 1 reply; 10+ messages in thread
From: Raphael M Zinsly via Libc-alpha @ 2020-09-30 14:21 UTC (permalink / raw)
To: Adhemerval Zanella, libc-alpha
Hi Adhemerval,
On 30/09/2020 10:42, Adhemerval Zanella wrote:
>
>
> On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote:
>> Add stpncpy support into the POWER9 strncpy.
>
> The benchmark numbers you provided [1] seems to show it is slight worse than
> the generic_strncpy which uses the same strategy as string/strncpy.c
> (which would use VSX instruction through memset/memcpy).
My implementation is always better than the generic_strncpy, almost
three times better in average. And it calls memset as well.
Are you talking about __strncpy_ppc? For some reason it is using
strnlen_ppc instead of the strnlen_power8, but I didn't touch it.
> Did you compare this
> optimization against an implementation that just call power8/9 memset/memcpy
> instead?
>
Not sure if I understand, isn't that generic_strncpy and strncpy_ppc?
> It should resulting a smaller implementation which reduces i-cache size and
> the code is much more simpler and maintainable. The same applies for stpncpy.
>
> I tried to dissuade Intel developers that such micro-optimization are not
> really a real gain and instead we should optimize only a handful of string
> operations (memcpy/memset/etc.) and use composable implementation instead
> (as generic strncpy). It still resulted on 1a153e47fcc, but I think we
> might do better for powerpc.
>
> [1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html
>
Best Regards,
--
Raphael Moreira Zinsly
IBM
Linux on Power Toolchain
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v3 2/2] powerpc: Add optimized stpncpy for POWER9
2020-09-30 14:21 ` Raphael M Zinsly via Libc-alpha
@ 2020-09-30 14:46 ` Adhemerval Zanella via Libc-alpha
0 siblings, 0 replies; 10+ messages in thread
From: Adhemerval Zanella via Libc-alpha @ 2020-09-30 14:46 UTC (permalink / raw)
To: Raphael M Zinsly, libc-alpha
On 30/09/2020 11:21, Raphael M Zinsly wrote:
> Hi Adhemerval,
>
> On 30/09/2020 10:42, Adhemerval Zanella wrote:
>>
>>
>> On 29/09/2020 12:21, Raphael Moreira Zinsly via Libc-alpha wrote:
>>> Add stpncpy support into the POWER9 strncpy.
>>
>> The benchmark numbers you provided [1] seems to show it is slight worse than
>> the generic_strncpy which uses the same strategy as string/strncpy.c
>> (which would use VSX instruction through memset/memcpy).
>
> My implementation is always better than the generic_strncpy, almost three times better in average. And it calls memset as well.
>
> Are you talking about __strncpy_ppc? For some reason it is using strnlen_ppc instead of the strnlen_power8, but I didn't touch it.
>
>> Did you compare this
>> optimization against an implementation that just call power8/9 memset/memcpy
>> instead?
>>
>
> Not sure if I understand, isn't that generic_strncpy and strncpy_ppc?
Right, I misread the benchmark. And I tested my own suggestion on the power9
from gcc farm and it seems that although it is slight faster than power7
variant it does not really beat power8 (as expected since it calls strnlen and
then memcpy/memset and access the input twice).
I do not really oppose it and it is up to the arch maintainer, but I still think
these micro-optimizations tends to just add extra maintainability and icache
pressure where the microbenchmark does not really catch.
>
>
>> It should resulting a smaller implementation which reduces i-cache size and
>> the code is much more simpler and maintainable. The same applies for stpncpy.
>>
>> I tried to dissuade Intel developers that such micro-optimization are not
>> really a real gain and instead we should optimize only a handful of string
>> operations (memcpy/memset/etc.) and use composable implementation instead
>> (as generic strncpy). It still resulted on 1a153e47fcc, but I think we
>> might do better for powerpc.
>>
>> [1] https://sourceware.org/pipermail/libc-alpha/2020-September/118049.html
>>
>
> Best Regards,
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v3 2/2] powerpc: Add optimized stpncpy for POWER9
2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
2020-09-29 15:23 ` Raphael M Zinsly via Libc-alpha
2020-09-30 13:42 ` Adhemerval Zanella via Libc-alpha
@ 2020-11-12 17:12 ` Tulio Magno Quites Machado Filho via Libc-alpha
2 siblings, 0 replies; 10+ messages in thread
From: Tulio Magno Quites Machado Filho via Libc-alpha @ 2020-11-12 17:12 UTC (permalink / raw)
To: Raphael Moreira Zinsly, libc-alpha; +Cc: murphyp, Raphael Moreira Zinsly, pc
Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:
> Add stpncpy support into the POWER9 strncpy.
Same reminder for Reviewed-by.
> +#define MEMSET __memset_power8
> +#ifdef SHARED
> +#define MEMSET_is_local
Wrong indentation here. Fixed.
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
Pushed as 7beee7b39ade.
Thanks!
--
Tulio Magno
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9
2020-09-29 15:21 [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly via Libc-alpha
2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
@ 2020-09-29 15:22 ` Raphael M Zinsly via Libc-alpha
2020-10-15 15:20 ` Lucas A. M. Magalhaes via Libc-alpha
2020-11-12 17:09 ` Tulio Magno Quites Machado Filho via Libc-alpha
3 siblings, 0 replies; 10+ messages in thread
From: Raphael M Zinsly via Libc-alpha @ 2020-09-29 15:22 UTC (permalink / raw)
To: libc-alpha
Benchtest output:
generic_strncpy __strncpy_power9
__strncpy_power8 __strncpy_power7 __strncpy_ppc
Length 16, n 16, alignment 1/ 1: 7.12492 2.55148 2.55079 5.71316
9.05306
Length 16, n 16, alignment 1/ 1: 6.71919 2.51696 2.56187 5.92056
9.43297
Length 16, n 16, alignment 1/ 2: 6.65909 2.53904 2.55074 5.6362 9.30194
Length 16, n 16, alignment 2/ 1: 6.50693 2.51671 2.82125 5.9298 9.18833
Length 2, n 4, alignment 7/ 2: 7.68477 2.27528 5.07192 4.8353 8.4619
Length 4, n 2, alignment 2/ 7: 6.03804 1.6644 2.32355 2.8178 6.27398
Length 2, n 4, alignment 7/ 2: 7.68944 2.31009 4.25078 4.83038
8.59554
Length 4, n 2, alignment 2/ 7: 6.04246 1.66875 2.31775 2.73826 6.8358
Length 16, n 16, alignment 2/ 2: 6.50729 2.51669 2.83075 5.91498
9.60274
Length 16, n 16, alignment 2/ 2: 6.3441 2.51684 2.82829 5.64233 9.29031
Length 16, n 16, alignment 2/ 4: 6.33989 2.51866 2.82089 5.59129
9.50426
Length 16, n 16, alignment 4/ 2: 7.88012 2.51145 3.44369 5.91774
9.50185
Length 4, n 8, alignment 6/ 4: 7.88965 2.27023 5.2189 4.67992 9.06714
Length 8, n 4, alignment 4/ 6: 5.8805 1.69238 2.67589 2.95865 7.70634
Length 4, n 8, alignment 6/ 4: 7.69107 2.29724 5.2196 4.68409 9.07751
Length 8, n 4, alignment 4/ 6: 6.33989 1.69501 2.67179 2.95862
7.75311
Length 16, n 16, alignment 3/ 3: 6.58968 2.51681 3.14295 5.92364
8.86981
Length 16, n 16, alignment 3/ 3: 6.76256 2.51385 3.14379 5.91558
9.02347
Length 16, n 16, alignment 3/ 6: 6.76734 2.53841 3.08182 5.90924
8.98558
Length 16, n 16, alignment 6/ 3: 6.67014 2.51618 4.16905 5.94761
7.81751
Length 8, n 16, alignment 5/ 6: 7.70082 2.30026 4.59182 5.41689
10.5428
Length 16, n 8, alignment 6/ 5: 5.63868 1.87873 2.32929 4.5053 5.78866
Length 8, n 16, alignment 5/ 6: 7.40013 2.2999 4.23768 5.41724 10.1649
Length 16, n 8, alignment 6/ 5: 5.63858 1.87872 2.32768 4.58045
6.02812
Length 16, n 16, alignment 4/ 4: 7.37003 2.5167 3.50594 5.91125 8.93866
Length 16, n 16, alignment 4/ 4: 7.51015 2.51684 3.58684 5.91127
8.60509
Length 16, n 16, alignment 4/ 0: 7.42056 2.51149 3.38179 5.92321
8.86607
Length 16, n 16, alignment 0/ 4: 6.6704 1.87853 2.44519 5.91475 7.68788
Length 16, n 32, alignment 4/ 0: 11.0276 3.0727 6.01877 6.9094 11.4447
Length 32, n 16, alignment 0/ 4: 6.90919 1.87852 2.45708 5.91217 6.7671
Length 16, n 32, alignment 4/ 0: 9.76588 3.07257 5.92168 6.81253
11.8936
Length 32, n 16, alignment 0/ 4: 6.90342 1.88296 2.44527 5.91673
7.68469
Length 16, n 16, alignment 5/ 5: 6.90186 2.51712 3.91963 5.91852
9.46308
Length 16, n 16, alignment 5/ 5: 6.58716 2.51626 3.94884 5.91303
9.59648
Length 16, n 16, alignment 5/ 2: 6.92421 2.52057 3.80827 5.91558 9.3486
Length 16, n 16, alignment 2/ 5: 6.50526 2.53369 2.82035 5.91729 9.065
Length 32, n 64, alignment 3/ 2: 14.0395 3.79978 6.41657 11.19 13.9713
Length 64, n 32, alignment 2/ 3: 9.85699 2.75331 3.21559 8.23056
11.4077
Length 32, n 64, alignment 3/ 2: 14.0923 3.8037 6.38851 11.4514 15.9838
Length 64, n 32, alignment 2/ 3: 9.4437 2.75344 3.21249 8.21276 13.9496
Length 16, n 16, alignment 6/ 6: 6.33989 2.51408 4.38486 5.91681
7.37203
Length 16, n 16, alignment 6/ 6: 6.76503 2.51645 4.26454 5.9103 7.87574
Length 16, n 16, alignment 6/ 4: 6.51654 2.51654 4.24635 5.91578
7.17827
Length 16, n 16, alignment 4/ 6: 7.28735 2.53335 3.54029 5.92337
8.63075
Length 64, n 128, alignment 2/ 4: 15.4973 4.98808 7.34157 11.5113
16.7688
Length 128, n 64, alignment 4/ 2: 11.6235 3.54914 4.80814 10.3103
11.6194
Length 64, n 128, alignment 2/ 4: 15.4979 5.02559 7.28236 11.5045
22.1309
Length 128, n 64, alignment 4/ 2: 11.6138 3.53841 4.80527 10.3293
19.5239
Length 16, n 16, alignment 7/ 7: 6.84212 2.51109 5.0585 5.7457 7.2307
Length 16, n 16, alignment 7/ 7: 6.86215 2.50957 5.06541 5.91726
8.55044
Length 16, n 16, alignment 7/ 6: 6.97428 2.51876 5.05053 5.92637
7.07715
Length 16, n 16, alignment 6/ 7: 7.01347 2.53448 4.38004 5.93278
7.86288
Length 128, n 256, alignment 1/ 6: 17.9407 7.92071 9.38384 16.9419
20.6065
Length 256, n 128, alignment 6/ 1: 13.3609 4.7983 7.967 12.5699 14.9996
Length 128, n 256, alignment 1/ 6: 17.9371 7.69161 9.36672 16.739 38.9048
Length 256, n 128, alignment 6/ 1: 13.3632 4.87671 7.80194 12.7028
33.9017
Length 8, n 16, alignment 0/ 0: 7.4529 2.29963 3.62737 4.22665 7.50268
Length 32, n 16, alignment 0/ 0: 6.86674 1.87853 2.45092 2.41528
7.30161
Length 8, n 16, alignment 7/ 2: 7.40103 2.29399 3.75703 5.43637
8.45285
Length 32, n 16, alignment 7/ 2: 7.72683 2.35278 5.04996 5.93629
7.18881
Length 16, n 32, alignment 0/ 0: 9.87066 3.17511 4.89448 4.41405
10.3408
Length 64, n 32, alignment 0/ 0: 8.06217 2.32926 2.94508 2.71275
8.11769
Length 16, n 32, alignment 6/ 4: 9.50052 3.07627 6.37858 9.46793
10.1393
Length 64, n 32, alignment 6/ 4: 9.7197 2.75154 4.47331 7.73667 9.26558
Length 32, n 64, alignment 0/ 0: 10.9157 3.79013 4.83041 4.97713
11.5486
Length 128, n 64, alignment 0/ 0: 9.28057 3.15788 3.5178 4.23091 11.0874
Length 32, n 64, alignment 5/ 6: 14.0472 3.8515 7.26431 10.1343 12.8115
Length 128, n 64, alignment 5/ 6: 11.5493 3.5659 5.05553 9.1005 13.4053
Length 64, n 128, alignment 0/ 0: 12.0056 4.94615 6.45436 7.06235
14.4743
Length 256, n 128, alignment 0/ 0: 7.87506 4.49546 6.4492 5.38877 12.1437
Length 64, n 128, alignment 4/ 0: 12.4174 4.99773 7.73749 11.1452
16.1494
Length 256, n 128, alignment 4/ 0: 12.2601 4.88446 6.95948 13.3726
16.7583
Length 128, n 256, alignment 0/ 0: 13.9215 7.51155 7.87942 8.79876
20.4226
Length 512, n 256, alignment 0/ 0: 10.5798 6.77319 8.79757 9.03297
20.0197
Length 128, n 256, alignment 3/ 2: 18.0213 7.57884 9.89436 18.7839
20.5445
Length 512, n 256, alignment 3/ 2: 16.9909 7.07957 9.9271 23.2621 25.2442
Length 256, n 512, alignment 0/ 0: 17.6825 12.3074 13.3245 13.9381
28.7687
Length 1024, n 512, alignment 0/ 0: 16.3837 10.8306 16.6999 16.6797
38.0562
Length 256, n 512, alignment 2/ 4: 23.1953 13.0445 14.324 26.8918 30.2049
Length 1024, n 512, alignment 2/ 4: 25.4059 12.0938 17.2483 41.4883
47.2025
Length 512, n 1024, alignment 0/ 0: 21.029 17.1782 19.4815 21.0035 43.2361
Length 2048, n 1024, alignment 0/ 0: 28.5154 19.3221 36.9624 35.482 68.4792
Length 512, n 1024, alignment 1/ 6: 32.4103 17.9272 21.5421 46.6099 55.059
Length 2048, n 1024, alignment 1/ 6: 43.0516 21.6315 37.8787 77.7889
83.4195
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9
2020-09-29 15:21 [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly via Libc-alpha
2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
2020-09-29 15:22 ` [PATCH v3 1/2] powerpc: Add optimized strncpy " Raphael M Zinsly via Libc-alpha
@ 2020-10-15 15:20 ` Lucas A. M. Magalhaes via Libc-alpha
2020-11-12 17:09 ` Tulio Magno Quites Machado Filho via Libc-alpha
3 siblings, 0 replies; 10+ messages in thread
From: Lucas A. M. Magalhaes via Libc-alpha @ 2020-10-15 15:20 UTC (permalink / raw)
To: Raphael Moreira Zinsly, libc-alpha; +Cc: murphyp, pc, Raphael Moreira Zinsly
Hi Raphael,
Thanks for the patch. All tests passed on a P9.
LGTM.
---
Lucas A. M. Magalhães
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9
2020-09-29 15:21 [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly via Libc-alpha
` (2 preceding siblings ...)
2020-10-15 15:20 ` Lucas A. M. Magalhaes via Libc-alpha
@ 2020-11-12 17:09 ` Tulio Magno Quites Machado Filho via Libc-alpha
3 siblings, 0 replies; 10+ messages in thread
From: Tulio Magno Quites Machado Filho via Libc-alpha @ 2020-11-12 17:09 UTC (permalink / raw)
To: Raphael Moreira Zinsly, libc-alpha; +Cc: murphyp, pc
Raphael Moreira Zinsly <rzinsly@linux.ibm.com> writes:
> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.
Remember to add the Reviewed-by: lines you collected in previous versions. ;-)
> +#define FRAMESIZE (FRAME_MIN_SIZE+48)
I think you actually meant to use FRAME_MIN_SIZE+8 here.
Fixed.
> +L(zero_padding_end):
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl v18,r11,r10 /* Partial store */
> + blr
> +
> + .align 4
> +L(zero_padding_memset):
> + std r30,-8(r1) /* Save r30 on the stack. */
This requires to add CFI:
cfi_offset(r30, -8)
Done.
> +#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
> +#define STRNCPY __strncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +/* memset is used to pad the end of the string. */
> +#define MEMSET __memset_power8
> +#ifdef SHARED
> +#define MEMSET_is_local
Wrong indentation in the previous lines. Fixed.
I wonder if we can improve this and stop depending on the list of memset
implementations on this file.
Anyway, this isn't new and is a future work.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> index 7bacf28aca..3f2108ddae 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> @@ -28,11 +28,19 @@
> extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
> extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
> extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
> +# endif
> # undef strncpy
>
> /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> ifunc symbol properly. */
> libc_ifunc_redirected (__redirect_strncpy, strncpy,
> +# ifdef __LITTLE_ENDIAN__
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00) &&
> + (hwcap & PPC_FEATURE_HAS_VSX)
> + ? __strncpy_power9 :
Wrong indentation here. Fixed.
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
Pushed as b9d83bf3eb57.
Thanks!
--
Tulio Magno
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2020-11-12 17:12 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-29 15:21 [PATCH v3 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly via Libc-alpha
2020-09-29 15:21 ` [PATCH v3 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly via Libc-alpha
2020-09-29 15:23 ` Raphael M Zinsly via Libc-alpha
2020-09-30 13:42 ` Adhemerval Zanella via Libc-alpha
2020-09-30 14:21 ` Raphael M Zinsly via Libc-alpha
2020-09-30 14:46 ` Adhemerval Zanella via Libc-alpha
2020-11-12 17:12 ` Tulio Magno Quites Machado Filho via Libc-alpha
2020-09-29 15:22 ` [PATCH v3 1/2] powerpc: Add optimized strncpy " Raphael M Zinsly via Libc-alpha
2020-10-15 15:20 ` Lucas A. M. Magalhaes via Libc-alpha
2020-11-12 17:09 ` Tulio Magno Quites Machado Filho via Libc-alpha
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).