unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 1/2] PPC64: Add libmvec SIMD single-precision natural exponent function.
@ 2019-05-12  3:28 Shawn Landden
  2019-05-12  3:28 ` [PATCH 2/2] PPC64: Add libmvec SIMD double-precision " Shawn Landden
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Shawn Landden @ 2019-05-12  3:28 UTC (permalink / raw)
  To: libc-alpha; +Cc: Shawn Landden

[BZ #24209]

Passes all tests.

Unlike other libmvec functions, this sets the onderflow and overflow bits.
The caller can check these flags, and possibly re-run the calculations with
scalar expf to figure out what is causing the overflow or underflow.

Suprisingly the special-case path performs as well as the normal path.
(both of which are vectorized)
Running 20 times over 32MiB
vector: mean 432.263032 MiB/s (sd 0.486733)
scalar: mean 178.646197 MiB/s (sd 0.050013)

2019-05-11 Shawn Landden <shawn@git.icu>
        * NEWS: Noted the addition of PPC64 vector expf function.
        * sysdeps/powerpc/bits/math-vector.h: Added entry for vector expf.
        * sysdeps/powerpc/powerpc64/fpu/Versions: Added vector expf entry.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile:
        (libmvec-sysdep_routines, CFLAGS-vec_s_expf4_vsx.c):
        (CFLAGS-vec_s_exp2f_data.c): Added build of VSX SIMD expf
        function and tests. Added vec_math_errf.c to build.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h: Modified for expf.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c:
        Added entry for vector expf.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c: New file.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c: Likewise.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c: Likewise.
        * sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist: SIMD expf
        added.
---
 NEWS                                          |   1 +
 sysdeps/powerpc/bits/math-vector.h            |   2 +
 sysdeps/powerpc/powerpc64/fpu/Versions        |   1 +
 .../powerpc/powerpc64/fpu/multiarch/Makefile  |   6 +-
 .../powerpc64/fpu/multiarch/math_config_flt.h |  64 ++++++--
 .../fpu/multiarch/test-float-vlen4-wrappers.c |   1 +
 .../powerpc64/fpu/multiarch/vec_math_errf.c   |  39 +++++
 .../fpu/multiarch/vec_s_exp2f_data.c          |  58 +++++++
 .../powerpc64/fpu/multiarch/vec_s_expf4_vsx.c | 141 ++++++++++++++++++
 .../linux/powerpc/powerpc64/libmvec.abilist   |   1 +
 10 files changed, 303 insertions(+), 11 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c

diff --git a/NEWS b/NEWS
index 6f9af0e24e..26f5c67f75 100644
--- a/NEWS
+++ b/NEWS
@@ -18,10 +18,11 @@ Major new features:
   - single-precision logarithm: logf
   - double-precision sine: sin
   - single-precision sine: sinf
   - double-precision sincos: sincos
   - single-precision sincos: sincosf
+  - single-precision exp: expf
 
   GCC support for auto-vectorization of functions on PPC64 is not yet
   available. Until that is done, the new vector math functions are
   inaccessible to applications.
 
diff --git a/sysdeps/powerpc/bits/math-vector.h b/sysdeps/powerpc/bits/math-vector.h
index 6dc62480fd..bbc9db0dd9 100644
--- a/sysdeps/powerpc/bits/math-vector.h
+++ b/sysdeps/powerpc/bits/math-vector.h
@@ -42,8 +42,10 @@
 #  define __DECL_SIMD_logf __DECL_SIMD_PPC64
 #  undef __DECL_SIMD_sincos
 #  define __DECL_SIMD_sincos __DECL_SIMD_PPC64
 #  undef __DECL_SIMD_sincosf
 #  define __DECL_SIMD_sincosf __DECL_SIMD_PPC64
+#  undef __DECL_SIMD_expf
+#  define __DECL_SIMD_expf __DECL_SIMD_PPC64
 
 # endif
 #endif
diff --git a/sysdeps/powerpc/powerpc64/fpu/Versions b/sysdeps/powerpc/powerpc64/fpu/Versions
index c083d9f938..7c45e76074 100644
--- a/sysdeps/powerpc/powerpc64/fpu/Versions
+++ b/sysdeps/powerpc/powerpc64/fpu/Versions
@@ -1,6 +1,7 @@
 libmvec {
   GLIBC_2.30 {
     _ZGVbN2v_cos; _ZGVbN4v_cosf; _ZGVbN2v_sin; _ZGVbN4v_sinf;
     _ZGVbN2v_log; _ZGVbN4v_logf; _ZGVbN2vvv_sincos; _ZGVbN4vvv_sincosf;
+    _ZGVbN4v_expf;
   }
 }
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
index a4f0d1694e..ad3c29b1ab 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
@@ -46,10 +46,12 @@ endif
 ifeq ($(subdir),mathvec)
 libmvec-sysdep_routines += vec_d_cos2_vsx vec_s_cosf4_vsx \
 			   vec_d_sin2_vsx vec_s_sinf4_vsx \
 			   vec_d_log2_vsx vec_d_log_data \
 			   vec_s_logf4_vsx vec_s_logf_data \
+			   vec_s_expf4_vsx vec_s_exp2f_data \
+			   vec_math_errf \
 			   vec_d_sincos2_vsx vec_s_sincosf4_vsx
 CFLAGS-vec_d_cos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_d_log2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_d_log_data.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_s_cosf4_vsx.c += -mabi=altivec -maltivec -mvsx
@@ -57,19 +59,21 @@ CFLAGS-vec_s_logf4_vsx.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_s_logf_data.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_d_sin2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_s_sinf4_vsx.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_d_sincos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_s_sincosf4_vsx.c += -mabi=altivec -maltivec -mvsx
+CFLAGS-vec_s_expf4_vsx.c += -mabi=altivec -maltivec -mvsx
+CFLAGS-vec_s_exp2f_data.c += -mabi=altivec -maltivec -mvsx
 endif
 
 # Variables for libmvec tests.
 ifeq ($(subdir),math)
 ifeq ($(build-mathvec),yes)
 libmvec-tests += double-vlen2 float-vlen4
 
 double-vlen2-funcs = cos sin sincos log
-float-vlen4-funcs = cos sin sincos log
+float-vlen4-funcs = cos sin sincos log exp
 
 double-vlen2-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
 float-vlen4-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
 
 CFLAGS-test-double-vlen2-wrappers.c += -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h
index b5fbe02d91..18f34197e2 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h
@@ -46,20 +46,42 @@
 
 #if TOINT_INTRINSICS
 /* Round x to nearest int in all rounding modes, ties have to be rounded
    consistently with converttoint so the results match.  If the result
    would be outside of [-2^31, 2^31-1] then the semantics is unspecified.  */
-static inline double_t
-roundtoint (double_t x);
+static inline vector double
+roundtoint (vector double x);
 
 /* Convert x to nearest int in all rounding modes, ties have to be rounded
    consistently with roundtoint.  If the result is not representible in an
    int32_t then the semantics is unspecified.  */
-static inline int32_t
-converttoint (double_t x);
+static inline vector unsigned long long
+converttoint (vector double x);
 #endif
 
+static inline vector unsigned
+vasuint (vector float f)
+{
+  union
+  {
+    vector float f;
+    vector unsigned i;
+  } u = {f};
+  return u.i;
+}
+
+static inline vector float
+vasfloat (vector unsigned i)
+{
+  union
+  {
+    vector unsigned i;
+    vector float f;
+  } u = {i};
+  return u.f;
+}
+
 static inline uint32_t
 asuint (float f)
 {
   union
   {
@@ -78,10 +100,32 @@ asfloat (uint32_t i)
     float f;
   } u = {i};
   return u.f;
 }
 
+static inline vector unsigned long long
+vasuint64 (vector double f)
+{
+  union
+  {
+    vector double f;
+    vector unsigned long long i;
+  } u = {f};
+  return u.i;
+}
+
+static inline vector double
+vasdouble (vector unsigned long long i)
+{
+  union
+  {
+    vector unsigned long long i;
+    vector double f;
+  } u = {i};
+  return u.f;
+}
+
 static inline uint64_t
 asuint64 (double f)
 {
   union
   {
@@ -122,16 +166,16 @@ attribute_hidden float __math_invalidf (float);
 /* Shared between expf, exp2f and powf.  */
 #define EXP2F_TABLE_BITS 5
 #define EXP2F_POLY_ORDER 3
 extern const struct exp2f_data
 {
-  uint64_t tab[1 << EXP2F_TABLE_BITS];
-  double shift_scaled;
-  double poly[EXP2F_POLY_ORDER];
-  double shift;
-  double invln2_scaled;
-  double poly_scaled[EXP2F_POLY_ORDER];
+  long long unsigned tab[1 << EXP2F_TABLE_BITS];
+  vector double shift_scaled;
+  vector double poly[EXP2F_POLY_ORDER];
+  vector double shift;
+  vector double invln2_scaled;
+  vector double poly_scaled[EXP2F_POLY_ORDER];
 } __exp2f_data attribute_hidden;
 
 #define LOGF_TABLE_BITS 4
 #define LOGF_POLY_ORDER 4
 extern const struct logf_data
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c
index b4721b1dac..37886d1a45 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c
@@ -22,6 +22,7 @@
 #define VEC_TYPE vector float
 
 VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVbN4v_cosf)
 VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf)
 VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
+VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
 VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf)
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c
new file mode 100644
index 0000000000..cf3466c400
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c
@@ -0,0 +1,39 @@
+/* Single-precision math error handling.
+   Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "math_config_flt.h"
+/* NOINLINE prevents fenv semantics breaking optimizations.  */
+NOINLINE static float
+xflowf (uint32_t sign, float y)
+{
+  y = (sign ? -y : y) * y;
+  return y;
+}
+
+attribute_hidden float
+__math_uflowf (uint32_t sign)
+{
+  return xflowf (sign, 0x1p-95f);
+}
+
+attribute_hidden float
+__math_oflowf (uint32_t sign)
+{
+  return xflowf (sign, 0x1p97f);
+}
+
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c
new file mode 100644
index 0000000000..b2232713a9
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c
@@ -0,0 +1,58 @@
+/* Data for exp, rearranged for vector operations
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "math_config_flt.h"
+
+#define N (1 << EXP2F_TABLE_BITS)
+
+const struct exp2f_data __exp2f_data = {
+  /* tab[i] = uint(2^(i/N)) - (i << 52-BITS)
+     used for computing 2^(k/N) for an int |k| < 150 N as
+     double(tab[k%N] + (k << 52-BITS)) */
+  .tab = {
+0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
+0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
+0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
+0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
+0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
+0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
+  },
+  .shift_scaled = {
+    0x1.8p+52 / N,
+    0x1.8p+52 / N },
+  .poly = {
+  {0x1.c6af84b912394p-5, 0x1.c6af84b912394p-5},
+  {0x1.ebfce50fac4f3p-3, 0x1.ebfce50fac4f3p-3},
+  {0x1.62e42ff0c52d6p-1, 0x1.62e42ff0c52d6p-1},
+  },
+  .shift = {
+    0x1.8p+52,
+    0x1.8p+52,
+  },
+  .invln2_scaled = {
+    0x1.71547652b82fep+0 * N,
+    0x1.71547652b82fep+0 * N,
+  },
+  .poly_scaled = {
+  {0x1.c6af84b912394p-5/N/N/N, 0x1.c6af84b912394p-5/N/N/N},
+  {0x1.ebfce50fac4f3p-3/N/N, 0x1.ebfce50fac4f3p-3/N/N},
+  {0x1.62e42ff0c52d6p-1/N, 0x1.62e42ff0c52d6p-1/N},
+  },
+};
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c
new file mode 100644
index 0000000000..cd04f5d8b3
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c
@@ -0,0 +1,141 @@
+/* Single-precision vector expf(x) function.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+#include <altivec.h>
+#include <math.h>
+
+#include "math_config_flt.h"
+
+typedef vector long long unsigned v64u;
+typedef union {
+	vector unsigned u;
+	vector float f;
+	v64u l;
+	vector double d;
+} u;
+typedef union {
+	double d;
+	int64_t l;
+	unsigned u;
+	float f;
+} us;
+
+#define N (1 << EXP2F_TABLE_BITS)
+#define InvLn2N __exp2f_data.invln2_scaled
+#define T __exp2f_data.tab
+#define C __exp2f_data.poly_scaled
+
+vector float
+_ZGVbN4v_expf (vector float x)
+{
+	u res;
+	u xu;
+	xu.f = x;
+	us c88;
+	c88.f = 88.0f;
+	us inf;
+	inf.f = INFINITY;
+	us ninf;
+	ninf.f = -INFINITY;
+	vector unsigned constants = {(c88.u & 0xfff00000) << 1, ninf.u, inf.u, 0};
+	vector float constants2 = {0x1.62e42ep6f, -0x1.9fe368p6f, 0, 0};
+	vector unsigned zero = {0, 0, 0, 0};
+	vector unsigned v88 = vec_splat (constants, 0);
+	vector unsigned is_special_case = (vector unsigned) vec_cmpge (xu.u << 1, v88);
+	vector unsigned is_special_not_covered = is_special_case;
+	if (__glibc_unlikely (!vec_all_eq (is_special_case, zero))) {
+		vector unsigned inf = vec_splat (constants, 2);
+		vector unsigned is_inf_or_ninf_or_nan = (vector unsigned) vec_cmpge (xu.u, inf);
+		u xpx;
+		xpx.f = x + x;
+		res.u = xpx.u;
+		is_special_not_covered &= ~is_inf_or_ninf_or_nan;
+		vector unsigned ninf = vec_splat (constants, 1);
+		vector unsigned is_ninf = (vector unsigned) vec_cmpeq (xu.u, ninf);
+		res.u = vec_sel (res.u, zero, is_ninf);
+
+		vector float overflow_v = vec_splat (constants2, 0);
+		vector unsigned is_overflow = (vector unsigned) vec_cmpgt (xu.f, overflow_v);
+		if (__glibc_unlikely (!vec_all_eq (is_overflow, zero)))
+		{
+			// This branch is because we are generating an overflow fp flag
+			us ofu;
+			ofu.f = __math_oflowf(0);
+			vector unsigned of = {ofu.u, ofu.u, ofu.u, ofu.u};
+			res.u = vec_sel (res.u, of, is_overflow);
+			is_special_not_covered &= ~is_overflow;
+		}
+		vector float underflow_v = vec_splat (constants2, 1);
+		vector unsigned is_underflow = (vector unsigned) vec_cmplt (xu.f, underflow_v);
+		if (__glibc_unlikely (!vec_all_eq (is_underflow, zero)))
+		{
+                        // This branch is because we are generating an underflow fp flag
+                        us ufu;
+                        ufu.f = __math_uflowf(0);
+                        vector unsigned uf = {ufu.u, ufu.u, ufu.u, ufu.u};
+                        res.u = vec_sel (res.u, uf, is_underflow);
+                        is_special_not_covered &= ~is_underflow;
+		}
+	}
+
+	vector double xl = vec_unpackh (x);
+	vector double xr = vec_unpackl (x);
+	vector double zl = InvLn2N * xl;
+	vector double zr = InvLn2N * xr;
+#if TOINT_INTRINSICS
+	vector double kdl = roundtoint (zl);
+	vector double kdl = roundtoint (zl);
+	v64u kil = converttoint (zl);
+	v64u kir = converttoint (zr);
+#else
+#define SHIFT __exp2f_data.shift
+	vector double kdl = zl + SHIFT;
+	vector double kdr = zr + SHIFT;
+	u kilu;
+	kilu.d = kdl;
+	u kiru;
+	kiru.d = kdr;
+	v64u kil = kilu.l;
+	v64u kir = kiru.l;
+	kdl -= SHIFT;
+	kdr -= SHIFT;
+#endif
+	vector double rl = zl - kdl;
+	vector double rr = zr - kdr;
+
+	v64u tl = {T[kil[0] % N], T[kil[1] % N]};
+	v64u tr = {T[kir[0] % N], T[kir[1] % N]};
+	tl += (kil << (52 - EXP2F_TABLE_BITS));
+	tr += (kir << (52 - EXP2F_TABLE_BITS));
+	u slu;
+	slu.l = tl;
+	u sru;
+	sru.l = tr;
+	// This cast is obnoxious, but there is no vec_ld for double
+	zl = C[0] * rl + C[1];
+	zr = C[0] * rr + C[1];
+	vector double r2l = rl * rl;
+	vector double r2r = rr * rr;
+	vector double yl = C[2] * rl + 1;
+	vector double yr = C[2] * rr + 1;
+	yl = zl * r2l + yl;
+	yr = zr * r2r + yr;
+	yl = yl * slu.d;
+	yr = yr * sru.d;
+	vector float restmp = {(float) yl[0], (float) yl[1], (float) yr[0], (float) yr[1]};
+	return vec_sel (restmp, res.f, is_special_case & ~is_special_not_covered);
+}
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
index bb9f0beea4..63770c8da3 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
@@ -4,5 +4,6 @@ GLIBC_2.30 _ZGVbN2v_sin F
 GLIBC_2.30 _ZGVbN2vvv_sincos F
 GLIBC_2.30 _ZGVbN4v_cosf F
 GLIBC_2.30 _ZGVbN4v_logf F
 GLIBC_2.30 _ZGVbN4v_sinf F
 GLIBC_2.30 _ZGVbN4vvv_sincosf F
+GLIBC_2.30 _ZGVbN4v_expf F
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/2] PPC64: Add libmvec SIMD double-precision natural exponent function.
  2019-05-12  3:28 [PATCH 1/2] PPC64: Add libmvec SIMD single-precision natural exponent function Shawn Landden
@ 2019-05-12  3:28 ` Shawn Landden
  2019-05-24 21:41   ` Tulio Magno Quites Machado Filho
  2019-05-24 20:04 ` [PATCH 1/2] PPC64: Add libmvec SIMD single-precision " Tulio Magno Quites Machado Filho
  2019-05-27 17:47 ` [v2 " Shawn Landden
  2 siblings, 1 reply; 15+ messages in thread
From: Shawn Landden @ 2019-05-12  3:28 UTC (permalink / raw)
  To: libc-alpha; +Cc: Shawn Landden

[BZ #24209]

Passes all tests.

Unlike other libmvec functions, this sets the onderflow and overflow bits.
The caller can check these flags, and possibly re-run the calculations with
scalar expf to figure out what is causing the overflow or underflow.

The special-case path is not vectorized, and performs much woorse than
the scalar code.
Normalized data: 1 to 2^32 converted to double
Running 20 times over 32MiB
vector: mean 563.807107 MiB/s (sd 0.390922)
scalar: mean 226.527824 MiB/s (sd 0.077406)

Random data:
vector: mean 80.175986 MiB/s (sd 1.110948)
scalar: mean 244.738130 MiB/s (sd 0.029561)

2019-05-11 Shawn Landden <shawn@git.icu>
        * NEWS: Noted the addition of PPC64 vector exp function.
        * sysdeps/powerpc/bits/math-vector.h: Added entry for vector exp.
        * sysdeps/powerpc/powerpc64/fpu/Versions: Added vector exp entry.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile:
        (libmvec-sysdep_routines, CFLAGS-vec_d_exp2_vsx.c):
        (CFLAGS-vec_d_exp_data.c): Added build of VSX SIMD expf
        function and tests. Added vec_math_err.c to build.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/(math_config.h)->(math_config_dbl.h):
	Renamed, and modified for exp.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c:
        Added entry for vector exp.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c: New file.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c: Likewise.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c: Likewise.
        * sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist: SIMD expf
        added.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c: Rename of math_config.h
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c: Likewise
---
 NEWS                                          |   1 +
 sysdeps/powerpc/bits/math-vector.h            |   2 +
 sysdeps/powerpc/powerpc64/fpu/Versions        |   2 +-
 .../powerpc/powerpc64/fpu/multiarch/Makefile  |   7 +-
 .../{math_config.h => math_config_dbl.h}      |  19 +-
 .../multiarch/test-double-vlen2-wrappers.c    |   1 +
 .../powerpc64/fpu/multiarch/vec_d_exp2_vsx.c  | 201 ++++++++++++++++++
 .../powerpc64/fpu/multiarch/vec_d_exp_data.c  | 200 +++++++++++++++++
 .../powerpc64/fpu/multiarch/vec_d_log2_vsx.c  |   2 +-
 .../powerpc64/fpu/multiarch/vec_d_log_data.c  |   2 +-
 .../powerpc64/fpu/multiarch/vec_math_err.c    |  41 ++++
 .../linux/powerpc/powerpc64/libmvec.abilist   |   1 +
 12 files changed, 467 insertions(+), 12 deletions(-)
 rename sysdeps/powerpc/powerpc64/fpu/multiarch/{math_config.h => math_config_dbl.h} (94%)
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c

diff --git a/NEWS b/NEWS
index 26f5c67f75..a33af8c316 100644
--- a/NEWS
+++ b/NEWS
@@ -18,10 +18,11 @@ Major new features:
   - single-precision logarithm: logf
   - double-precision sine: sin
   - single-precision sine: sinf
   - double-precision sincos: sincos
   - single-precision sincos: sincosf
+  - double-precision exp: exp
   - single-precision exp: expf
 
   GCC support for auto-vectorization of functions on PPC64 is not yet
   available. Until that is done, the new vector math functions are
   inaccessible to applications.
diff --git a/sysdeps/powerpc/bits/math-vector.h b/sysdeps/powerpc/bits/math-vector.h
index bbc9db0dd9..e7846b4bfa 100644
--- a/sysdeps/powerpc/bits/math-vector.h
+++ b/sysdeps/powerpc/bits/math-vector.h
@@ -44,8 +44,10 @@
 #  define __DECL_SIMD_sincos __DECL_SIMD_PPC64
 #  undef __DECL_SIMD_sincosf
 #  define __DECL_SIMD_sincosf __DECL_SIMD_PPC64
 #  undef __DECL_SIMD_expf
 #  define __DECL_SIMD_expf __DECL_SIMD_PPC64
+#  undef __DECL_SIMD_exp
+#  define __DECL_SIMD_exp __DECL_SIMD_PPC64
 
 # endif
 #endif
diff --git a/sysdeps/powerpc/powerpc64/fpu/Versions b/sysdeps/powerpc/powerpc64/fpu/Versions
index 7c45e76074..361edee9e7 100644
--- a/sysdeps/powerpc/powerpc64/fpu/Versions
+++ b/sysdeps/powerpc/powerpc64/fpu/Versions
@@ -1,7 +1,7 @@
 libmvec {
   GLIBC_2.30 {
     _ZGVbN2v_cos; _ZGVbN4v_cosf; _ZGVbN2v_sin; _ZGVbN4v_sinf;
     _ZGVbN2v_log; _ZGVbN4v_logf; _ZGVbN2vvv_sincos; _ZGVbN4vvv_sincosf;
-    _ZGVbN4v_expf;
+    _ZGVbN4v_expf; _ZGVbN2v_exp;
   }
 }
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
index ad3c29b1ab..aceaedee0a 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
@@ -47,11 +47,12 @@ ifeq ($(subdir),mathvec)
 libmvec-sysdep_routines += vec_d_cos2_vsx vec_s_cosf4_vsx \
 			   vec_d_sin2_vsx vec_s_sinf4_vsx \
 			   vec_d_log2_vsx vec_d_log_data \
 			   vec_s_logf4_vsx vec_s_logf_data \
 			   vec_s_expf4_vsx vec_s_exp2f_data \
-			   vec_math_errf \
+			   vec_math_errf vec_math_err \
+			   vec_d_exp2_vsx vec_d_exp_data \
 			   vec_d_sincos2_vsx vec_s_sincosf4_vsx
 CFLAGS-vec_d_cos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_d_log2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_d_log_data.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_s_cosf4_vsx.c += -mabi=altivec -maltivec -mvsx
@@ -61,18 +62,20 @@ CFLAGS-vec_d_sin2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_s_sinf4_vsx.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_d_sincos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_s_sincosf4_vsx.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_s_expf4_vsx.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_s_exp2f_data.c += -mabi=altivec -maltivec -mvsx
+CFLAGS-vec_d_exp2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
+CFLAGS-vec_d_exp_data.c += -mabi=altivec -maltivec -mvsx
 endif
 
 # Variables for libmvec tests.
 ifeq ($(subdir),math)
 ifeq ($(build-mathvec),yes)
 libmvec-tests += double-vlen2 float-vlen4
 
-double-vlen2-funcs = cos sin sincos log
+double-vlen2-funcs = cos sin sincos log exp
 float-vlen4-funcs = cos sin sincos log exp
 
 double-vlen2-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
 float-vlen4-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
 
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
similarity index 94%
rename from sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h
rename to sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
index bd37906ecb..039ff6adeb 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
@@ -17,15 +17,20 @@
    <http://www.gnu.org/licenses/>.  */
 
 #ifndef _MATH_CONFIG_H
 #define _MATH_CONFIG_H
 
+#ifndef EXP_USE_TOINT_NARROW
+#define EXP_USE_TOINT_NARROW 0
+#endif
+
 #include <math.h>
 #include <math_private.h>
 #include <nan-high-order-bit.h>
 #include <stdint.h>
 #include <altivec.h>
+#include <math-narrow-eval.h>
 
 #ifndef WANT_ROUNDING
 /* Correct special case results in non-nearest rounding modes.  */
 # define WANT_ROUNDING 1
 #endif
@@ -132,17 +137,17 @@ check_uflow (double x)
 #define EXP_TABLE_BITS 7
 #define EXP_POLY_ORDER 5
 #define EXP2_POLY_ORDER 5
 extern const struct exp_data
 {
-  double invln2N;
-  double shift;
-  double negln2hiN;
-  double negln2loN;
-  double poly[4]; /* Last four coefficients.  */
-  double exp2_shift;
-  double exp2_poly[EXP2_POLY_ORDER];
+  vector double invln2N;
+  vector double shift;
+  vector double negln2hiN;
+  vector double negln2loN;
+  vector double poly[4]; /* Last four coefficients.  */
+  vector double exp2_shift;
+  vector double exp2_poly[EXP2_POLY_ORDER];
   uint64_t tab[2*(1 << EXP_TABLE_BITS)];
 } __exp_data attribute_hidden;
 
 #define LOG_TABLE_BITS 7
 #define LOG_POLY_ORDER 6
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
index c9e2dbfdd0..f368e47947 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
@@ -22,6 +22,7 @@
 #define VEC_TYPE vector double
 
 VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVbN2v_cos)
 VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
 VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
+VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
 VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVbN2vvv_sincos)
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
new file mode 100644
index 0000000000..6a9dfb4492
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
@@ -0,0 +1,201 @@
+/* Double-precision vector exp(x) function.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+#include <altivec.h>
+#include <math.h>
+#include <fpu/math-barriers.h>
+#include "math_config_dbl.h"
+
+typedef vector long long unsigned v64u;
+typedef union {
+  vector unsigned u;
+  vector float f;
+  v64u l;
+  vector double d;
+} u;
+typedef union {
+  double d;
+  uint64_t l;
+  unsigned u;
+  float f;
+} us;
+
+#define N (1 << EXP_TABLE_BITS)
+#define InvLn2N __exp_data.invln2N
+#define NegLn2hiN __exp_data.negln2hiN
+#define NegLn2loN __exp_data.negln2loN
+#define Shift __exp_data.shift
+#define T __exp_data.tab
+#define C2 __exp_data.poly[5 - EXP_POLY_ORDER]
+#define C3 __exp_data.poly[6 - EXP_POLY_ORDER]
+#define C4 __exp_data.poly[7 - EXP_POLY_ORDER]
+#define C5 __exp_data.poly[8 - EXP_POLY_ORDER]
+
+/* Handle cases that may overflow or underflow when computing the result that
+   is scale*(1+TMP) without intermediate rounding.  The bit representation of
+   scale is in SBITS, however it has a computed exponent that may have
+   overflown into the sign bit so that needs to be adjusted before using it as
+   a double.  (int32_t)KI is the k used in the argument reduction and exponent
+   adjustment of scale, positive k here means the result may overflow and
+   negative k means the result may underflow.  */
+static inline double
+specialcase (double_t tmp, uint64_t sbits, uint64_t ki)
+{
+  double_t scale, y;
+
+  if ((ki & 0x80000000) == 0)
+    {
+      /* k > 0, the exponent of scale might have overflowed by <= 460.  */
+      sbits -= 1009ull << 52;
+      scale = asdouble (sbits);
+      y = 0x1p1009 * (scale + scale * tmp);
+      return y;
+    }
+  /* k < 0, need special care in the subnormal range.  */
+  sbits += 1022ull << 52;
+  scale = asdouble (sbits);
+  y = scale + scale * tmp;
+  if (y < 1.0)
+    {
+      /* Round y to the right precision before scaling it into the subnormal
+	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
+	 E is the worst-case ulp error outside the subnormal range.  So this
+	 is only useful if the goal is better than 1 ulp worst-case error.  */
+      double_t hi, lo;
+      lo = scale - y + scale * tmp;
+      hi = 1.0 + y;
+      lo = 1.0 - hi + y + lo;
+      y = math_narrow_eval (hi + lo) - 1.0;
+      /* Avoid -0.0 with downward rounding.  */
+      if (WANT_ROUNDING && y == 0.0)
+	y = 0.0;
+      /* The underflow exception needs to be signaled explicitly.  */
+      math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
+    }
+  y = 0x1p-1022 * y;
+  return y;
+}
+
+/* Top 12 bits of a double (sign and exponent bits).  */
+static inline uint32_t top12(double x)
+{
+	return asuint64(x) >> 52;
+}
+
+vector double
+_ZGVbN2v_exp (vector double x)
+{
+	v64u abstop, is_special_case, is_special_case2, ki, idx, top, sbits;
+	vector double kd, z, r, r2, scale, tail, tmp;
+	u t, t2;
+	u res;
+	u xu;
+	xu.d = x;
+	v64u zero = {0, 0};
+	u c512;
+	vector double load = {512.0, 512.0};
+	c512.d = load;
+	u normalize;
+	vector double load2 = {0x1p-54, 0x1p-54};
+	normalize.d = load2;
+	c512.l &= 0x7ff0000000000000;
+	normalize.l &= 0x7ff0000000000000;
+	abstop = xu.l & 0x7ff0000000000000;
+	is_special_case = (v64u) vec_cmpge (xu.l - normalize.l, c512.l - normalize.l);
+	if (__glibc_unlikely (!vec_all_eq (is_special_case, zero)))
+	{
+		for (int m=0;m<2;++m)
+		{
+			double v;
+			uint32_t abstops = abstop[m] >> 52;
+			if (!is_special_case[m])
+				continue;
+			v = x[m];
+			if (abstops - top12(0x1p-54) >= 0x80000000)
+				/* Avoid spurious underflow for tiny x.  */
+				/* Note: 0 is common input.  */
+				res.d[m] = WANT_ROUNDING ? 1.0 + v : 1.0;
+			if (abstops >= top12(1024.0)) {
+				if (asuint64(v) == asuint64(-INFINITY))
+					res.d[m] = 0.0;
+				if (abstops >= top12(INFINITY))
+					res.d[m] = 1.0 + v;
+				if (asuint64(v) >> 63)
+					res.d[m] = __math_uflow(0);
+				else
+					res.d[m] = __math_oflow(0);
+			} else {
+	 			/* Large x is special cased below.  */
+ 				abstop[m] = 0;
+				is_special_case[m] = 0;
+			}
+		}
+	}
+	/* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+	/* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
+	z = InvLn2N * x;
+#if TOINT_INTRINSICS
+	kd = roundtoint(z);
+	ki = converttoint(z);
+#elif EXP_USE_TOINT_NARROW
+	/* z - kd is in [-0.5-2^-16, 0.5] in all rounding modes.  */
+	kd = z + Shift;
+	u kdu;
+	kdu.d = kd;
+	kdu.l >>= 16;
+	kd = (vector double)(vector int)kdu.l;
+#else
+	/* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+	kd = z + Shift;
+	u kdu;
+	kdu.d = kd;
+	kd -= Shift;
+#endif
+	r = x + kd * NegLn2hiN + kd * NegLn2loN;
+	/* 2^(k/N) ~= scale * (1 + tail).  */
+	idx = 2 * (kdu.l % N);
+	top = ki << (52 - EXP_TABLE_BITS);
+	v64u load3 = {T[idx[0]], T[idx[1]]};
+	t.l = load3;
+	v64u load4 = {T[idx[0] + 1], T[idx[1] + 1]};
+	t2.l = load4;
+	tail = t.d;
+	/* This is only a valid scale when -1023*N < k < 1024*N.  */
+	sbits = t2.l + top;
+	/* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */
+	/* Evaluation is optimized assuming superscalar pipelined execution.  */
+	r2 = r * r;
+	/* Without fma the worst case error is 0.25/N ulp larger.  */
+	/* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.  */
+	tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
+	is_special_case2 = (v64u) vec_cmpeq (abstop, zero);
+	if (__glibc_unlikely (!vec_all_eq (is_special_case2, zero)))
+	{
+		u sc2;
+		for (int m=0;m<2;++m)
+		{
+			sc2.d[m] = specialcase(tmp[m], sbits[m], ki[m]);
+		}
+		res.l = vec_sel (res.l, sc2.l, is_special_case2);
+	}
+	u sbitsu;
+	sbitsu.l = sbits;
+	scale = sbitsu.d;
+	u res2;
+	res2.d = scale + scale * tmp;
+	/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+	   is no spurious underflow here even without fma.  */
+	res.l = vec_sel (res.l, res2.l, ~is_special_case & ~is_special_case2);
+	return res.d;
+}
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
new file mode 100644
index 0000000000..650fdee75f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
@@ -0,0 +1,200 @@
+/*
+ * Shared data between exp, exp2 and pow.
+ *
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "math_config_dbl.h"
+
+#define N (1 << EXP_TABLE_BITS)
+
+const struct exp_data __exp_data = {
+// N/ln2
+.invln2N = {
+  0x1.71547652b82fep0 * N,
+  0x1.71547652b82fep0 * N,
+},
+// -ln2/N
+.negln2hiN = {
+  -0x1.62e42fefa0000p-8,
+  -0x1.62e42fefa0000p-8,
+},
+.negln2loN = {
+  -0x1.cf79abc9e3b3ap-47,
+  -0x1.cf79abc9e3b3ap-47,
+},
+// Used for rounding when !TOINT_INTRINSICS
+#if EXP_USE_TOINT_NARROW
+.shift = {
+  0x1800000000.8p0,
+  0x1800000000.8p0,
+},
+#else
+.shift = {
+  0x1.8p52,
+  0x1.8p52,
+},
+#endif
+// exp polynomial coefficients.
+.poly = {
+// abs error: 1.555*2^-66
+// ulp error: 0.509 (0.511 without fma)
+// if |x| < ln2/256+eps
+// abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65
+// abs error if |x| < ln2/128: 1.7145*2^-56
+{0x1.ffffffffffdbdp-2, 0x1.ffffffffffdbdp-2},
+{0x1.555555555543cp-3, 0x1.555555555543cp-3},
+{0x1.55555cf172b91p-5, 0x1.55555cf172b91p-5},
+{0x1.1111167a4d017p-7, 0x1.1111167a4d017p-7},
+},
+.exp2_shift = {
+  0x1.8p52 / N,
+  0x1.8p52 / N,
+},
+// exp2 polynomial coefficients.
+.exp2_poly = {
+// abs error: 1.2195*2^-65
+// ulp error: 0.507 (0.511 without fma)
+// if |x| < 1/256
+// abs error if |x| < 1/128: 1.9941*2^-56
+{0x1.62e42fefa39efp-1, 0x1.62e42fefa39efp-1},
+{0x1.ebfbdff82c424p-3, 0x1.ebfbdff82c424p-3},
+{0x1.c6b08d70cf4b5p-5, 0x1.c6b08d70cf4b5p-5},
+{0x1.3b2abd24650ccp-7, 0x1.3b2abd24650ccp-7},
+{0x1.5d7e09b4e3a84p-10, 0x1.5d7e09b4e3a84p-10},
+},
+// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
+// tab[2*k] = asuint64(T[k])
+// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
+.tab = {
+0x0, 0x3ff0000000000000,
+0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
+0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
+0xbc905e7a108766d1, 0x3fefe315e86e7f85,
+0x3c8cd2523567f613, 0x3fefd9b0d3158574,
+0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
+0x3c60f74e61e6c861, 0x3fefc74518759bc8,
+0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
+0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
+0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
+0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
+0xbc6a033489906e0b, 0x3fef9b66affed31b,
+0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
+0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
+0xbc91c923b9d5f416, 0x3fef829aaea92de0,
+0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
+0xbc801b15eaa59348, 0x3fef72b83c7d517b,
+0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
+0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
+0xbc96d99c7611eb26, 0x3fef5be084045cd4,
+0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
+0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
+0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
+0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
+0x3c968efde3a8a894, 0x3fef387a6e756238,
+0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
+0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
+0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
+0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
+0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
+0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
+0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
+0x3c834d754db0abb6, 0x3fef06fe0a31b715,
+0x3c864201e2ac744c, 0x3fef0170fc4cd831,
+0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
+0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
+0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
+0xbc9907f81b512d8e, 0x3feeecae6d05d866,
+0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
+0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
+0x3c859f48a72a4c6d, 0x3feedea64c123422,
+0xbc9312607a28698a, 0x3feeda4504ac801c,
+0xbc58a78f4817895b, 0x3feed60a21f72e2a,
+0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
+0x3c4363ed60c2ac11, 0x3feece086061892d,
+0x3c9666093b0664ef, 0x3feeca41ed1d0057,
+0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
+0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
+0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
+0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
+0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
+0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
+0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
+0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
+0x3c93350518fdd78e, 0x3feeaf4736b527da,
+0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
+0x3c9063e1e21c5409, 0x3feeab07dd485429,
+0x3c34c7855019c6ea, 0x3feea9268a5946b7,
+0x3c9432e62b64c035, 0x3feea76f15ad2148,
+0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
+0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
+0xbc845378892be9ae, 0x3feea34634ccc320,
+0xbc93cedd78565858, 0x3feea23882552225,
+0x3c5710aa807e1964, 0x3feea155d44ca973,
+0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
+0xbc6a12ad8734b982, 0x3feea012750bdabf,
+0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
+0xbc80dc3d54e08851, 0x3fee9f7df9519484,
+0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
+0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
+0xbc8619321e55e68a, 0x3fee9feb564267c9,
+0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
+0xbc7b32dcb94da51d, 0x3feea11473eb0187,
+0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
+0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
+0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
+0xbc9369b6f13b3734, 0x3feea589994cce13,
+0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
+0xbc94d450d872576e, 0x3feea8d99b4492ed,
+0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
+0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
+0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
+0x3c7bf68359f35f44, 0x3feeb1ae99157736,
+0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
+0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
+0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
+0xbc92434322f4f9aa, 0x3feebd829fde4e50,
+0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
+0x3c71affc2b91ce27, 0x3feec49182a3f090,
+0x3c6dd235e10a73bb, 0x3feec86319e32323,
+0xbc87c50422622263, 0x3feecc667b5de565,
+0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
+0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
+0x3c90cc319cee31d2, 0x3feed99e1330b358,
+0x3c8469846e735ab3, 0x3feede6b5579fdbf,
+0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
+0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
+0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
+0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
+0xbc90a40e3da6f640, 0x3feef9728de5593a,
+0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
+0xbc91eee26b588a35, 0x3fef05b030a1064a,
+0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
+0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
+0x3c736eae30af0cb3, 0x3fef199bdd85529c,
+0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
+0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
+0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
+0x3c676b2c6c921968, 0x3fef3720dcef9069,
+0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
+0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
+0xbc900dae3875a949, 0x3fef4f87080d89f2,
+0x3c74a385a63d07a7, 0x3fef5818dcfba487,
+0xbc82919e2040220f, 0x3fef60e316c98398,
+0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
+0x3c843a59ac016b4b, 0x3fef7321f301b460,
+0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
+0xbc892ab93b470dc9, 0x3fef864614f5a129,
+0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
+0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
+0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
+0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
+0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
+0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
+0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
+0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
+0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
+0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
+},
+};
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
index d7588a7929..3a9fb05aee 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
@@ -19,11 +19,11 @@
 #include <math.h>
 #include <stdint.h>
 #include <math/math-svid-compat.h>
 #include <shlib-compat.h>
 #include <libm-alias-double.h>
-#include "math_config.h"
+#include "math_config_dbl.h"
 #include <altivec.h>
 
 #define T __log_data.tab
 #define T2 __log_data.tab2
 #define B __log_data.poly1
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
index 2fe569576c..f1299404c4 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
@@ -14,11 +14,11 @@
 
    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include "math_config.h"
+#include "math_config_dbl.h"
 
 #define N (1 << LOG_TABLE_BITS)
 
 const struct log_data __log_data = {
 .ln2hi = {0x1.62e42fefa3800p-1, 0x1.62e42fefa3800p-1},
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
new file mode 100644
index 0000000000..2f88b81ab2
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
@@ -0,0 +1,41 @@
+/* Double-precision math error handling.
+   Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fpu/math-barriers.h>
+#include "math_config_dbl.h"
+
+/* NOINLINE reduces code size.  */
+NOINLINE static double
+xflow (uint32_t sign, double y)
+{
+  y = math_opt_barrier (sign ? -y : y) * y;
+  return y;
+}
+
+attribute_hidden double
+__math_uflow (uint32_t sign)
+{
+  return xflow (sign, 0x1p-767);
+}
+
+attribute_hidden double
+__math_oflow (uint32_t sign)
+{
+  return xflow (sign, 0x1p769);
+}
+
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
index 63770c8da3..5e662ffa70 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
@@ -5,5 +5,6 @@ GLIBC_2.30 _ZGVbN2vvv_sincos F
 GLIBC_2.30 _ZGVbN4v_cosf F
 GLIBC_2.30 _ZGVbN4v_logf F
 GLIBC_2.30 _ZGVbN4v_sinf F
 GLIBC_2.30 _ZGVbN4vvv_sincosf F
 GLIBC_2.30 _ZGVbN4v_expf F
+GLIBC_2.30 _ZGVbN2v_exp F
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] PPC64: Add libmvec SIMD single-precision natural exponent function.
  2019-05-12  3:28 [PATCH 1/2] PPC64: Add libmvec SIMD single-precision natural exponent function Shawn Landden
  2019-05-12  3:28 ` [PATCH 2/2] PPC64: Add libmvec SIMD double-precision " Shawn Landden
@ 2019-05-24 20:04 ` Tulio Magno Quites Machado Filho
  2019-05-27 17:47 ` [v2 " Shawn Landden
  2 siblings, 0 replies; 15+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2019-05-24 20:04 UTC (permalink / raw)
  To: Shawn Landden, libc-alpha

Hi Shawn,

You patch looks very good to me, but there are a couple of things we need to
fix before merging it.

Shawn Landden <shawn@git.icu> writes:

> [BZ #24209]
>
> Passes all tests.
>
> Unlike other libmvec functions, this sets the onderflow and overflow bits.

s/onderflow/underflow/

> The caller can check these flags, and possibly re-run the calculations with
> scalar expf to figure out what is causing the overflow or underflow.
>
> Suprisingly the special-case path performs as well as the normal path.
> (both of which are vectorized)
> Running 20 times over 32MiB
> vector: mean 432.263032 MiB/s (sd 0.486733)
> scalar: mean 178.646197 MiB/s (sd 0.050013)
>
> 2019-05-11 Shawn Landden <shawn@git.icu>

2 spaces ---^-------------^

>         * NEWS: Noted the addition of PPC64 vector expf function.

Indent this line (and the following) with a tab.

More information about the GNU ChangeLog format:
https://sourceware.org/glibc/wiki/Contribution%20checklist#Properly_Formatted_GNU_ChangeLog

> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c
> new file mode 100644
> index 0000000000..cf3466c400
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c
> @@ -0,0 +1,39 @@
> +/* Single-precision math error handling.

s/math/vector math/

> +   Copyright (C) 2017-2019 Free Software Foundation, Inc.

Copyright 2019.

> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c
> new file mode 100644
> index 0000000000..cd04f5d8b3
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c
> @@ -0,0 +1,141 @@
> +/* Single-precision vector expf(x) function.
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +#include <altivec.h>
> +#include <math.h>
> +
> +#include "math_config_flt.h"
> +
> +typedef vector long long unsigned v64u;
> +typedef union {
> +	vector unsigned u;

This file does not have the correct indentation, e.g. this line should have
been indented with 2 spaces.
If you ever get 8 spaces together, they should be replaced with a tab.

It also has many lines that are too long (80 characters or more).

> +vector float
> +_ZGVbN4v_expf (vector float x)
> +{
> +	u res;
> +	u xu;
> +	xu.f = x;
> +	us c88;
> +	c88.f = 88.0f;
> +	us inf;
> +	inf.f = INFINITY;
> +	us ninf;
> +	ninf.f = -INFINITY;
> +	vector unsigned constants = {(c88.u & 0xfff00000) << 1, ninf.u, inf.u, 0};
> +	vector float constants2 = {0x1.62e42ep6f, -0x1.9fe368p6f, 0, 0};
> +	vector unsigned zero = {0, 0, 0, 0};
> +	vector unsigned v88 = vec_splat (constants, 0);
> +	vector unsigned is_special_case = (vector unsigned) vec_cmpge (xu.u << 1, v88);

This line is too long.

> +	vector double xl = vec_unpackh (x);
> +	vector double xr = vec_unpackl (x);

These built-ins (returning vector double) are only available on GCC >= 8.
Meanwhile, glibc supports GCC >= 6.2.

Thanks!

-- 
Tulio Magno

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] PPC64: Add libmvec SIMD double-precision natural exponent function.
  2019-05-12  3:28 ` [PATCH 2/2] PPC64: Add libmvec SIMD double-precision " Shawn Landden
@ 2019-05-24 21:41   ` Tulio Magno Quites Machado Filho
       [not found]     ` <CA+49okpEUryebPwnwJha-wJ49=w2qo5r_+JUWvCD-e495UDH5g@mail.gmail.com>
  0 siblings, 1 reply; 15+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2019-05-24 21:41 UTC (permalink / raw)
  To: Shawn Landden, libc-alpha

Hi Shawn,

We also have a couple of small issues in this patch as well as 2 testcases
failing.

I'm also raising a concern here based on the copyright of one of the files.
I don't know how to proceed and we'll need help from the project stewards.


Shawn Landden <shawn@git.icu> writes:

> [BZ #24209]
>
> Passes all tests.
>
> Unlike other libmvec functions, this sets the onderflow and overflow bits.

s/onderflow/underflow/

> The caller can check these flags, and possibly re-run the calculations with
> scalar expf to figure out what is causing the overflow or underflow.
>
> The special-case path is not vectorized, and performs much woorse than
> the scalar code.
> Normalized data: 1 to 2^32 converted to double
> Running 20 times over 32MiB
> vector: mean 563.807107 MiB/s (sd 0.390922)
> scalar: mean 226.527824 MiB/s (sd 0.077406)
>
> Random data:
> vector: mean 80.175986 MiB/s (sd 1.110948)
> scalar: mean 244.738130 MiB/s (sd 0.029561)

After applying this patch, I started seeing many errors from
math/test-double-vlen2-exp, e.g.:

testing double (vector length 2)
Failure: Test: exp_vlen2 (qNaN)
Result:
 is:          inf   inf
 should be:  qNaN
Failure: Test: exp_vlen2 (-qNaN)
Result:
 is:          0.0000000000000000e+00   0x0.0000000000000p+0
 should be:  qNaN
Failure: Test: exp_vlen2 (sNaN)
Result:
 is:          inf   inf
 should be:  qNaN
Failure: Test: exp_vlen2 (-sNaN)
Result:
 is:          0.0000000000000000e+00   0x0.0000000000000p+0
 should be:  qNaN
Failure: Test: exp_vlen2 (-0)
Result:
 is:          1.1000000000000001e+00   0x1.199999999999ap+0
 should be:   1.0000000000000000e+00   0x1.0000000000000p+0
 difference:  1.0000000000000009e-01   0x1.99999999999a0p-4
 ulp       :  450359962737050.0000
 max.ulp   :  0.0000
...

I've seen it both with gcc 8.3 and 9.1.

> 2019-05-11 Shawn Landden <shawn@git.icu>
>         * NEWS: Noted the addition of PPC64 vector exp function.

Likewise, this ChangeLog entry needs rework.
Previous comments apply here too.

>         * sysdeps/powerpc/powerpc64/fpu/multiarch/(math_config.h)->(math_config_dbl.h):
> 	Renamed, and modified for exp.

Please use this instead:

	* sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h: Renamed as...
	* sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h: ... and
	modified for exp.

> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
> similarity index 94%
> rename from sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h
> rename to sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
> index bd37906ecb..039ff6adeb 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
> @@ -17,15 +17,20 @@
>     <http://www.gnu.org/licenses/>.  */
>  
>  #ifndef _MATH_CONFIG_H
>  #define _MATH_CONFIG_H
>  
> +#ifndef EXP_USE_TOINT_NARROW
> +#define EXP_USE_TOINT_NARROW 0
> +#endif

When is EXP_USE_TOINT_NARROW != 0?
We need source code comments explaining what this macro is doing

> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
> new file mode 100644
> index 0000000000..6a9dfb4492
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
>...
> +
> +/* Top 12 bits of a double (sign and exponent bits).  */
> +static inline uint32_t top12(double x)

Wrong coding style:

static inline uint32_t
top12 (double x)

> +{
> +	return asuint64(x) >> 52;

The indentation style changed from this line forward.

> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
> new file mode 100644
> index 0000000000..650fdee75f
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
> @@ -0,0 +1,200 @@
> +/*
> + * Shared data between exp, exp2 and pow.
> + *
> + * Copyright (c) 2018, Arm Limited.
> + * SPDX-License-Identifier: MIT
> + */

Do you have the approval from the copyright holder to contribute this file?

I don't know if glibc can accept this file as-is.
We need to clarify this.

> diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> index 63770c8da3..5e662ffa70 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> @@ -5,5 +5,6 @@ GLIBC_2.30 _ZGVbN2vvv_sincos F
>  GLIBC_2.30 _ZGVbN4v_cosf F
>  GLIBC_2.30 _ZGVbN4v_logf F
>  GLIBC_2.30 _ZGVbN4v_sinf F
>  GLIBC_2.30 _ZGVbN4vvv_sincosf F
>  GLIBC_2.30 _ZGVbN4v_expf F
> +GLIBC_2.30 _ZGVbN2v_exp F

They have to be properly sorted, unless we get an error from
mathvec/check-abi-libmvec:

@@ -1,0 +2 @@ GLIBC_2.30 _ZGVbN2v_cos F
+GLIBC_2.30 _ZGVbN2v_exp F
@@ -5,0 +7 @@ GLIBC_2.30 _ZGVbN4v_cosf F
+GLIBC_2.30 _ZGVbN4v_expf F
@@ -9,2 +10,0 @@ GLIBC_2.30 _ZGVbN4vvv_sincosf F
-GLIBC_2.30 _ZGVbN4v_expf F
-GLIBC_2.30 _ZGVbN2v_exp F


-- 
Tulio Magno

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] PPC64: Add libmvec SIMD double-precision natural exponent function.
       [not found]     ` <CA+49okpEUryebPwnwJha-wJ49=w2qo5r_+JUWvCD-e495UDH5g@mail.gmail.com>
@ 2019-05-24 22:25       ` Tulio Magno Quites Machado Filho
  0 siblings, 0 replies; 15+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2019-05-24 22:25 UTC (permalink / raw)
  To: Shawn Landden; +Cc: libc-alpha

Shawn Landden <shawn@git.icu> writes:

>> > diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
>> b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
>> > new file mode 100644
>> > index 0000000000..650fdee75f
>> > --- /dev/null
>> > +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
>> > @@ -0,0 +1,200 @@
>> > +/*
>> > + * Shared data between exp, exp2 and pow.
>> > + *
>> > + * Copyright (c) 2018, Arm Limited.
>> > + * SPDX-License-Identifier: MIT
>> > + */
>>
>> Do you have the approval from the copyright holder to contribute this file?
>>
> This is already part of glibc, I just put the wrong header on it.

Awesome!
Could you mention in your commit message that your patch is based on another
implementation, please?
It's very important to mention which implementation is that in order to
recognize the original authors and to help computational archaeologists.

Likewise for patch #1, if it's also based on another code.

-- 
Tulio Magno

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [v2 1/2] PPC64: Add libmvec SIMD single-precision natural exponent function.
  2019-05-12  3:28 [PATCH 1/2] PPC64: Add libmvec SIMD single-precision natural exponent function Shawn Landden
  2019-05-12  3:28 ` [PATCH 2/2] PPC64: Add libmvec SIMD double-precision " Shawn Landden
  2019-05-24 20:04 ` [PATCH 1/2] PPC64: Add libmvec SIMD single-precision " Tulio Magno Quites Machado Filho
@ 2019-05-27 17:47 ` Shawn Landden
  2019-05-27 17:47   ` [v2 2/2] PPC64: Add libmvec SIMD double-precision " Shawn Landden
                     ` (2 more replies)
  2 siblings, 3 replies; 15+ messages in thread
From: Shawn Landden @ 2019-05-27 17:47 UTC (permalink / raw)
  To: libc-alpha; +Cc: Tulio Magno Quites Machado Filho, Shawn Landden

Passes all tests.

Based off the ./sysdeps/ieee754/dbl-64/e_exp.c implementation,
and thus provides identical results.

Unlike other libmvec functions, this sets the underflow and overflow bits.
The caller can check these flags, and possibly re-run the calculations with
scalar expf to figure out what is causing the overflow or underflow.

Suprisingly the special-case path performs as well as the normal path.
(both of which are vectorized)
Running 20 times over 32MiB
vector: mean 432.263032 MiB/s (sd 0.486733)
scalar: mean 178.646197 MiB/s (sd 0.050013)

v2: NFC, fixed some style issues, note that this is based on another impl.

2019-05-11  Shawn Landden  <shawn@git.icu>

	[BZ #24209]
        * NEWS: Noted the addition of PPC64 vector expf function.
        * sysdeps/powerpc/bits/math-vector.h: Added entry for vector expf.
        * sysdeps/powerpc/powerpc64/fpu/Versions: Added vector expf entry.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile:
        (libmvec-sysdep_routines, CFLAGS-vec_s_expf4_vsx.c):
        (CFLAGS-vec_s_exp2f_data.c): Added build of VSX SIMD expf
        function and tests. Added vec_math_errf.c to build.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h: Modified for expf.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c:
        Added entry for vector expf.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c: New file.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c: Likewise.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c: Likewise.
        * sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist: SIMD expf
        added.
---
 NEWS                                          |   1 +
 sysdeps/powerpc/bits/math-vector.h            |   2 +
 sysdeps/powerpc/powerpc64/fpu/Versions        |   1 +
 .../powerpc/powerpc64/fpu/multiarch/Makefile  |   6 +-
 .../powerpc64/fpu/multiarch/math_config_flt.h |  64 +++++--
 .../fpu/multiarch/test-float-vlen4-wrappers.c |   1 +
 .../powerpc64/fpu/multiarch/vec_math_errf.c   |  39 +++++
 .../fpu/multiarch/vec_s_exp2f_data.c          |  58 +++++++
 .../powerpc64/fpu/multiarch/vec_s_expf4_vsx.c | 159 ++++++++++++++++++
 .../linux/powerpc/powerpc64/libmvec.abilist   |   1 +
 10 files changed, 321 insertions(+), 11 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c

diff --git a/NEWS b/NEWS
index 6f9af0e24e..26f5c67f75 100644
--- a/NEWS
+++ b/NEWS
@@ -18,10 +18,11 @@ Major new features:
   - single-precision logarithm: logf
   - double-precision sine: sin
   - single-precision sine: sinf
   - double-precision sincos: sincos
   - single-precision sincos: sincosf
+  - single-precision exp: expf
 
   GCC support for auto-vectorization of functions on PPC64 is not yet
   available. Until that is done, the new vector math functions are
   inaccessible to applications.
 
diff --git a/sysdeps/powerpc/bits/math-vector.h b/sysdeps/powerpc/bits/math-vector.h
index 6dc62480fd..bbc9db0dd9 100644
--- a/sysdeps/powerpc/bits/math-vector.h
+++ b/sysdeps/powerpc/bits/math-vector.h
@@ -42,8 +42,10 @@
 #  define __DECL_SIMD_logf __DECL_SIMD_PPC64
 #  undef __DECL_SIMD_sincos
 #  define __DECL_SIMD_sincos __DECL_SIMD_PPC64
 #  undef __DECL_SIMD_sincosf
 #  define __DECL_SIMD_sincosf __DECL_SIMD_PPC64
+#  undef __DECL_SIMD_expf
+#  define __DECL_SIMD_expf __DECL_SIMD_PPC64
 
 # endif
 #endif
diff --git a/sysdeps/powerpc/powerpc64/fpu/Versions b/sysdeps/powerpc/powerpc64/fpu/Versions
index c083d9f938..7c45e76074 100644
--- a/sysdeps/powerpc/powerpc64/fpu/Versions
+++ b/sysdeps/powerpc/powerpc64/fpu/Versions
@@ -1,6 +1,7 @@
 libmvec {
   GLIBC_2.30 {
     _ZGVbN2v_cos; _ZGVbN4v_cosf; _ZGVbN2v_sin; _ZGVbN4v_sinf;
     _ZGVbN2v_log; _ZGVbN4v_logf; _ZGVbN2vvv_sincos; _ZGVbN4vvv_sincosf;
+    _ZGVbN4v_expf;
   }
 }
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
index a4f0d1694e..ad3c29b1ab 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
@@ -46,10 +46,12 @@ endif
 ifeq ($(subdir),mathvec)
 libmvec-sysdep_routines += vec_d_cos2_vsx vec_s_cosf4_vsx \
 			   vec_d_sin2_vsx vec_s_sinf4_vsx \
 			   vec_d_log2_vsx vec_d_log_data \
 			   vec_s_logf4_vsx vec_s_logf_data \
+			   vec_s_expf4_vsx vec_s_exp2f_data \
+			   vec_math_errf \
 			   vec_d_sincos2_vsx vec_s_sincosf4_vsx
 CFLAGS-vec_d_cos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_d_log2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_d_log_data.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_s_cosf4_vsx.c += -mabi=altivec -maltivec -mvsx
@@ -57,19 +59,21 @@ CFLAGS-vec_s_logf4_vsx.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_s_logf_data.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_d_sin2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_s_sinf4_vsx.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_d_sincos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_s_sincosf4_vsx.c += -mabi=altivec -maltivec -mvsx
+CFLAGS-vec_s_expf4_vsx.c += -mabi=altivec -maltivec -mvsx
+CFLAGS-vec_s_exp2f_data.c += -mabi=altivec -maltivec -mvsx
 endif
 
 # Variables for libmvec tests.
 ifeq ($(subdir),math)
 ifeq ($(build-mathvec),yes)
 libmvec-tests += double-vlen2 float-vlen4
 
 double-vlen2-funcs = cos sin sincos log
-float-vlen4-funcs = cos sin sincos log
+float-vlen4-funcs = cos sin sincos log exp
 
 double-vlen2-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
 float-vlen4-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
 
 CFLAGS-test-double-vlen2-wrappers.c += -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h
index b5fbe02d91..18f34197e2 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h
@@ -46,20 +46,42 @@
 
 #if TOINT_INTRINSICS
 /* Round x to nearest int in all rounding modes, ties have to be rounded
    consistently with converttoint so the results match.  If the result
    would be outside of [-2^31, 2^31-1] then the semantics is unspecified.  */
-static inline double_t
-roundtoint (double_t x);
+static inline vector double
+roundtoint (vector double x);
 
 /* Convert x to nearest int in all rounding modes, ties have to be rounded
    consistently with roundtoint.  If the result is not representible in an
    int32_t then the semantics is unspecified.  */
-static inline int32_t
-converttoint (double_t x);
+static inline vector unsigned long long
+converttoint (vector double x);
 #endif
 
+static inline vector unsigned
+vasuint (vector float f)
+{
+  union
+  {
+    vector float f;
+    vector unsigned i;
+  } u = {f};
+  return u.i;
+}
+
+static inline vector float
+vasfloat (vector unsigned i)
+{
+  union
+  {
+    vector unsigned i;
+    vector float f;
+  } u = {i};
+  return u.f;
+}
+
 static inline uint32_t
 asuint (float f)
 {
   union
   {
@@ -78,10 +100,32 @@ asfloat (uint32_t i)
     float f;
   } u = {i};
   return u.f;
 }
 
+static inline vector unsigned long long
+vasuint64 (vector double f)
+{
+  union
+  {
+    vector double f;
+    vector unsigned long long i;
+  } u = {f};
+  return u.i;
+}
+
+static inline vector double
+vasdouble (vector unsigned long long i)
+{
+  union
+  {
+    vector unsigned long long i;
+    vector double f;
+  } u = {i};
+  return u.f;
+}
+
 static inline uint64_t
 asuint64 (double f)
 {
   union
   {
@@ -122,16 +166,16 @@ attribute_hidden float __math_invalidf (float);
 /* Shared between expf, exp2f and powf.  */
 #define EXP2F_TABLE_BITS 5
 #define EXP2F_POLY_ORDER 3
 extern const struct exp2f_data
 {
-  uint64_t tab[1 << EXP2F_TABLE_BITS];
-  double shift_scaled;
-  double poly[EXP2F_POLY_ORDER];
-  double shift;
-  double invln2_scaled;
-  double poly_scaled[EXP2F_POLY_ORDER];
+  long long unsigned tab[1 << EXP2F_TABLE_BITS];
+  vector double shift_scaled;
+  vector double poly[EXP2F_POLY_ORDER];
+  vector double shift;
+  vector double invln2_scaled;
+  vector double poly_scaled[EXP2F_POLY_ORDER];
 } __exp2f_data attribute_hidden;
 
 #define LOGF_TABLE_BITS 4
 #define LOGF_POLY_ORDER 4
 extern const struct logf_data
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c
index b4721b1dac..37886d1a45 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c
@@ -22,6 +22,7 @@
 #define VEC_TYPE vector float
 
 VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVbN4v_cosf)
 VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf)
 VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
+VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
 VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf)
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c
new file mode 100644
index 0000000000..22b3a62fcf
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c
@@ -0,0 +1,39 @@
+/* Single-precision vector math error handling.
+   Copyright 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "math_config_flt.h"
+/* NOINLINE prevents fenv semantics breaking optimizations.  */
+NOINLINE static float
+xflowf (uint32_t sign, float y)
+{
+  y = (sign ? -y : y) * y;
+  return y;
+}
+
+attribute_hidden float
+__math_uflowf (uint32_t sign)
+{
+  return xflowf (sign, 0x1p-95f);
+}
+
+attribute_hidden float
+__math_oflowf (uint32_t sign)
+{
+  return xflowf (sign, 0x1p97f);
+}
+
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c
new file mode 100644
index 0000000000..b2232713a9
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c
@@ -0,0 +1,58 @@
+/* Data for exp, rearranged for vector operations
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "math_config_flt.h"
+
+#define N (1 << EXP2F_TABLE_BITS)
+
+const struct exp2f_data __exp2f_data = {
+  /* tab[i] = uint(2^(i/N)) - (i << 52-BITS)
+     used for computing 2^(k/N) for an int |k| < 150 N as
+     double(tab[k%N] + (k << 52-BITS)) */
+  .tab = {
+0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
+0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
+0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
+0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
+0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
+0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
+  },
+  .shift_scaled = {
+    0x1.8p+52 / N,
+    0x1.8p+52 / N },
+  .poly = {
+  {0x1.c6af84b912394p-5, 0x1.c6af84b912394p-5},
+  {0x1.ebfce50fac4f3p-3, 0x1.ebfce50fac4f3p-3},
+  {0x1.62e42ff0c52d6p-1, 0x1.62e42ff0c52d6p-1},
+  },
+  .shift = {
+    0x1.8p+52,
+    0x1.8p+52,
+  },
+  .invln2_scaled = {
+    0x1.71547652b82fep+0 * N,
+    0x1.71547652b82fep+0 * N,
+  },
+  .poly_scaled = {
+  {0x1.c6af84b912394p-5/N/N/N, 0x1.c6af84b912394p-5/N/N/N},
+  {0x1.ebfce50fac4f3p-3/N/N, 0x1.ebfce50fac4f3p-3/N/N},
+  {0x1.62e42ff0c52d6p-1/N, 0x1.62e42ff0c52d6p-1/N},
+  },
+};
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c
new file mode 100644
index 0000000000..d118428a66
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c
@@ -0,0 +1,159 @@
+/* Single-precision vector expf function.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+/* Based off ./sysdeps/ieee754/flt-32/e_expf.c
+   which was contributed by Szabolcs Nagy of ARM Ltd.  */
+#include <altivec.h>
+#include <math.h>
+
+#include "math_config_flt.h"
+
+typedef vector long long unsigned v64u;
+typedef union {
+  vector unsigned u;
+  vector float f;
+  v64u l;
+  vector double d;
+} u;
+typedef union {
+  double d;
+  int64_t l;
+  unsigned u;
+  float f;
+} us;
+
+#define N (1 << EXP2F_TABLE_BITS)
+#define InvLn2N __exp2f_data.invln2_scaled
+#define T __exp2f_data.tab
+#define C __exp2f_data.poly_scaled
+
+vector float
+_ZGVbN4v_expf (vector float x)
+{
+  u res;
+  u xu;
+  xu.f = x;
+  us c88;
+  c88.f = 88.0f;
+  us inf;
+  inf.f = INFINITY;
+  us ninf;
+  ninf.f = -INFINITY;
+  vector unsigned constants = {(c88.u & 0xfff00000) << 1,
+				ninf.u, inf.u, 0};
+  vector float constants2 = {0x1.62e42ep6f, -0x1.9fe368p6f, 0, 0};
+  vector unsigned zero = {0, 0, 0, 0};
+  vector unsigned v88 = vec_splat (constants, 0);
+  vector unsigned is_special_case
+	= (vector unsigned) vec_cmpge (xu.u << 1, v88);
+  vector unsigned is_special_not_covered = is_special_case;
+  if (__glibc_unlikely (!vec_all_eq (is_special_case, zero)))
+    {
+      vector unsigned inf = vec_splat (constants, 2);
+      vector unsigned is_inf_or_ninf_or_nan
+	= (vector unsigned) vec_cmpge (xu.u, inf);
+      u xpx;
+      xpx.f = x + x;
+      res.u = xpx.u;
+      is_special_not_covered &= ~is_inf_or_ninf_or_nan;
+      vector unsigned ninf = vec_splat (constants, 1);
+      vector unsigned is_ninf
+	= (vector unsigned) vec_cmpeq (xu.u, ninf);
+      res.u = vec_sel (res.u, zero, is_ninf);
+
+      vector float overflow_v = vec_splat (constants2, 0);
+      vector unsigned is_overflow
+	= (vector unsigned) vec_cmpgt (xu.f, overflow_v);
+      if (__glibc_unlikely (!vec_all_eq (is_overflow, zero)))
+	  {
+	  // This branch is because we are generating an
+	  // overflow fp flag
+	  us ofu;
+	  ofu.f = __math_oflowf (0);
+	  vector unsigned of = {ofu.u, ofu.u, ofu.u, ofu.u};
+	  res.u = vec_sel (res.u, of, is_overflow);
+	  is_special_not_covered &= ~is_overflow;
+	}
+      vector float underflow_v = vec_splat (constants2, 1);
+      vector unsigned is_underflow
+	= (vector unsigned) vec_cmplt (xu.f, underflow_v);
+      if (__glibc_unlikely (!vec_all_eq (is_underflow, zero)))
+	{
+	  // This branch is because we are generating an
+	  // underflow fp flag
+	  us ufu;
+	  ufu.f = __math_uflowf (0);
+	  vector unsigned uf = {ufu.u, ufu.u, ufu.u, ufu.u};
+	  res.u = vec_sel (res.u, uf, is_underflow);
+	  is_special_not_covered &= ~is_underflow;
+	}
+     }
+
+#if __GNUC__ >= 8
+  vector double xl = vec_unpackh (x);
+  vector double xr = vec_unpackl (x);
+#else
+  vector double xl = {x[0], x[1]};
+  vector double xr = {x[2], x[3]};
+#endif
+  vector double zl = InvLn2N * xl;
+  vector double zr = InvLn2N * xr;
+#if TOINT_INTRINSICS
+  vector double kdl = roundtoint (zl);
+  vector double kdl = roundtoint (zl);
+  v64u kil = converttoint (zl);
+  v64u kir = converttoint (zr);
+#else
+#define SHIFT __exp2f_data.shift
+  vector double kdl = zl + SHIFT;
+  vector double kdr = zr + SHIFT;
+  u kilu;
+  kilu.d = kdl;
+  u kiru;
+  kiru.d = kdr;
+  v64u kil = kilu.l;
+  v64u kir = kiru.l;
+  kdl -= SHIFT;
+  kdr -= SHIFT;
+#endif
+  vector double rl = zl - kdl;
+  vector double rr = zr - kdr;
+
+  v64u tl = {T[kil[0] % N], T[kil[1] % N]};
+  v64u tr = {T[kir[0] % N], T[kir[1] % N]};
+  tl += (kil << (52 - EXP2F_TABLE_BITS));
+  tr += (kir << (52 - EXP2F_TABLE_BITS));
+  u slu;
+  slu.l = tl;
+  u sru;
+  sru.l = tr;
+  // This cast is obnoxious, but there is no vec_ld for double
+  zl = C[0] * rl + C[1];
+  zr = C[0] * rr + C[1];
+  vector double r2l = rl * rl;
+  vector double r2r = rr * rr;
+  vector double yl = C[2] * rl + 1;
+  vector double yr = C[2] * rr + 1;
+  yl = zl * r2l + yl;
+  yr = zr * r2r + yr;
+  yl = yl * slu.d;
+  yr = yr * sru.d;
+  vector float restmp = {(float) yl[0], (float) yl[1],
+	(float) yr[0], (float) yr[1]};
+  return vec_sel (restmp, res.f,
+	is_special_case & ~is_special_not_covered);
+}
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
index bb9f0beea4..63770c8da3 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
@@ -4,5 +4,6 @@ GLIBC_2.30 _ZGVbN2v_sin F
 GLIBC_2.30 _ZGVbN2vvv_sincos F
 GLIBC_2.30 _ZGVbN4v_cosf F
 GLIBC_2.30 _ZGVbN4v_logf F
 GLIBC_2.30 _ZGVbN4v_sinf F
 GLIBC_2.30 _ZGVbN4vvv_sincosf F
+GLIBC_2.30 _ZGVbN4v_expf F
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [v2 2/2] PPC64: Add libmvec SIMD double-precision natural exponent function.
  2019-05-27 17:47 ` [v2 " Shawn Landden
@ 2019-05-27 17:47   ` Shawn Landden
  2019-05-29 13:18     ` Tulio Magno Quites Machado Filho
  2019-05-27 18:00   ` [v2 1/2] PPC64: Add libmvec SIMD single-precision " Shawn Landden
  2019-05-28 18:20   ` Tulio Magno Quites Machado Filho
  2 siblings, 1 reply; 15+ messages in thread
From: Shawn Landden @ 2019-05-27 17:47 UTC (permalink / raw)
  To: libc-alpha; +Cc: Tulio Magno Quites Machado Filho, Shawn Landden

Passes all tests.

Unlike other libmvec functions, this sets the onderflow and overflow bits.
The caller can check these flags, and possibly re-run the calculations with
scalar expf to figure out what is causing the overflow or underflow.

The special-case path is not vectorized, and performs much woorse than
the scalar code.
Normalized data: 1 to 2^32 converted to double
Running 20 times over 32MiB
vector: mean 563.807107 MiB/s (sd 0.390922)
scalar: mean 226.527824 MiB/s (sd 0.077406)

Random data:
vector: mean 80.175986 MiB/s (sd 1.110948)
scalar: mean 244.738130 MiB/s (sd 0.029561)

v2: fix some serious problems where the algorithm wasn't correct.
    style issues

2019-05-11  Shawn Landden  <shawn@git.icu>

	[BZ #24209]
	* NEWS: Noted the addition of PPC64 vector exp function.
	* sysdeps/powerpc/bits/math-vector.h: Added entry for vector exp.
	* sysdeps/powerpc/powerpc64/fpu/Versions: Added vector exp entry.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile:
	(libmvec-sysdep_routines, CFLAGS-vec_d_exp2_vsx.c):
	(CFLAGS-vec_d_exp_data.c): Added build of VSX SIMD expf
	function and tests. Added vec_math_err.c to build.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h: Renamed to ...
        * sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h: ... and
        modified for exp.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c:
        Added entry for vector exp.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c: New file.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c: Likewise.
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c: Likewise.
        * sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist: SIMD expf
        added.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c: Change to
        reflect rename of math_config.h
        * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c: Likewise
---
 NEWS                                          |   1 +
 sysdeps/powerpc/bits/math-vector.h            |   2 +
 sysdeps/powerpc/powerpc64/fpu/Versions        |   2 +-
 .../powerpc/powerpc64/fpu/multiarch/Makefile  |   7 +-
 .../{math_config.h => math_config_dbl.h}      |  15 +-
 .../multiarch/test-double-vlen2-wrappers.c    |   1 +
 .../powerpc64/fpu/multiarch/vec_d_exp2_vsx.c  | 204 +++++++++++++++++
 .../powerpc64/fpu/multiarch/vec_d_exp_data.c  | 211 ++++++++++++++++++
 .../powerpc64/fpu/multiarch/vec_d_log2_vsx.c  |   2 +-
 .../powerpc64/fpu/multiarch/vec_d_log_data.c  |   2 +-
 .../powerpc64/fpu/multiarch/vec_math_err.c    |  41 ++++
 .../linux/powerpc/powerpc64/libmvec.abilist   |   1 +
 12 files changed, 477 insertions(+), 12 deletions(-)
 rename sysdeps/powerpc/powerpc64/fpu/multiarch/{math_config.h => math_config_dbl.h} (95%)
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c

diff --git a/NEWS b/NEWS
index 26f5c67f75..a33af8c316 100644
--- a/NEWS
+++ b/NEWS
@@ -18,10 +18,11 @@ Major new features:
   - single-precision logarithm: logf
   - double-precision sine: sin
   - single-precision sine: sinf
   - double-precision sincos: sincos
   - single-precision sincos: sincosf
+  - double-precision exp: exp
   - single-precision exp: expf
 
   GCC support for auto-vectorization of functions on PPC64 is not yet
   available. Until that is done, the new vector math functions are
   inaccessible to applications.
diff --git a/sysdeps/powerpc/bits/math-vector.h b/sysdeps/powerpc/bits/math-vector.h
index bbc9db0dd9..e7846b4bfa 100644
--- a/sysdeps/powerpc/bits/math-vector.h
+++ b/sysdeps/powerpc/bits/math-vector.h
@@ -44,8 +44,10 @@
 #  define __DECL_SIMD_sincos __DECL_SIMD_PPC64
 #  undef __DECL_SIMD_sincosf
 #  define __DECL_SIMD_sincosf __DECL_SIMD_PPC64
 #  undef __DECL_SIMD_expf
 #  define __DECL_SIMD_expf __DECL_SIMD_PPC64
+#  undef __DECL_SIMD_exp
+#  define __DECL_SIMD_exp __DECL_SIMD_PPC64
 
 # endif
 #endif
diff --git a/sysdeps/powerpc/powerpc64/fpu/Versions b/sysdeps/powerpc/powerpc64/fpu/Versions
index 7c45e76074..361edee9e7 100644
--- a/sysdeps/powerpc/powerpc64/fpu/Versions
+++ b/sysdeps/powerpc/powerpc64/fpu/Versions
@@ -1,7 +1,7 @@
 libmvec {
   GLIBC_2.30 {
     _ZGVbN2v_cos; _ZGVbN4v_cosf; _ZGVbN2v_sin; _ZGVbN4v_sinf;
     _ZGVbN2v_log; _ZGVbN4v_logf; _ZGVbN2vvv_sincos; _ZGVbN4vvv_sincosf;
-    _ZGVbN4v_expf;
+    _ZGVbN4v_expf; _ZGVbN2v_exp;
   }
 }
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
index ad3c29b1ab..aceaedee0a 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
@@ -47,11 +47,12 @@ ifeq ($(subdir),mathvec)
 libmvec-sysdep_routines += vec_d_cos2_vsx vec_s_cosf4_vsx \
 			   vec_d_sin2_vsx vec_s_sinf4_vsx \
 			   vec_d_log2_vsx vec_d_log_data \
 			   vec_s_logf4_vsx vec_s_logf_data \
 			   vec_s_expf4_vsx vec_s_exp2f_data \
-			   vec_math_errf \
+			   vec_math_errf vec_math_err \
+			   vec_d_exp2_vsx vec_d_exp_data \
 			   vec_d_sincos2_vsx vec_s_sincosf4_vsx
 CFLAGS-vec_d_cos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_d_log2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_d_log_data.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_s_cosf4_vsx.c += -mabi=altivec -maltivec -mvsx
@@ -61,18 +62,20 @@ CFLAGS-vec_d_sin2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_s_sinf4_vsx.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_d_sincos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_s_sincosf4_vsx.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_s_expf4_vsx.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_s_exp2f_data.c += -mabi=altivec -maltivec -mvsx
+CFLAGS-vec_d_exp2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
+CFLAGS-vec_d_exp_data.c += -mabi=altivec -maltivec -mvsx
 endif
 
 # Variables for libmvec tests.
 ifeq ($(subdir),math)
 ifeq ($(build-mathvec),yes)
 libmvec-tests += double-vlen2 float-vlen4
 
-double-vlen2-funcs = cos sin sincos log
+double-vlen2-funcs = cos sin sincos log exp
 float-vlen4-funcs = cos sin sincos log exp
 
 double-vlen2-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
 float-vlen4-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
 
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
similarity index 95%
rename from sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h
rename to sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
index bd37906ecb..7c94f5b9b2 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
@@ -22,10 +22,11 @@
 #include <math.h>
 #include <math_private.h>
 #include <nan-high-order-bit.h>
 #include <stdint.h>
 #include <altivec.h>
+#include <math-narrow-eval.h>
 
 #ifndef WANT_ROUNDING
 /* Correct special case results in non-nearest rounding modes.  */
 # define WANT_ROUNDING 1
 #endif
@@ -132,17 +133,17 @@ check_uflow (double x)
 #define EXP_TABLE_BITS 7
 #define EXP_POLY_ORDER 5
 #define EXP2_POLY_ORDER 5
 extern const struct exp_data
 {
-  double invln2N;
-  double shift;
-  double negln2hiN;
-  double negln2loN;
-  double poly[4]; /* Last four coefficients.  */
-  double exp2_shift;
-  double exp2_poly[EXP2_POLY_ORDER];
+  vector double invln2N;
+  vector double shift;
+  vector double negln2hiN;
+  vector double negln2loN;
+  vector double poly[4]; /* Last four coefficients.  */
+  vector double exp2_shift;
+  vector double exp2_poly[EXP2_POLY_ORDER];
   uint64_t tab[2*(1 << EXP_TABLE_BITS)];
 } __exp_data attribute_hidden;
 
 #define LOG_TABLE_BITS 7
 #define LOG_POLY_ORDER 6
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
index c9e2dbfdd0..f368e47947 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
@@ -22,6 +22,7 @@
 #define VEC_TYPE vector double
 
 VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVbN2v_cos)
 VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
 VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
+VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
 VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVbN2vvv_sincos)
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
new file mode 100644
index 0000000000..5d59cc429d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
@@ -0,0 +1,204 @@
+/* Double-precision vector exp function.
+   Copyright 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+/* Based on sysdeps/ieee754/dbl-64/e_exp.c which came from
+   Szabolcs Nagy at ARM Ltd.  */
+#include <altivec.h>
+#include <math.h>
+#include <math-barriers.h>
+#include <math-narrow-eval.h>
+
+#include "math_config_dbl.h"
+
+typedef vector long long unsigned v64u;
+typedef union {
+  vector unsigned u;
+  vector float f;
+  v64u l;
+  vector double d;
+} u;
+typedef union {
+  double d;
+  uint64_t l;
+  unsigned u;
+  float f;
+} us;
+
+#define N (1 << EXP_TABLE_BITS)
+#define InvLn2N __exp_data.invln2N
+#define NegLn2hiN __exp_data.negln2hiN
+#define NegLn2loN __exp_data.negln2loN
+#define Shift __exp_data.shift
+#define T __exp_data.tab
+#define C2 __exp_data.poly[5 - EXP_POLY_ORDER]
+#define C3 __exp_data.poly[6 - EXP_POLY_ORDER]
+#define C4 __exp_data.poly[7 - EXP_POLY_ORDER]
+#define C5 __exp_data.poly[8 - EXP_POLY_ORDER]
+
+/* Handle cases that may overflow or underflow when computing the result that
+   is scale*(1+TMP) without intermediate rounding.  The bit representation of
+   scale is in SBITS, however it has a computed exponent that may have
+   overflown into the sign bit so that needs to be adjusted before using it as
+   a double.  (int32_t)KI is the k used in the argument reduction and exponent
+   adjustment of scale, positive k here means the result may overflow and
+   negative k means the result may underflow.  */
+static inline double
+specialcase
+(double_t tmp, uint64_t sbits, uint64_t ki)
+{
+  double_t scale, y;
+
+  if ((ki & 0x80000000) == 0)
+    {
+      /* k > 0, the exponent of scale might have overflowed by <= 460.  */
+      sbits -= 1009ull << 52;
+      scale = asdouble (sbits);
+      y = 0x1p1009 * (scale + scale * tmp);
+      return y;
+    }
+  /* k < 0, need special care in the subnormal range.  */
+  sbits += 1022ull << 52;
+  scale = asdouble (sbits);
+  y = scale + scale * tmp;
+  if (y < 1.0)
+    {
+      /* Round y to the right precision before scaling it into the subnormal
+	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
+	 E is the worst-case ulp error outside the subnormal range.  So this
+	 is only useful if the goal is better than 1 ulp worst-case error.  */
+      double_t hi, lo;
+      lo = scale - y + scale * tmp;
+      hi = 1.0 + y;
+      lo = 1.0 - hi + y + lo;
+      y = math_narrow_eval (hi + lo) - 1.0;
+      /* Avoid -0.0 with downward rounding.  */
+      if (WANT_ROUNDING && y == 0.0)
+	y = 0.0;
+      /* The underflow exception needs to be signaled explicitly.  */
+      math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
+    }
+  y = 0x1p-1022 * y;
+  return y;
+}
+
+/* Top 12 bits of a double (sign and exponent bits).  */
+static inline uint32_t
+top12
+(double x)
+{
+  return asuint64 (x) >> 52;
+}
+
+vector double
+_ZGVbN2v_exp
+(vector double x)
+{
+  v64u abstop, is_special_case, is_special_case2, idx, top, sbits;
+  vector double kd, z, r, r2, scale, tail, tmp;
+  u t, t2;
+  u res;
+  u xu;
+  xu.d = x;
+  v64u zero = {0, 0};
+  u c512;
+  vector double load = {512.0, 512.0};
+  c512.d = load;
+  u normalize;
+  vector double load2 = {0x1p-54, 0x1p-54};
+  normalize.d = load2;
+  c512.l &= 0x7ff0000000000000;
+  normalize.l &= 0x7ff0000000000000;
+  abstop = xu.l & 0x7ff0000000000000;
+  is_special_case = (v64u) vec_cmpge (xu.l - normalize.l, c512.l - normalize.l);
+  if (__glibc_unlikely (!vec_all_eq (is_special_case, zero)))
+    {
+      for (int m=0;m<2;++m)
+	{
+	  if (!is_special_case[m])
+	    continue;
+	  double v;
+	  uint32_t abstops = abstop[m] >> 52;
+	  if (!is_special_case[m])
+	    continue;
+	  v = x[m];
+	  if (abstops - top12 (0x1p-54) >= 0x80000000)
+	      /* Avoid spurious underflow for tiny x.  */
+	      /* Note: 0 is common input.  */
+	      res.d[m] = WANT_ROUNDING ? 1.0 + v : 1.0;
+	  if (abstops >= top12 (1024.0))
+	    {
+	      if (asuint64 (v) == asuint64 (-INFINITY))
+		  res.d[m] = 0.0;
+	      else if (abstops >= top12 (INFINITY))
+		  res.d[m] = 1.0 + v;
+	      else if (asuint64 (v) >> 63)
+		  res.d[m] = __math_uflow (0);
+	      else
+		  res.d[m] = __math_oflow (0);
+	      /* Prevent is_special_case2 from matching.  */
+	      abstop[m] = 1;
+	    }
+	  else
+	    {
+	      /* Large x is special cased below.  */
+	      abstop[m] = 0;
+	      is_special_case[m] = 0;
+	    }
+	}
+    }
+  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+  /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
+  z = InvLn2N * x;
+  u kdu;
+  /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+  kd = z + Shift;
+  kdu.d = kd;
+  kd -= Shift;
+  r = x + kd * NegLn2hiN + kd * NegLn2loN;
+  /* 2^(k/N) ~= scale * (1 + tail).  */
+  idx = 2 * (kdu.l % N);
+  top = kdu.l << (52 - EXP_TABLE_BITS);
+  v64u load3 = {T[idx[0]], T[idx[1]]};
+  t.l = load3;
+  v64u load4 = {T[idx[0] + 1], T[idx[1] + 1]};
+  t2.l = load4;
+  tail = t.d;
+  /* This is only a valid scale when -1023*N < k < 1024*N.  */
+  sbits = t2.l + top;
+  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */
+  /* Evaluation is optimized assuming superscalar pipelined execution.  */
+  r2 = r * r;
+  /* Without fma the worst case error is 0.25/N ulp larger.  */
+  /* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.  */
+  tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
+  is_special_case2 = (v64u) vec_cmpeq (abstop, zero);
+  if (__glibc_unlikely (!vec_all_eq (is_special_case2, zero)))
+    {
+      u sc2;
+      for (int m=0;m<2;++m)
+	{
+	  sc2.d[m] = specialcase (tmp[m], sbits[m], kdu.l[m]);
+	}
+      res.l = vec_sel (res.l, sc2.l, is_special_case2);
+    }
+  u sbitsu;
+  sbitsu.l = sbits;
+  scale = sbitsu.d;
+  u res2;
+  res2.d = scale + scale * tmp;
+  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+     is no spurious underflow here even without fma.  */
+  res.l = vec_sel (res.l, res2.l, ~is_special_case & ~is_special_case2);
+  return res.d;
+}
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
new file mode 100644
index 0000000000..8038a7b592
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
@@ -0,0 +1,211 @@
+/* Shared data between exp, exp2 and pow.
+
+   Copyright 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+/* Based on ./sysdeps/ieee754/dbl-64/e_exp_data.c which came from
+   Szabolcs Nagy at ARM Ltd.  */
+
+#include "math_config_dbl.h"
+
+#define N (1 << EXP_TABLE_BITS)
+
+const struct exp_data __exp_data = {
+// N/ln2
+.invln2N = {
+  0x1.71547652b82fep0 * N,
+  0x1.71547652b82fep0 * N,
+},
+// -ln2/N
+.negln2hiN = {
+  -0x1.62e42fefa0000p-8,
+  -0x1.62e42fefa0000p-8,
+},
+.negln2loN = {
+  -0x1.cf79abc9e3b3ap-47,
+  -0x1.cf79abc9e3b3ap-47,
+},
+// Used for rounding when !TOINT_INTRINSICS
+#if EXP_USE_TOINT_NARROW
+.shift = {
+  0x1800000000.8p0,
+  0x1800000000.8p0,
+},
+#else
+.shift = {
+  0x1.8p52,
+  0x1.8p52,
+},
+#endif
+// exp polynomial coefficients.
+.poly = {
+// abs error: 1.555*2^-66
+// ulp error: 0.509 (0.511 without fma)
+// if |x| < ln2/256+eps
+// abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65
+// abs error if |x| < ln2/128: 1.7145*2^-56
+{0x1.ffffffffffdbdp-2, 0x1.ffffffffffdbdp-2},
+{0x1.555555555543cp-3, 0x1.555555555543cp-3},
+{0x1.55555cf172b91p-5, 0x1.55555cf172b91p-5},
+{0x1.1111167a4d017p-7, 0x1.1111167a4d017p-7},
+},
+.exp2_shift = {
+  0x1.8p52 / N,
+  0x1.8p52 / N,
+},
+// exp2 polynomial coefficients.
+.exp2_poly = {
+// abs error: 1.2195*2^-65
+// ulp error: 0.507 (0.511 without fma)
+// if |x| < 1/256
+// abs error if |x| < 1/128: 1.9941*2^-56
+{0x1.62e42fefa39efp-1, 0x1.62e42fefa39efp-1},
+{0x1.ebfbdff82c424p-3, 0x1.ebfbdff82c424p-3},
+{0x1.c6b08d70cf4b5p-5, 0x1.c6b08d70cf4b5p-5},
+{0x1.3b2abd24650ccp-7, 0x1.3b2abd24650ccp-7},
+{0x1.5d7e09b4e3a84p-10, 0x1.5d7e09b4e3a84p-10},
+},
+// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
+// tab[2*k] = asuint64(T[k])
+// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
+.tab = {
+0x0, 0x3ff0000000000000,
+0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
+0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
+0xbc905e7a108766d1, 0x3fefe315e86e7f85,
+0x3c8cd2523567f613, 0x3fefd9b0d3158574,
+0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
+0x3c60f74e61e6c861, 0x3fefc74518759bc8,
+0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
+0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
+0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
+0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
+0xbc6a033489906e0b, 0x3fef9b66affed31b,
+0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
+0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
+0xbc91c923b9d5f416, 0x3fef829aaea92de0,
+0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
+0xbc801b15eaa59348, 0x3fef72b83c7d517b,
+0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
+0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
+0xbc96d99c7611eb26, 0x3fef5be084045cd4,
+0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
+0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
+0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
+0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
+0x3c968efde3a8a894, 0x3fef387a6e756238,
+0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
+0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
+0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
+0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
+0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
+0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
+0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
+0x3c834d754db0abb6, 0x3fef06fe0a31b715,
+0x3c864201e2ac744c, 0x3fef0170fc4cd831,
+0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
+0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
+0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
+0xbc9907f81b512d8e, 0x3feeecae6d05d866,
+0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
+0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
+0x3c859f48a72a4c6d, 0x3feedea64c123422,
+0xbc9312607a28698a, 0x3feeda4504ac801c,
+0xbc58a78f4817895b, 0x3feed60a21f72e2a,
+0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
+0x3c4363ed60c2ac11, 0x3feece086061892d,
+0x3c9666093b0664ef, 0x3feeca41ed1d0057,
+0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
+0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
+0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
+0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
+0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
+0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
+0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
+0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
+0x3c93350518fdd78e, 0x3feeaf4736b527da,
+0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
+0x3c9063e1e21c5409, 0x3feeab07dd485429,
+0x3c34c7855019c6ea, 0x3feea9268a5946b7,
+0x3c9432e62b64c035, 0x3feea76f15ad2148,
+0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
+0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
+0xbc845378892be9ae, 0x3feea34634ccc320,
+0xbc93cedd78565858, 0x3feea23882552225,
+0x3c5710aa807e1964, 0x3feea155d44ca973,
+0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
+0xbc6a12ad8734b982, 0x3feea012750bdabf,
+0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
+0xbc80dc3d54e08851, 0x3fee9f7df9519484,
+0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
+0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
+0xbc8619321e55e68a, 0x3fee9feb564267c9,
+0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
+0xbc7b32dcb94da51d, 0x3feea11473eb0187,
+0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
+0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
+0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
+0xbc9369b6f13b3734, 0x3feea589994cce13,
+0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
+0xbc94d450d872576e, 0x3feea8d99b4492ed,
+0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
+0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
+0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
+0x3c7bf68359f35f44, 0x3feeb1ae99157736,
+0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
+0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
+0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
+0xbc92434322f4f9aa, 0x3feebd829fde4e50,
+0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
+0x3c71affc2b91ce27, 0x3feec49182a3f090,
+0x3c6dd235e10a73bb, 0x3feec86319e32323,
+0xbc87c50422622263, 0x3feecc667b5de565,
+0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
+0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
+0x3c90cc319cee31d2, 0x3feed99e1330b358,
+0x3c8469846e735ab3, 0x3feede6b5579fdbf,
+0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
+0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
+0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
+0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
+0xbc90a40e3da6f640, 0x3feef9728de5593a,
+0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
+0xbc91eee26b588a35, 0x3fef05b030a1064a,
+0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
+0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
+0x3c736eae30af0cb3, 0x3fef199bdd85529c,
+0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
+0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
+0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
+0x3c676b2c6c921968, 0x3fef3720dcef9069,
+0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
+0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
+0xbc900dae3875a949, 0x3fef4f87080d89f2,
+0x3c74a385a63d07a7, 0x3fef5818dcfba487,
+0xbc82919e2040220f, 0x3fef60e316c98398,
+0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
+0x3c843a59ac016b4b, 0x3fef7321f301b460,
+0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
+0xbc892ab93b470dc9, 0x3fef864614f5a129,
+0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
+0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
+0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
+0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
+0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
+0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
+0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
+0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
+0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
+0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
+},
+};
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
index d7588a7929..3a9fb05aee 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
@@ -19,11 +19,11 @@
 #include <math.h>
 #include <stdint.h>
 #include <math/math-svid-compat.h>
 #include <shlib-compat.h>
 #include <libm-alias-double.h>
-#include "math_config.h"
+#include "math_config_dbl.h"
 #include <altivec.h>
 
 #define T __log_data.tab
 #define T2 __log_data.tab2
 #define B __log_data.poly1
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
index 2fe569576c..f1299404c4 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
@@ -14,11 +14,11 @@
 
    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include "math_config.h"
+#include "math_config_dbl.h"
 
 #define N (1 << LOG_TABLE_BITS)
 
 const struct log_data __log_data = {
 .ln2hi = {0x1.62e42fefa3800p-1, 0x1.62e42fefa3800p-1},
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
new file mode 100644
index 0000000000..2f88b81ab2
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
@@ -0,0 +1,41 @@
+/* Double-precision math error handling.
+   Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fpu/math-barriers.h>
+#include "math_config_dbl.h"
+
+/* NOINLINE reduces code size.  */
+NOINLINE static double
+xflow (uint32_t sign, double y)
+{
+  y = math_opt_barrier (sign ? -y : y) * y;
+  return y;
+}
+
+attribute_hidden double
+__math_uflow (uint32_t sign)
+{
+  return xflow (sign, 0x1p-767);
+}
+
+attribute_hidden double
+__math_oflow (uint32_t sign)
+{
+  return xflow (sign, 0x1p769);
+}
+
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
index 63770c8da3..26175bb1a0 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
@@ -4,6 +4,7 @@ GLIBC_2.30 _ZGVbN2v_sin F
 GLIBC_2.30 _ZGVbN2vvv_sincos F
 GLIBC_2.30 _ZGVbN4v_cosf F
 GLIBC_2.30 _ZGVbN4v_logf F
 GLIBC_2.30 _ZGVbN4v_sinf F
 GLIBC_2.30 _ZGVbN4vvv_sincosf F
+GLIBC_2.30 _ZGVbN2v_exp F
 GLIBC_2.30 _ZGVbN4v_expf F
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [v2 1/2] PPC64: Add libmvec SIMD single-precision natural exponent function.
  2019-05-27 17:47 ` [v2 " Shawn Landden
  2019-05-27 17:47   ` [v2 2/2] PPC64: Add libmvec SIMD double-precision " Shawn Landden
@ 2019-05-27 18:00   ` Shawn Landden
  2019-05-28 18:20   ` Tulio Magno Quites Machado Filho
  2 siblings, 0 replies; 15+ messages in thread
From: Shawn Landden @ 2019-05-27 18:00 UTC (permalink / raw)
  To: libc-alpha; +Cc: Tulio Magno Quites Machado Filho

On Mon, May 27, 2019 at 12:47 PM Shawn Landden <shawn@git.icu> wrote:
>
> Passes all tests.
>
> Based off the ./sysdeps/ieee754/dbl-64/e_exp.c implementation,
> and thus provides identical results.
>
> Unlike other libmvec functions, this sets the underflow and overflow bits.
> The caller can check these flags, and possibly re-run the calculations with
> scalar expf to figure out what is causing the overflow or underflow.
>
> Suprisingly the special-case path performs as well as the normal path.
> (both of which are vectorized)
> Running 20 times over 32MiB
> vector: mean 432.263032 MiB/s (sd 0.486733)
> scalar: mean 178.646197 MiB/s (sd 0.050013)
>
> v2: NFC, fixed some style issues, note that this is based on another impl.
>
> 2019-05-11  Shawn Landden  <shawn@git.icu>
>
>         [BZ #24209]
>         * NEWS: Noted the addition of PPC64 vector expf function.
>         * sysdeps/powerpc/bits/math-vector.h: Added entry for vector expf.
>         * sysdeps/powerpc/powerpc64/fpu/Versions: Added vector expf entry.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile:
>         (libmvec-sysdep_routines, CFLAGS-vec_s_expf4_vsx.c):
>         (CFLAGS-vec_s_exp2f_data.c): Added build of VSX SIMD expf
>         function and tests. Added vec_math_errf.c to build.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h: Modified for expf.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c:
>         Added entry for vector expf.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c: New file.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c: Likewise.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c: Likewise.
>         * sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist: SIMD expf
>         added.
> ---
>  NEWS                                          |   1 +
>  sysdeps/powerpc/bits/math-vector.h            |   2 +
>  sysdeps/powerpc/powerpc64/fpu/Versions        |   1 +
>  .../powerpc/powerpc64/fpu/multiarch/Makefile  |   6 +-
>  .../powerpc64/fpu/multiarch/math_config_flt.h |  64 +++++--
>  .../fpu/multiarch/test-float-vlen4-wrappers.c |   1 +
>  .../powerpc64/fpu/multiarch/vec_math_errf.c   |  39 +++++
>  .../fpu/multiarch/vec_s_exp2f_data.c          |  58 +++++++
>  .../powerpc64/fpu/multiarch/vec_s_expf4_vsx.c | 159 ++++++++++++++++++
>  .../linux/powerpc/powerpc64/libmvec.abilist   |   1 +
>  10 files changed, 321 insertions(+), 11 deletions(-)
>  create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c
>  create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c
>  create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c
>
> diff --git a/NEWS b/NEWS
> index 6f9af0e24e..26f5c67f75 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -18,10 +18,11 @@ Major new features:
>    - single-precision logarithm: logf
>    - double-precision sine: sin
>    - single-precision sine: sinf
>    - double-precision sincos: sincos
>    - single-precision sincos: sincosf
> +  - single-precision exp: expf
>
>    GCC support for auto-vectorization of functions on PPC64 is not yet
>    available. Until that is done, the new vector math functions are
>    inaccessible to applications.
>
> diff --git a/sysdeps/powerpc/bits/math-vector.h b/sysdeps/powerpc/bits/math-vector.h
> index 6dc62480fd..bbc9db0dd9 100644
> --- a/sysdeps/powerpc/bits/math-vector.h
> +++ b/sysdeps/powerpc/bits/math-vector.h
> @@ -42,8 +42,10 @@
>  #  define __DECL_SIMD_logf __DECL_SIMD_PPC64
>  #  undef __DECL_SIMD_sincos
>  #  define __DECL_SIMD_sincos __DECL_SIMD_PPC64
>  #  undef __DECL_SIMD_sincosf
>  #  define __DECL_SIMD_sincosf __DECL_SIMD_PPC64
> +#  undef __DECL_SIMD_expf
> +#  define __DECL_SIMD_expf __DECL_SIMD_PPC64
>
>  # endif
>  #endif
> diff --git a/sysdeps/powerpc/powerpc64/fpu/Versions b/sysdeps/powerpc/powerpc64/fpu/Versions
> index c083d9f938..7c45e76074 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/Versions
> +++ b/sysdeps/powerpc/powerpc64/fpu/Versions
> @@ -1,6 +1,7 @@
>  libmvec {
>    GLIBC_2.30 {
>      _ZGVbN2v_cos; _ZGVbN4v_cosf; _ZGVbN2v_sin; _ZGVbN4v_sinf;
>      _ZGVbN2v_log; _ZGVbN4v_logf; _ZGVbN2vvv_sincos; _ZGVbN4vvv_sincosf;
> +    _ZGVbN4v_expf;
>    }
>  }
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> index a4f0d1694e..ad3c29b1ab 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> @@ -46,10 +46,12 @@ endif
>  ifeq ($(subdir),mathvec)
>  libmvec-sysdep_routines += vec_d_cos2_vsx vec_s_cosf4_vsx \
>                            vec_d_sin2_vsx vec_s_sinf4_vsx \
>                            vec_d_log2_vsx vec_d_log_data \
>                            vec_s_logf4_vsx vec_s_logf_data \
> +                          vec_s_expf4_vsx vec_s_exp2f_data \
> +                          vec_math_errf \
>                            vec_d_sincos2_vsx vec_s_sincosf4_vsx
>  CFLAGS-vec_d_cos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_d_log2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_d_log_data.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_s_cosf4_vsx.c += -mabi=altivec -maltivec -mvsx
> @@ -57,19 +59,21 @@ CFLAGS-vec_s_logf4_vsx.c += -mabi=altivec -maltivec -mvsx
>  CFLAGS-vec_s_logf_data.c += -mabi=altivec -maltivec -mvsx
>  CFLAGS-vec_d_sin2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_s_sinf4_vsx.c += -mabi=altivec -maltivec -mvsx
>  CFLAGS-vec_d_sincos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_s_sincosf4_vsx.c += -mabi=altivec -maltivec -mvsx
> +CFLAGS-vec_s_expf4_vsx.c += -mabi=altivec -maltivec -mvsx
> +CFLAGS-vec_s_exp2f_data.c += -mabi=altivec -maltivec -mvsx
>  endif
>
>  # Variables for libmvec tests.
>  ifeq ($(subdir),math)
>  ifeq ($(build-mathvec),yes)
>  libmvec-tests += double-vlen2 float-vlen4
>
>  double-vlen2-funcs = cos sin sincos log
> -float-vlen4-funcs = cos sin sincos log
> +float-vlen4-funcs = cos sin sincos log exp
>
>  double-vlen2-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
>  float-vlen4-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
>
>  CFLAGS-test-double-vlen2-wrappers.c += -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h
> index b5fbe02d91..18f34197e2 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h
> @@ -46,20 +46,42 @@
>
>  #if TOINT_INTRINSICS
>  /* Round x to nearest int in all rounding modes, ties have to be rounded
>     consistently with converttoint so the results match.  If the result
>     would be outside of [-2^31, 2^31-1] then the semantics is unspecified.  */
> -static inline double_t
> -roundtoint (double_t x);
> +static inline vector double
> +roundtoint (vector double x);
>
>  /* Convert x to nearest int in all rounding modes, ties have to be rounded
>     consistently with roundtoint.  If the result is not representible in an
>     int32_t then the semantics is unspecified.  */
> -static inline int32_t
> -converttoint (double_t x);
> +static inline vector unsigned long long
> +converttoint (vector double x);
>  #endif
>
> +static inline vector unsigned
> +vasuint (vector float f)
> +{
> +  union
> +  {
> +    vector float f;
> +    vector unsigned i;
> +  } u = {f};
> +  return u.i;
> +}
> +
> +static inline vector float
> +vasfloat (vector unsigned i)
> +{
> +  union
> +  {
> +    vector unsigned i;
> +    vector float f;
> +  } u = {i};
> +  return u.f;
> +}
> +
>  static inline uint32_t
>  asuint (float f)
>  {
>    union
>    {
> @@ -78,10 +100,32 @@ asfloat (uint32_t i)
>      float f;
>    } u = {i};
>    return u.f;
>  }
>
> +static inline vector unsigned long long
> +vasuint64 (vector double f)
> +{
> +  union
> +  {
> +    vector double f;
> +    vector unsigned long long i;
> +  } u = {f};
> +  return u.i;
> +}
> +
> +static inline vector double
> +vasdouble (vector unsigned long long i)
> +{
> +  union
> +  {
> +    vector unsigned long long i;
> +    vector double f;
> +  } u = {i};
> +  return u.f;
> +}
> +
>  static inline uint64_t
>  asuint64 (double f)
>  {
>    union
>    {
> @@ -122,16 +166,16 @@ attribute_hidden float __math_invalidf (float);
>  /* Shared between expf, exp2f and powf.  */
>  #define EXP2F_TABLE_BITS 5
>  #define EXP2F_POLY_ORDER 3
>  extern const struct exp2f_data
>  {
> -  uint64_t tab[1 << EXP2F_TABLE_BITS];
> -  double shift_scaled;
> -  double poly[EXP2F_POLY_ORDER];
> -  double shift;
> -  double invln2_scaled;
> -  double poly_scaled[EXP2F_POLY_ORDER];
> +  long long unsigned tab[1 << EXP2F_TABLE_BITS];
> +  vector double shift_scaled;
> +  vector double poly[EXP2F_POLY_ORDER];
> +  vector double shift;
> +  vector double invln2_scaled;
> +  vector double poly_scaled[EXP2F_POLY_ORDER];
>  } __exp2f_data attribute_hidden;
>
>  #define LOGF_TABLE_BITS 4
>  #define LOGF_POLY_ORDER 4
>  extern const struct logf_data
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c
> index b4721b1dac..37886d1a45 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c
> @@ -22,6 +22,7 @@
>  #define VEC_TYPE vector float
>
>  VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVbN4v_cosf)
>  VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf)
>  VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
> +VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
>  VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf)
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c
> new file mode 100644
> index 0000000000..22b3a62fcf
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c
> @@ -0,0 +1,39 @@
> +/* Single-precision vector math error handling.
> +   Copyright 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include "math_config_flt.h"
> +/* NOINLINE prevents fenv semantics breaking optimizations.  */
> +NOINLINE static float
> +xflowf (uint32_t sign, float y)
> +{
> +  y = (sign ? -y : y) * y;
> +  return y;
> +}
> +
> +attribute_hidden float
> +__math_uflowf (uint32_t sign)
> +{
> +  return xflowf (sign, 0x1p-95f);
> +}
> +
> +attribute_hidden float
> +__math_oflowf (uint32_t sign)
> +{
> +  return xflowf (sign, 0x1p97f);
> +}
> +
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c
> new file mode 100644
> index 0000000000..b2232713a9
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c
> @@ -0,0 +1,58 @@
> +/* Data for exp, rearranged for vector operations
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include "math_config_flt.h"
> +
> +#define N (1 << EXP2F_TABLE_BITS)
> +
> +const struct exp2f_data __exp2f_data = {
> +  /* tab[i] = uint(2^(i/N)) - (i << 52-BITS)
> +     used for computing 2^(k/N) for an int |k| < 150 N as
> +     double(tab[k%N] + (k << 52-BITS)) */
> +  .tab = {
> +0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
> +0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
> +0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
> +0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
> +0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
> +0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
> +0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
> +0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
> +  },
> +  .shift_scaled = {
> +    0x1.8p+52 / N,
> +    0x1.8p+52 / N },
> +  .poly = {
> +  {0x1.c6af84b912394p-5, 0x1.c6af84b912394p-5},
> +  {0x1.ebfce50fac4f3p-3, 0x1.ebfce50fac4f3p-3},
> +  {0x1.62e42ff0c52d6p-1, 0x1.62e42ff0c52d6p-1},
> +  },
> +  .shift = {
> +    0x1.8p+52,
> +    0x1.8p+52,
> +  },
> +  .invln2_scaled = {
> +    0x1.71547652b82fep+0 * N,
> +    0x1.71547652b82fep+0 * N,
> +  },
> +  .poly_scaled = {
> +  {0x1.c6af84b912394p-5/N/N/N, 0x1.c6af84b912394p-5/N/N/N},
> +  {0x1.ebfce50fac4f3p-3/N/N, 0x1.ebfce50fac4f3p-3/N/N},
> +  {0x1.62e42ff0c52d6p-1/N, 0x1.62e42ff0c52d6p-1/N},
> +  },
> +};
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c
> new file mode 100644
> index 0000000000..d118428a66
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c
> @@ -0,0 +1,159 @@
> +/* Single-precision vector expf function.
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +/* Based off ./sysdeps/ieee754/flt-32/e_expf.c
> +   which was contributed by Szabolcs Nagy of ARM Ltd.  */
> +#include <altivec.h>
> +#include <math.h>
> +
> +#include "math_config_flt.h"
> +
> +typedef vector long long unsigned v64u;
> +typedef union {
> +  vector unsigned u;
> +  vector float f;
> +  v64u l;
> +  vector double d;
> +} u;
> +typedef union {
> +  double d;
> +  int64_t l;
> +  unsigned u;
> +  float f;
> +} us;
> +
> +#define N (1 << EXP2F_TABLE_BITS)
> +#define InvLn2N __exp2f_data.invln2_scaled
> +#define T __exp2f_data.tab
> +#define C __exp2f_data.poly_scaled
> +
> +vector float
> +_ZGVbN4v_expf (vector float x)
> +{
> +  u res;
> +  u xu;
> +  xu.f = x;
> +  us c88;
> +  c88.f = 88.0f;
> +  us inf;
> +  inf.f = INFINITY;
> +  us ninf;
> +  ninf.f = -INFINITY;
> +  vector unsigned constants = {(c88.u & 0xfff00000) << 1,
> +                               ninf.u, inf.u, 0};
> +  vector float constants2 = {0x1.62e42ep6f, -0x1.9fe368p6f, 0, 0};
> +  vector unsigned zero = {0, 0, 0, 0};
> +  vector unsigned v88 = vec_splat (constants, 0);
> +  vector unsigned is_special_case
> +       = (vector unsigned) vec_cmpge (xu.u << 1, v88);
> +  vector unsigned is_special_not_covered = is_special_case;
> +  if (__glibc_unlikely (!vec_all_eq (is_special_case, zero)))
> +    {
> +      vector unsigned inf = vec_splat (constants, 2);
> +      vector unsigned is_inf_or_ninf_or_nan
> +       = (vector unsigned) vec_cmpge (xu.u, inf);
> +      u xpx;
> +      xpx.f = x + x;
> +      res.u = xpx.u;
> +      is_special_not_covered &= ~is_inf_or_ninf_or_nan;
> +      vector unsigned ninf = vec_splat (constants, 1);
> +      vector unsigned is_ninf
> +       = (vector unsigned) vec_cmpeq (xu.u, ninf);
> +      res.u = vec_sel (res.u, zero, is_ninf);
> +
> +      vector float overflow_v = vec_splat (constants2, 0);
> +      vector unsigned is_overflow
> +       = (vector unsigned) vec_cmpgt (xu.f, overflow_v);
> +      if (__glibc_unlikely (!vec_all_eq (is_overflow, zero)))
> +         {
> +         // This branch is because we are generating an
> +         // overflow fp flag
> +         us ofu;
> +         ofu.f = __math_oflowf (0);
> +         vector unsigned of = {ofu.u, ofu.u, ofu.u, ofu.u};
> +         res.u = vec_sel (res.u, of, is_overflow);
> +         is_special_not_covered &= ~is_overflow;
> +       }
> +      vector float underflow_v = vec_splat (constants2, 1);
> +      vector unsigned is_underflow
> +       = (vector unsigned) vec_cmplt (xu.f, underflow_v);
> +      if (__glibc_unlikely (!vec_all_eq (is_underflow, zero)))
> +       {
> +         // This branch is because we are generating an
> +         // underflow fp flag
> +         us ufu;
> +         ufu.f = __math_uflowf (0);
> +         vector unsigned uf = {ufu.u, ufu.u, ufu.u, ufu.u};
> +         res.u = vec_sel (res.u, uf, is_underflow);
> +         is_special_not_covered &= ~is_underflow;
> +       }
> +     }
> +
> +#if __GNUC__ >= 8
|| __clang_major__ >= 6
https://reviews.llvm.org/rG41e14c4dfa63c45271cb42368d09b5a94ee16e4f#change-sAqzIEx8bz0L
> +  vector double xl = vec_unpackh (x);
> +  vector double xr = vec_unpackl (x);
> +#else
> +  vector double xl = {x[0], x[1]};
> +  vector double xr = {x[2], x[3]};
> +#endif
> +  vector double zl = InvLn2N * xl;
> +  vector double zr = InvLn2N * xr;
> +#if TOINT_INTRINSICS
> +  vector double kdl = roundtoint (zl);
> +  vector double kdl = roundtoint (zl);
> +  v64u kil = converttoint (zl);
> +  v64u kir = converttoint (zr);
> +#else
> +#define SHIFT __exp2f_data.shift
> +  vector double kdl = zl + SHIFT;
> +  vector double kdr = zr + SHIFT;
> +  u kilu;
> +  kilu.d = kdl;
> +  u kiru;
> +  kiru.d = kdr;
> +  v64u kil = kilu.l;
> +  v64u kir = kiru.l;
> +  kdl -= SHIFT;
> +  kdr -= SHIFT;
> +#endif
> +  vector double rl = zl - kdl;
> +  vector double rr = zr - kdr;
> +
> +  v64u tl = {T[kil[0] % N], T[kil[1] % N]};
> +  v64u tr = {T[kir[0] % N], T[kir[1] % N]};
> +  tl += (kil << (52 - EXP2F_TABLE_BITS));
> +  tr += (kir << (52 - EXP2F_TABLE_BITS));
> +  u slu;
> +  slu.l = tl;
> +  u sru;
> +  sru.l = tr;
> +  // This cast is obnoxious, but there is no vec_ld for double
> +  zl = C[0] * rl + C[1];
> +  zr = C[0] * rr + C[1];
> +  vector double r2l = rl * rl;
> +  vector double r2r = rr * rr;
> +  vector double yl = C[2] * rl + 1;
> +  vector double yr = C[2] * rr + 1;
> +  yl = zl * r2l + yl;
> +  yr = zr * r2r + yr;
> +  yl = yl * slu.d;
> +  yr = yr * sru.d;
> +  vector float restmp = {(float) yl[0], (float) yl[1],
> +       (float) yr[0], (float) yr[1]};
> +  return vec_sel (restmp, res.f,
> +       is_special_case & ~is_special_not_covered);
> +}
> diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> index bb9f0beea4..63770c8da3 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> @@ -4,5 +4,6 @@ GLIBC_2.30 _ZGVbN2v_sin F
>  GLIBC_2.30 _ZGVbN2vvv_sincos F
>  GLIBC_2.30 _ZGVbN4v_cosf F
>  GLIBC_2.30 _ZGVbN4v_logf F
>  GLIBC_2.30 _ZGVbN4v_sinf F
>  GLIBC_2.30 _ZGVbN4vvv_sincosf F
> +GLIBC_2.30 _ZGVbN4v_expf F
> --
> 2.20.1
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v2 1/2] PPC64: Add libmvec SIMD single-precision natural exponent function.
  2019-05-27 17:47 ` [v2 " Shawn Landden
  2019-05-27 17:47   ` [v2 2/2] PPC64: Add libmvec SIMD double-precision " Shawn Landden
  2019-05-27 18:00   ` [v2 1/2] PPC64: Add libmvec SIMD single-precision " Shawn Landden
@ 2019-05-28 18:20   ` Tulio Magno Quites Machado Filho
  2 siblings, 0 replies; 15+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2019-05-28 18:20 UTC (permalink / raw)
  To: Shawn Landden, libc-alpha

Shawn Landden <shawn@git.icu> writes:

> 2019-05-11  Shawn Landden  <shawn@git.icu>
>
> 	[BZ #24209]
>         * NEWS: Noted the addition of PPC64 vector expf function.
>         * sysdeps/powerpc/bits/math-vector.h: Added entry for vector expf.
>         * sysdeps/powerpc/powerpc64/fpu/Versions: Added vector expf entry.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile:
>         (libmvec-sysdep_routines, CFLAGS-vec_s_expf4_vsx.c):
>         (CFLAGS-vec_s_exp2f_data.c): Added build of VSX SIMD expf
>         function and tests. Added vec_math_errf.c to build.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_flt.h: Modified for expf.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c:
>         Added entry for vector expf.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_expf4_vsx.c: New file.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_exp2f_data.c: Likewise.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c: Likewise.
>         * sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist: SIMD expf
>         added.

These entries are still indented with spaces.  Fixed.

> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> index a4f0d1694e..ad3c29b1ab 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> @@ -46,10 +46,12 @@ endif
>  ifeq ($(subdir),mathvec)
>  libmvec-sysdep_routines += vec_d_cos2_vsx vec_s_cosf4_vsx \
>  			   vec_d_sin2_vsx vec_s_sinf4_vsx \
>  			   vec_d_log2_vsx vec_d_log_data \
>  			   vec_s_logf4_vsx vec_s_logf_data \
> +			   vec_s_expf4_vsx vec_s_exp2f_data \
> +			   vec_math_errf \
>  			   vec_d_sincos2_vsx vec_s_sincosf4_vsx
>  CFLAGS-vec_d_cos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_d_log2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_d_log_data.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_s_cosf4_vsx.c += -mabi=altivec -maltivec -mvsx
> @@ -57,19 +59,21 @@ CFLAGS-vec_s_logf4_vsx.c += -mabi=altivec -maltivec -mvsx
>  CFLAGS-vec_s_logf_data.c += -mabi=altivec -maltivec -mvsx
>  CFLAGS-vec_d_sin2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_s_sinf4_vsx.c += -mabi=altivec -maltivec -mvsx
>  CFLAGS-vec_d_sincos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_s_sincosf4_vsx.c += -mabi=altivec -maltivec -mvsx
> +CFLAGS-vec_s_expf4_vsx.c += -mabi=altivec -maltivec -mvsx
> +CFLAGS-vec_s_exp2f_data.c += -mabi=altivec -maltivec -mvsx

vec_math_errf.c indirectly depends on altivec.h, which requires to use the same
flags.

Fixed locally.

> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c
> new file mode 100644
> index 0000000000..22b3a62fcf
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c
> @@ -0,0 +1,39 @@
> ...
> +attribute_hidden float
> +__math_oflowf (uint32_t sign)
> +{
> +  return xflowf (sign, 0x1p97f);
> +}
> +

Extra line at EOF here.  Fixed.

I'm merging this patch with the fixes I mentioned to branch tuliom/libmvec.

Thanks!

-- 
Tulio Magno

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v2 2/2] PPC64: Add libmvec SIMD double-precision natural exponent function.
  2019-05-27 17:47   ` [v2 2/2] PPC64: Add libmvec SIMD double-precision " Shawn Landden
@ 2019-05-29 13:18     ` Tulio Magno Quites Machado Filho
  2019-05-29 19:11       ` [v3 PATCH] " Shawn Landden
  0 siblings, 1 reply; 15+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2019-05-29 13:18 UTC (permalink / raw)
  To: Shawn Landden, libc-alpha

Hi Shawn,

This patch is a lot better.

Shawn Landden <shawn@git.icu> writes:

> Passes all tests.
>
> Unlike other libmvec functions, this sets the onderflow and overflow bits.

s/onderflow/underflow/

> The caller can check these flags, and possibly re-run the calculations with
> scalar expf to figure out what is causing the overflow or underflow.
>
> The special-case path is not vectorized, and performs much woorse than

s/woorse/worse/

> 2019-05-11  Shawn Landden  <shawn@git.icu>
>
> 	[BZ #24209]
> 	* NEWS: Noted the addition of PPC64 vector exp function.
> 	* sysdeps/powerpc/bits/math-vector.h: Added entry for vector exp.
> 	* sysdeps/powerpc/powerpc64/fpu/Versions: Added vector exp entry.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile:
> 	(libmvec-sysdep_routines, CFLAGS-vec_d_exp2_vsx.c):
> 	(CFLAGS-vec_d_exp_data.c): Added build of VSX SIMD expf
> 	function and tests. Added vec_math_err.c to build.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h: Renamed to ...
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h: ... and
>         modified for exp.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c:
>         Added entry for vector exp.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c: New file.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c: Likewise.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c: Likewise.
>         * sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist: SIMD expf
>         added.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c: Change to
>         reflect rename of math_config.h
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c: Likewise

There are still 11 lines indented with spaces instead of tab here.
You can identify easily in this quote.

> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> index ad3c29b1ab..aceaedee0a 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> @@ -47,11 +47,12 @@ ifeq ($(subdir),mathvec)
>  libmvec-sysdep_routines += vec_d_cos2_vsx vec_s_cosf4_vsx \
>  			   vec_d_sin2_vsx vec_s_sinf4_vsx \
>  			   vec_d_log2_vsx vec_d_log_data \
>  			   vec_s_logf4_vsx vec_s_logf_data \
>  			   vec_s_expf4_vsx vec_s_exp2f_data \
> -			   vec_math_errf \
> +			   vec_math_errf vec_math_err \
> +			   vec_d_exp2_vsx vec_d_exp_data \
>  			   vec_d_sincos2_vsx vec_s_sincosf4_vsx
>  CFLAGS-vec_d_cos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_d_log2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_d_log_data.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_s_cosf4_vsx.c += -mabi=altivec -maltivec -mvsx
> @@ -61,18 +62,20 @@ CFLAGS-vec_d_sin2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_s_sinf4_vsx.c += -mabi=altivec -maltivec -mvsx
>  CFLAGS-vec_d_sincos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_s_sincosf4_vsx.c += -mabi=altivec -maltivec -mvsx
>  CFLAGS-vec_s_expf4_vsx.c += -mabi=altivec -maltivec -mvsx
>  CFLAGS-vec_s_exp2f_data.c += -mabi=altivec -maltivec -mvsx
> +CFLAGS-vec_d_exp2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
> +CFLAGS-vec_d_exp_data.c += -mabi=altivec -maltivec -mvsx

vec_math_err.c includes altivec.h indirectly, requiring the same CFLAGS.

> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
> new file mode 100644
> index 0000000000..5d59cc429d
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
> @@ -0,0 +1,204 @@
> +/* Double-precision vector exp function.
> +   Copyright 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */

We need to reuse the copyright template that is available in the file that
you used as a base for this one.  You just need to update the year.

That means to keep "(C)".

> +/* Based on sysdeps/ieee754/dbl-64/e_exp.c which came from
> +   Szabolcs Nagy at ARM Ltd.  */

Thanks for adding this.

> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
> new file mode 100644
> index 0000000000..8038a7b592
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
> @@ -0,0 +1,211 @@
> +/* Shared data between exp, exp2 and pow.

Should this be just exp?

> +
> +   Copyright 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */

Same comment here.

> +/* Based on ./sysdeps/ieee754/dbl-64/e_exp_data.c which came from
> +   Szabolcs Nagy at ARM Ltd.  */

Thanks again.

> +// Used for rounding when !TOINT_INTRINSICS
> +#if EXP_USE_TOINT_NARROW

This usage is still lost here and causes a build failure.

> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
> new file mode 100644
> index 0000000000..2f88b81ab2
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
> @@ -0,0 +1,41 @@
> ...
> +attribute_hidden double
> +__math_oflow (uint32_t sign)
> +{
> +  return xflow (sign, 0x1p769);
> +}
> +

Extra line at EOF.

> diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> index 63770c8da3..26175bb1a0 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> @@ -4,6 +4,7 @@ GLIBC_2.30 _ZGVbN2v_sin F
>  GLIBC_2.30 _ZGVbN2vvv_sincos F
>  GLIBC_2.30 _ZGVbN4v_cosf F
>  GLIBC_2.30 _ZGVbN4v_logf F
>  GLIBC_2.30 _ZGVbN4v_sinf F
>  GLIBC_2.30 _ZGVbN4vvv_sincosf F
> +GLIBC_2.30 _ZGVbN2v_exp F

This is still causing an issue with mathvec/check-abi-libmvec

-- 
Tulio Magno

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [v3 PATCH] PPC64: Add libmvec SIMD double-precision natural exponent function.
  2019-05-29 13:18     ` Tulio Magno Quites Machado Filho
@ 2019-05-29 19:11       ` Shawn Landden
  2019-05-29 19:12         ` Shawn Landden
                           ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Shawn Landden @ 2019-05-29 19:11 UTC (permalink / raw)
  Cc: libc-alpha, Tulio Magno Quites Machado Filho, Shawn Landden

Passes all tests.

Unlike other libmvec functions, this sets the underflow and overflow bits.
The caller can check these flags, and possibly re-run the calculations with
scalar expf to figure out what is causing the overflow or underflow.

The special-case path is not vectorized, and performs much worse than
the scalar code.
Normalized data: 1 to 2^32 converted to double
Running 20 times over 32MiB
vector: mean 563.807107 MiB/s (sd 0.390922)
scalar: mean 226.527824 MiB/s (sd 0.077406)

Random data:
vector: mean 80.175986 MiB/s (sd 1.110948)
scalar: mean 244.738130 MiB/s (sd 0.029561)

v2: Fix some serious problems with the algorithm. Not
    sure how I missed those problems.
v3: Style problems.  Fix symbol order.

2019-05-11  Shawn Landden  <shawn@git.icu>

	[BZ #24209]
	* NEWS: Noted the addition of PPC64 vector exp function.
	* sysdeps/powerpc/bits/math-vector.h: Added entry for vector exp.
	* sysdeps/powerpc/powerpc64/fpu/Versions: Added vector exp entry.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile:
	(libmvec-sysdep_routines, CFLAGS-vec_d_exp2_vsx.c):
	(CFLAGS-vec_d_exp_data.c): Added build of VSX SIMD expf
	function and tests. Added vec_math_err.c to build.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/(math_config.h)->(math_config_dbl.h)
	Renamed, and modified for exp.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c:
	Added entry for vector exp.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c: New file.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c: Likewise.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c: Likewise.
	* sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist: SIMD expf
	added.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c: Rename of math_config.h
	* sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c: Likewise
---
 NEWS                                          |   1 +
 sysdeps/powerpc/bits/math-vector.h            |   2 +
 sysdeps/powerpc/powerpc64/fpu/Versions        |   2 +-
 .../powerpc/powerpc64/fpu/multiarch/Makefile  |   8 +-
 .../{math_config.h => math_config_dbl.h}      |  15 +-
 .../multiarch/test-double-vlen2-wrappers.c    |   1 +
 .../powerpc64/fpu/multiarch/vec_d_exp2_vsx.c  | 197 +++++++++++++++++
 .../powerpc64/fpu/multiarch/vec_d_exp_data.c  | 202 ++++++++++++++++++
 .../powerpc64/fpu/multiarch/vec_d_log2_vsx.c  |   2 +-
 .../powerpc64/fpu/multiarch/vec_d_log_data.c  |   2 +-
 .../powerpc64/fpu/multiarch/vec_math_err.c    |  40 ++++
 .../linux/powerpc/powerpc64/libmvec.abilist   |   1 +
 12 files changed, 461 insertions(+), 12 deletions(-)
 rename sysdeps/powerpc/powerpc64/fpu/multiarch/{math_config.h => math_config_dbl.h} (95%)
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c

diff --git a/NEWS b/NEWS
index 26f5c67f75..a33af8c316 100644
--- a/NEWS
+++ b/NEWS
@@ -20,6 +20,7 @@ Major new features:
   - single-precision sine: sinf
   - double-precision sincos: sincos
   - single-precision sincos: sincosf
+  - double-precision exp: exp
   - single-precision exp: expf
 
   GCC support for auto-vectorization of functions on PPC64 is not yet
diff --git a/sysdeps/powerpc/bits/math-vector.h b/sysdeps/powerpc/bits/math-vector.h
index bbc9db0dd9..e7846b4bfa 100644
--- a/sysdeps/powerpc/bits/math-vector.h
+++ b/sysdeps/powerpc/bits/math-vector.h
@@ -46,6 +46,8 @@
 #  define __DECL_SIMD_sincosf __DECL_SIMD_PPC64
 #  undef __DECL_SIMD_expf
 #  define __DECL_SIMD_expf __DECL_SIMD_PPC64
+#  undef __DECL_SIMD_exp
+#  define __DECL_SIMD_exp __DECL_SIMD_PPC64
 
 # endif
 #endif
diff --git a/sysdeps/powerpc/powerpc64/fpu/Versions b/sysdeps/powerpc/powerpc64/fpu/Versions
index 7c45e76074..361edee9e7 100644
--- a/sysdeps/powerpc/powerpc64/fpu/Versions
+++ b/sysdeps/powerpc/powerpc64/fpu/Versions
@@ -2,6 +2,6 @@ libmvec {
   GLIBC_2.30 {
     _ZGVbN2v_cos; _ZGVbN4v_cosf; _ZGVbN2v_sin; _ZGVbN4v_sinf;
     _ZGVbN2v_log; _ZGVbN4v_logf; _ZGVbN2vvv_sincos; _ZGVbN4vvv_sincosf;
-    _ZGVbN4v_expf;
+    _ZGVbN4v_expf; _ZGVbN2v_exp;
   }
 }
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
index d15183d5ee..dd982bb068 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
@@ -49,7 +49,8 @@ libmvec-sysdep_routines += vec_d_cos2_vsx vec_s_cosf4_vsx \
 			   vec_d_log2_vsx vec_d_log_data \
 			   vec_s_logf4_vsx vec_s_logf_data \
 			   vec_s_expf4_vsx vec_s_exp2f_data \
-			   vec_math_errf \
+			   vec_math_errf vec_math_err \
+			   vec_d_exp2_vsx vec_d_exp_data \
 			   vec_d_sincos2_vsx vec_s_sincosf4_vsx
 CFLAGS-vec_d_cos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_d_log2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
@@ -64,6 +65,9 @@ CFLAGS-vec_d_sincos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
 CFLAGS-vec_s_sincosf4_vsx.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_s_expf4_vsx.c += -mabi=altivec -maltivec -mvsx
 CFLAGS-vec_s_exp2f_data.c += -mabi=altivec -maltivec -mvsx
+CFLAGS-vec_d_exp2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
+CFLAGS-vec_d_exp_data.c += -mabi=altivec -maltivec -mvsx
+CFLAGS-vec_math_err.c += -mabi=altivec -maltivec -mvsx
 endif
 
 # Variables for libmvec tests.
@@ -71,7 +75,7 @@ ifeq ($(subdir),math)
 ifeq ($(build-mathvec),yes)
 libmvec-tests += double-vlen2 float-vlen4
 
-double-vlen2-funcs = cos sin sincos log
+double-vlen2-funcs = cos sin sincos log exp
 float-vlen4-funcs = cos sin sincos log exp
 
 double-vlen2-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
similarity index 95%
rename from sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h
rename to sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
index bd37906ecb..7c94f5b9b2 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
@@ -24,6 +24,7 @@
 #include <nan-high-order-bit.h>
 #include <stdint.h>
 #include <altivec.h>
+#include <math-narrow-eval.h>
 
 #ifndef WANT_ROUNDING
 /* Correct special case results in non-nearest rounding modes.  */
@@ -134,13 +135,13 @@ check_uflow (double x)
 #define EXP2_POLY_ORDER 5
 extern const struct exp_data
 {
-  double invln2N;
-  double shift;
-  double negln2hiN;
-  double negln2loN;
-  double poly[4]; /* Last four coefficients.  */
-  double exp2_shift;
-  double exp2_poly[EXP2_POLY_ORDER];
+  vector double invln2N;
+  vector double shift;
+  vector double negln2hiN;
+  vector double negln2loN;
+  vector double poly[4]; /* Last four coefficients.  */
+  vector double exp2_shift;
+  vector double exp2_poly[EXP2_POLY_ORDER];
   uint64_t tab[2*(1 << EXP_TABLE_BITS)];
 } __exp_data attribute_hidden;
 
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
index c9e2dbfdd0..f368e47947 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
@@ -24,4 +24,5 @@
 VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVbN2v_cos)
 VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
 VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
+VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
 VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVbN2vvv_sincos)
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
new file mode 100644
index 0000000000..376531e32b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
@@ -0,0 +1,197 @@
+/* Double-precision vector exp(x) function.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+/* Based on sysdeps/ieee754/dbl-64/e_exp.c which came from
+   Szabolcs Nagy at ARM Ltd. */
+#include <altivec.h>
+#include <math.h>
+#include <math-barriers.h>
+#include <math-narrow-eval.h>
+
+#include "math_config_dbl.h"
+
+typedef vector long long unsigned v64u;
+typedef union {
+  vector unsigned u;
+  vector float f;
+  v64u l;
+  vector double d;
+} u;
+typedef union {
+  double d;
+  uint64_t l;
+  unsigned u;
+  float f;
+} us;
+
+#define N (1 << EXP_TABLE_BITS)
+#define InvLn2N __exp_data.invln2N
+#define NegLn2hiN __exp_data.negln2hiN
+#define NegLn2loN __exp_data.negln2loN
+#define Shift __exp_data.shift
+#define T __exp_data.tab
+#define C2 __exp_data.poly[5 - EXP_POLY_ORDER]
+#define C3 __exp_data.poly[6 - EXP_POLY_ORDER]
+#define C4 __exp_data.poly[7 - EXP_POLY_ORDER]
+#define C5 __exp_data.poly[8 - EXP_POLY_ORDER]
+
+/* Handle cases that may overflow or underflow when computing the result that
+   is scale*(1+TMP) without intermediate rounding.  The bit representation of
+   scale is in SBITS, however it has a computed exponent that may have
+   overflown into the sign bit so that needs to be adjusted before using it as
+   a double.  (int32_t)KI is the k used in the argument reduction and exponent
+   adjustment of scale, positive k here means the result may overflow and
+   negative k means the result may underflow.  */
+static inline double
+specialcase (double_t tmp, uint64_t sbits, uint64_t ki)
+{
+  double_t scale, y;
+
+  if ((ki & 0x80000000) == 0)
+    {
+      /* k > 0, the exponent of scale might have overflowed by <= 460.  */
+      sbits -= 1009ull << 52;
+      scale = asdouble (sbits);
+      y = 0x1p1009 * (scale + scale * tmp);
+      return y;
+    }
+  /* k < 0, need special care in the subnormal range.  */
+  sbits += 1022ull << 52;
+  scale = asdouble (sbits);
+  y = scale + scale * tmp;
+  if (y < 1.0)
+    {
+      /* Round y to the right precision before scaling it into the subnormal
+	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
+	 E is the worst-case ulp error outside the subnormal range.  So this
+	 is only useful if the goal is better than 1 ulp worst-case error.  */
+      double_t hi, lo;
+      lo = scale - y + scale * tmp;
+      hi = 1.0 + y;
+      lo = 1.0 - hi + y + lo;
+      y = math_narrow_eval (hi + lo) - 1.0;
+      /* Avoid -0.0 with downward rounding.  */
+      if (WANT_ROUNDING && y == 0.0)
+	y = 0.0;
+      /* The underflow exception needs to be signaled explicitly.  */
+      math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
+    }
+  y = 0x1p-1022 * y;
+  return y;
+}
+
+/* Top 12 bits of a double (sign and exponent bits).  */
+static inline uint32_t top12(double x)
+{
+	return asuint64(x) >> 52;
+}
+
+vector double
+_ZGVbN2v_exp (vector double x)
+{
+	v64u abstop, is_special_case, is_special_case2, idx, top, sbits;
+	vector double kd, z, r, r2, scale, tail, tmp;
+	u t, t2;
+	u res;
+	u xu;
+	xu.d = x;
+	v64u zero = {0, 0};
+	u c512;
+	vector double load = {512.0, 512.0};
+	c512.d = load;
+	u normalize;
+	vector double load2 = {0x1p-54, 0x1p-54};
+	normalize.d = load2;
+	c512.l &= 0x7ff0000000000000;
+	normalize.l &= 0x7ff0000000000000;
+	abstop = xu.l & 0x7ff0000000000000;
+	is_special_case = (v64u) vec_cmpge (xu.l - normalize.l, c512.l - normalize.l);
+	if (__glibc_unlikely (!vec_all_eq (is_special_case, zero)))
+	{
+		for (int m=0;m<2;++m)
+		{
+			if (!is_special_case[m])
+				continue;
+			double v;
+			uint32_t abstops = abstop[m] >> 52;
+			if (!is_special_case[m])
+				continue;
+			v = x[m];
+			if (abstops - top12(0x1p-54) >= 0x80000000)
+				/* Avoid spurious underflow for tiny x.  */
+				/* Note: 0 is common input.  */
+				res.d[m] = WANT_ROUNDING ? 1.0 + v : 1.0;
+			if (abstops >= top12(1024.0)) {
+				if (asuint64(v) == asuint64(-INFINITY))
+					res.d[m] = 0.0;
+				else if (abstops >= top12(INFINITY))
+					res.d[m] = 1.0 + v;
+				else if (asuint64(v) >> 63)
+					res.d[m] = __math_uflow(0);
+				else
+					res.d[m] = __math_oflow(0);
+				/* prevent is_special_case2 from matching */
+				abstop[m] = 1;
+			} else {
+	 			/* Large x is special cased below.  */
+ 				abstop[m] = 0;
+				is_special_case[m] = 0;
+			}
+		}
+	}
+	/* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+	/* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
+	z = InvLn2N * x;
+	u kdu;
+	/* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+	kd = z + Shift;
+	kdu.d = kd;
+	kd -= Shift;
+	r = x + kd * NegLn2hiN + kd * NegLn2loN;
+	/* 2^(k/N) ~= scale * (1 + tail).  */
+	idx = 2 * (kdu.l % N);
+	top = kdu.l << (52 - EXP_TABLE_BITS);
+	v64u load3 = {T[idx[0]], T[idx[1]]};
+	t.l = load3;
+	v64u load4 = {T[idx[0] + 1], T[idx[1] + 1]};
+	t2.l = load4;
+	tail = t.d;
+	/* This is only a valid scale when -1023*N < k < 1024*N.  */
+	sbits = t2.l + top;
+	/* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */
+	/* Evaluation is optimized assuming superscalar pipelined execution.  */
+	r2 = r * r;
+	/* Without fma the worst case error is 0.25/N ulp larger.  */
+	/* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.  */
+	tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
+	is_special_case2 = (v64u) vec_cmpeq (abstop, zero);
+	if (__glibc_unlikely (!vec_all_eq (is_special_case2, zero)))
+	{
+		u sc2;
+		for (int m=0;m<2;++m)
+		{
+			sc2.d[m] = specialcase(tmp[m], sbits[m], kdu.l[m]);
+		}
+		res.l = vec_sel (res.l, sc2.l, is_special_case2);
+	}
+	u sbitsu;
+	sbitsu.l = sbits;
+	scale = sbitsu.d;
+	u res2;
+	res2.d = scale + scale * tmp;
+	/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+	   is no spurious underflow here even without fma.  */
+	res.l = vec_sel (res.l, res2.l, ~is_special_case & ~is_special_case2);
+	return res.d;
+}
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
new file mode 100644
index 0000000000..208b897d04
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
@@ -0,0 +1,202 @@
+/* Shared data between vector exp, exp2 and pow
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+/* Based on sysdeps/ieee754/dbl-64/e_exp.c which came from
+   Szabolcs Nagy at ARM Ltd. */
+
+#include "math_config_dbl.h"
+
+#define N (1 << EXP_TABLE_BITS)
+
+const struct exp_data __exp_data = {
+// N/ln2
+.invln2N = {
+  0x1.71547652b82fep0 * N,
+  0x1.71547652b82fep0 * N,
+},
+// -ln2/N
+.negln2hiN = {
+  -0x1.62e42fefa0000p-8,
+  -0x1.62e42fefa0000p-8,
+},
+.negln2loN = {
+  -0x1.cf79abc9e3b3ap-47,
+  -0x1.cf79abc9e3b3ap-47,
+},
+.shift = {
+  0x1.8p52,
+  0x1.8p52,
+},
+// exp polynomial coefficients.
+.poly = {
+// abs error: 1.555*2^-66
+// ulp error: 0.509 (0.511 without fma)
+// if |x| < ln2/256+eps
+// abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65
+// abs error if |x| < ln2/128: 1.7145*2^-56
+{0x1.ffffffffffdbdp-2, 0x1.ffffffffffdbdp-2},
+{0x1.555555555543cp-3, 0x1.555555555543cp-3},
+{0x1.55555cf172b91p-5, 0x1.55555cf172b91p-5},
+{0x1.1111167a4d017p-7, 0x1.1111167a4d017p-7},
+},
+.exp2_shift = {
+  0x1.8p52 / N,
+  0x1.8p52 / N,
+},
+// exp2 polynomial coefficients.
+.exp2_poly = {
+// abs error: 1.2195*2^-65
+// ulp error: 0.507 (0.511 without fma)
+// if |x| < 1/256
+// abs error if |x| < 1/128: 1.9941*2^-56
+{0x1.62e42fefa39efp-1, 0x1.62e42fefa39efp-1},
+{0x1.ebfbdff82c424p-3, 0x1.ebfbdff82c424p-3},
+{0x1.c6b08d70cf4b5p-5, 0x1.c6b08d70cf4b5p-5},
+{0x1.3b2abd24650ccp-7, 0x1.3b2abd24650ccp-7},
+{0x1.5d7e09b4e3a84p-10, 0x1.5d7e09b4e3a84p-10},
+},
+// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
+// tab[2*k] = asuint64(T[k])
+// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
+.tab = {
+0x0, 0x3ff0000000000000,
+0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
+0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
+0xbc905e7a108766d1, 0x3fefe315e86e7f85,
+0x3c8cd2523567f613, 0x3fefd9b0d3158574,
+0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
+0x3c60f74e61e6c861, 0x3fefc74518759bc8,
+0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
+0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
+0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
+0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
+0xbc6a033489906e0b, 0x3fef9b66affed31b,
+0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
+0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
+0xbc91c923b9d5f416, 0x3fef829aaea92de0,
+0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
+0xbc801b15eaa59348, 0x3fef72b83c7d517b,
+0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
+0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
+0xbc96d99c7611eb26, 0x3fef5be084045cd4,
+0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
+0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
+0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
+0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
+0x3c968efde3a8a894, 0x3fef387a6e756238,
+0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
+0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
+0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
+0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
+0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
+0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
+0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
+0x3c834d754db0abb6, 0x3fef06fe0a31b715,
+0x3c864201e2ac744c, 0x3fef0170fc4cd831,
+0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
+0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
+0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
+0xbc9907f81b512d8e, 0x3feeecae6d05d866,
+0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
+0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
+0x3c859f48a72a4c6d, 0x3feedea64c123422,
+0xbc9312607a28698a, 0x3feeda4504ac801c,
+0xbc58a78f4817895b, 0x3feed60a21f72e2a,
+0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
+0x3c4363ed60c2ac11, 0x3feece086061892d,
+0x3c9666093b0664ef, 0x3feeca41ed1d0057,
+0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
+0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
+0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
+0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
+0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
+0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
+0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
+0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
+0x3c93350518fdd78e, 0x3feeaf4736b527da,
+0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
+0x3c9063e1e21c5409, 0x3feeab07dd485429,
+0x3c34c7855019c6ea, 0x3feea9268a5946b7,
+0x3c9432e62b64c035, 0x3feea76f15ad2148,
+0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
+0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
+0xbc845378892be9ae, 0x3feea34634ccc320,
+0xbc93cedd78565858, 0x3feea23882552225,
+0x3c5710aa807e1964, 0x3feea155d44ca973,
+0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
+0xbc6a12ad8734b982, 0x3feea012750bdabf,
+0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
+0xbc80dc3d54e08851, 0x3fee9f7df9519484,
+0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
+0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
+0xbc8619321e55e68a, 0x3fee9feb564267c9,
+0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
+0xbc7b32dcb94da51d, 0x3feea11473eb0187,
+0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
+0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
+0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
+0xbc9369b6f13b3734, 0x3feea589994cce13,
+0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
+0xbc94d450d872576e, 0x3feea8d99b4492ed,
+0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
+0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
+0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
+0x3c7bf68359f35f44, 0x3feeb1ae99157736,
+0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
+0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
+0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
+0xbc92434322f4f9aa, 0x3feebd829fde4e50,
+0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
+0x3c71affc2b91ce27, 0x3feec49182a3f090,
+0x3c6dd235e10a73bb, 0x3feec86319e32323,
+0xbc87c50422622263, 0x3feecc667b5de565,
+0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
+0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
+0x3c90cc319cee31d2, 0x3feed99e1330b358,
+0x3c8469846e735ab3, 0x3feede6b5579fdbf,
+0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
+0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
+0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
+0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
+0xbc90a40e3da6f640, 0x3feef9728de5593a,
+0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
+0xbc91eee26b588a35, 0x3fef05b030a1064a,
+0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
+0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
+0x3c736eae30af0cb3, 0x3fef199bdd85529c,
+0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
+0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
+0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
+0x3c676b2c6c921968, 0x3fef3720dcef9069,
+0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
+0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
+0xbc900dae3875a949, 0x3fef4f87080d89f2,
+0x3c74a385a63d07a7, 0x3fef5818dcfba487,
+0xbc82919e2040220f, 0x3fef60e316c98398,
+0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
+0x3c843a59ac016b4b, 0x3fef7321f301b460,
+0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
+0xbc892ab93b470dc9, 0x3fef864614f5a129,
+0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
+0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
+0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
+0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
+0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
+0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
+0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
+0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
+0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
+0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
+},
+};
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
index d7588a7929..3a9fb05aee 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
@@ -21,7 +21,7 @@
 #include <math/math-svid-compat.h>
 #include <shlib-compat.h>
 #include <libm-alias-double.h>
-#include "math_config.h"
+#include "math_config_dbl.h"
 #include <altivec.h>
 
 #define T __log_data.tab
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
index 2fe569576c..f1299404c4 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
@@ -16,7 +16,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include "math_config.h"
+#include "math_config_dbl.h"
 
 #define N (1 << LOG_TABLE_BITS)
 
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
new file mode 100644
index 0000000000..e60b0d232f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
@@ -0,0 +1,40 @@
+/* Double-precision math error handling.
+   Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fpu/math-barriers.h>
+#include "math_config_dbl.h"
+
+/* NOINLINE reduces code size.  */
+NOINLINE static double
+xflow (uint32_t sign, double y)
+{
+  y = math_opt_barrier (sign ? -y : y) * y;
+  return y;
+}
+
+attribute_hidden double
+__math_uflow (uint32_t sign)
+{
+  return xflow (sign, 0x1p-767);
+}
+
+attribute_hidden double
+__math_oflow (uint32_t sign)
+{
+  return xflow (sign, 0x1p769);
+}
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
index bfbd340626..46558dbd77 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
@@ -1,4 +1,5 @@
 GLIBC_2.30 _ZGVbN2v_cos F
+GLIBC_2.30 _ZGVbN2v_exp F
 GLIBC_2.30 _ZGVbN2v_log F
 GLIBC_2.30 _ZGVbN2v_sin F
 GLIBC_2.30 _ZGVbN2vvv_sincos F
-- 
2.21.0.1020.gf2820cf01a


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [v3 PATCH] PPC64: Add libmvec SIMD double-precision natural exponent function.
  2019-05-29 19:11       ` [v3 PATCH] " Shawn Landden
@ 2019-05-29 19:12         ` Shawn Landden
  2019-05-29 19:19           ` Florian Weimer
  2019-06-04  2:43         ` Shawn Landden
  2019-06-04 20:00         ` Tulio Magno Quites Machado Filho
  2 siblings, 1 reply; 15+ messages in thread
From: Shawn Landden @ 2019-05-29 19:12 UTC (permalink / raw)
  Cc: libc-alpha, Tulio Magno Quites Machado Filho

On Wed, May 29, 2019 at 2:11 PM Shawn Landden <shawn@git.icu> wrote:
>
> Passes all tests.
I change this but forgot to regenerate the patch. I am getting a bunch
of seemingly unrelated test failures.
FAIL: dlfcn/bug-atexit3
FAIL: elf/check-abi-libc
XPASS: elf/tst-protected1a
XPASS: elf/tst-protected1b
FAIL: elf/tst-unwind-main
FAIL: iconv/tst-iconv-mt
FAIL: inet/tst-idna_name_classify
FAIL: malloc/tst-malloc-stats-cancellation
UNSUPPORTED: math/test-fesetexcept-traps
UNSUPPORTED: math/test-fexcept-traps
FAIL: misc/tst-gettid-kill
FAIL: misc/tst-pkey
UNSUPPORTED: nptl/test-cond-printers
UNSUPPORTED: nptl/test-condattr-printers
FAIL: nptl/test-get_hwcap
UNSUPPORTED: nptl/test-mutex-printers
UNSUPPORTED: nptl/test-mutexattr-printers
UNSUPPORTED: nptl/test-rwlock-printers
UNSUPPORTED: nptl/test-rwlockattr-printers
FAIL: nptl/tst-basic3
FAIL: nptl/tst-basic4
FAIL: nptl/tst-call-once
FAIL: nptl/tst-cancel-self
FAIL: nptl/tst-cancel-self-cancelstate
FAIL: nptl/tst-cancel-self-canceltype
FAIL: nptl/tst-cancel-self-testcancel
FAIL: nptl/tst-cancel1
FAIL: nptl/tst-cancel10
FAIL: nptl/tst-cancel11
FAIL: nptl/tst-cancel12
FAIL: nptl/tst-cancel13
FAIL: nptl/tst-cancel14
FAIL: nptl/tst-cancel15
FAIL: nptl/tst-cancel16
FAIL: nptl/tst-cancel17
FAIL: nptl/tst-cancel18
FAIL: nptl/tst-cancel2
FAIL: nptl/tst-cancel20
FAIL: nptl/tst-cancel21
FAIL: nptl/tst-cancel22
FAIL: nptl/tst-cancel23
FAIL: nptl/tst-cancel24
FAIL: nptl/tst-cancel25
FAIL: nptl/tst-cancel3
FAIL: nptl/tst-cancel4
FAIL: nptl/tst-cancel4_1
FAIL: nptl/tst-cancel4_2
FAIL: nptl/tst-cancel5
FAIL: nptl/tst-cancel6
FAIL: nptl/tst-cancel7
FAIL: nptl/tst-cancel8
FAIL: nptl/tst-cancel9
FAIL: nptl/tst-cancelx10
FAIL: nptl/tst-cancelx11
FAIL: nptl/tst-cancelx12
FAIL: nptl/tst-cancelx13
FAIL: nptl/tst-cancelx14
FAIL: nptl/tst-cancelx15
FAIL: nptl/tst-cancelx16
FAIL: nptl/tst-cancelx17
FAIL: nptl/tst-cancelx18
FAIL: nptl/tst-cancelx2
FAIL: nptl/tst-cancelx20
FAIL: nptl/tst-cancelx21
FAIL: nptl/tst-cancelx3
FAIL: nptl/tst-cancelx4
FAIL: nptl/tst-cancelx5
FAIL: nptl/tst-cancelx6
FAIL: nptl/tst-cancelx7
FAIL: nptl/tst-cancelx8
FAIL: nptl/tst-cancelx9
FAIL: nptl/tst-cleanup0
FAIL: nptl/tst-cleanup0-cmp
FAIL: nptl/tst-cleanup1
FAIL: nptl/tst-cleanup3
FAIL: nptl/tst-cleanup4
FAIL: nptl/tst-cleanupx0
FAIL: nptl/tst-cleanupx1
FAIL: nptl/tst-cleanupx3
FAIL: nptl/tst-cleanupx4
FAIL: nptl/tst-cnd-basic
FAIL: nptl/tst-cnd-broadcast
FAIL: nptl/tst-cnd-timedwait
FAIL: nptl/tst-cond-except
FAIL: nptl/tst-cond22
FAIL: nptl/tst-cond25
FAIL: nptl/tst-cond7
FAIL: nptl/tst-cond8
FAIL: nptl/tst-default-attr
FAIL: nptl/tst-exec5
FAIL: nptl/tst-execstack
FAIL: nptl/tst-exit2
FAIL: nptl/tst-exit3
FAIL: nptl/tst-fini1
FAIL: nptl/tst-join1
FAIL: nptl/tst-join5
FAIL: nptl/tst-join6
FAIL: nptl/tst-join8
FAIL: nptl/tst-join9
FAIL: nptl/tst-key3
FAIL: nptl/tst-minstack-cancel
FAIL: nptl/tst-minstack-exit
FAIL: nptl/tst-minstack-throw
FAIL: nptl/tst-mtx-basic
FAIL: nptl/tst-mtx-timedlock
FAIL: nptl/tst-mtx-trylock
FAIL: nptl/tst-mutex8
FAIL: nptl/tst-mutexpi8
FAIL: nptl/tst-once3
FAIL: nptl/tst-once4
FAIL: nptl/tst-oncex3
FAIL: nptl/tst-oncex4
FAIL: nptl/tst-robust1
FAIL: nptl/tst-robust2
FAIL: nptl/tst-robust3
FAIL: nptl/tst-robust4
FAIL: nptl/tst-robust5
FAIL: nptl/tst-robust6
FAIL: nptl/tst-robust7
FAIL: nptl/tst-robustpi1
FAIL: nptl/tst-robustpi2
FAIL: nptl/tst-robustpi3
FAIL: nptl/tst-robustpi4
FAIL: nptl/tst-robustpi5
FAIL: nptl/tst-robustpi6
FAIL: nptl/tst-robustpi7
FAIL: nptl/tst-sem11
FAIL: nptl/tst-sem12
FAIL: nptl/tst-sem16
FAIL: nptl/tst-stack4
FAIL: nptl/tst-thrd-detach
FAIL: nptl/tst-thrd-sleep
FAIL: nptl/tst-thread-exit-clobber
FAIL: nptl/tst-thread_local1
FAIL: nptl/tst-tsd5
FAIL: nptl/tst-tss-basic
FAIL: nptl/tst-unwind-thread
FAIL: nss/tst-cancel-getpwuid_r
FAIL: nss/tst-nss-files-hosts-long
FAIL: nss/tst-nss-test3
FAIL: posix/tst-getopt-cancel
UNSUPPORTED: posix/tst-spawn4-compat
UNSUPPORTED: resolv/tst-resolv-ai_idn
UNSUPPORTED: resolv/tst-resolv-ai_idn-latin1
FAIL: rt/tst-cpuclock2
FAIL: rt/tst-mqueue8
FAIL: rt/tst-mqueue8x
FAIL: rt/tst-shm-cancel
FAIL: stdlib/tst-quick_exit
FAIL: stdlib/tst-thread-quick_exit

>
> Unlike other libmvec functions, this sets the underflow and overflow bits.
> The caller can check these flags, and possibly re-run the calculations with
> scalar expf to figure out what is causing the overflow or underflow.
>
> The special-case path is not vectorized, and performs much worse than
> the scalar code.
> Normalized data: 1 to 2^32 converted to double
> Running 20 times over 32MiB
> vector: mean 563.807107 MiB/s (sd 0.390922)
> scalar: mean 226.527824 MiB/s (sd 0.077406)
>
> Random data:
> vector: mean 80.175986 MiB/s (sd 1.110948)
> scalar: mean 244.738130 MiB/s (sd 0.029561)
>
> v2: Fix some serious problems with the algorithm. Not
>     sure how I missed those problems.
> v3: Style problems.  Fix symbol order.
>
> 2019-05-11  Shawn Landden  <shawn@git.icu>
>
>         [BZ #24209]
>         * NEWS: Noted the addition of PPC64 vector exp function.
>         * sysdeps/powerpc/bits/math-vector.h: Added entry for vector exp.
>         * sysdeps/powerpc/powerpc64/fpu/Versions: Added vector exp entry.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile:
>         (libmvec-sysdep_routines, CFLAGS-vec_d_exp2_vsx.c):
>         (CFLAGS-vec_d_exp_data.c): Added build of VSX SIMD expf
>         function and tests. Added vec_math_err.c to build.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/(math_config.h)->(math_config_dbl.h)
>         Renamed, and modified for exp.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c:
>         Added entry for vector exp.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c: New file.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c: Likewise.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c: Likewise.
>         * sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist: SIMD expf
>         added.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c: Rename of math_config.h
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c: Likewise
> ---
>  NEWS                                          |   1 +
>  sysdeps/powerpc/bits/math-vector.h            |   2 +
>  sysdeps/powerpc/powerpc64/fpu/Versions        |   2 +-
>  .../powerpc/powerpc64/fpu/multiarch/Makefile  |   8 +-
>  .../{math_config.h => math_config_dbl.h}      |  15 +-
>  .../multiarch/test-double-vlen2-wrappers.c    |   1 +
>  .../powerpc64/fpu/multiarch/vec_d_exp2_vsx.c  | 197 +++++++++++++++++
>  .../powerpc64/fpu/multiarch/vec_d_exp_data.c  | 202 ++++++++++++++++++
>  .../powerpc64/fpu/multiarch/vec_d_log2_vsx.c  |   2 +-
>  .../powerpc64/fpu/multiarch/vec_d_log_data.c  |   2 +-
>  .../powerpc64/fpu/multiarch/vec_math_err.c    |  40 ++++
>  .../linux/powerpc/powerpc64/libmvec.abilist   |   1 +
>  12 files changed, 461 insertions(+), 12 deletions(-)
>  rename sysdeps/powerpc/powerpc64/fpu/multiarch/{math_config.h => math_config_dbl.h} (95%)
>  create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
>  create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
>  create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
>
> diff --git a/NEWS b/NEWS
> index 26f5c67f75..a33af8c316 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -20,6 +20,7 @@ Major new features:
>    - single-precision sine: sinf
>    - double-precision sincos: sincos
>    - single-precision sincos: sincosf
> +  - double-precision exp: exp
>    - single-precision exp: expf
>
>    GCC support for auto-vectorization of functions on PPC64 is not yet
> diff --git a/sysdeps/powerpc/bits/math-vector.h b/sysdeps/powerpc/bits/math-vector.h
> index bbc9db0dd9..e7846b4bfa 100644
> --- a/sysdeps/powerpc/bits/math-vector.h
> +++ b/sysdeps/powerpc/bits/math-vector.h
> @@ -46,6 +46,8 @@
>  #  define __DECL_SIMD_sincosf __DECL_SIMD_PPC64
>  #  undef __DECL_SIMD_expf
>  #  define __DECL_SIMD_expf __DECL_SIMD_PPC64
> +#  undef __DECL_SIMD_exp
> +#  define __DECL_SIMD_exp __DECL_SIMD_PPC64
>
>  # endif
>  #endif
> diff --git a/sysdeps/powerpc/powerpc64/fpu/Versions b/sysdeps/powerpc/powerpc64/fpu/Versions
> index 7c45e76074..361edee9e7 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/Versions
> +++ b/sysdeps/powerpc/powerpc64/fpu/Versions
> @@ -2,6 +2,6 @@ libmvec {
>    GLIBC_2.30 {
>      _ZGVbN2v_cos; _ZGVbN4v_cosf; _ZGVbN2v_sin; _ZGVbN4v_sinf;
>      _ZGVbN2v_log; _ZGVbN4v_logf; _ZGVbN2vvv_sincos; _ZGVbN4vvv_sincosf;
> -    _ZGVbN4v_expf;
> +    _ZGVbN4v_expf; _ZGVbN2v_exp;
>    }
>  }
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> index d15183d5ee..dd982bb068 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> @@ -49,7 +49,8 @@ libmvec-sysdep_routines += vec_d_cos2_vsx vec_s_cosf4_vsx \
>                            vec_d_log2_vsx vec_d_log_data \
>                            vec_s_logf4_vsx vec_s_logf_data \
>                            vec_s_expf4_vsx vec_s_exp2f_data \
> -                          vec_math_errf \
> +                          vec_math_errf vec_math_err \
> +                          vec_d_exp2_vsx vec_d_exp_data \
>                            vec_d_sincos2_vsx vec_s_sincosf4_vsx
>  CFLAGS-vec_d_cos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_d_log2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
> @@ -64,6 +65,9 @@ CFLAGS-vec_d_sincos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_s_sincosf4_vsx.c += -mabi=altivec -maltivec -mvsx
>  CFLAGS-vec_s_expf4_vsx.c += -mabi=altivec -maltivec -mvsx
>  CFLAGS-vec_s_exp2f_data.c += -mabi=altivec -maltivec -mvsx
> +CFLAGS-vec_d_exp2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
> +CFLAGS-vec_d_exp_data.c += -mabi=altivec -maltivec -mvsx
> +CFLAGS-vec_math_err.c += -mabi=altivec -maltivec -mvsx
>  endif
>
>  # Variables for libmvec tests.
> @@ -71,7 +75,7 @@ ifeq ($(subdir),math)
>  ifeq ($(build-mathvec),yes)
>  libmvec-tests += double-vlen2 float-vlen4
>
> -double-vlen2-funcs = cos sin sincos log
> +double-vlen2-funcs = cos sin sincos log exp
>  float-vlen4-funcs = cos sin sincos log exp
>
>  double-vlen2-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
> similarity index 95%
> rename from sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h
> rename to sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
> index bd37906ecb..7c94f5b9b2 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
> @@ -24,6 +24,7 @@
>  #include <nan-high-order-bit.h>
>  #include <stdint.h>
>  #include <altivec.h>
> +#include <math-narrow-eval.h>
>
>  #ifndef WANT_ROUNDING
>  /* Correct special case results in non-nearest rounding modes.  */
> @@ -134,13 +135,13 @@ check_uflow (double x)
>  #define EXP2_POLY_ORDER 5
>  extern const struct exp_data
>  {
> -  double invln2N;
> -  double shift;
> -  double negln2hiN;
> -  double negln2loN;
> -  double poly[4]; /* Last four coefficients.  */
> -  double exp2_shift;
> -  double exp2_poly[EXP2_POLY_ORDER];
> +  vector double invln2N;
> +  vector double shift;
> +  vector double negln2hiN;
> +  vector double negln2loN;
> +  vector double poly[4]; /* Last four coefficients.  */
> +  vector double exp2_shift;
> +  vector double exp2_poly[EXP2_POLY_ORDER];
>    uint64_t tab[2*(1 << EXP_TABLE_BITS)];
>  } __exp_data attribute_hidden;
>
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
> index c9e2dbfdd0..f368e47947 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
> @@ -24,4 +24,5 @@
>  VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVbN2v_cos)
>  VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
>  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
> +VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
>  VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVbN2vvv_sincos)
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
> new file mode 100644
> index 0000000000..376531e32b
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
> @@ -0,0 +1,197 @@
> +/* Double-precision vector exp(x) function.
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +/* Based on sysdeps/ieee754/dbl-64/e_exp.c which came from
> +   Szabolcs Nagy at ARM Ltd. */
> +#include <altivec.h>
> +#include <math.h>
> +#include <math-barriers.h>
> +#include <math-narrow-eval.h>
> +
> +#include "math_config_dbl.h"
> +
> +typedef vector long long unsigned v64u;
> +typedef union {
> +  vector unsigned u;
> +  vector float f;
> +  v64u l;
> +  vector double d;
> +} u;
> +typedef union {
> +  double d;
> +  uint64_t l;
> +  unsigned u;
> +  float f;
> +} us;
> +
> +#define N (1 << EXP_TABLE_BITS)
> +#define InvLn2N __exp_data.invln2N
> +#define NegLn2hiN __exp_data.negln2hiN
> +#define NegLn2loN __exp_data.negln2loN
> +#define Shift __exp_data.shift
> +#define T __exp_data.tab
> +#define C2 __exp_data.poly[5 - EXP_POLY_ORDER]
> +#define C3 __exp_data.poly[6 - EXP_POLY_ORDER]
> +#define C4 __exp_data.poly[7 - EXP_POLY_ORDER]
> +#define C5 __exp_data.poly[8 - EXP_POLY_ORDER]
> +
> +/* Handle cases that may overflow or underflow when computing the result that
> +   is scale*(1+TMP) without intermediate rounding.  The bit representation of
> +   scale is in SBITS, however it has a computed exponent that may have
> +   overflown into the sign bit so that needs to be adjusted before using it as
> +   a double.  (int32_t)KI is the k used in the argument reduction and exponent
> +   adjustment of scale, positive k here means the result may overflow and
> +   negative k means the result may underflow.  */
> +static inline double
> +specialcase (double_t tmp, uint64_t sbits, uint64_t ki)
> +{
> +  double_t scale, y;
> +
> +  if ((ki & 0x80000000) == 0)
> +    {
> +      /* k > 0, the exponent of scale might have overflowed by <= 460.  */
> +      sbits -= 1009ull << 52;
> +      scale = asdouble (sbits);
> +      y = 0x1p1009 * (scale + scale * tmp);
> +      return y;
> +    }
> +  /* k < 0, need special care in the subnormal range.  */
> +  sbits += 1022ull << 52;
> +  scale = asdouble (sbits);
> +  y = scale + scale * tmp;
> +  if (y < 1.0)
> +    {
> +      /* Round y to the right precision before scaling it into the subnormal
> +        range to avoid double rounding that can cause 0.5+E/2 ulp error where
> +        E is the worst-case ulp error outside the subnormal range.  So this
> +        is only useful if the goal is better than 1 ulp worst-case error.  */
> +      double_t hi, lo;
> +      lo = scale - y + scale * tmp;
> +      hi = 1.0 + y;
> +      lo = 1.0 - hi + y + lo;
> +      y = math_narrow_eval (hi + lo) - 1.0;
> +      /* Avoid -0.0 with downward rounding.  */
> +      if (WANT_ROUNDING && y == 0.0)
> +       y = 0.0;
> +      /* The underflow exception needs to be signaled explicitly.  */
> +      math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
> +    }
> +  y = 0x1p-1022 * y;
> +  return y;
> +}
> +
> +/* Top 12 bits of a double (sign and exponent bits).  */
> +static inline uint32_t top12(double x)
> +{
> +       return asuint64(x) >> 52;
> +}
> +
> +vector double
> +_ZGVbN2v_exp (vector double x)
> +{
> +       v64u abstop, is_special_case, is_special_case2, idx, top, sbits;
> +       vector double kd, z, r, r2, scale, tail, tmp;
> +       u t, t2;
> +       u res;
> +       u xu;
> +       xu.d = x;
> +       v64u zero = {0, 0};
> +       u c512;
> +       vector double load = {512.0, 512.0};
> +       c512.d = load;
> +       u normalize;
> +       vector double load2 = {0x1p-54, 0x1p-54};
> +       normalize.d = load2;
> +       c512.l &= 0x7ff0000000000000;
> +       normalize.l &= 0x7ff0000000000000;
> +       abstop = xu.l & 0x7ff0000000000000;
> +       is_special_case = (v64u) vec_cmpge (xu.l - normalize.l, c512.l - normalize.l);
> +       if (__glibc_unlikely (!vec_all_eq (is_special_case, zero)))
> +       {
> +               for (int m=0;m<2;++m)
> +               {
> +                       if (!is_special_case[m])
> +                               continue;
> +                       double v;
> +                       uint32_t abstops = abstop[m] >> 52;
> +                       if (!is_special_case[m])
> +                               continue;
> +                       v = x[m];
> +                       if (abstops - top12(0x1p-54) >= 0x80000000)
> +                               /* Avoid spurious underflow for tiny x.  */
> +                               /* Note: 0 is common input.  */
> +                               res.d[m] = WANT_ROUNDING ? 1.0 + v : 1.0;
> +                       if (abstops >= top12(1024.0)) {
> +                               if (asuint64(v) == asuint64(-INFINITY))
> +                                       res.d[m] = 0.0;
> +                               else if (abstops >= top12(INFINITY))
> +                                       res.d[m] = 1.0 + v;
> +                               else if (asuint64(v) >> 63)
> +                                       res.d[m] = __math_uflow(0);
> +                               else
> +                                       res.d[m] = __math_oflow(0);
> +                               /* prevent is_special_case2 from matching */
> +                               abstop[m] = 1;
> +                       } else {
> +                               /* Large x is special cased below.  */
> +                               abstop[m] = 0;
> +                               is_special_case[m] = 0;
> +                       }
> +               }
> +       }
> +       /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
> +       /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
> +       z = InvLn2N * x;
> +       u kdu;
> +       /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
> +       kd = z + Shift;
> +       kdu.d = kd;
> +       kd -= Shift;
> +       r = x + kd * NegLn2hiN + kd * NegLn2loN;
> +       /* 2^(k/N) ~= scale * (1 + tail).  */
> +       idx = 2 * (kdu.l % N);
> +       top = kdu.l << (52 - EXP_TABLE_BITS);
> +       v64u load3 = {T[idx[0]], T[idx[1]]};
> +       t.l = load3;
> +       v64u load4 = {T[idx[0] + 1], T[idx[1] + 1]};
> +       t2.l = load4;
> +       tail = t.d;
> +       /* This is only a valid scale when -1023*N < k < 1024*N.  */
> +       sbits = t2.l + top;
> +       /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */
> +       /* Evaluation is optimized assuming superscalar pipelined execution.  */
> +       r2 = r * r;
> +       /* Without fma the worst case error is 0.25/N ulp larger.  */
> +       /* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.  */
> +       tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
> +       is_special_case2 = (v64u) vec_cmpeq (abstop, zero);
> +       if (__glibc_unlikely (!vec_all_eq (is_special_case2, zero)))
> +       {
> +               u sc2;
> +               for (int m=0;m<2;++m)
> +               {
> +                       sc2.d[m] = specialcase(tmp[m], sbits[m], kdu.l[m]);
> +               }
> +               res.l = vec_sel (res.l, sc2.l, is_special_case2);
> +       }
> +       u sbitsu;
> +       sbitsu.l = sbits;
> +       scale = sbitsu.d;
> +       u res2;
> +       res2.d = scale + scale * tmp;
> +       /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
> +          is no spurious underflow here even without fma.  */
> +       res.l = vec_sel (res.l, res2.l, ~is_special_case & ~is_special_case2);
> +       return res.d;
> +}
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
> new file mode 100644
> index 0000000000..208b897d04
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
> @@ -0,0 +1,202 @@
> +/* Shared data between vector exp, exp2 and pow
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +/* Based on sysdeps/ieee754/dbl-64/e_exp.c which came from
> +   Szabolcs Nagy at ARM Ltd. */
> +
> +#include "math_config_dbl.h"
> +
> +#define N (1 << EXP_TABLE_BITS)
> +
> +const struct exp_data __exp_data = {
> +// N/ln2
> +.invln2N = {
> +  0x1.71547652b82fep0 * N,
> +  0x1.71547652b82fep0 * N,
> +},
> +// -ln2/N
> +.negln2hiN = {
> +  -0x1.62e42fefa0000p-8,
> +  -0x1.62e42fefa0000p-8,
> +},
> +.negln2loN = {
> +  -0x1.cf79abc9e3b3ap-47,
> +  -0x1.cf79abc9e3b3ap-47,
> +},
> +.shift = {
> +  0x1.8p52,
> +  0x1.8p52,
> +},
> +// exp polynomial coefficients.
> +.poly = {
> +// abs error: 1.555*2^-66
> +// ulp error: 0.509 (0.511 without fma)
> +// if |x| < ln2/256+eps
> +// abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65
> +// abs error if |x| < ln2/128: 1.7145*2^-56
> +{0x1.ffffffffffdbdp-2, 0x1.ffffffffffdbdp-2},
> +{0x1.555555555543cp-3, 0x1.555555555543cp-3},
> +{0x1.55555cf172b91p-5, 0x1.55555cf172b91p-5},
> +{0x1.1111167a4d017p-7, 0x1.1111167a4d017p-7},
> +},
> +.exp2_shift = {
> +  0x1.8p52 / N,
> +  0x1.8p52 / N,
> +},
> +// exp2 polynomial coefficients.
> +.exp2_poly = {
> +// abs error: 1.2195*2^-65
> +// ulp error: 0.507 (0.511 without fma)
> +// if |x| < 1/256
> +// abs error if |x| < 1/128: 1.9941*2^-56
> +{0x1.62e42fefa39efp-1, 0x1.62e42fefa39efp-1},
> +{0x1.ebfbdff82c424p-3, 0x1.ebfbdff82c424p-3},
> +{0x1.c6b08d70cf4b5p-5, 0x1.c6b08d70cf4b5p-5},
> +{0x1.3b2abd24650ccp-7, 0x1.3b2abd24650ccp-7},
> +{0x1.5d7e09b4e3a84p-10, 0x1.5d7e09b4e3a84p-10},
> +},
> +// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
> +// tab[2*k] = asuint64(T[k])
> +// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
> +.tab = {
> +0x0, 0x3ff0000000000000,
> +0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
> +0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
> +0xbc905e7a108766d1, 0x3fefe315e86e7f85,
> +0x3c8cd2523567f613, 0x3fefd9b0d3158574,
> +0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
> +0x3c60f74e61e6c861, 0x3fefc74518759bc8,
> +0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
> +0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
> +0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
> +0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
> +0xbc6a033489906e0b, 0x3fef9b66affed31b,
> +0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
> +0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
> +0xbc91c923b9d5f416, 0x3fef829aaea92de0,
> +0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
> +0xbc801b15eaa59348, 0x3fef72b83c7d517b,
> +0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
> +0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
> +0xbc96d99c7611eb26, 0x3fef5be084045cd4,
> +0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
> +0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
> +0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
> +0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
> +0x3c968efde3a8a894, 0x3fef387a6e756238,
> +0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
> +0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
> +0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
> +0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
> +0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
> +0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
> +0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
> +0x3c834d754db0abb6, 0x3fef06fe0a31b715,
> +0x3c864201e2ac744c, 0x3fef0170fc4cd831,
> +0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
> +0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
> +0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
> +0xbc9907f81b512d8e, 0x3feeecae6d05d866,
> +0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
> +0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
> +0x3c859f48a72a4c6d, 0x3feedea64c123422,
> +0xbc9312607a28698a, 0x3feeda4504ac801c,
> +0xbc58a78f4817895b, 0x3feed60a21f72e2a,
> +0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
> +0x3c4363ed60c2ac11, 0x3feece086061892d,
> +0x3c9666093b0664ef, 0x3feeca41ed1d0057,
> +0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
> +0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
> +0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
> +0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
> +0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
> +0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
> +0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
> +0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
> +0x3c93350518fdd78e, 0x3feeaf4736b527da,
> +0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
> +0x3c9063e1e21c5409, 0x3feeab07dd485429,
> +0x3c34c7855019c6ea, 0x3feea9268a5946b7,
> +0x3c9432e62b64c035, 0x3feea76f15ad2148,
> +0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
> +0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
> +0xbc845378892be9ae, 0x3feea34634ccc320,
> +0xbc93cedd78565858, 0x3feea23882552225,
> +0x3c5710aa807e1964, 0x3feea155d44ca973,
> +0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
> +0xbc6a12ad8734b982, 0x3feea012750bdabf,
> +0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
> +0xbc80dc3d54e08851, 0x3fee9f7df9519484,
> +0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
> +0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
> +0xbc8619321e55e68a, 0x3fee9feb564267c9,
> +0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
> +0xbc7b32dcb94da51d, 0x3feea11473eb0187,
> +0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
> +0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
> +0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
> +0xbc9369b6f13b3734, 0x3feea589994cce13,
> +0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
> +0xbc94d450d872576e, 0x3feea8d99b4492ed,
> +0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
> +0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
> +0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
> +0x3c7bf68359f35f44, 0x3feeb1ae99157736,
> +0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
> +0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
> +0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
> +0xbc92434322f4f9aa, 0x3feebd829fde4e50,
> +0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
> +0x3c71affc2b91ce27, 0x3feec49182a3f090,
> +0x3c6dd235e10a73bb, 0x3feec86319e32323,
> +0xbc87c50422622263, 0x3feecc667b5de565,
> +0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
> +0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
> +0x3c90cc319cee31d2, 0x3feed99e1330b358,
> +0x3c8469846e735ab3, 0x3feede6b5579fdbf,
> +0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
> +0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
> +0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
> +0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
> +0xbc90a40e3da6f640, 0x3feef9728de5593a,
> +0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
> +0xbc91eee26b588a35, 0x3fef05b030a1064a,
> +0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
> +0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
> +0x3c736eae30af0cb3, 0x3fef199bdd85529c,
> +0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
> +0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
> +0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
> +0x3c676b2c6c921968, 0x3fef3720dcef9069,
> +0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
> +0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
> +0xbc900dae3875a949, 0x3fef4f87080d89f2,
> +0x3c74a385a63d07a7, 0x3fef5818dcfba487,
> +0xbc82919e2040220f, 0x3fef60e316c98398,
> +0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
> +0x3c843a59ac016b4b, 0x3fef7321f301b460,
> +0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
> +0xbc892ab93b470dc9, 0x3fef864614f5a129,
> +0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
> +0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
> +0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
> +0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
> +0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
> +0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
> +0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
> +0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
> +0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
> +0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
> +},
> +};
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
> index d7588a7929..3a9fb05aee 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
> @@ -21,7 +21,7 @@
>  #include <math/math-svid-compat.h>
>  #include <shlib-compat.h>
>  #include <libm-alias-double.h>
> -#include "math_config.h"
> +#include "math_config_dbl.h"
>  #include <altivec.h>
>
>  #define T __log_data.tab
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
> index 2fe569576c..f1299404c4 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
> @@ -16,7 +16,7 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>
> -#include "math_config.h"
> +#include "math_config_dbl.h"
>
>  #define N (1 << LOG_TABLE_BITS)
>
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
> new file mode 100644
> index 0000000000..e60b0d232f
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
> @@ -0,0 +1,40 @@
> +/* Double-precision math error handling.
> +   Copyright (C) 2017-2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <fpu/math-barriers.h>
> +#include "math_config_dbl.h"
> +
> +/* NOINLINE reduces code size.  */
> +NOINLINE static double
> +xflow (uint32_t sign, double y)
> +{
> +  y = math_opt_barrier (sign ? -y : y) * y;
> +  return y;
> +}
> +
> +attribute_hidden double
> +__math_uflow (uint32_t sign)
> +{
> +  return xflow (sign, 0x1p-767);
> +}
> +
> +attribute_hidden double
> +__math_oflow (uint32_t sign)
> +{
> +  return xflow (sign, 0x1p769);
> +}
> diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> index bfbd340626..46558dbd77 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> @@ -1,4 +1,5 @@
>  GLIBC_2.30 _ZGVbN2v_cos F
> +GLIBC_2.30 _ZGVbN2v_exp F
>  GLIBC_2.30 _ZGVbN2v_log F
>  GLIBC_2.30 _ZGVbN2v_sin F
>  GLIBC_2.30 _ZGVbN2vvv_sincos F
> --
> 2.21.0.1020.gf2820cf01a
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v3 PATCH] PPC64: Add libmvec SIMD double-precision natural exponent function.
  2019-05-29 19:12         ` Shawn Landden
@ 2019-05-29 19:19           ` Florian Weimer
  0 siblings, 0 replies; 15+ messages in thread
From: Florian Weimer @ 2019-05-29 19:19 UTC (permalink / raw)
  To: Shawn Landden; +Cc: libc-alpha, Tulio Magno Quites Machado Filho

* Shawn Landden:

> On Wed, May 29, 2019 at 2:11 PM Shawn Landden <shawn@git.icu> wrote:
>>
>> Passes all tests.

> I change this but forgot to regenerate the patch. I am getting a bunch
> of seemingly unrelated test failures.

Did you rebuild everything from scratch?  Incremental builds often
result in bizarre failures. 8-(

Thanks,
Florian

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v3 PATCH] PPC64: Add libmvec SIMD double-precision natural exponent function.
  2019-05-29 19:11       ` [v3 PATCH] " Shawn Landden
  2019-05-29 19:12         ` Shawn Landden
@ 2019-06-04  2:43         ` Shawn Landden
  2019-06-04 20:00         ` Tulio Magno Quites Machado Filho
  2 siblings, 0 replies; 15+ messages in thread
From: Shawn Landden @ 2019-06-04  2:43 UTC (permalink / raw)
  Cc: libc-alpha, Tulio Magno Quites Machado Filho

Ping

On Wed, May 29, 2019 at 2:11 PM Shawn Landden <shawn@git.icu> wrote:
>
> Passes all tests.
>
> Unlike other libmvec functions, this sets the underflow and overflow bits.
> The caller can check these flags, and possibly re-run the calculations with
> scalar expf to figure out what is causing the overflow or underflow.
>
> The special-case path is not vectorized, and performs much worse than
> the scalar code.
> Normalized data: 1 to 2^32 converted to double
> Running 20 times over 32MiB
> vector: mean 563.807107 MiB/s (sd 0.390922)
> scalar: mean 226.527824 MiB/s (sd 0.077406)
>
> Random data:
> vector: mean 80.175986 MiB/s (sd 1.110948)
> scalar: mean 244.738130 MiB/s (sd 0.029561)
>
> v2: Fix some serious problems with the algorithm. Not
>     sure how I missed those problems.
> v3: Style problems.  Fix symbol order.
>
> 2019-05-11  Shawn Landden  <shawn@git.icu>
>
>         [BZ #24209]
>         * NEWS: Noted the addition of PPC64 vector exp function.
>         * sysdeps/powerpc/bits/math-vector.h: Added entry for vector exp.
>         * sysdeps/powerpc/powerpc64/fpu/Versions: Added vector exp entry.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile:
>         (libmvec-sysdep_routines, CFLAGS-vec_d_exp2_vsx.c):
>         (CFLAGS-vec_d_exp_data.c): Added build of VSX SIMD expf
>         function and tests. Added vec_math_err.c to build.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/(math_config.h)->(math_config_dbl.h)
>         Renamed, and modified for exp.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c:
>         Added entry for vector exp.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c: New file.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c: Likewise.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c: Likewise.
>         * sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist: SIMD expf
>         added.
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c: Rename of math_config.h
>         * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c: Likewise
> ---
>  NEWS                                          |   1 +
>  sysdeps/powerpc/bits/math-vector.h            |   2 +
>  sysdeps/powerpc/powerpc64/fpu/Versions        |   2 +-
>  .../powerpc/powerpc64/fpu/multiarch/Makefile  |   8 +-
>  .../{math_config.h => math_config_dbl.h}      |  15 +-
>  .../multiarch/test-double-vlen2-wrappers.c    |   1 +
>  .../powerpc64/fpu/multiarch/vec_d_exp2_vsx.c  | 197 +++++++++++++++++
>  .../powerpc64/fpu/multiarch/vec_d_exp_data.c  | 202 ++++++++++++++++++
>  .../powerpc64/fpu/multiarch/vec_d_log2_vsx.c  |   2 +-
>  .../powerpc64/fpu/multiarch/vec_d_log_data.c  |   2 +-
>  .../powerpc64/fpu/multiarch/vec_math_err.c    |  40 ++++
>  .../linux/powerpc/powerpc64/libmvec.abilist   |   1 +
>  12 files changed, 461 insertions(+), 12 deletions(-)
>  rename sysdeps/powerpc/powerpc64/fpu/multiarch/{math_config.h => math_config_dbl.h} (95%)
>  create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
>  create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
>  create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
>
> diff --git a/NEWS b/NEWS
> index 26f5c67f75..a33af8c316 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -20,6 +20,7 @@ Major new features:
>    - single-precision sine: sinf
>    - double-precision sincos: sincos
>    - single-precision sincos: sincosf
> +  - double-precision exp: exp
>    - single-precision exp: expf
>
>    GCC support for auto-vectorization of functions on PPC64 is not yet
> diff --git a/sysdeps/powerpc/bits/math-vector.h b/sysdeps/powerpc/bits/math-vector.h
> index bbc9db0dd9..e7846b4bfa 100644
> --- a/sysdeps/powerpc/bits/math-vector.h
> +++ b/sysdeps/powerpc/bits/math-vector.h
> @@ -46,6 +46,8 @@
>  #  define __DECL_SIMD_sincosf __DECL_SIMD_PPC64
>  #  undef __DECL_SIMD_expf
>  #  define __DECL_SIMD_expf __DECL_SIMD_PPC64
> +#  undef __DECL_SIMD_exp
> +#  define __DECL_SIMD_exp __DECL_SIMD_PPC64
>
>  # endif
>  #endif
> diff --git a/sysdeps/powerpc/powerpc64/fpu/Versions b/sysdeps/powerpc/powerpc64/fpu/Versions
> index 7c45e76074..361edee9e7 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/Versions
> +++ b/sysdeps/powerpc/powerpc64/fpu/Versions
> @@ -2,6 +2,6 @@ libmvec {
>    GLIBC_2.30 {
>      _ZGVbN2v_cos; _ZGVbN4v_cosf; _ZGVbN2v_sin; _ZGVbN4v_sinf;
>      _ZGVbN2v_log; _ZGVbN4v_logf; _ZGVbN2vvv_sincos; _ZGVbN4vvv_sincosf;
> -    _ZGVbN4v_expf;
> +    _ZGVbN4v_expf; _ZGVbN2v_exp;
>    }
>  }
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> index d15183d5ee..dd982bb068 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> @@ -49,7 +49,8 @@ libmvec-sysdep_routines += vec_d_cos2_vsx vec_s_cosf4_vsx \
>                            vec_d_log2_vsx vec_d_log_data \
>                            vec_s_logf4_vsx vec_s_logf_data \
>                            vec_s_expf4_vsx vec_s_exp2f_data \
> -                          vec_math_errf \
> +                          vec_math_errf vec_math_err \
> +                          vec_d_exp2_vsx vec_d_exp_data \
>                            vec_d_sincos2_vsx vec_s_sincosf4_vsx
>  CFLAGS-vec_d_cos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_d_log2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
> @@ -64,6 +65,9 @@ CFLAGS-vec_d_sincos2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
>  CFLAGS-vec_s_sincosf4_vsx.c += -mabi=altivec -maltivec -mvsx
>  CFLAGS-vec_s_expf4_vsx.c += -mabi=altivec -maltivec -mvsx
>  CFLAGS-vec_s_exp2f_data.c += -mabi=altivec -maltivec -mvsx
> +CFLAGS-vec_d_exp2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector
> +CFLAGS-vec_d_exp_data.c += -mabi=altivec -maltivec -mvsx
> +CFLAGS-vec_math_err.c += -mabi=altivec -maltivec -mvsx
>  endif
>
>  # Variables for libmvec tests.
> @@ -71,7 +75,7 @@ ifeq ($(subdir),math)
>  ifeq ($(build-mathvec),yes)
>  libmvec-tests += double-vlen2 float-vlen4
>
> -double-vlen2-funcs = cos sin sincos log
> +double-vlen2-funcs = cos sin sincos log exp
>  float-vlen4-funcs = cos sin sincos log exp
>
>  double-vlen2-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
> similarity index 95%
> rename from sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h
> rename to sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
> index bd37906ecb..7c94f5b9b2 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config.h
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/math_config_dbl.h
> @@ -24,6 +24,7 @@
>  #include <nan-high-order-bit.h>
>  #include <stdint.h>
>  #include <altivec.h>
> +#include <math-narrow-eval.h>
>
>  #ifndef WANT_ROUNDING
>  /* Correct special case results in non-nearest rounding modes.  */
> @@ -134,13 +135,13 @@ check_uflow (double x)
>  #define EXP2_POLY_ORDER 5
>  extern const struct exp_data
>  {
> -  double invln2N;
> -  double shift;
> -  double negln2hiN;
> -  double negln2loN;
> -  double poly[4]; /* Last four coefficients.  */
> -  double exp2_shift;
> -  double exp2_poly[EXP2_POLY_ORDER];
> +  vector double invln2N;
> +  vector double shift;
> +  vector double negln2hiN;
> +  vector double negln2loN;
> +  vector double poly[4]; /* Last four coefficients.  */
> +  vector double exp2_shift;
> +  vector double exp2_poly[EXP2_POLY_ORDER];
>    uint64_t tab[2*(1 << EXP_TABLE_BITS)];
>  } __exp_data attribute_hidden;
>
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
> index c9e2dbfdd0..f368e47947 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c
> @@ -24,4 +24,5 @@
>  VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVbN2v_cos)
>  VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
>  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
> +VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
>  VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVbN2vvv_sincos)
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
> new file mode 100644
> index 0000000000..376531e32b
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
> @@ -0,0 +1,197 @@
> +/* Double-precision vector exp(x) function.
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +/* Based on sysdeps/ieee754/dbl-64/e_exp.c which came from
> +   Szabolcs Nagy at ARM Ltd. */
> +#include <altivec.h>
> +#include <math.h>
> +#include <math-barriers.h>
> +#include <math-narrow-eval.h>
> +
> +#include "math_config_dbl.h"
> +
> +typedef vector long long unsigned v64u;
> +typedef union {
> +  vector unsigned u;
> +  vector float f;
> +  v64u l;
> +  vector double d;
> +} u;
> +typedef union {
> +  double d;
> +  uint64_t l;
> +  unsigned u;
> +  float f;
> +} us;
> +
> +#define N (1 << EXP_TABLE_BITS)
> +#define InvLn2N __exp_data.invln2N
> +#define NegLn2hiN __exp_data.negln2hiN
> +#define NegLn2loN __exp_data.negln2loN
> +#define Shift __exp_data.shift
> +#define T __exp_data.tab
> +#define C2 __exp_data.poly[5 - EXP_POLY_ORDER]
> +#define C3 __exp_data.poly[6 - EXP_POLY_ORDER]
> +#define C4 __exp_data.poly[7 - EXP_POLY_ORDER]
> +#define C5 __exp_data.poly[8 - EXP_POLY_ORDER]
> +
> +/* Handle cases that may overflow or underflow when computing the result that
> +   is scale*(1+TMP) without intermediate rounding.  The bit representation of
> +   scale is in SBITS, however it has a computed exponent that may have
> +   overflown into the sign bit so that needs to be adjusted before using it as
> +   a double.  (int32_t)KI is the k used in the argument reduction and exponent
> +   adjustment of scale, positive k here means the result may overflow and
> +   negative k means the result may underflow.  */
> +static inline double
> +specialcase (double_t tmp, uint64_t sbits, uint64_t ki)
> +{
> +  double_t scale, y;
> +
> +  if ((ki & 0x80000000) == 0)
> +    {
> +      /* k > 0, the exponent of scale might have overflowed by <= 460.  */
> +      sbits -= 1009ull << 52;
> +      scale = asdouble (sbits);
> +      y = 0x1p1009 * (scale + scale * tmp);
> +      return y;
> +    }
> +  /* k < 0, need special care in the subnormal range.  */
> +  sbits += 1022ull << 52;
> +  scale = asdouble (sbits);
> +  y = scale + scale * tmp;
> +  if (y < 1.0)
> +    {
> +      /* Round y to the right precision before scaling it into the subnormal
> +        range to avoid double rounding that can cause 0.5+E/2 ulp error where
> +        E is the worst-case ulp error outside the subnormal range.  So this
> +        is only useful if the goal is better than 1 ulp worst-case error.  */
> +      double_t hi, lo;
> +      lo = scale - y + scale * tmp;
> +      hi = 1.0 + y;
> +      lo = 1.0 - hi + y + lo;
> +      y = math_narrow_eval (hi + lo) - 1.0;
> +      /* Avoid -0.0 with downward rounding.  */
> +      if (WANT_ROUNDING && y == 0.0)
> +       y = 0.0;
> +      /* The underflow exception needs to be signaled explicitly.  */
> +      math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
> +    }
> +  y = 0x1p-1022 * y;
> +  return y;
> +}
> +
> +/* Top 12 bits of a double (sign and exponent bits).  */
> +static inline uint32_t top12(double x)
> +{
> +       return asuint64(x) >> 52;
> +}
> +
> +vector double
> +_ZGVbN2v_exp (vector double x)
> +{
> +       v64u abstop, is_special_case, is_special_case2, idx, top, sbits;
> +       vector double kd, z, r, r2, scale, tail, tmp;
> +       u t, t2;
> +       u res;
> +       u xu;
> +       xu.d = x;
> +       v64u zero = {0, 0};
> +       u c512;
> +       vector double load = {512.0, 512.0};
> +       c512.d = load;
> +       u normalize;
> +       vector double load2 = {0x1p-54, 0x1p-54};
> +       normalize.d = load2;
> +       c512.l &= 0x7ff0000000000000;
> +       normalize.l &= 0x7ff0000000000000;
> +       abstop = xu.l & 0x7ff0000000000000;
> +       is_special_case = (v64u) vec_cmpge (xu.l - normalize.l, c512.l - normalize.l);
> +       if (__glibc_unlikely (!vec_all_eq (is_special_case, zero)))
> +       {
> +               for (int m=0;m<2;++m)
> +               {
> +                       if (!is_special_case[m])
> +                               continue;
> +                       double v;
> +                       uint32_t abstops = abstop[m] >> 52;
> +                       if (!is_special_case[m])
> +                               continue;
> +                       v = x[m];
> +                       if (abstops - top12(0x1p-54) >= 0x80000000)
> +                               /* Avoid spurious underflow for tiny x.  */
> +                               /* Note: 0 is common input.  */
> +                               res.d[m] = WANT_ROUNDING ? 1.0 + v : 1.0;
> +                       if (abstops >= top12(1024.0)) {
> +                               if (asuint64(v) == asuint64(-INFINITY))
> +                                       res.d[m] = 0.0;
> +                               else if (abstops >= top12(INFINITY))
> +                                       res.d[m] = 1.0 + v;
> +                               else if (asuint64(v) >> 63)
> +                                       res.d[m] = __math_uflow(0);
> +                               else
> +                                       res.d[m] = __math_oflow(0);
> +                               /* prevent is_special_case2 from matching */
> +                               abstop[m] = 1;
> +                       } else {
> +                               /* Large x is special cased below.  */
> +                               abstop[m] = 0;
> +                               is_special_case[m] = 0;
> +                       }
> +               }
> +       }
> +       /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
> +       /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
> +       z = InvLn2N * x;
> +       u kdu;
> +       /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
> +       kd = z + Shift;
> +       kdu.d = kd;
> +       kd -= Shift;
> +       r = x + kd * NegLn2hiN + kd * NegLn2loN;
> +       /* 2^(k/N) ~= scale * (1 + tail).  */
> +       idx = 2 * (kdu.l % N);
> +       top = kdu.l << (52 - EXP_TABLE_BITS);
> +       v64u load3 = {T[idx[0]], T[idx[1]]};
> +       t.l = load3;
> +       v64u load4 = {T[idx[0] + 1], T[idx[1] + 1]};
> +       t2.l = load4;
> +       tail = t.d;
> +       /* This is only a valid scale when -1023*N < k < 1024*N.  */
> +       sbits = t2.l + top;
> +       /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */
> +       /* Evaluation is optimized assuming superscalar pipelined execution.  */
> +       r2 = r * r;
> +       /* Without fma the worst case error is 0.25/N ulp larger.  */
> +       /* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.  */
> +       tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5);
> +       is_special_case2 = (v64u) vec_cmpeq (abstop, zero);
> +       if (__glibc_unlikely (!vec_all_eq (is_special_case2, zero)))
> +       {
> +               u sc2;
> +               for (int m=0;m<2;++m)
> +               {
> +                       sc2.d[m] = specialcase(tmp[m], sbits[m], kdu.l[m]);
> +               }
> +               res.l = vec_sel (res.l, sc2.l, is_special_case2);
> +       }
> +       u sbitsu;
> +       sbitsu.l = sbits;
> +       scale = sbitsu.d;
> +       u res2;
> +       res2.d = scale + scale * tmp;
> +       /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
> +          is no spurious underflow here even without fma.  */
> +       res.l = vec_sel (res.l, res2.l, ~is_special_case & ~is_special_case2);
> +       return res.d;
> +}
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
> new file mode 100644
> index 0000000000..208b897d04
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c
> @@ -0,0 +1,202 @@
> +/* Shared data between vector exp, exp2 and pow
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +/* Based on sysdeps/ieee754/dbl-64/e_exp.c which came from
> +   Szabolcs Nagy at ARM Ltd. */
> +
> +#include "math_config_dbl.h"
> +
> +#define N (1 << EXP_TABLE_BITS)
> +
> +const struct exp_data __exp_data = {
> +// N/ln2
> +.invln2N = {
> +  0x1.71547652b82fep0 * N,
> +  0x1.71547652b82fep0 * N,
> +},
> +// -ln2/N
> +.negln2hiN = {
> +  -0x1.62e42fefa0000p-8,
> +  -0x1.62e42fefa0000p-8,
> +},
> +.negln2loN = {
> +  -0x1.cf79abc9e3b3ap-47,
> +  -0x1.cf79abc9e3b3ap-47,
> +},
> +.shift = {
> +  0x1.8p52,
> +  0x1.8p52,
> +},
> +// exp polynomial coefficients.
> +.poly = {
> +// abs error: 1.555*2^-66
> +// ulp error: 0.509 (0.511 without fma)
> +// if |x| < ln2/256+eps
> +// abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65
> +// abs error if |x| < ln2/128: 1.7145*2^-56
> +{0x1.ffffffffffdbdp-2, 0x1.ffffffffffdbdp-2},
> +{0x1.555555555543cp-3, 0x1.555555555543cp-3},
> +{0x1.55555cf172b91p-5, 0x1.55555cf172b91p-5},
> +{0x1.1111167a4d017p-7, 0x1.1111167a4d017p-7},
> +},
> +.exp2_shift = {
> +  0x1.8p52 / N,
> +  0x1.8p52 / N,
> +},
> +// exp2 polynomial coefficients.
> +.exp2_poly = {
> +// abs error: 1.2195*2^-65
> +// ulp error: 0.507 (0.511 without fma)
> +// if |x| < 1/256
> +// abs error if |x| < 1/128: 1.9941*2^-56
> +{0x1.62e42fefa39efp-1, 0x1.62e42fefa39efp-1},
> +{0x1.ebfbdff82c424p-3, 0x1.ebfbdff82c424p-3},
> +{0x1.c6b08d70cf4b5p-5, 0x1.c6b08d70cf4b5p-5},
> +{0x1.3b2abd24650ccp-7, 0x1.3b2abd24650ccp-7},
> +{0x1.5d7e09b4e3a84p-10, 0x1.5d7e09b4e3a84p-10},
> +},
> +// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
> +// tab[2*k] = asuint64(T[k])
> +// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
> +.tab = {
> +0x0, 0x3ff0000000000000,
> +0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335,
> +0xbc7160139cd8dc5d, 0x3fefec9a3e778061,
> +0xbc905e7a108766d1, 0x3fefe315e86e7f85,
> +0x3c8cd2523567f613, 0x3fefd9b0d3158574,
> +0xbc8bce8023f98efa, 0x3fefd06b29ddf6de,
> +0x3c60f74e61e6c861, 0x3fefc74518759bc8,
> +0x3c90a3e45b33d399, 0x3fefbe3ecac6f383,
> +0x3c979aa65d837b6d, 0x3fefb5586cf9890f,
> +0x3c8eb51a92fdeffc, 0x3fefac922b7247f7,
> +0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2,
> +0xbc6a033489906e0b, 0x3fef9b66affed31b,
> +0xbc9556522a2fbd0e, 0x3fef9301d0125b51,
> +0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc,
> +0xbc91c923b9d5f416, 0x3fef829aaea92de0,
> +0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51,
> +0xbc801b15eaa59348, 0x3fef72b83c7d517b,
> +0xbc8f1ff055de323d, 0x3fef6af9388c8dea,
> +0x3c8b898c3f1353bf, 0x3fef635beb6fcb75,
> +0xbc96d99c7611eb26, 0x3fef5be084045cd4,
> +0x3c9aecf73e3a2f60, 0x3fef54873168b9aa,
> +0xbc8fe782cb86389d, 0x3fef4d5022fcd91d,
> +0x3c8a6f4144a6c38d, 0x3fef463b88628cd6,
> +0x3c807a05b0e4047d, 0x3fef3f49917ddc96,
> +0x3c968efde3a8a894, 0x3fef387a6e756238,
> +0x3c875e18f274487d, 0x3fef31ce4fb2a63f,
> +0x3c80472b981fe7f2, 0x3fef2b4565e27cdd,
> +0xbc96b87b3f71085e, 0x3fef24dfe1f56381,
> +0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1,
> +0xbc3d219b1a6fbffa, 0x3fef187fd0dad990,
> +0x3c8b3782720c0ab4, 0x3fef1285a6e4030b,
> +0x3c6e149289cecb8f, 0x3fef0cafa93e2f56,
> +0x3c834d754db0abb6, 0x3fef06fe0a31b715,
> +0x3c864201e2ac744c, 0x3fef0170fc4cd831,
> +0x3c8fdd395dd3f84a, 0x3feefc08b26416ff,
> +0xbc86a3803b8e5b04, 0x3feef6c55f929ff1,
> +0xbc924aedcc4b5068, 0x3feef1a7373aa9cb,
> +0xbc9907f81b512d8e, 0x3feeecae6d05d866,
> +0xbc71d1e83e9436d2, 0x3feee7db34e59ff7,
> +0xbc991919b3ce1b15, 0x3feee32dc313a8e5,
> +0x3c859f48a72a4c6d, 0x3feedea64c123422,
> +0xbc9312607a28698a, 0x3feeda4504ac801c,
> +0xbc58a78f4817895b, 0x3feed60a21f72e2a,
> +0xbc7c2c9b67499a1b, 0x3feed1f5d950a897,
> +0x3c4363ed60c2ac11, 0x3feece086061892d,
> +0x3c9666093b0664ef, 0x3feeca41ed1d0057,
> +0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0,
> +0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de,
> +0x3c7690cebb7aafb0, 0x3feebfdad5362a27,
> +0x3c931dbdeb54e077, 0x3feebcb299fddd0d,
> +0xbc8f94340071a38e, 0x3feeb9b2769d2ca7,
> +0xbc87deccdc93a349, 0x3feeb6daa2cf6642,
> +0xbc78dec6bd0f385f, 0x3feeb42b569d4f82,
> +0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f,
> +0x3c93350518fdd78e, 0x3feeaf4736b527da,
> +0x3c7b98b72f8a9b05, 0x3feead12d497c7fd,
> +0x3c9063e1e21c5409, 0x3feeab07dd485429,
> +0x3c34c7855019c6ea, 0x3feea9268a5946b7,
> +0x3c9432e62b64c035, 0x3feea76f15ad2148,
> +0xbc8ce44a6199769f, 0x3feea5e1b976dc09,
> +0xbc8c33c53bef4da8, 0x3feea47eb03a5585,
> +0xbc845378892be9ae, 0x3feea34634ccc320,
> +0xbc93cedd78565858, 0x3feea23882552225,
> +0x3c5710aa807e1964, 0x3feea155d44ca973,
> +0xbc93b3efbf5e2228, 0x3feea09e667f3bcd,
> +0xbc6a12ad8734b982, 0x3feea012750bdabf,
> +0xbc6367efb86da9ee, 0x3fee9fb23c651a2f,
> +0xbc80dc3d54e08851, 0x3fee9f7df9519484,
> +0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74,
> +0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174,
> +0xbc8619321e55e68a, 0x3fee9feb564267c9,
> +0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f,
> +0xbc7b32dcb94da51d, 0x3feea11473eb0187,
> +0x3c94ecfd5467c06b, 0x3feea1ed0130c132,
> +0x3c65ebe1abd66c55, 0x3feea2f336cf4e62,
> +0xbc88a1c52fb3cf42, 0x3feea427543e1a12,
> +0xbc9369b6f13b3734, 0x3feea589994cce13,
> +0xbc805e843a19ff1e, 0x3feea71a4623c7ad,
> +0xbc94d450d872576e, 0x3feea8d99b4492ed,
> +0x3c90ad675b0e8a00, 0x3feeaac7d98a6699,
> +0x3c8db72fc1f0eab4, 0x3feeace5422aa0db,
> +0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c,
> +0x3c7bf68359f35f44, 0x3feeb1ae99157736,
> +0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6,
> +0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5,
> +0xbc6c23f97c90b959, 0x3feeba44cbc8520f,
> +0xbc92434322f4f9aa, 0x3feebd829fde4e50,
> +0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba,
> +0x3c71affc2b91ce27, 0x3feec49182a3f090,
> +0x3c6dd235e10a73bb, 0x3feec86319e32323,
> +0xbc87c50422622263, 0x3feecc667b5de565,
> +0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33,
> +0xbc91bbd1d3bcbb15, 0x3feed503b23e255d,
> +0x3c90cc319cee31d2, 0x3feed99e1330b358,
> +0x3c8469846e735ab3, 0x3feede6b5579fdbf,
> +0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a,
> +0x3c8c1a7792cb3387, 0x3feee89f995ad3ad,
> +0xbc907b8f4ad1d9fa, 0x3feeee07298db666,
> +0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb,
> +0xbc90a40e3da6f640, 0x3feef9728de5593a,
> +0xbc68d6f438ad9334, 0x3feeff76f2fb5e47,
> +0xbc91eee26b588a35, 0x3fef05b030a1064a,
> +0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2,
> +0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09,
> +0x3c736eae30af0cb3, 0x3fef199bdd85529c,
> +0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a,
> +0x3c84e08fd10959ac, 0x3fef27f12e57d14b,
> +0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5,
> +0x3c676b2c6c921968, 0x3fef3720dcef9069,
> +0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa,
> +0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c,
> +0xbc900dae3875a949, 0x3fef4f87080d89f2,
> +0x3c74a385a63d07a7, 0x3fef5818dcfba487,
> +0xbc82919e2040220f, 0x3fef60e316c98398,
> +0x3c8e5a50d5c192ac, 0x3fef69e603db3285,
> +0x3c843a59ac016b4b, 0x3fef7321f301b460,
> +0xbc82d52107b43e1f, 0x3fef7c97337b9b5f,
> +0xbc892ab93b470dc9, 0x3fef864614f5a129,
> +0x3c74b604603a88d3, 0x3fef902ee78b3ff6,
> +0x3c83c5ec519d7271, 0x3fef9a51fbc74c83,
> +0xbc8ff7128fd391f0, 0x3fefa4afa2a490da,
> +0xbc8dae98e223747d, 0x3fefaf482d8e67f1,
> +0x3c8ec3bc41aa2008, 0x3fefba1bee615a27,
> +0x3c842b94c3a9eb32, 0x3fefc52b376bba97,
> +0x3c8a64a931d185ee, 0x3fefd0765b6e4540,
> +0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14,
> +0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8,
> +0x3c5305c14160cc89, 0x3feff3c22b8f71f1,
> +},
> +};
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
> index d7588a7929..3a9fb05aee 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c
> @@ -21,7 +21,7 @@
>  #include <math/math-svid-compat.h>
>  #include <shlib-compat.h>
>  #include <libm-alias-double.h>
> -#include "math_config.h"
> +#include "math_config_dbl.h"
>  #include <altivec.h>
>
>  #define T __log_data.tab
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
> index 2fe569576c..f1299404c4 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c
> @@ -16,7 +16,7 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>
> -#include "math_config.h"
> +#include "math_config_dbl.h"
>
>  #define N (1 << LOG_TABLE_BITS)
>
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
> new file mode 100644
> index 0000000000..e60b0d232f
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c
> @@ -0,0 +1,40 @@
> +/* Double-precision math error handling.
> +   Copyright (C) 2017-2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <fpu/math-barriers.h>
> +#include "math_config_dbl.h"
> +
> +/* NOINLINE reduces code size.  */
> +NOINLINE static double
> +xflow (uint32_t sign, double y)
> +{
> +  y = math_opt_barrier (sign ? -y : y) * y;
> +  return y;
> +}
> +
> +attribute_hidden double
> +__math_uflow (uint32_t sign)
> +{
> +  return xflow (sign, 0x1p-767);
> +}
> +
> +attribute_hidden double
> +__math_oflow (uint32_t sign)
> +{
> +  return xflow (sign, 0x1p769);
> +}
> diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> index bfbd340626..46558dbd77 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist
> @@ -1,4 +1,5 @@
>  GLIBC_2.30 _ZGVbN2v_cos F
> +GLIBC_2.30 _ZGVbN2v_exp F
>  GLIBC_2.30 _ZGVbN2v_log F
>  GLIBC_2.30 _ZGVbN2v_sin F
>  GLIBC_2.30 _ZGVbN2vvv_sincos F
> --
> 2.21.0.1020.gf2820cf01a
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v3 PATCH] PPC64: Add libmvec SIMD double-precision natural exponent function.
  2019-05-29 19:11       ` [v3 PATCH] " Shawn Landden
  2019-05-29 19:12         ` Shawn Landden
  2019-06-04  2:43         ` Shawn Landden
@ 2019-06-04 20:00         ` Tulio Magno Quites Machado Filho
  2 siblings, 0 replies; 15+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2019-06-04 20:00 UTC (permalink / raw)
  To: Shawn Landden, libc-alpha

Shawn Landden <shawn@git.icu> writes:

> 	[BZ #24209]
> 	* NEWS: Noted the addition of PPC64 vector exp function.
> 	* sysdeps/powerpc/bits/math-vector.h: Added entry for vector exp.
> 	* sysdeps/powerpc/powerpc64/fpu/Versions: Added vector exp entry.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile:
> 	(libmvec-sysdep_routines, CFLAGS-vec_d_exp2_vsx.c):
> 	(CFLAGS-vec_d_exp_data.c): Added build of VSX SIMD expf
> 	function and tests. Added vec_math_err.c to build.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/(math_config.h)->(math_config_dbl.h)
> 	Renamed, and modified for exp.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/test-double-vlen2-wrappers.c:
> 	Added entry for vector exp.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c: New file.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp_data.c: Likewise.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_err.c: Likewise.
> 	* sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist: SIMD expf
> 	added.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log_data.c: Rename of math_config.h
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_log2_vsx.c: Likewise

Looks like some of the issues you fixed in v2 got reverted here.
Fixed.

> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
> new file mode 100644
> index 0000000000..376531e32b
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_exp2_vsx.c
> ...
> +/* Top 12 bits of a double (sign and exponent bits).  */
> +static inline uint32_t top12(double x)
> +{
> +	return asuint64(x) >> 52;
> +}

Something happened here as well and all the coding style issues that you fixed
in v2 got lost.  Starting from this function.
Fixed.

I'm pushing this patch with the fixes I mentioned to branch tuliom/libmvec.

Thanks!

-- 
Tulio Magno

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2019-06-04 20:00 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-12  3:28 [PATCH 1/2] PPC64: Add libmvec SIMD single-precision natural exponent function Shawn Landden
2019-05-12  3:28 ` [PATCH 2/2] PPC64: Add libmvec SIMD double-precision " Shawn Landden
2019-05-24 21:41   ` Tulio Magno Quites Machado Filho
     [not found]     ` <CA+49okpEUryebPwnwJha-wJ49=w2qo5r_+JUWvCD-e495UDH5g@mail.gmail.com>
2019-05-24 22:25       ` Tulio Magno Quites Machado Filho
2019-05-24 20:04 ` [PATCH 1/2] PPC64: Add libmvec SIMD single-precision " Tulio Magno Quites Machado Filho
2019-05-27 17:47 ` [v2 " Shawn Landden
2019-05-27 17:47   ` [v2 2/2] PPC64: Add libmvec SIMD double-precision " Shawn Landden
2019-05-29 13:18     ` Tulio Magno Quites Machado Filho
2019-05-29 19:11       ` [v3 PATCH] " Shawn Landden
2019-05-29 19:12         ` Shawn Landden
2019-05-29 19:19           ` Florian Weimer
2019-06-04  2:43         ` Shawn Landden
2019-06-04 20:00         ` Tulio Magno Quites Machado Filho
2019-05-27 18:00   ` [v2 1/2] PPC64: Add libmvec SIMD single-precision " Shawn Landden
2019-05-28 18:20   ` Tulio Magno Quites Machado Filho

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).