unofficial mirror of libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1 01/27] x86/fpu: Create helper file for common data macros
@ 2022-12-07  8:52 Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 02/27] x86/fpu: Add file for common data used across svml_s_*_avx2.S files Noah Goldstein via Libc-alpha
                   ` (26 more replies)
  0 siblings, 27 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

The macros are useful for creating .rodata definitions and checking
that the offset is correct.
---
 .../x86_64/fpu/svml_common_data_macros.h.S    | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 sysdeps/x86_64/fpu/svml_common_data_macros.h.S

diff --git a/sysdeps/x86_64/fpu/svml_common_data_macros.h.S b/sysdeps/x86_64/fpu/svml_common_data_macros.h.S
new file mode 100644
index 0000000000..31bd66835d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_common_data_macros.h.S
@@ -0,0 +1,50 @@
+/* Helper macros for creating rodata
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+#ifndef _SVML_COMMON_DATA_MACROS_H_S
+#define _SVML_COMMON_DATA_MACROS_H_S	1
+
+
+.macro check_offset data_section offset
+	.if	.-\data_section != \offset
+	.err
+	.endif
+.endm
+
+
+/* Only used in floating point functions at the moment.  */
+.macro float_vectorN data_section N offset value
+	check_offset \data_section \offset
+	.rept	\N
+	.long	\value
+	.endr
+.endm
+
+#define float_block(data_section, offset, ...)	\
+	check_offset data_section offset;	\
+	.long	__VA_ARGS__
+
+
+#define float_vector16(data_section, offset, value)	\
+	float_vectorN data_section 4 offset value
+#define float_vector32(data_section, offset, value)	\
+	float_vectorN data_section 8 offset value
+#define float_vector64(data_section, offset, value)	\
+	float_vectorN data_section 16 offset value
+
+#endif
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 02/27] x86/fpu: Add file for common data used across svml_s_*_avx2.S files
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 03/27] x86/fpu: Add file for common data used across svml_s_*_avx512.S files Noah Goldstein via Libc-alpha
                   ` (25 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

All rodata that is used by more than one implementation can be defined
     here to save .rodata size.

At the moment definitions are only for the following implementations:
    atanf
    atanhf
    log
    log10f
    log2
    tanf

have been added.
---
 .../x86_64/fpu/svml_s_common_avx2_rodata.S    | 28 +++++++++++++++++++
 .../fpu/svml_s_common_avx2_rodata_offsets.h   | 26 +++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 sysdeps/x86_64/fpu/svml_s_common_avx2_rodata.S
 create mode 100644 sysdeps/x86_64/fpu/svml_s_common_avx2_rodata_offsets.h

diff --git a/sysdeps/x86_64/fpu/svml_s_common_avx2_rodata.S b/sysdeps/x86_64/fpu/svml_s_common_avx2_rodata.S
new file mode 100644
index 0000000000..986e8ca685
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_common_avx2_rodata.S
@@ -0,0 +1,28 @@
+#include "svml_s_common_avx2_rodata_offsets.h"
+
+	.section .rodata.avx2, "a"
+	.align	32
+	.globl	COMMON_DATA_NAME
+
+COMMON_DATA_NAME:
+	/* Used By: atanf, atanhf, log, log10f, log2, tanf.  */
+	DATA_VEC(COMMON_DATA_NAME, _OneF, 0x3f800000)
+	/* Used By: atanhf, log, log10f, log2.  */
+	DATA_VEC(COMMON_DATA_NAME, _NotiOffExpoMask, 0xff800000)
+	/* Used By: atanhf, log, log10f, log2.  */
+	DATA_VEC(COMMON_DATA_NAME, _IBrkValue, 0x3f2aaaab)
+	/* Used By: atanf, atanhf, tanf, tanhf.  */
+	DATA_VEC(COMMON_DATA_NAME, _AbsMask, 0x7fffffff)
+	/* Used By: log, log10f, log2.  */
+	DATA_VEC(COMMON_DATA_NAME, _ILoRange, 0x01000000)
+	/* Used By: atanhf, log.  */
+	DATA_VEC(COMMON_DATA_NAME, _Neg5F, 0xbf000000)
+	/* Used By: atanhf, log.  */
+	DATA_VEC(COMMON_DATA_NAME, _Ln2, 0x3f317218)
+	/* Used By: atanhf, tanf.  */
+	DATA_VEC(COMMON_DATA_NAME, _Neg4096, 0xfffff000)
+	/* Used By: atanf, tanf.  */
+	DATA_VEC(COMMON_DATA_NAME, _TanSPI1_FMA, 0x3fc90fdb)
+
+	.type	COMMON_DATA_NAME, @object
+	.size	COMMON_DATA_NAME, .-COMMON_DATA_NAME
diff --git a/sysdeps/x86_64/fpu/svml_s_common_avx2_rodata_offsets.h b/sysdeps/x86_64/fpu/svml_s_common_avx2_rodata_offsets.h
new file mode 100644
index 0000000000..6b18b721b1
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_common_avx2_rodata_offsets.h
@@ -0,0 +1,26 @@
+#ifndef _SVML_S_COMMON_AVX2_RODATA_OFFSETS_H
+#define _SVML_S_COMMON_AVX2_RODATA_OFFSETS_H	1
+
+#include "svml_common_data_macros.h.S"
+
+#define COMMON_DATA_NAME __svml_s_common_avx2_data
+
+#define _OneF            0
+#define _NotiOffExpoMask 32
+#define _IBrkValue       64
+#define _AbsMask         96
+#define _ILoRange        128
+#define _Neg5F           160
+#define _Ln2             192
+#define _Neg4096         224
+#define _TanSPI1_FMA     256
+
+#define COMMON_DATA(offset) ((offset) + (COMMON_DATA_NAME))
+#define DATA_VEC(data_name, offset, value)                                     \
+    float_vector32(data_name, offset, value)
+
+#ifdef LOCAL_DATA_NAME
+# define LOCAL_DATA(offset) ((offset) + (LOCAL_DATA_NAME))
+#endif
+
+#endif
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 03/27] x86/fpu: Add file for common data used across svml_s_*_avx512.S files
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 02/27] x86/fpu: Add file for common data used across svml_s_*_avx2.S files Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 04/27] x86/fpu: Add file for common data used across svml_s_*_sse4.S files Noah Goldstein via Libc-alpha
                   ` (24 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

All rodata that is used by more than one implementation can be defined
     here to save .rodata size.

At the moment definitions are only for the following implementations:
    atanf
    atanhf
    log
    log10f
    log2
    tanf

have been added.
---
 .../x86_64/fpu/svml_s_common_evex512_rodata.S | 19 +++++++++++++
 .../svml_s_common_evex512_rodata_offsets.h    | 27 +++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 sysdeps/x86_64/fpu/svml_s_common_evex512_rodata.S
 create mode 100644 sysdeps/x86_64/fpu/svml_s_common_evex512_rodata_offsets.h

diff --git a/sysdeps/x86_64/fpu/svml_s_common_evex512_rodata.S b/sysdeps/x86_64/fpu/svml_s_common_evex512_rodata.S
new file mode 100644
index 0000000000..4652b7be41
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_common_evex512_rodata.S
@@ -0,0 +1,19 @@
+#include "svml_s_common_evex512_rodata_offsets.h"
+
+	.section .rodata.evex512, "a"
+	.align	64
+	.globl	COMMON_DATA_NAME
+COMMON_DATA_NAME:
+	/* Used By: atanf, atanhf, log10f, log2f, logf, tanf.  */
+	DATA_VEC (COMMON_DATA_NAME, _OneF, 0x3f800000)
+	/* Used By: logf, tanf.  */
+	DATA_VEC (COMMON_DATA_NAME, _NotiOffExpoMask, 0xff800000)
+	/* Used By: atanhf, logf.  */
+	DATA_VEC (COMMON_DATA_NAME, _Neg5F, 0xbf000000)
+	/* Used By: tanf, atanhf.  */
+	DATA_VEC (COMMON_DATA_NAME, _AbsMask, 0x7fffffff)
+	/* Used By: atanf, tanhf.  */
+	DATA_VEC (COMMON_DATA_NAME, _SignMask, 0x80000000)
+
+	.type	COMMON_DATA_NAME, @object
+	.size	COMMON_DATA_NAME, .-COMMON_DATA_NAME
diff --git a/sysdeps/x86_64/fpu/svml_s_common_evex512_rodata_offsets.h b/sysdeps/x86_64/fpu/svml_s_common_evex512_rodata_offsets.h
new file mode 100644
index 0000000000..f41020f760
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_common_evex512_rodata_offsets.h
@@ -0,0 +1,27 @@
+#ifndef _SVML_S_COMMON_EVEX512_RODATA_OFFSETS_H
+#define _SVML_S_COMMON_EVEX512_RODATA_OFFSETS_H	1
+
+#include "svml_common_data_macros.h.S"
+
+#define COMMON_DATA_NAME __svml_s_common_evex512_data
+
+#define _OneF            0
+#define _NotiOffExpoMask 64
+#define _Neg5F           128
+#define _AbsMask         192
+#define _SignMask        256
+
+
+#define COMMON_DATA(offset) ((offset) + (COMMON_DATA_NAME))
+#define DATA_VEC(data_name, offset, value)                                     \
+    float_vector64(data_name, offset, value)
+
+#ifdef LOCAL_DATA_NAME
+# define LOCAL_DATA(offset) ((offset) + (LOCAL_DATA_NAME))
+#endif
+
+#ifdef LOCAL_DATA_NAME_UNALIGNED
+# define LOCAL_DATA_UNALIGNED(offset) ((offset) + (LOCAL_DATA_NAME_UNALIGNED))
+#endif
+
+#endif
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 04/27] x86/fpu: Add file for common data used across svml_s_*_sse4.S files
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 02/27] x86/fpu: Add file for common data used across svml_s_*_avx2.S files Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 03/27] x86/fpu: Add file for common data used across svml_s_*_avx512.S files Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 05/27] x86/fpu: Build common data files for svml_s_*_{avx512, avx2, sse4}.S Noah Goldstein via Libc-alpha
                   ` (23 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

All rodata that is used by more than one implementation can be defined
     here to save .rodata size.

At the moment definitions are only for the following implementations:
    atanf
    atanhf
    log
    log10f
    log2
    tanf

have been added.
---
 .../x86_64/fpu/svml_s_common_sse4_rodata.S    | 27 +++++++++++++++++++
 .../fpu/svml_s_common_sse4_rodata_offsets.h   | 25 +++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 sysdeps/x86_64/fpu/svml_s_common_sse4_rodata.S
 create mode 100644 sysdeps/x86_64/fpu/svml_s_common_sse4_rodata_offsets.h

diff --git a/sysdeps/x86_64/fpu/svml_s_common_sse4_rodata.S b/sysdeps/x86_64/fpu/svml_s_common_sse4_rodata.S
new file mode 100644
index 0000000000..58337635fd
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_common_sse4_rodata.S
@@ -0,0 +1,27 @@
+#include "svml_s_common_sse4_rodata_offsets.h"
+
+	.section .rodata.sse4, "a"
+	.align	16
+	.globl	COMMON_DATA_NAME
+
+COMMON_DATA_NAME:
+	/* Used By: atanf, atanhf, log10f, log2f, logf, tanf.  */
+	DATA_VEC(COMMON_DATA_NAME, _OneF, 0x3f800000)
+	/* Used By: atanhf, log10f, log2f, logf, tanf.  */
+	DATA_VEC(COMMON_DATA_NAME, _NotiOffExpoMask, 0xff800000)
+	/* Used By: atanhf, log10f, log2f, logf.  */
+	DATA_VEC(COMMON_DATA_NAME, _IBrkValue, 0x3f2aaaab)
+	/* Used By: log10f, log2f, logf.  */
+	DATA_VEC(COMMON_DATA_NAME, _ILoRange, 0x01000000)
+	/* Used By: atanhf, logf.  */
+	DATA_VEC(COMMON_DATA_NAME, _Neg5F, 0xbf000000)
+	/* Used By: atanhf, logf.  */
+	DATA_VEC(COMMON_DATA_NAME, _Ln2, 0x3f317218)
+	/* Used By: atanhf, tanf.  */
+	DATA_VEC(COMMON_DATA_NAME, _AbsMask, 0x7fffffff)
+	/* Used By: atanhf, tanf.  */
+	DATA_VEC(COMMON_DATA_NAME, _Neg4096, 0xfffff000)
+
+
+	.type	COMMON_DATA_NAME, @object
+	.size	COMMON_DATA_NAME, .-COMMON_DATA_NAME
diff --git a/sysdeps/x86_64/fpu/svml_s_common_sse4_rodata_offsets.h b/sysdeps/x86_64/fpu/svml_s_common_sse4_rodata_offsets.h
new file mode 100644
index 0000000000..8f985565d1
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_common_sse4_rodata_offsets.h
@@ -0,0 +1,25 @@
+#ifndef _SVML_S_COMMON_SSE4_RODATA_OFFSETS_H
+#define _SVML_S_COMMON_SSE4_RODATA_OFFSETS_H	1
+
+#include "svml_common_data_macros.h.S"
+
+#define COMMON_DATA_NAME __svml_s_common_sse4_data
+
+#define _OneF	0
+#define _NotiOffExpoMask	16
+#define _IBrkValue	32
+#define _ILoRange	48
+#define _Neg5F	64
+#define _Ln2	80
+#define _AbsMask	96
+#define _Neg4096	112
+
+#define COMMON_DATA(offset) ((offset) + (COMMON_DATA_NAME))
+#define DATA_VEC(data_name, offset, value)                                     \
+    float_vector16(data_name, offset, value)
+
+#ifdef LOCAL_DATA_NAME
+# define LOCAL_DATA(offset) ((offset) + (LOCAL_DATA_NAME))
+#endif
+
+#endif
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 05/27] x86/fpu: Build common data files for svml_s_*_{avx512, avx2, sse4}.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (2 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 04/27] x86/fpu: Add file for common data used across svml_s_*_sse4.S files Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 06/27] x86/fpu: Update rodata usage in svml_s_tanhf_*_{avx2, sse4} Noah Goldstein via Libc-alpha
                   ` (22 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

Build them so they can start to be used in the implementations.
---
 sysdeps/x86_64/fpu/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
index 7233174ede..e7e747e920 100644
--- a/sysdeps/x86_64/fpu/Makefile
+++ b/sysdeps/x86_64/fpu/Makefile
@@ -14,6 +14,9 @@ libmvec-support += \
   svml_d_log_data \
   svml_d_pow_data \
   svml_d_trig_data \
+  svml_s_common_avx2_rodata \
+  svml_s_common_evex512_rodata \
+  svml_s_common_sse4_rodata \
   svml_s_expf_data \
   svml_s_logf_data \
   svml_s_powf_data \
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 06/27] x86/fpu: Update rodata usage in svml_s_tanhf_*_{avx2, sse4}
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (3 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 05/27] x86/fpu: Build common data files for svml_s_*_{avx512, avx2, sse4}.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S Noah Goldstein via Libc-alpha
                   ` (21 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

No changes to the logic, just change how rodata is handled.

1. Define the rodatas using the new macros so they check that the
   offset is correct.

2. Use common data where applicable.
---
 .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   |  72 +-
 .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 105 +--
 .../fpu/multiarch/svml_s_tanhf_rodata.S       | 621 ------------------
 .../fpu/multiarch/svml_s_tanhf_rodata.h.S     | 591 +++++++++++++++++
 4 files changed, 680 insertions(+), 709 deletions(-)
 delete mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.h.S

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
index bf7687d8ba..824862eb7f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
@@ -70,21 +70,22 @@
  *
  */
 
+#include "svml_s_common_sse4_rodata_offsets.h"
 
-#include <sysdep.h>
+/* tanhf data tables for avx2 and sse4 implementatins
+   defined here.  */
+#define AVX2_SHARED_OFFSETS
+#include "svml_s_tanhf_rodata.h.S"
 
-/* tanhf data tables for avx2 and sse4 implementatins defined here.
- */
-#define ONLY_DECL_OFFSET
-#include "svml_s_tanhf_rodata.S"
+#include <sysdep.h>
 
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_tanhf_sse4)
 	/* Save copy of input in xmm12.  */
 	movaps	%xmm0, %xmm12
 
-	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	movdqu	TANHF_DATA(_iExpMantMask)(%rip), %xmm3
+	/* Here huge arguments, INF and NaNs are filtered out to callout.  */
+	movdqu	AVX2_SHARED_DATA(_iExpMantMask)(%rip), %xmm3
 	pand	%xmm0, %xmm3
 
 
@@ -92,9 +93,9 @@ ENTRY(_ZGVbN4v_tanhf_sse4)
 	pxor	%xmm7, %xmm7
 	/* Save xmm3 for special values check at end.  */
 	movdqa	%xmm3, %xmm8
-	psubd	TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
+	psubd	AVX2_SHARED_DATA(_iMinIdxOfsMask)(%rip), %xmm3
 	pmaxsd	%xmm7, %xmm3
-	pminsd	TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
+	pminsd	AVX2_SHARED_DATA(_iMaxIdxMask)(%rip), %xmm3
 	psrld	$14, %xmm3
 
 	movq	%xmm3, %rcx
@@ -106,17 +107,14 @@ ENTRY(_ZGVbN4v_tanhf_sse4)
 	movl	%edi, %esi
 	shrq	$32, %rdi
 
-	movaps	TANHF_DATA(_sAbsMask)(%rip), %xmm1
+	movaps	COMMON_DATA(_AbsMask)(%rip), %xmm1
 	andps	%xmm1, %xmm0
 
-	leaq	TANHF_DATA(_lookupTable)(%rip), %rax
+	/* small table specific variables Constant loading.  */
+	leaq	AVX2_SHARED_DATA(_lookupTable)(%rip), %rax
 	movups	(%rdx, %rax), %xmm2
 	movups	(%rcx, %rax), %xmm6
 
-	/*
-	 *  small table specific variables *
-	 *  Constant loading
-	 */
 	movaps	%xmm2, %xmm4
 	movlhps	%xmm6, %xmm4
 	unpckhpd %xmm6, %xmm2
@@ -175,45 +173,45 @@ ENTRY(_ZGVbN4v_tanhf_sse4)
 	orps	%xmm1, %xmm0
 
 	/* xmm8 contains mask of special values.  */
-	pcmpgtd	TANHF_DATA(_iExpMask)(%rip), %xmm8
+	pcmpgtd	AVX2_SHARED_DATA(_iExpMask)(%rip), %xmm8
 
 	movmskps %xmm8, %edx
 	testl	%edx, %edx
 
-	/* Go to special inputs processing branch */
+	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx rbp r12 r13 r14 r15 xmm0
+
 	/* No stack restoration on the fastpath.  */
 	ret
 
 	/* Cold case. edx has 1s where there was a special value that
 	   needs to be handled by a tanhf call. Optimize for code size
-	   more so than speed here. */
+	   more so than speed here.  */
 L(SPECIAL_VALUES_BRANCH):
-	# LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm12
+
 	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
-       call entry will be 16-byte aligned. */
+	   call entry will be 16-byte aligned.  */
 	subq	$56, %rsp
-	cfi_def_cfa_offset(64)
+	cfi_def_cfa_offset (64)
 	movups	%xmm0, 24(%rsp)
 	movups	%xmm12, 40(%rsp)
 
 	/* Use rbx/rbp for callee save registers as they get short
-       encoding for many instructions (as compared with r12/r13). */
+	   encoding for many instructions (as compared with r12/r13).  */
 	movq	%rbx, (%rsp)
-	cfi_offset(rbx, -64)
+	cfi_offset (rbx, -64)
 	movq	%rbp, 8(%rsp)
-	cfi_offset(rbp, -56)
-	/* edx has 1s where there was a special value that needs to be handled
-	   by a tanhf call.  */
+	cfi_offset (rbp, -56)
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a tanhf call.  */
 	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	# LOE rbx rbp r12 r13 r14 r15
-	/* use rbp as index for special value that is saved across calls to
-	   tanhf. We technically don't need a callee save register here as offset
-	   to rsp is always [0, 12] so we can restore rsp by realigning to 64.
-	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
-	   in the loop.  */
+
+	/* use rbp as index for special value that is saved across calls
+	   to tanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 12] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop.  */
 	xorl	%ebp, %ebp
 	bsfl	%ebx, %ebp
 
@@ -228,14 +226,14 @@ L(SPECIAL_VALUES_LOOP):
 	leal	-1(%rbx), %eax
 	andl	%eax, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
-	# LOE r12 r13 r14 r15
+
 	/* All results have been written to 24(%rsp).  */
 	movups	24(%rsp), %xmm0
 	movq	(%rsp), %rbx
-	cfi_restore(rbx)
+	cfi_restore (rbx)
 	movq	8(%rsp), %rbp
-	cfi_restore(rbp)
+	cfi_restore (rbp)
 	addq	$56, %rsp
-	cfi_def_cfa_offset(8)
+	cfi_def_cfa_offset (8)
 	ret
 END(_ZGVbN4v_tanhf_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
index ea3e9f4210..9c524f5df4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
@@ -70,42 +70,44 @@
  *
  */
 
-#include <sysdep.h>
+#include "svml_s_common_avx2_rodata_offsets.h"
 
-/* tanhf data tables for avx2 and sse4 implementatins defined here.
- */
-#include "svml_s_tanhf_rodata.S"
+/* tanhf data tables for avx2 and sse4 implementatins
+   defined here.  */
+#define AVX2_SHARED_TABLE
+#include "svml_s_tanhf_rodata.h.S"
+
+#include <sysdep.h>
 
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_tanhf_avx2)
-	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	vpand	TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
-	vpsubd	TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
+	/* Here huge arguments, INF and NaNs are filtered out to callout.  */
+	vpand	AVX2_SHARED_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
+	vpsubd	AVX2_SHARED_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
 
 	/* Selection of arguments between [0, 0x04280000] into ymm2.  */
 	vpxor	%ymm3, %ymm3, %ymm3
 	vpmaxsd	%ymm3, %ymm2, %ymm2
-	vpminsd	TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
+	vpminsd	AVX2_SHARED_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
 
-	/*
-	 *  small table specific variables *
-	 *  Constant loading
-	 */
+	/* small table specific variables *
+	   Constant loading.  */
 	vpsrld	$14, %ymm2, %ymm1
 
 	/* We are splitting xmm1 into 8 GPRs. This may be faster to do with
 	   store/load as we can take advantage of store-forwarding.  */
 	vmovq	%xmm1, %r8
-	/* We have eliminated all negative values for ymm1 so no need to sign
-	   extend.  */
+	/* We have eliminated all negative values for ymm1 so no need to
+	   sign extend.  */
 	movl	%r8d, %r9d
 	shrq	$32, %r8
 
 	/* Store base of lookup table in rax.  */
-	leaq	TANHF_DATA(_lookupTable)(%rip), %rax
+	leaq	AVX2_SHARED_DATA(_lookupTable)(%rip), %rax
 
-	/* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
-	   with memory operand. This helps alleviate bottleneck on p5.  */
+	/* Instead of using cross-lane permutes on ymm vectors, use
+	   vpinsertf128 with memory operand. This helps alleviate
+	   bottleneck on p5.  */
 	vmovupd	16(%r9, %rax), %xmm5
 
 	vpextrq	$1, %xmm1, %rsi
@@ -138,9 +140,9 @@ ENTRY(_ZGVdN8v_tanhf_avx2)
 	vunpcklpd %ymm1, %ymm5, %ymm3
 	vunpckhpd %ymm1, %ymm5, %ymm1
 
-	vmovaps	TANHF_DATA(_sAbsMask)(%rip), %ymm11
+	vmovaps	COMMON_DATA(_AbsMask)(%rip), %ymm11
 	/* Store special cases in ymm15.  */
-	vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
+	vpcmpgtd AVX2_SHARED_DATA(_iExpMask)(%rip), %ymm4, %ymm15
 
 	vandps	%ymm11, %ymm0, %ymm4
 
@@ -184,9 +186,9 @@ ENTRY(_ZGVdN8v_tanhf_avx2)
 	vmovmskps %ymm15, %edx
 	vandnps	%ymm0, %ymm11, %ymm2
 	testl	%edx, %edx
-	/* Go to special inputs processing branch */
+	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
+
 	/* Wait until after branch of write over ymm0.  */
 	vorps	%ymm2, %ymm1, %ymm0
 	/* No stack restoration on the fastpath.  */
@@ -195,24 +197,24 @@ ENTRY(_ZGVdN8v_tanhf_avx2)
 
 	/* Cold case. edx has 1s where there was a special value that
 	   needs to be handled by a tanhf call. Optimize for code size
-	   more so than speed here. */
+	   more so than speed here.  */
 L(SPECIAL_VALUES_BRANCH):
-	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
-    /* Use r13 to save/restore the stack. This allows us to use rbp as
-       callee save register saving code size. */
+
+	/* Use r13 to save/restore the stack. This allows us to use rbp as
+	   callee save register saving code size.  */
 	pushq	%r13
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(r13, -16)
-	/* Need to callee save registers to preserve state across tanhf calls.
-	 */
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (r13, -16)
+	/* Need to callee save registers to preserve state across tanhf
+	   calls.  */
 	pushq	%rbx
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(rbx, -24)
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbx, -24)
 	pushq	%rbp
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(rbp, -32)
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbp, -32)
 	movq	%rsp, %r13
-	cfi_def_cfa_register(r13)
+	cfi_def_cfa_register (r13)
 
 	/* Align stack and make room for 2x ymm vectors.  */
 	andq	$-32, %rsp
@@ -226,16 +228,17 @@ L(SPECIAL_VALUES_BRANCH):
 
 	vzeroupper
 
-	/* edx has 1s where there was a special value that needs to be handled
-	   by a tanhf call.  */
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a tanhf call.  */
 	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	# LOE rbx rbp r12 r13 r14 r15
-	/* use rbp as index for special value that is saved across calls to
-	   tanhf. We technically don't need a callee save register here as offset
-	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
-	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
-	   in the loop. Realigning also costs more code size.  */
+
+	/* use rbp as index for special value that is saved across calls
+	   to tanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 28] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop. Realigning
+	   also costs more code size.  */
 	xorl	%ebp, %ebp
 	tzcntl	%ebx, %ebp
 
@@ -248,25 +251,25 @@ L(SPECIAL_VALUES_LOOP):
 	   serialized stack/callee save restoration.  */
 	vmovss	%xmm0, (%rsp, %rbp, 4)
 
-	blsrl   %ebx, %ebx
+	blsrl	%ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
-	# LOE r12 r13 r14 r15
+
 
 
 	/* All results have been written to (%rsp).  */
 	vmovups	(%rsp), %ymm0
 	/* Restore rsp.  */
 	movq	%r13, %rsp
-	cfi_def_cfa_register(rsp)
+	cfi_def_cfa_register (rsp)
 	/* Restore callee save registers.  */
 	popq	%rbp
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(rbp)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
 	popq	%rbx
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(rbp)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
 	popq	%r13
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(r13)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (r13)
 	ret
 END(_ZGVdN8v_tanhf_avx2)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
deleted file mode 100644
index 904fe5f588..0000000000
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.S
+++ /dev/null
@@ -1,621 +0,0 @@
-/* Datatables for  tanhf AVX2 and tanhf SSE4.
-   Copyright (C) 2021-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   https://www.gnu.org/licenses/.  */
-
-/* Offsets are ordered by use in the function. On cold-starts this
-   might help the prefetcher. If the streaming prefetchers kick in it
-   will prefetch into the lookup table.  */
-#define _iExpMantMask			0
-#define _iMinIdxOfsMask			32
-#define _iMaxIdxMask			64
-#define _sAbsMask			96
-#define _iExpMask			128
-#define _lookupTable			160
-
-#define TANHF_DATA(offset)		((offset)+__svml_stanh_data_internal_avx2)
-#ifndef ONLY_DECL_OFFSET
-	.section .rodata, "a"
-	.align	32
-
-# ifdef __svml_stanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct
-	{
-	__declspec(align(32)) VUINT32 _iExpMantMask[8][1];
-	__declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
-	__declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
-	__declspec(align(32)) VUINT32 _sAbsMask[8][1];
-	__declspec(align(32)) VUINT32 _iExpMask[8][1];
-	__declspec(align(32)) VUINT32 _lookupTable[(134*4)][2];
-} __svml_stanh_data_internal;
-# endif
-
-
-__svml_stanh_data_internal:
-	.globl	__svml_stanh_data_internal_avx2
-__svml_stanh_data_internal_avx2:
-	.align	32
-	/* _iExpMantMask.  */
-	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000
-	.long	0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000
-
-	.align	32
-	/* _iMinIdxOfsMask.  */
-	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000
-	.long	0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000
-
-	.align	32
-	/* _iMaxIdxMask.  */
-	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000
-	.long	0x04280000, 0x04280000, 0x04280000, 0x04280000
-
-	.align	32
-	/* _sAbsMask.  */
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-
-	.align	32
-	/* _iExpMask.  */
-	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000
-	.long	0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000
-
-	.align	32
-	/* _lookupTable.  */
-	/* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500].  */
-	.quad	0x0000000000000000 /* A00 = +0.000000000000000000000e-01.  */
-	.quad	0x3FF00000022C70EB /* A01 = +1.000000008097283510367e+00.  */
-	.quad	0xBED00E878CFFA194 /* A02 = -3.828228912518614443549e-06.  */
-	.quad	0xBFD551766D0607A9 /* A03 = -3.330970825846813476723e-01.  */
-	.quad	0xBE53D60CE3E4C297 /* A00 = -1.847383956330407336230e-08.  */
-	.quad	0x3FF000024177CF5C /* A01 = +1.000002151235967140508e+00.  */
-	.quad	0xBF1758BC94A51A25 /* A02 = -8.906031613262943753568e-05.  */
-	.quad	0xBFD53EAE67E0D4F0 /* A03 = -3.319507612644221339337e-01.  */
-	.quad	0xBE5A9E47EF32D6FE /* A00 = -2.479020984039698285657e-08.  */
-	.quad	0x3FF00002DA983057 /* A01 = +1.000002721676556793895e+00.  */
-	.quad	0xBF1BD953509E94AA /* A02 = -1.062352277175377670507e-04.  */
-	.quad	0xBFD53BDB562EEDD5 /* A03 = -3.317783681520414806876e-01.  */
-	.quad	0xBE6191BBE496D294 /* A00 = -3.272532162914017685901e-08.  */
-	.quad	0x3FF0000390492017 /* A01 = +1.000003398528866105366e+00.  */
-	.quad	0xBF20727E814A57CE /* A02 = -1.254825043772153972919e-04.  */
-	.quad	0xBFD538DE060A6F22 /* A03 = -3.315959033004550748913e-01.  */
-	.quad	0xBE66DAFA2A893A25 /* A00 = -4.257146219278012568149e-08.  */
-	.quad	0x3FF0000465E08CD1 /* A01 = +1.000004194219219266770e+00.  */
-	.quad	0xBF2341C765EF91B6 /* A02 = -1.469188600530365522261e-04.  */
-	.quad	0xBFD535B6841FAF9E /* A03 = -3.314033785124993469751e-01.  */
-	.quad	0xBE6D5794E361E964 /* A00 = -5.465394929765249413434e-08.  */
-	.quad	0x3FF000055EE2A0CB /* A01 = +1.000005121846742950353e+00.  */
-	.quad	0xBF265E6C77E66C8B /* A02 = -1.706607253709506650304e-04.  */
-	.quad	0xBFD53264DDCCEDA6 /* A03 = -3.312008062382240103361e-01.  */
-	.quad	0xBE729C844D374A6E /* A00 = -6.933284462462096107184e-08.  */
-	.quad	0x3FF000067F019093 /* A01 = +1.000006195180536350264e+00.  */
-	.quad	0xBF29CC5348D6DCE5 /* A02 = -1.968242326435338705130e-04.  */
-	.quad	0xBFD52EE92121ED35 /* A03 = -3.309881995734998416658e-01.  */
-	.quad	0xBE775AEA17EAA872 /* A00 = -8.700465590574974405858e-08.  */
-	.quad	0x3FF00007CA1D66B8 /* A01 = +1.000007428656699559610e+00.  */
-	.quad	0xBF2D8F5EB98A2637 /* A02 = -2.255252009216044881395e-04.  */
-	.quad	0xBFD52B435CDF9128 /* A03 = -3.307655722585587376727e-01.  */
-	.quad	0xBE7D04DA28C343F0 /* A00 = -1.081040272327705484794e-07.  */
-	.quad	0x3FF000094443CCF5 /* A01 = +1.000008837375216730337e+00.  */
-	.quad	0xBF30D5B76C947AE5 /* A02 = -2.568791210978817814332e-04.  */
-	.quad	0xBFD52773A0776FAD /* A03 = -3.305329386764651045105e-01.  */
-	.quad	0xBE81DD77A12C51C7 /* A00 = -1.331054169875768625701e-07.  */
-	.quad	0x3FF0000AF1AFD2DA /* A01 = +1.000010437096696680470e+00.  */
-	.quad	0xBF331230624C1680 /* A02 = -2.910011410651516805537e-04.  */
-	.quad	0xBFD52379FC0B61DF /* A03 = -3.302903138515186909352e-01.  */
-	.quad	0xBE85D04EEEB3C435 /* A00 = -1.625247628488202841012e-07.  */
-	.quad	0x3FF0000CD6C9B1F2 /* A01 = +1.000012244238970726684e+00.  */
-	.quad	0xBF357F0742FADDD4 /* A02 = -3.280060509313874068243e-04.  */
-	.quad	0xBFD51F56806D0E81 /* A03 = -3.300377134475880880338e-01.  */
-	.quad	0xBE8A6E289B59681B /* A00 = -1.969211333326924655065e-07.  */
-	.quad	0x3FF0000EF8268F72 /* A01 = +1.000014275873550406715e+00.  */
-	.quad	0xBF381E277A1B747A /* A02 = -3.680082682942575423093e-04.  */
-	.quad	0xBFD51B093F1D6FD4 /* A03 = -3.297751537663746734808e-01.  */
-	.quad	0xBE8FCBC40EE9ABD5 /* A00 = -2.368983653301529373887e-07.  */
-	.quad	0x3FF000115A883B6C /* A01 = +1.000016549721943981410e+00.  */
-	.quad	0xBF3AF17AC974B3D9 /* A02 = -4.111218235774406434303e-04.  */
-	.quad	0xBFD516924A4C549C /* A03 = -3.295026517456081105450e-01.  */
-	.quad	0xBE92FFBC60A3F956 /* A00 = -2.831066871072026054144e-07.  */
-	.quad	0x3FF0001402DCED8A /* A01 = +1.000019084151832604590e+00.  */
-	.quad	0xBF3DFAE9390C4801 /* A02 = -4.574603454311488280083e-04.  */
-	.quad	0xBFD511F1B4D7DC3A /* A03 = -3.292202249571719585575e-01.  */
-	.quad	0xBE9690A22F96D5AD /* A00 = -3.362443262393081632612e-07.  */
-	.quad	0x3FF00016F63EFF5D /* A01 = +1.000021898173108825247e+00.  */
-	.quad	0xBF409E2C839605BB /* A02 = -5.071370461992499986334e-04.  */
-	.quad	0xBFD50D27924BEE00 /* A03 = -3.289278916051614487515e-01.  */
-	.quad	0xBE9AA56C65E72A73 /* A00 = -3.970591019557469835586e-07.  */
-	.quad	0x3FF0001A39F4A43E /* A01 = +1.000025011433776978009e+00.  */
-	.quad	0xBF425BD74C3D6667 /* A02 = -5.602647074553602319844e-04.  */
-	.quad	0xBFD50833F6E1ABA2 /* A03 = -3.286256705238718156536e-01.  */
-	.quad	0xBE9F4BD4FF1A83B0 /* A00 = -4.663500013744687071912e-07.  */
-	.quad	0x3FF0001DD36F9EC2 /* A01 = +1.000028444215715683896e+00.  */
-	.quad	0xBF44376634149405 /* A02 = -6.169556656102642569831e-04.  */
-	.quad	0xBFD50316F77EDEE5 /* A03 = -3.283135811757190158922e-01.  */
-	.quad	0xBEA3B625387BB079 /* A00 = -5.874486399249461304297e-07.  */
-	.quad	0x3FF00023E14CFBA9 /* A01 = +1.000034217911642153709e+00.  */
-	.quad	0xBF47392F923218D2 /* A02 = -7.087213783883111826306e-04.  */
-	.quad	0xBFD4FB1FACDEB938 /* A03 = -3.278273761924483942209e-01.  */
-	.quad	0xBEAA6E24F543500A /* A00 = -7.876828740601738750574e-07.  */
-	.quad	0x3FF0002D5C6E8412 /* A01 = +1.000043259679163742959e+00.  */
-	.quad	0xBF4BAF02BD7FDD70 /* A02 = -8.448375110664940040861e-04.  */
-	.quad	0xBFD4EFEE6527A7DE /* A03 = -3.271442401734229177279e-01.  */
-	.quad	0xBEB16E3EBE2157D0 /* A00 = -1.038947396133402500647e-06.  */
-	.quad	0x3FF00038990FEE2F /* A01 = +1.000053975962952312884e+00.  */
-	.quad	0xBF50569481C574CB /* A02 = -9.972048056490652716971e-04.  */
-	.quad	0xBFD4E419278DA2B4 /* A03 = -3.264220129263251113372e-01.  */
-	.quad	0xBEB6A7B6723165D4 /* A00 = -1.350350836279403750524e-06.  */
-	.quad	0x3FF00045CAB4158E /* A01 = +1.000066558657042303793e+00.  */
-	.quad	0xBF531D7C9C849108 /* A02 = -1.166698160951775212202e-03.  */
-	.quad	0xBFD4D7A0BB33B152 /* A03 = -3.256608799117844954552e-01.  */
-	.quad	0xBEBD0EE2A8654AFD /* A00 = -1.732000471561702711532e-06.  */
-	.quad	0x3FF00055276F18D6 /* A01 = +1.000081209219890521211e+00.  */
-	.quad	0xBF562FDBA3FB6C6C /* A02 = -1.354183666925102939860e-03.  */
-	.quad	0xBFD4CA85F1B93DB2 /* A03 = -3.248610363561638125773e-01.  */
-	.quad	0xBEC269D4036A207E /* A00 = -2.195047297096822741730e-06.  */
-	.quad	0x3FF00066E7DA6E4E /* A01 = +1.000098138500919997540e+00.  */
-	.quad	0xBF5991499FC36B3A /* A02 = -1.560518167983372759405e-03.  */
-	.quad	0xBFD4BCC9A72283D6 /* A03 = -3.240226871658341556426e-01.  */
-	.quad	0xBEC7154B6C09CFE1 /* A00 = -2.751729738565190291276e-06.  */
-	.quad	0x3FF0007B47086B80 /* A01 = +1.000117566559055148900e+00.  */
-	.quad	0xBF5D455433B4F8F4 /* A02 = -1.786548832412968197680e-03.  */
-	.quad	0xBFD4AE6CC1BFE145 /* A03 = -3.231460468373550942722e-01.  */
-	.quad	0xBECCA68CC64A0F8A /* A00 = -3.415415948561670285790e-06.  */
-	.quad	0x3FF00092827742F7 /* A01 = +1.000139722473418535387e+00.  */
-	.quad	0xBF60A7BF15A527AF /* A02 = -2.033112728132522705610e-03.  */
-	.quad	0xBFD49F703214084C /* A03 = -3.222313393636155876010e-01.  */
-	.quad	0xBED19E68676B241B /* A00 = -4.200644630977303616698e-06.  */
-	.quad	0x3FF000ACDA037B26 /* A01 = +1.000164844146362863597e+00.  */
-	.quad	0xBF62D99F836A02F8 /* A02 = -2.301036405072284102280e-03.  */
-	.quad	0xBFD48FD4F2B91B28 /* A03 = -3.212787981359945810311e-01.  */
-	.quad	0xBED57CF4B0C7AA54 /* A00 = -5.123164339408145209103e-06.  */
-	.quad	0x3FF000CA8FD9E1A1 /* A01 = +1.000193178099017865534e+00.  */
-	.quad	0xBF653A014548E686 /* A02 = -2.591135484433962181405e-03.  */
-	.quad	0xBFD47F9C0844B38F /* A03 = -3.202886658426046806447e-01.  */
-	.quad	0xBEDA012B1B1A41E2 /* A00 = -6.199971197454598722328e-06.  */
-	.quad	0x3FF000EBE868FDF4 /* A01 = +1.000224979259539459520e+00.  */
-	.quad	0xBF67CA9427E0A544 /* A02 = -2.904214255086275467410e-03.  */
-	.quad	0xBFD46EC6812ADB37 /* A03 = -3.192611943626845749655e-01.  */
-	.quad	0xBEDF3EAC5BF12194 /* A00 = -7.449344990702664567927e-06.  */
-	.quad	0x3FF001112A520784 /* A01 = +1.000260510744255704196e+00.  */
-	.quad	0xBF6A8D01ABDA4DC4 /* A02 = -3.241065277345108255891e-03.  */
-	.quad	0xBFD45D55759FFA4A /* A03 = -3.181966446572103146551e-01.  */
-	.quad	0xBEE2A541BC274267 /* A00 = -8.890883582164319970972e-06.  */
-	.quad	0x3FF0013A9E5961F2 /* A01 = +1.000300043631906721231e+00.  */
-	.quad	0xBF6D82ECD080C540 /* A02 = -3.602468994380686462264e-03.  */
-	.quad	0xBFD44B4A0779C0AD /* A03 = -3.170952866557950611259e-01.  */
-	.quad	0xBEE61D97609A27F4 /* A00 = -1.054553560499505625520e-05.  */
-	.quad	0x3FF001688F56A3AF /* A01 = +1.000343856731187974773e+00.  */
-	.quad	0xBF7056F8EFB683EC /* A02 = -3.989193351487490407647e-03.  */
-	.quad	0xBFD438A5620F0F74 /* A03 = -3.159573991399533543500e-01.  */
-	.quad	0xBEEA145429EDD370 /* A00 = -1.243563138839952927732e-05.  */
-	.quad	0x3FF0019B4A242A67 /* A01 = +1.000392236341804297339e+00.  */
-	.quad	0xBF7207D31CA78D9B /* A02 = -4.401993423445739288258e-03.  */
-	.quad	0xBFD42568BA16E7CD /* A03 = -3.147832696228050619602e-01.  */
-	.quad	0xBEEE96370D52680F /* A00 = -1.458491207477835326165e-05.  */
-	.quad	0x3FF001D31D8E4115 /* A01 = +1.000445476009251821736e+00.  */
-	.quad	0xBF73D4CC11EDC094 /* A02 = -4.841611050196221316400e-03.  */
-	.quad	0xBFD411954D8664E7 /* A03 = -3.135731942252974469021e-01.  */
-	.quad	0xBEF338C046215EF8 /* A00 = -1.833122622260562810219e-05.  */
-	.quad	0x3FF00230C32C2EC1 /* A01 = +1.000534784691737621998e+00.  */
-	.quad	0xBF76BD019BCC5DAF /* A02 = -5.551344188254799492943e-03.  */
-	.quad	0xBFD3F2C7156DC21E /* A03 = -3.116929730668135389848e-01.  */
-	.quad	0xBEF9B15EAE411EAE /* A00 = -2.450261207822986676092e-05.  */
-	.quad	0x3FF002C2DF057A4D /* A01 = +1.000674124886830940184e+00.  */
-	.quad	0xBF7B08CCD9AC1E30 /* A02 = -6.600189396301511801646e-03.  */
-	.quad	0xBFD3C7A7A114FED8 /* A03 = -3.090609620157755976777e-01.  */
-	.quad	0xBF00E36483C373B3 /* A00 = -3.221178528332122595812e-05.  */
-	.quad	0x3FF0036F419480D7 /* A01 = +1.000838524028997644777e+00.  */
-	.quad	0xBF7FD255D1777007 /* A02 = -7.768950679260206403087e-03.  */
-	.quad	0xBFD39A453911D6CE /* A03 = -3.062909180947429588215e-01.  */
-	.quad	0xBF05DFA04DD12059 /* A00 = -4.172046622180685472624e-05.  */
-	.quad	0x3FF00438B2A03D8D /* A01 = +1.001030633695197069599e+00.  */
-	.quad	0xBF828F8DBB4A9D10 /* A02 = -9.062869337255224921890e-03.  */
-	.quad	0xBFD36AAB704697D9 /* A03 = -3.033856007044711255993e-01.  */
-	.quad	0xBF0BF3E0C647DEFB /* A00 = -5.331544597092331081714e-05.  */
-	.quad	0x3FF005221063D36D /* A01 = +1.001253189109060359741e+00.  */
-	.quad	0xBF857A2CB3C96102 /* A02 = -1.048693584122917590862e-02.  */
-	.quad	0xBFD338E65BBB4FEC /* A03 = -3.003478904549854444639e-01.  */
-	.quad	0xBF11A506ED7C9D31 /* A00 = -6.730894835681591541979e-05.  */
-	.quad	0x3FF0062E4D0EA92A /* A01 = +1.001508999829250345925e+00.  */
-	.quad	0xBF88AB82C2761AF3 /* A02 = -1.204588085125866091241e-02.  */
-	.quad	0xBFD305028D6BD206 /* A03 = -2.971807843271395688234e-01.  */
-	.quad	0xBF1607C0922D9BF1 /* A00 = -8.403885708006799337092e-05.  */
-	.quad	0x3FF007606C341961 /* A01 = +1.001800940198869449560e+00.  */
-	.quad	0xBF8C25E6DA487BCF /* A02 = -1.374416688582682892494e-02.  */
-	.quad	0xBFD2CF0D0EE8F7B5 /* A03 = -2.938873906713255768075e-01.  */
-	.quad	0xBF1B3A8480A0A16D /* A00 = -1.038688061788578038307e-04.  */
-	.quad	0x3FF008BB802D02D6 /* A01 = +1.002131939589323561535e+00.  */
-	.quad	0xBF8FEB8AE99FD100 /* A02 = -1.558598065819483124983e-02.  */
-	.quad	0xBFD297135BD0911B /* A03 = -2.904709240558688843059e-01.  */
-	.quad	0xBF20ABB9BDB75C65 /* A00 = -1.271881327357976163798e-04.  */
-	.quad	0x3FF00A42A76D8CD1 /* A01 = +1.002504972472525901495e+00.  */
-	.quad	0xBF91FF3D752BB9E6 /* A02 = -1.757522609380570560722e-02.  */
-	.quad	0xBFD25D235C1F88B4 /* A03 = -2.869346999779154305799e-01.  */
-	.quad	0xBF243D3254425461 /* A00 = -1.544116913733432829448e-04.  */
-	.quad	0x3FF00BF909D1795E /* A01 = +1.002923048355647051011e+00.  */
-	.quad	0xBF94304E04D44942 /* A02 = -1.971551804042204897316e-02.  */
-	.quad	0xBFD2214B5E61CFA6 /* A03 = -2.832821294498394371075e-01.  */
-	.quad	0xBF286070011B61CE /* A00 = -1.859795307186510085994e-04.  */
-	.quad	0x3FF00DE1D5E1627E /* A01 = +1.003389201612804537689e+00.  */
-	.quad	0xBF9689D5F4163F59 /* A02 = -2.201017668045266231780e-02.  */
-	.quad	0xBFD1E39A11C3B42C /* A03 = -2.795167134743816728104e-01.  */
-	.quad	0xBF2D250B366A79E8 /* A00 = -2.223564326486314902259e-04.  */
-	.quad	0x3FF010003E134001 /* A01 = +1.003906481248123094829e+00.  */
-	.quad	0xBF990C9FF91F6F81 /* A02 = -2.446222265267250853271e-02.  */
-	.quad	0xBFD1A41E80084CDC /* A03 = -2.756420374218586655246e-01.  */
-	.quad	0xBF314DB5DDC2A30E /* A00 = -2.640313157465248123865e-04.  */
-	.quad	0x3FF012577608921B /* A01 = +1.004477940624503018441e+00.  */
-	.quad	0xBF9BB9626875B0C9 /* A02 = -2.707437288829409385849e-02.  */
-	.quad	0xBFD162E80768A9D0 /* A03 = -2.716617653228725615122e-01.  */
-	.quad	0xBF346A6133808864 /* A00 = -3.115165050094957730625e-04.  */
-	.quad	0x3FF014EAAFCC88A3 /* A01 = +1.005106627192198898157e+00.  */
-	.quad	0xBF9E90BEF9BF7419 /* A02 = -2.984903716411588595059e-02.  */
-	.quad	0xBFD12006545F7FAD /* A03 = -2.675796340899932457269e-01.  */
-	.quad	0xBF37F180DC3848EA /* A00 = -3.653468704395550778821e-04.  */
-	.quad	0x3FF017BD19147861 /* A01 = +1.005795572250939295955e+00.  */
-	.quad	0xBFA0C9A14C702E07 /* A02 = -3.278831537326359207851e-02.  */
-	.quad	0xBFD0DB895B650092 /* A03 = -2.633994476818851682154e-01.  */
-	.quad	0xBF3BEC6AAC6D7635 /* A00 = -4.260788377246944457107e-04.  */
-	.quad	0x3FF01AD1D884E719 /* A01 = +1.006547780778822565040e+00.  */
-	.quad	0xBFA260B2A1B1434A /* A02 = -3.589399551186163439542e-02.  */
-	.quad	0xBFD09581529E93D6 /* A03 = -2.591250712233067465817e-01.  */
-	.quad	0xBF4164E26167882B /* A00 = -5.308251737086202562063e-04.  */
-	.quad	0x3FF01FEF14B62B81 /* A01 = +1.007796364693348545316e+00.  */
-	.quad	0xBFA4EB014538AA42 /* A02 = -4.085544557559163403315e-02.  */
-	.quad	0xBFD029D36FEAF41F /* A03 = -2.525528519580024222613e-01.  */
-	.quad	0xBF46F6FFF4E53DC8 /* A00 = -7.008313930700277652464e-04.  */
-	.quad	0x3FF027CBB51CBBA0 /* A01 = +1.009715754956893363214e+00.  */
-	.quad	0xBFA89DEC9FEC112E /* A02 = -4.807986690687680864098e-02.  */
-	.quad	0xBFCF2A99464D0DB4 /* A03 = -2.434875100390009317053e-01.  */
-	.quad	0xBF4DCC9C4F66A4D9 /* A00 = -9.094012482836712945103e-04.  */
-	.quad	0x3FF030E7CFCCD583 /* A01 = +1.011939822882909068014e+00.  */
-	.quad	0xBFACAA3B95814081 /* A02 = -5.598627281199331645611e-02.  */
-	.quad	0xBFCDF78F156BE7CF /* A03 = -2.341173987004467604844e-01.  */
-	.quad	0xBF5308ED74E5C7A6 /* A00 = -1.161796466103906435435e-03.  */
-	.quad	0x3FF03B5986412ECB /* A01 = +1.014489674026594512313e+00.  */
-	.quad	0xBFB087EBA88DCC3F /* A02 = -6.457398285947223148806e-02.  */
-	.quad	0xBFCCBB9BD134862F /* A03 = -2.244753619680052991736e-01.  */
-	.quad	0xBF57FA23C00DF4B5 /* A00 = -1.463446533505758208674e-03.  */
-	.quad	0x3FF0473558A1BCC0 /* A01 = +1.017384859292903342975e+00.  */
-	.quad	0xBFB2E702BC6360EF /* A02 = -7.383744334527241048871e-02.  */
-	.quad	0xBFCB77D546379288 /* A03 = -2.145945160729250122955e-01.  */
-	.quad	0xBF5DD12971557F71 /* A00 = -1.819887610814388068450e-03.  */
-	.quad	0x3FF0548DDF5000A8 /* A01 = +1.020643112482540360020e+00.  */
-	.quad	0xBFB571B63DA186E1 /* A02 = -8.376635555898871710045e-02.  */
-	.quad	0xBFCA2D5202605148 /* A03 = -2.045080672838912594358e-01.  */
-	.quad	0xBF6252B1AD5D4F17 /* A00 = -2.236697221556737096709e-03.  */
-	.quad	0x3FF063738A910BF7 /* A01 = +1.024280110622155737232e+00.  */
-	.quad	0xBFB8270C8E6B601B /* A02 = -9.434584118878357184013e-02.  */
-	.quad	0xBFC8DD27D950A07E /* A03 = -1.942491351230763441116e-01.  */
-	.quad	0xBF66470C91730CFC /* A00 = -2.719425723258004842786e-03.  */
-	.quad	0x3FF073F468FCF331 /* A01 = +1.028309259519300633556e+00.  */
-	.quad	0xBFBB05C2952191E4 /* A02 = -1.055566419686964629854e-01.  */
-	.quad	0xBFC7886A770DE2BD /* A03 = -1.838505822486435070662e-01.  */
-	.quad	0xBF6AD114AC8E98EC /* A00 = -3.273525599485007861467e-03.  */
-	.quad	0x3FF0861BF53E5226 /* A01 = +1.032741506559554434119e+00.  */
-	.quad	0xBFBE0C4F9B461507 /* A02 = -1.173753503881763554650e-01.  */
-	.quad	0xBFC6302A037CDE3A /* A03 = -1.733448521642786954722e-01.  */
-	.quad	0xBF6FFBDE2A6C2AF8 /* A00 = -3.904279630096648551207e-03.  */
-	.quad	0x3FF099F2EB8E7DA3 /* A01 = +1.037585182326304034106e+00.  */
-	.quad	0xBFC09C74D192DDF0 /* A02 = -1.297746680554463516444e-01.  */
-	.quad	0xBFC4D571D8E3079F /* A03 = -1.627638157861470424859e-01.  */
-	.quad	0xBF72E8FDC0B952AA /* A00 = -4.616728994353872309042e-03.  */
-	.quad	0x3FF0AF7F273C9533 /* A01 = +1.042845872181101141152e+00.  */
-	.quad	0xBFC244C512736F10 /* A02 = -1.427236881344176033792e-01.  */
-	.quad	0xBFC379474F58B902 /* A03 = -1.521386277613104298645e-01.  */
-	.quad	0xBF762EABAF17395B /* A00 = -5.415602341101023557701e-03.  */
-	.quad	0x3FF0C6C3886F63FB /* A01 = +1.048526318502125631582e+00.  */
-	.quad	0xBFC3FDF9918EA12A /* A02 = -1.561881981590514389957e-01.  */
-	.quad	0xBFC21CA89ECAB895 /* A03 = -1.414995932913753196036e-01.  */
-	.quad	0xBF79D387CE5B2BAE /* A00 = -6.305246822828998107258e-03.  */
-	.quad	0x3FF0DFBFE2346376 /* A01 = +1.054626353847394337748e+00.  */
-	.quad	0xBFC5C6DA43602620 /* A02 = -1.701309994680721970894e-01.  */
-	.quad	0xBFC0C08BD8DB6631 /* A03 = -1.308760460731704100557e-01.  */
-	.quad	0xBF7DDBA8E8DA9060 /* A00 = -7.289562037531366334164e-03.  */
-	.quad	0x3FF0FA70F0D1B464 /* A01 = +1.061142864894713433443e+00.  */
-	.quad	0xBFC79E18D92BAA7C /* A02 = -1.845122394946264732241e-01.  */
-	.quad	0xBFBECBBBF74C2669 /* A03 = -1.202962378266875381749e-01.  */
-	.quad	0xBF81254E76EA25DA /* A00 = -8.371937755572145950511e-03.  */
-	.quad	0x3FF116D05835EBD0 /* A01 = +1.068069786618014660462e+00.  */
-	.quad	0xBFC982539E2ED224 /* A02 = -1.992897531869327609755e-01.  */
-	.quad	0xBFBC1B043C350159 /* A03 = -1.097872397413132278254e-01.  */
-	.quad	0xBF8391ACBA863403 /* A00 = -9.555196230190082448686e-03.  */
-	.quad	0x3FF134D4AA477FE2 /* A01 = +1.075398125794884141015e+00.  */
-	.quad	0xBFCB7218609FEAFB /* A02 = -2.144194099235717521079e-01.  */
-	.quad	0xBFB970A16CB88329 /* A03 = -9.937485603633135211599e-02.  */
-	.quad	0xBF87935088E48E8B /* A00 = -1.151144902957603431692e-02.  */
-	.quad	0x3FF1649892AD7DD3 /* A01 = +1.087059567413110938716e+00.  */
-	.quad	0xBFCE6971DDE75409 /* A02 = -2.375929196847723912089e-01.  */
-	.quad	0xBFB58291E88CB251 /* A03 = -8.402358939628952472223e-02.  */
-	.quad	0xBF8DB3A62C325325 /* A00 = -1.450280973794233242702e-02.  */
-	.quad	0x3FF1A9C900C6DEEA /* A01 = +1.103951457056548068891e+00.  */
-	.quad	0xBFD13DBC65B0E08E /* A02 = -2.693930619311765140012e-01.  */
-	.quad	0xBFB06696F62696D1 /* A03 = -6.406539449252625362252e-02.  */
-	.quad	0xBF92583699F2E27A /* A00 = -1.791463198307716858659e-02.  */
-	.quad	0x3FF1F451B85AA9F0 /* A01 = +1.122148246892376022288e+00.  */
-	.quad	0xBFD34FD5F8288180 /* A02 = -3.017477916164565954205e-01.  */
-	.quad	0xBFA6FB692825B683 /* A03 = -4.488686194495718900788e-02.  */
-	.quad	0xBF9641C26E673D6F /* A00 = -2.173522757385398448959e-02.  */
-	.quad	0x3FF24364DA5E2B07 /* A01 = +1.141453602790251542487e+00.  */
-	.quad	0xBFD564A5A5EF5890 /* A02 = -3.342680092295120530821e-01.  */
-	.quad	0xBF9B43712011A982 /* A03 = -2.662445791467283467968e-02.  */
-	.quad	0xBF9A901038EC2F39 /* A00 = -2.594018313816024226548e-02.  */
-	.quad	0x3FF2961356DFFEBA /* A01 = +1.161639537196534011088e+00.  */
-	.quad	0xBFD775EBB17198C7 /* A02 = -3.665723069046972759644e-01.  */
-	.quad	0xBF833B1A926CD462 /* A03 = -9.390075295963199591975e-03.  */
-	.quad	0xBF9F396A6A461B91 /* A00 = -3.049246095317987084727e-02.  */
-	.quad	0x3FF2EB53BAEF534B /* A01 = +1.182452898229899629357e+00.  */
-	.quad	0xBFD97DABF8AD8BBD /* A02 = -3.982953957076310058660e-01.  */
-	.quad	0x3F7B8F6A3E0F8837 /* A03 = +6.728568086119371925713e-03.  */
-	.quad	0xBFA21878590F8BAA /* A00 = -3.534294211546946951064e-02.  */
-	.quad	0x3FF34209790236E1 /* A01 = +1.203622315111197105253e+00.  */
-	.quad	0xBFDB764C0E71BECB /* A02 = -4.290952817018306997277e-01.  */
-	.quad	0x3F962FE0C03F84C0 /* A03 = +2.166701482190513949888e-02.  */
-	.quad	0xBFA4B36B9AD27ECC /* A00 = -4.043136849327097492868e-02.  */
-	.quad	0x3FF3990C5B12FC16 /* A01 = +1.224865298994477935679e+00.  */
-	.quad	0xBFDD5AABB0D01390 /* A02 = -4.586590983092770912322e-01.  */
-	.quad	0x3FA21DAF5CA162DB /* A03 = +3.538272863142363083844e-02.  */
-	.quad	0xBFA7645E4D7BF28B /* A00 = -4.568762489177399105378e-02.  */
-	.quad	0x3FF3EF2FD51C0D9F /* A01 = +1.245895225962932562069e+00.  */
-	.quad	0xBFDF26377E1B686E /* A02 = -4.867075664057044503963e-01.  */
-	.quad	0x3FA8803E756EE812 /* A03 = +4.785342391501513914509e-02.  */
-	.quad	0xBFAA210925C64413 /* A00 = -5.103329263796054643398e-02.  */
-	.quad	0x3FF44349F897D8E7 /* A01 = +1.266427966181760345066e+00.  */
-	.quad	0xBFE06A7B02C6D8E2 /* A02 = -5.129981092675530707226e-01.  */
-	.quad	0x3FAE3F194734F5D0 /* A03 = +5.907515520309980505687e-02.  */
-	.quad	0xBFACDE48F8A19BBB /* A00 = -5.638340029764018351832e-02.  */
-	.quad	0x3FF49439D5466582 /* A01 = +1.286187966447272845727e+00.  */
-	.quad	0xBFE131C7C1063DDC /* A02 = -5.373266954429101183166e-01.  */
-	.quad	0x3FB1ADEEC36AD805 /* A03 = +6.906025191241844940482e-02.  */
-	.quad	0xBFAF905D8F585680 /* A00 = -6.164829611604449866036e-02.  */
-	.quad	0x3FF4E0ED1FD27F99 /* A01 = +1.304913639360142818546e+00.  */
-	.quad	0xBFE1E7A859DC1D3D /* A02 = -5.595285182070380836095e-01.  */
-	.quad	0x3FB3ED018E4642A1 /* A03 = +7.783517573831001679086e-02.  */
-	.quad	0xBFB11595104160BA /* A00 = -6.673556944713512906198e-02.  */
-	.quad	0x3FF528650340490B /* A01 = +1.322361958217302513319e+00.  */
-	.quad	0xBFE28B14B40BC974 /* A02 = -5.794776455425521000109e-01.  */
-	.quad	0x3FB5DF49F5BAF6D7 /* A03 = +8.543836831355676453281e-02.  */
-	.quad	0xBFB2513A97344BA4 /* A00 = -7.155195418844911836587e-02.  */
-	.quad	0x3FF569BA0DB5EE14 /* A01 = +1.338312200124055273420e+00.  */
-	.quad	0xBFE31B53A8B67B20 /* A02 = -5.970857901737396389308e-01.  */
-	.quad	0x3FB787F297BB0544 /* A03 = +9.191814617499455275507e-02.  */
-	.quad	0xBFB37512E848FAFA /* A00 = -7.600515528700305112331e-02.  */
-	.quad	0x3FF5A41F33B403C8 /* A01 = +1.352568819013173495591e+00.  */
-	.quad	0xBFE397F6EA9A58A5 /* A02 = -6.123003561103997904880e-01.  */
-	.quad	0x3FB8EAA9FF25CA06 /* A03 = +9.733068923177520814782e-02.  */
-	.quad	0xBFB47B3E603AFC5D /* A00 = -8.000554894805263217439e-02.  */
-	.quad	0x3FF5D6E3EDE40487 /* A01 = +1.364963464031718975988e+00.  */
-	.quad	0xBFE400D5BCA6D631 /* A02 = -6.251019177058819709103e-01.  */
-	.quad	0x3FBA0B830ED567FE /* A03 = +1.017381583418739132707e-01.  */
-	.quad	0xBFB5BBFE8AC90496 /* A00 = -8.489981544791400103200e-02.  */
-	.quad	0x3FF612BA70107E95 /* A01 = +1.379572332145390989311e+00.  */
-	.quad	0xBFE477EAF1FA7693 /* A02 = -6.396383978023599814478e-01.  */
-	.quad	0x3FBB4784B7C08A95 /* A03 = +1.065600346196709652391e-01.  */
-	.quad	0xBFB6D5D940743939 /* A00 = -8.920057128509463473254e-02.  */
-	.quad	0x3FF644A8748F70CE /* A01 = +1.391762214006166953340e+00.  */
-	.quad	0xBFE4D646AB07EA37 /* A02 = -6.511567440459832267763e-01.  */
-	.quad	0x3FBC354F4E1D5292 /* A03 = +1.101884427747086558913e-01.  */
-	.quad	0xBFB7223D19E4F3D1 /* A00 = -9.036619074045339206069e-02.  */
-	.quad	0x3FF6518FEB42B7FA /* A01 = +1.394912642466350494175e+00.  */
-	.quad	0xBFE4ED86CB87498C /* A02 = -6.539949393430091184598e-01.  */
-	.quad	0x3FBC6D29F28CCA9B /* A03 = +1.110407082713131127205e-01.  */
-	.quad	0xBFB6878652FF6312 /* A00 = -8.800544287022329936754e-02.  */
-	.quad	0x3FF63948C302D040 /* A01 = +1.388985406648330922508e+00.  */
-	.quad	0xBFE4C4E2E7904E17 /* A02 = -6.490339777687407218920e-01.  */
-	.quad	0x3FBC127356CA1ABE /* A03 = +1.096565329445224612481e-01.  */
-	.quad	0xBFB4F5D18B0C91D6 /* A00 = -8.187589306596207427980e-02.  */
-	.quad	0x3FF5FD27EB7DD0B8 /* A01 = +1.374305648697413673176e+00.  */
-	.quad	0xBFE464E01A2B2FC6 /* A02 = -6.373138915164353601739e-01.  */
-	.quad	0x3FBB460547674A30 /* A03 = +1.065371798825160976065e-01.  */
-	.quad	0xBFB26642FA16A685 /* A00 = -7.187288861919156890412e-02.  */
-	.quad	0x3FF59F9BEDE1C95A /* A01 = +1.351467065073470141812e+00.  */
-	.quad	0xBFE3D67920C8FBEA /* A02 = -6.199308052381387046381e-01.  */
-	.quad	0x3FBA24F6A8D3CBC1 /* A03 = +1.021265184570401413078e-01.  */
-	.quad	0xBFADB5294794F097 /* A00 = -5.802277563859197656582e-02.  */
-	.quad	0x3FF523EA7B9CF453 /* A01 = +1.321268542159732772845e+00.  */
-	.quad	0xBFE322A8B55E35DB /* A02 = -5.979808370918208160205e-01.  */
-	.quad	0x3FB8C8673B1B3E37 /* A03 = +9.680791085269722928697e-02.  */
-	.quad	0xBFA4B7D661965C6A /* A00 = -4.046506825687219699450e-02.  */
-	.quad	0x3FF48DE3E2CE3122 /* A01 = +1.284641157110919085227e+00.  */
-	.quad	0xBFE251FED1A7F445 /* A02 = -5.725092024655472622285e-01.  */
-	.quad	0x3FB745699FCABDB9 /* A03 = +9.090290213747821701507e-02.  */
-	.quad	0xBF93E60456E4EE1D /* A00 = -1.943213253365004902773e-02.  */
-	.quad	0x3FF3E1A14E628A59 /* A01 = +1.242585474196536532432e+00.  */
-	.quad	0xBFE16C5AB660E876 /* A02 = -5.444768488007543094653e-01.  */
-	.quad	0x3FB5AD33AA8C188F /* A03 = +8.467410005332197397987e-02.  */
-	.quad	0x3F738C17C47C7961 /* A00 = +4.772274820224659853951e-03.  */
-	.quad	0x3FF3234DDE3BD146 /* A01 = +1.196119182682268355933e+00.  */
-	.quad	0xBFE078C0D77A9D3B /* A02 = -5.147403915952176722826e-01.  */
-	.quad	0x3FB40D74B3E276B8 /* A03 = +7.833032027925923568290e-02.  */
-	.quad	0x3FA0474BECC689C7 /* A00 = +3.179394975019849550746e-02.  */
-	.quad	0x3FF256FB4FA7D18A /* A01 = +1.146235762743432307076e+00.  */
-	.quad	0xBFDEFA8E3FB285E2 /* A02 = -4.840427038235174395098e-01.  */
-	.quad	0x3FB270C007493D59 /* A03 = +7.203293016322244446403e-02.  */
-	.quad	0x3FAF5BD51E479BDC /* A00 = +6.124750132203590768931e-02.  */
-	.quad	0x3FF18081D0B53BC5 /* A01 = +1.093873801484492647162e+00.  */
-	.quad	0xBFDCFE2439BD0C03 /* A02 = -4.530115665294831006626e-01.  */
-	.quad	0x3FB0DEFE5A45AFDD /* A03 = +6.590261176978580437424e-02.  */
-	.quad	0x3FB7BD5D2806EA26 /* A00 = +9.273321368429118805032e-02.  */
-	.quad	0x3FF0A369E35B4440 /* A01 = +1.039895904647224256223e+00.  */
-	.quad	0xBFDB04BC5C9951E7 /* A02 = -4.221640495573226181669e-01.  */
-	.quad	0x3FAEBBBAA9D6DEEF /* A03 = +6.002600978120919278380e-02.  */
-	.quad	0x3FC01BE411098DBC /* A00 = +1.258511622610124502941e-01.  */
-	.quad	0x3FEF85BDABC031C1 /* A01 = +9.850757936961188621083e-01.  */
-	.quad	0xBFD91521375097C2 /* A02 = -3.919146576102968682065e-01.  */
-	.quad	0x3FABE26F0086D982 /* A03 = +5.446192628317005068883e-02.  */
-	.quad	0x3FC481D7FF5776B9 /* A00 = +1.602125164781023347604e-01.  */
-	.quad	0x3FEDC3506C1E7218 /* A01 = +9.300920592973538347792e-01.  */
-	.quad	0xBFD7349A88DA7D4F /* A02 = -3.625856720409119104964e-01.  */
-	.quad	0x3FA936E2DFF8E2AE /* A03 = +4.924687370334389358018e-02.  */
-	.quad	0x3FC90471F96FA27A /* A00 = +1.954481571149420671141e-01.  */
-	.quad	0x3FEC0451601987A2 /* A01 = +8.755270840595026360376e-01.  */
-	.quad	0xBFD5671CD4B898DC /* A02 = -3.344184949259110251063e-01.  */
-	.quad	0x3FA6BB9594603B67 /* A03 = +4.439990459660841243261e-02.  */
-	.quad	0x3FCFD8ADB9ED944C /* A00 = +2.488000066615846384011e-01.  */
-	.quad	0x3FE978C073F6809A /* A01 = +7.959902062321078108909e-01.  */
-	.quad	0xBFD2DF7E00BCD5A9 /* A02 = -2.948908812716931060471e-01.  */
-	.quad	0x3FA3614033D490B2 /* A03 = +3.785133965200894456959e-02.  */
-	.quad	0x3FD4846A12AFE5A0 /* A00 = +3.205819303981005674586e-01.  */
-	.quad	0x3FE63A1147D40472 /* A01 = +6.945883181471244061100e-01.  */
-	.quad	0xBFCFA2268AD34450 /* A02 = -2.471359422548027318101e-01.  */
-	.quad	0x3F9F150201D9FFE0 /* A03 = +3.035357605267552383310e-02.  */
-	.quad	0x3FD9018641F82BEB /* A00 = +3.907180446846598154131e-01.  */
-	.quad	0x3FE33B7C220FFBDC /* A01 = +6.010113396913498995389e-01.  */
-	.quad	0xBFCA4E4187E29C86 /* A02 = -2.055131829740483584423e-01.  */
-	.quad	0x3F98C30CED19F8F4 /* A03 = +2.418155858185229434287e-02.  */
-	.quad	0x3FDD4B8255BEB078 /* A00 = +4.577337109901757905561e-01.  */
-	.quad	0x3FE0858B19D3A49B /* A01 = +5.163016800335243905451e-01.  */
-	.quad	0xBFC5BC929EACE564 /* A02 = -1.698172831327539045176e-01.  */
-	.quad	0x3F93A083CE57DE2B /* A03 = +1.916700312537337677621e-02.  */
-	.quad	0x3FE0A8E5E039295C /* A00 = +5.206174258576470315063e-01.  */
-	.quad	0x3FDC35E1234583FE /* A01 = +4.407885403107342225937e-01.  */
-	.quad	0xBFC1DE034E31AEB9 /* A02 = -1.395877963835710222629e-01.  */
-	.quad	0x3F8EFDEBB3471BDC /* A03 = +1.513275280821162888101e-02.  */
-	.quad	0x3FE2851B603CB2A5 /* A00 = +5.787484054213406503564e-01.  */
-	.quad	0x3FD7F4A44ABBB286 /* A01 = +3.743067483726821853551e-01.  */
-	.quad	0xBFBD3EEB67087DE7 /* A02 = -1.142413260026767657385e-01.  */
-	.quad	0x3F8864F38329E8BD /* A03 = +1.191129917173260922836e-02.  */
-	.quad	0x3FE437DBE3C34AC1 /* A00 = +6.318187187665317283702e-01.  */
-	.quad	0x3FD43F6F789441B5 /* A01 = +3.163717916040938438194e-01.  */
-	.quad	0xBFB7D92E7901B9A4 /* A02 = -9.315767721429907277653e-02.  */
-	.quad	0x3F8327ED342308E1 /* A03 = +9.353497651663324544136e-03.  */
-	.quad	0x3FE5C0977766D55C /* A00 = +6.797597248138731451661e-01.  */
-	.quad	0x3FD10B42A764D8F9 /* A01 = +2.663122782427219115142e-01.  */
-	.quad	0xBFB3633351D3D70F /* A02 = -7.573242900602060456716e-02.  */
-	.quad	0x3F7E079E30FF899C /* A03 = +7.331483779099558922843e-03.  */
-	.quad	0x3FE7202CE08A88C4 /* A00 = +7.226776490754436288455e-01.  */
-	.quad	0x3FCC973EB5662B01 /* A01 = +2.233656297433626314319e-01.  */
-	.quad	0xBFAF70A455F9920B /* A02 = -6.140626477716545211782e-02.  */
-	.quad	0x3F77812411CE99B6 /* A03 = +5.738392731393584730859e-03.  */
-	.quad	0x3FE85879424095B1 /* A00 = +7.608000082006382003286e-01.  */
-	.quad	0x3FC7E73BD1674D84 /* A01 = +1.867441914060742336190e-01.  */
-	.quad	0xBFA96F84E4BF333B /* A02 = -4.967894832916504993525e-02.  */
-	.quad	0x3F72606DDCA6E117 /* A03 = +4.486493251924870105662e-03.  */
-	.quad	0x3FE96BFE4957F4DD /* A00 = +7.944327766887472330737e-01.  */
-	.quad	0x3FC3ED4780D25478 /* A01 = +1.556786898624158421711e-01.  */
-	.quad	0xBFA489C5F9A56B58 /* A02 = -4.011362717093075458408e-02.  */
-	.quad	0x3F6CB5DC17E9AD2A /* A03 = +3.504686231556104931972e-03.  */
-	.quad	0x3FEA5D9CB2F41234 /* A00 = +8.239272589858672724006e-01.  */
-	.quad	0x3FC091A758374DCF /* A01 = +1.294449978582705440555e-01.  */
-	.quad	0xBFA08E436D4B5CE0 /* A02 = -3.233538350257858517978e-02.  */
-	.quad	0x3F666997AD53E6B7 /* A03 = +2.735897297154145629133e-03.  */
-	.quad	0x3FEB3060342CB850 /* A00 = +8.496552485501158713532e-01.  */
-	.quad	0x3FBB7D30BBC7DC1B /* A01 = +1.073790033768634993860e-01.  */
-	.quad	0xBF9AA6BA3443D9E3 /* A02 = -2.602663940430173170060e-02.  */
-	.quad	0x3F617CA764B7850B /* A03 = +2.134634914668814050648e-03.  */
-	.quad	0x3FEBE759A6A0C7B8 /* A00 = +8.719909910635044170135e-01.  */
-	.quad	0x3FB6C10DE6A703FF /* A01 = +8.888327485239243264115e-02.  */
-	.quad	0xBF956C566D8BE1F6 /* A02 = -2.092108768099084498138e-02.  */
-	.quad	0x3F5B46D1A4A59CF8 /* A03 = +1.664833764687232917079e-03.  */
-	.quad	0x3FEC858494887A04 /* A00 = +8.912985707318630268503e-01.  */
-	.quad	0x3FB2CC31F543394D /* A01 = +7.342827070099140762682e-02.  */
-	.quad	0xBF9133477FF69137 /* A02 = -1.679717749142747504343e-02.  */
-	.quad	0x3F5544482FBB4DA5 /* A03 = +1.298017973501022466823e-03.  */
-	.quad	0x3FED0DB59D0E32E9 /* A00 = +9.079235141267335551518e-01.  */
-	.quad	0x3FAF006BAFFC6EF4 /* A01 = +6.055008433597022787787e-02.  */
-	.quad	0xBF8B97146FA2B97A /* A02 = -1.347175565419144252499e-02.  */
-	.quad	0x3F5093B01F4CDC69 /* A03 = +1.011774057770665211434e-03.  */
-	.quad	0x3FEDB487C3EC457C /* A00 = +9.282873942012623835751e-01.  */
-	.quad	0x3FA7390C09D0BD1D /* A01 = +4.535710925881118044112e-02.  */
-	.quad	0xBF83D9F7C3181106 /* A02 = -9.693084374710735778846e-03.  */
-	.quad	0x3F46E34A0A3C0E64 /* A03 = +6.984817050299072134500e-04.  */
-	.quad	0x3FEE5FFCB4E6EB00 /* A00 = +9.492171796076434020506e-01.  */
-	.quad	0x3F9F4913ED00AADF /* A01 = +3.055220731782070861526e-02.  */
-	.quad	0xBF79670BD0E59B5C /* A02 = -6.201788097633133961528e-03.  */
-	.quad	0x3F3BC998EBCAF96D /* A03 = +4.240034429975534616304e-04.  */
-	.quad	0x3FEEDBA41E9542FE /* A00 = +9.643116566968215064293e-01.  */
-	.quad	0x3F94F5DD18D9C24D /* A01 = +2.046914543319848858727e-02.  */
-	.quad	0xBF7034896AA122B9 /* A02 = -3.956352980886528904192e-03.  */
-	.quad	0x3F30DCCB47810B39 /* A03 = +2.573009765038273091199e-04.  */
-	.quad	0x3FEF33F2882520ED /* A00 = +9.750912341196716903724e-01.  */
-	.quad	0x3F8BF37F2CF553FF /* A01 = +1.364802699996836392315e-02.  */
-	.quad	0xBF649F6F05A69619 /* A02 = -2.517430152880317534986e-03.  */
-	.quad	0x3F247623C950AAC9 /* A03 = +1.561087307505231250044e-04.  */
-	.quad	0x3FEF727757751741 /* A00 = +9.827229221489021115943e-01.  */
-	.quad	0x3F828E67912C4400 /* A01 = +9.060677640748693306705e-03.  */
-	.quad	0xBF5A2F51A806CC2C /* A02 = -1.598195784123355826789e-03.  */
-	.quad	0x3F18D35D7687E613 /* A03 = +9.470231965016282719549e-05.  */
-	.quad	0x3FEF9E6325C5942A /* A00 = +9.880843866091073568469e-01.  */
-	.quad	0x3F788AB117618F76 /* A01 = +5.991641772286606867914e-03.  */
-	.quad	0xBF5096EAB0B1EA89 /* A02 = -1.012543859160305046233e-03.  */
-	.quad	0x3F0E1E50EC4435AB /* A03 = +5.744633156910412119652e-05.  */
-	.quad	0x3FEFBD0784049369 /* A00 = +9.918248728250605994461e-01.  */
-	.quad	0x3F702BBD8294035F /* A01 = +3.947963975634432264028e-03.  */
-	.quad	0xBF44FB55E0F00593 /* A02 = -6.403130845457509273330e-04.  */
-	.quad	0x3F0244DCD723230A /* A03 = +3.484534217219031730379e-05.  */
-	.quad	0x3FEFD245E2366A43 /* A00 = +9.944180887426415926811e-01.  */
-	.quad	0x3F653D82EC088433 /* A01 = +2.592807490387838333795e-03.  */
-	.quad	0xBF3A7DF75E013CB8 /* A02 = -4.042366908878036561859e-04.  */
-	.quad	0x3EF6298E69F991CD /* A03 = +2.113564425911141559972e-05.  */
-	.quad	0x3FEFE0EAA508BC69 /* A00 = +9.962056372950317539861e-01.  */
-	.quad	0x3F5BD0771AF3FDDA /* A01 = +1.697651208644282514598e-03.  */
-	.quad	0xBF30B2E1254DE571 /* A02 = -2.548026725928887099328e-04.  */
-	.quad	0x3EEAE28B70EC0256 /* A03 = +1.281973848454955042307e-05.  */
-	.quad	0x3FEFEAF5303D7F96 /* A00 = +9.974313680831865536192e-01.  */
-	.quad	0x3F5229111365657E /* A01 = +1.108423877289460134782e-03.  */
-	.quad	0xBF250572D04DFE66 /* A02 = -1.603796628408704519168e-04.  */
-	.quad	0x3EE04E89BB57C981 /* A03 = +7.775682983689149966743e-06.  */
-	.quad	0x3FEFF1CF52F1CF44 /* A00 = +9.982678051005469122003e-01.  */
-	.quad	0x3F47A71316147CEB /* A01 = +7.218211359577819110842e-04.  */
-	.quad	0xBF1A6D7604055719 /* A02 = -1.008132248946049582547e-04.  */
-	.quad	0x3ED3C8047586A85C /* A03 = +4.716233739913014633626e-06.  */
-	.quad	0x3FEFF6770369EF69 /* A00 = +9.988360468555416149528e-01.  */
-	.quad	0x3F3EBB261180FBF0 /* A01 = +4.689186039321105101130e-04.  */
-	.quad	0xBF1097754FE19D7F /* A02 = -6.329206004950480057066e-05.  */
-	.quad	0x3EC7FEFF83BCA0A7 /* A03 = +2.860556404988488738366e-06.  */
-	.quad	0x3FEFF99D42371AC4 /* A00 = +9.992204945818561334647e-01.  */
-	.quad	0x3F33EB2AEC271F59 /* A01 = +3.039340773764907474054e-04.  */
-	.quad	0xBF04CF18E0FC0D79 /* A02 = -3.968996690952969588805e-05.  */
-	.quad	0x3EBD1BDBD6019BE9 /* A03 = +1.735021065507727833886e-06.  */
-	.quad	0x3FEFFBBCA32B0D91 /* A00 = +9.994795977476532700123e-01.  */
-	.quad	0x3F29C41E1615110A /* A01 = +1.965796209707565346710e-04.  */
-	.quad	0xBEFA11F93D9DCB5A /* A02 = -2.486248909101414873235e-05.  */
-	.quad	0x3EB1A7CA4546F7A7 /* A03 = +1.052345642723709228769e-06.  */
-	.quad	0x3FEFFD298B8E8DE2 /* A00 = +9.996535993308806045121e-01.  */
-	.quad	0x3F20A1C42D523C5B /* A01 = +1.268913244172078754520e-04.  */
-	.quad	0xBEF0507A364AFAE4 /* A02 = -1.555859070622834605755e-05.  */
-	.quad	0x3EA56ACA17E7CDF4 /* A03 = +6.382806956848098872313e-07.  */
-	.quad	0x3FEFFE1DC82BA5A3 /* A00 = +9.997700604991915929176e-01.  */
-	.quad	0x3F156E73B90F1769 /* A01 = +8.175450626798714452801e-05.  */
-	.quad	0xBEE4663579D0A09F /* A02 = -9.727122057226747625365e-06.  */
-	.quad	0x3E99FAF6FEC5D4C1 /* A03 = +3.871371052824002996020e-07.  */
-	.quad	0x3FEFFEF8D0BB5E81 /* A00 = +9.998745037837154514548e-01.  */
-	.quad	0x3F06686DA18D39C3 /* A01 = +4.273972098777251447726e-05.  */
-	.quad	0xBED46BC298073E90 /* A02 = -4.868731025855742842491e-06.  */
-	.quad	0x3E88E42286B9D0FD /* A03 = +1.854535328530838170114e-07.  */
-	.quad	0x3FEFFF8DBC68DDC7 /* A00 = +9.999455146670975791423e-01.  */
-	.quad	0x3EF26B2953A80AF0 /* A01 = +1.756534514108903368909e-05.  */
-	.quad	0xBEBFC4472D580F83 /* A02 = -1.893443529411295465239e-06.  */
-	.quad	0x3E72505B4553D19F /* A03 = +6.822456673547912277047e-08.  */
-	.quad	0x3FEFFFCED1276609 /* A00 = +9.999765477215883935358e-01.  */
-	.quad	0x3EDE1A94C7CC58F5 /* A01 = +7.177313020153979672606e-06.  */
-	.quad	0xBEA8A2C988744E57 /* A02 = -7.342066660497443762363e-07.  */
-	.quad	0x3E5AF30036BBBAF4 /* A03 = +2.509841882843541084885e-08.  */
-	.quad	0x3FEFFFEAFE70FCFC /* A00 = +9.999899835164849370983e-01.  */
-	.quad	0x3EC879175E3549F5 /* A01 = +2.917410471128503564412e-06.  */
-	.quad	0xBE930E36677D1813 /* A02 = -2.839493400307523115929e-07.  */
-	.quad	0x3E43D4005B42D48F /* A03 = +9.233192745401904898013e-09.  */
-	.quad	0x3ff0000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-	.quad	0x0000000000000000
-
-	.align	32
-	.type	__svml_stanh_data_internal_avx2, @object
-	.size	__svml_stanh_data_internal_avx2, .-__svml_stanh_data_internal_avx2
-	.type	__svml_stanh_data_internal, @object
-	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
-#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.h.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.h.S
new file mode 100644
index 0000000000..3634950e97
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf_rodata.h.S
@@ -0,0 +1,591 @@
+/* Datatables for tanhf_{avx2,sse4}
+   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+#include "svml_common_data_macros.h.S"
+
+#if (defined AVX2_SHARED_OFFSETS) || (defined AVX2_SHARED_TABLE)
+
+/* Offsets are ordered by use in the function. On cold-starts this
+   might help the prefetcher. If the streaming prefetchers kick in it
+   will prefetch into the lookup table.  */
+# define _iExpMantMask	0
+# define _iMinIdxOfsMask	32
+# define _iMaxIdxMask	64
+# define _iExpMask	96
+# define _lookupTable	128
+
+# define AVX2_SHARED_DATA(offset)		((offset) + __svml_stanh_common_data_avx2)
+
+# if (defined AVX2_SHARED_TABLE)
+
+	.section .rodata.avx2, "a"
+
+	.align	32
+	.globl	__svml_stanh_common_data_avx2
+__svml_stanh_common_data_avx2:
+	/* _iExpMantMask.  */
+	float_vector32 (__svml_stanh_common_data_avx2, _iExpMantMask, 0x7ff80000)
+	float_vector32 (__svml_stanh_common_data_avx2, _iMinIdxOfsMask, 0x3cf80000)
+	float_vector32 (__svml_stanh_common_data_avx2, _iMaxIdxMask, 0x04280000)
+	float_vector32 (__svml_stanh_common_data_avx2, _iExpMask, 0x7f000000)
+
+	/* _lookupTable.  */
+	/* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500].  */
+	float_block (__svml_stanh_common_data_avx2, _lookupTable,
+		0x00000000, 0x00000000,	/* A00 = +0.000000000000000000000e-01. */
+		0x022C70EB, 0x3FF00000,	/* A01 = +1.000000008097283510367e+00. */
+		0x8CFFA194, 0xBED00E87,	/* A02 = -3.828228912518614443549e-06. */
+		0x6D0607A9, 0xBFD55176,	/* A03 = -3.330970825846813476723e-01. */
+		0xE3E4C297, 0xBE53D60C,	/* A00 = -1.847383956330407336230e-08. */
+		0x4177CF5C, 0x3FF00002,	/* A01 = +1.000002151235967140508e+00. */
+		0x94A51A25, 0xBF1758BC,	/* A02 = -8.906031613262943753568e-05. */
+		0x67E0D4F0, 0xBFD53EAE,	/* A03 = -3.319507612644221339337e-01. */
+		0xEF32D6FE, 0xBE5A9E47,	/* A00 = -2.479020984039698285657e-08. */
+		0xDA983057, 0x3FF00002,	/* A01 = +1.000002721676556793895e+00. */
+		0x509E94AA, 0xBF1BD953,	/* A02 = -1.062352277175377670507e-04. */
+		0x562EEDD5, 0xBFD53BDB,	/* A03 = -3.317783681520414806876e-01. */
+		0xE496D294, 0xBE6191BB,	/* A00 = -3.272532162914017685901e-08. */
+		0x90492017, 0x3FF00003,	/* A01 = +1.000003398528866105366e+00. */
+		0x814A57CE, 0xBF20727E,	/* A02 = -1.254825043772153972919e-04. */
+		0x060A6F22, 0xBFD538DE,	/* A03 = -3.315959033004550748913e-01. */
+		0x2A893A25, 0xBE66DAFA,	/* A00 = -4.257146219278012568149e-08. */
+		0x65E08CD1, 0x3FF00004,	/* A01 = +1.000004194219219266770e+00. */
+		0x65EF91B6, 0xBF2341C7,	/* A02 = -1.469188600530365522261e-04. */
+		0x841FAF9E, 0xBFD535B6,	/* A03 = -3.314033785124993469751e-01. */
+		0xE361E964, 0xBE6D5794,	/* A00 = -5.465394929765249413434e-08. */
+		0x5EE2A0CB, 0x3FF00005,	/* A01 = +1.000005121846742950353e+00. */
+		0x77E66C8B, 0xBF265E6C,	/* A02 = -1.706607253709506650304e-04. */
+		0xDDCCEDA6, 0xBFD53264,	/* A03 = -3.312008062382240103361e-01. */
+		0x4D374A6E, 0xBE729C84,	/* A00 = -6.933284462462096107184e-08. */
+		0x7F019093, 0x3FF00006,	/* A01 = +1.000006195180536350264e+00. */
+		0x48D6DCE5, 0xBF29CC53,	/* A02 = -1.968242326435338705130e-04. */
+		0x2121ED35, 0xBFD52EE9,	/* A03 = -3.309881995734998416658e-01. */
+		0x17EAA872, 0xBE775AEA,	/* A00 = -8.700465590574974405858e-08. */
+		0xCA1D66B8, 0x3FF00007,	/* A01 = +1.000007428656699559610e+00. */
+		0xB98A2637, 0xBF2D8F5E,	/* A02 = -2.255252009216044881395e-04. */
+		0x5CDF9128, 0xBFD52B43,	/* A03 = -3.307655722585587376727e-01. */
+		0x28C343F0, 0xBE7D04DA,	/* A00 = -1.081040272327705484794e-07. */
+		0x4443CCF5, 0x3FF00009,	/* A01 = +1.000008837375216730337e+00. */
+		0x6C947AE5, 0xBF30D5B7,	/* A02 = -2.568791210978817814332e-04. */
+		0xA0776FAD, 0xBFD52773,	/* A03 = -3.305329386764651045105e-01. */
+		0xA12C51C7, 0xBE81DD77,	/* A00 = -1.331054169875768625701e-07. */
+		0xF1AFD2DA, 0x3FF0000A,	/* A01 = +1.000010437096696680470e+00. */
+		0x624C1680, 0xBF331230,	/* A02 = -2.910011410651516805537e-04. */
+		0xFC0B61DF, 0xBFD52379,	/* A03 = -3.302903138515186909352e-01. */
+		0xEEB3C435, 0xBE85D04E,	/* A00 = -1.625247628488202841012e-07. */
+		0xD6C9B1F2, 0x3FF0000C,	/* A01 = +1.000012244238970726684e+00. */
+		0x42FADDD4, 0xBF357F07,	/* A02 = -3.280060509313874068243e-04. */
+		0x806D0E81, 0xBFD51F56,	/* A03 = -3.300377134475880880338e-01. */
+		0x9B59681B, 0xBE8A6E28,	/* A00 = -1.969211333326924655065e-07. */
+		0xF8268F72, 0x3FF0000E,	/* A01 = +1.000014275873550406715e+00. */
+		0x7A1B747A, 0xBF381E27,	/* A02 = -3.680082682942575423093e-04. */
+		0x3F1D6FD4, 0xBFD51B09,	/* A03 = -3.297751537663746734808e-01. */
+		0x0EE9ABD5, 0xBE8FCBC4,	/* A00 = -2.368983653301529373887e-07. */
+		0x5A883B6C, 0x3FF00011,	/* A01 = +1.000016549721943981410e+00. */
+		0xC974B3D9, 0xBF3AF17A,	/* A02 = -4.111218235774406434303e-04. */
+		0x4A4C549C, 0xBFD51692,	/* A03 = -3.295026517456081105450e-01. */
+		0x60A3F956, 0xBE92FFBC,	/* A00 = -2.831066871072026054144e-07. */
+		0x02DCED8A, 0x3FF00014,	/* A01 = +1.000019084151832604590e+00. */
+		0x390C4801, 0xBF3DFAE9,	/* A02 = -4.574603454311488280083e-04. */
+		0xB4D7DC3A, 0xBFD511F1,	/* A03 = -3.292202249571719585575e-01. */
+		0x2F96D5AD, 0xBE9690A2,	/* A00 = -3.362443262393081632612e-07. */
+		0xF63EFF5D, 0x3FF00016,	/* A01 = +1.000021898173108825247e+00. */
+		0x839605BB, 0xBF409E2C,	/* A02 = -5.071370461992499986334e-04. */
+		0x924BEE00, 0xBFD50D27,	/* A03 = -3.289278916051614487515e-01. */
+		0x65E72A73, 0xBE9AA56C,	/* A00 = -3.970591019557469835586e-07. */
+		0x39F4A43E, 0x3FF0001A,	/* A01 = +1.000025011433776978009e+00. */
+		0x4C3D6667, 0xBF425BD7,	/* A02 = -5.602647074553602319844e-04. */
+		0xF6E1ABA2, 0xBFD50833,	/* A03 = -3.286256705238718156536e-01. */
+		0xFF1A83B0, 0xBE9F4BD4,	/* A00 = -4.663500013744687071912e-07. */
+		0xD36F9EC2, 0x3FF0001D,	/* A01 = +1.000028444215715683896e+00. */
+		0x34149405, 0xBF443766,	/* A02 = -6.169556656102642569831e-04. */
+		0xF77EDEE5, 0xBFD50316,	/* A03 = -3.283135811757190158922e-01. */
+		0x387BB079, 0xBEA3B625,	/* A00 = -5.874486399249461304297e-07. */
+		0xE14CFBA9, 0x3FF00023,	/* A01 = +1.000034217911642153709e+00. */
+		0x923218D2, 0xBF47392F,	/* A02 = -7.087213783883111826306e-04. */
+		0xACDEB938, 0xBFD4FB1F,	/* A03 = -3.278273761924483942209e-01. */
+		0xF543500A, 0xBEAA6E24,	/* A00 = -7.876828740601738750574e-07. */
+		0x5C6E8412, 0x3FF0002D,	/* A01 = +1.000043259679163742959e+00. */
+		0xBD7FDD70, 0xBF4BAF02,	/* A02 = -8.448375110664940040861e-04. */
+		0x6527A7DE, 0xBFD4EFEE,	/* A03 = -3.271442401734229177279e-01. */
+		0xBE2157D0, 0xBEB16E3E,	/* A00 = -1.038947396133402500647e-06. */
+		0x990FEE2F, 0x3FF00038,	/* A01 = +1.000053975962952312884e+00. */
+		0x81C574CB, 0xBF505694,	/* A02 = -9.972048056490652716971e-04. */
+		0x278DA2B4, 0xBFD4E419,	/* A03 = -3.264220129263251113372e-01. */
+		0x723165D4, 0xBEB6A7B6,	/* A00 = -1.350350836279403750524e-06. */
+		0xCAB4158E, 0x3FF00045,	/* A01 = +1.000066558657042303793e+00. */
+		0x9C849108, 0xBF531D7C,	/* A02 = -1.166698160951775212202e-03. */
+		0xBB33B152, 0xBFD4D7A0,	/* A03 = -3.256608799117844954552e-01. */
+		0xA8654AFD, 0xBEBD0EE2,	/* A00 = -1.732000471561702711532e-06. */
+		0x276F18D6, 0x3FF00055,	/* A01 = +1.000081209219890521211e+00. */
+		0xA3FB6C6C, 0xBF562FDB,	/* A02 = -1.354183666925102939860e-03. */
+		0xF1B93DB2, 0xBFD4CA85,	/* A03 = -3.248610363561638125773e-01. */
+		0x036A207E, 0xBEC269D4,	/* A00 = -2.195047297096822741730e-06. */
+		0xE7DA6E4E, 0x3FF00066,	/* A01 = +1.000098138500919997540e+00. */
+		0x9FC36B3A, 0xBF599149,	/* A02 = -1.560518167983372759405e-03. */
+		0xA72283D6, 0xBFD4BCC9,	/* A03 = -3.240226871658341556426e-01. */
+		0x6C09CFE1, 0xBEC7154B,	/* A00 = -2.751729738565190291276e-06. */
+		0x47086B80, 0x3FF0007B,	/* A01 = +1.000117566559055148900e+00. */
+		0x33B4F8F4, 0xBF5D4554,	/* A02 = -1.786548832412968197680e-03. */
+		0xC1BFE145, 0xBFD4AE6C,	/* A03 = -3.231460468373550942722e-01. */
+		0xC64A0F8A, 0xBECCA68C,	/* A00 = -3.415415948561670285790e-06. */
+		0x827742F7, 0x3FF00092,	/* A01 = +1.000139722473418535387e+00. */
+		0x15A527AF, 0xBF60A7BF,	/* A02 = -2.033112728132522705610e-03. */
+		0x3214084C, 0xBFD49F70,	/* A03 = -3.222313393636155876010e-01. */
+		0x676B241B, 0xBED19E68,	/* A00 = -4.200644630977303616698e-06. */
+		0xDA037B26, 0x3FF000AC,	/* A01 = +1.000164844146362863597e+00. */
+		0x836A02F8, 0xBF62D99F,	/* A02 = -2.301036405072284102280e-03. */
+		0xF2B91B28, 0xBFD48FD4,	/* A03 = -3.212787981359945810311e-01. */
+		0xB0C7AA54, 0xBED57CF4,	/* A00 = -5.123164339408145209103e-06. */
+		0x8FD9E1A1, 0x3FF000CA,	/* A01 = +1.000193178099017865534e+00. */
+		0x4548E686, 0xBF653A01,	/* A02 = -2.591135484433962181405e-03. */
+		0x0844B38F, 0xBFD47F9C,	/* A03 = -3.202886658426046806447e-01. */
+		0x1B1A41E2, 0xBEDA012B,	/* A00 = -6.199971197454598722328e-06. */
+		0xE868FDF4, 0x3FF000EB,	/* A01 = +1.000224979259539459520e+00. */
+		0x27E0A544, 0xBF67CA94,	/* A02 = -2.904214255086275467410e-03. */
+		0x812ADB37, 0xBFD46EC6,	/* A03 = -3.192611943626845749655e-01. */
+		0x5BF12194, 0xBEDF3EAC,	/* A00 = -7.449344990702664567927e-06. */
+		0x2A520784, 0x3FF00111,	/* A01 = +1.000260510744255704196e+00. */
+		0xABDA4DC4, 0xBF6A8D01,	/* A02 = -3.241065277345108255891e-03. */
+		0x759FFA4A, 0xBFD45D55,	/* A03 = -3.181966446572103146551e-01. */
+		0xBC274267, 0xBEE2A541,	/* A00 = -8.890883582164319970972e-06. */
+		0x9E5961F2, 0x3FF0013A,	/* A01 = +1.000300043631906721231e+00. */
+		0xD080C540, 0xBF6D82EC,	/* A02 = -3.602468994380686462264e-03. */
+		0x0779C0AD, 0xBFD44B4A,	/* A03 = -3.170952866557950611259e-01. */
+		0x609A27F4, 0xBEE61D97,	/* A00 = -1.054553560499505625520e-05. */
+		0x8F56A3AF, 0x3FF00168,	/* A01 = +1.000343856731187974773e+00. */
+		0xEFB683EC, 0xBF7056F8,	/* A02 = -3.989193351487490407647e-03. */
+		0x620F0F74, 0xBFD438A5,	/* A03 = -3.159573991399533543500e-01. */
+		0x29EDD370, 0xBEEA1454,	/* A00 = -1.243563138839952927732e-05. */
+		0x4A242A67, 0x3FF0019B,	/* A01 = +1.000392236341804297339e+00. */
+		0x1CA78D9B, 0xBF7207D3,	/* A02 = -4.401993423445739288258e-03. */
+		0xBA16E7CD, 0xBFD42568,	/* A03 = -3.147832696228050619602e-01. */
+		0x0D52680F, 0xBEEE9637,	/* A00 = -1.458491207477835326165e-05. */
+		0x1D8E4115, 0x3FF001D3,	/* A01 = +1.000445476009251821736e+00. */
+		0x11EDC094, 0xBF73D4CC,	/* A02 = -4.841611050196221316400e-03. */
+		0x4D8664E7, 0xBFD41195,	/* A03 = -3.135731942252974469021e-01. */
+		0x46215EF8, 0xBEF338C0,	/* A00 = -1.833122622260562810219e-05. */
+		0xC32C2EC1, 0x3FF00230,	/* A01 = +1.000534784691737621998e+00. */
+		0x9BCC5DAF, 0xBF76BD01,	/* A02 = -5.551344188254799492943e-03. */
+		0x156DC21E, 0xBFD3F2C7,	/* A03 = -3.116929730668135389848e-01. */
+		0xAE411EAE, 0xBEF9B15E,	/* A00 = -2.450261207822986676092e-05. */
+		0xDF057A4D, 0x3FF002C2,	/* A01 = +1.000674124886830940184e+00. */
+		0xD9AC1E30, 0xBF7B08CC,	/* A02 = -6.600189396301511801646e-03. */
+		0xA114FED8, 0xBFD3C7A7,	/* A03 = -3.090609620157755976777e-01. */
+		0x83C373B3, 0xBF00E364,	/* A00 = -3.221178528332122595812e-05. */
+		0x419480D7, 0x3FF0036F,	/* A01 = +1.000838524028997644777e+00. */
+		0xD1777007, 0xBF7FD255,	/* A02 = -7.768950679260206403087e-03. */
+		0x3911D6CE, 0xBFD39A45,	/* A03 = -3.062909180947429588215e-01. */
+		0x4DD12059, 0xBF05DFA0,	/* A00 = -4.172046622180685472624e-05. */
+		0xB2A03D8D, 0x3FF00438,	/* A01 = +1.001030633695197069599e+00. */
+		0xBB4A9D10, 0xBF828F8D,	/* A02 = -9.062869337255224921890e-03. */
+		0x704697D9, 0xBFD36AAB,	/* A03 = -3.033856007044711255993e-01. */
+		0xC647DEFB, 0xBF0BF3E0,	/* A00 = -5.331544597092331081714e-05. */
+		0x1063D36D, 0x3FF00522,	/* A01 = +1.001253189109060359741e+00. */
+		0xB3C96102, 0xBF857A2C,	/* A02 = -1.048693584122917590862e-02. */
+		0x5BBB4FEC, 0xBFD338E6,	/* A03 = -3.003478904549854444639e-01. */
+		0xED7C9D31, 0xBF11A506,	/* A00 = -6.730894835681591541979e-05. */
+		0x4D0EA92A, 0x3FF0062E,	/* A01 = +1.001508999829250345925e+00. */
+		0xC2761AF3, 0xBF88AB82,	/* A02 = -1.204588085125866091241e-02. */
+		0x8D6BD206, 0xBFD30502,	/* A03 = -2.971807843271395688234e-01. */
+		0x922D9BF1, 0xBF1607C0,	/* A00 = -8.403885708006799337092e-05. */
+		0x6C341961, 0x3FF00760,	/* A01 = +1.001800940198869449560e+00. */
+		0xDA487BCF, 0xBF8C25E6,	/* A02 = -1.374416688582682892494e-02. */
+		0x0EE8F7B5, 0xBFD2CF0D,	/* A03 = -2.938873906713255768075e-01. */
+		0x80A0A16D, 0xBF1B3A84,	/* A00 = -1.038688061788578038307e-04. */
+		0x802D02D6, 0x3FF008BB,	/* A01 = +1.002131939589323561535e+00. */
+		0xE99FD100, 0xBF8FEB8A,	/* A02 = -1.558598065819483124983e-02. */
+		0x5BD0911B, 0xBFD29713,	/* A03 = -2.904709240558688843059e-01. */
+		0xBDB75C65, 0xBF20ABB9,	/* A00 = -1.271881327357976163798e-04. */
+		0xA76D8CD1, 0x3FF00A42,	/* A01 = +1.002504972472525901495e+00. */
+		0x752BB9E6, 0xBF91FF3D,	/* A02 = -1.757522609380570560722e-02. */
+		0x5C1F88B4, 0xBFD25D23,	/* A03 = -2.869346999779154305799e-01. */
+		0x54425461, 0xBF243D32,	/* A00 = -1.544116913733432829448e-04. */
+		0x09D1795E, 0x3FF00BF9,	/* A01 = +1.002923048355647051011e+00. */
+		0x04D44942, 0xBF94304E,	/* A02 = -1.971551804042204897316e-02. */
+		0x5E61CFA6, 0xBFD2214B,	/* A03 = -2.832821294498394371075e-01. */
+		0x011B61CE, 0xBF286070,	/* A00 = -1.859795307186510085994e-04. */
+		0xD5E1627E, 0x3FF00DE1,	/* A01 = +1.003389201612804537689e+00. */
+		0xF4163F59, 0xBF9689D5,	/* A02 = -2.201017668045266231780e-02. */
+		0x11C3B42C, 0xBFD1E39A,	/* A03 = -2.795167134743816728104e-01. */
+		0x366A79E8, 0xBF2D250B,	/* A00 = -2.223564326486314902259e-04. */
+		0x3E134001, 0x3FF01000,	/* A01 = +1.003906481248123094829e+00. */
+		0xF91F6F81, 0xBF990C9F,	/* A02 = -2.446222265267250853271e-02. */
+		0x80084CDC, 0xBFD1A41E,	/* A03 = -2.756420374218586655246e-01. */
+		0xDDC2A30E, 0xBF314DB5,	/* A00 = -2.640313157465248123865e-04. */
+		0x7608921B, 0x3FF01257,	/* A01 = +1.004477940624503018441e+00. */
+		0x6875B0C9, 0xBF9BB962,	/* A02 = -2.707437288829409385849e-02. */
+		0x0768A9D0, 0xBFD162E8,	/* A03 = -2.716617653228725615122e-01. */
+		0x33808864, 0xBF346A61,	/* A00 = -3.115165050094957730625e-04. */
+		0xAFCC88A3, 0x3FF014EA,	/* A01 = +1.005106627192198898157e+00. */
+		0xF9BF7419, 0xBF9E90BE,	/* A02 = -2.984903716411588595059e-02. */
+		0x545F7FAD, 0xBFD12006,	/* A03 = -2.675796340899932457269e-01. */
+		0xDC3848EA, 0xBF37F180,	/* A00 = -3.653468704395550778821e-04. */
+		0x19147861, 0x3FF017BD,	/* A01 = +1.005795572250939295955e+00. */
+		0x4C702E07, 0xBFA0C9A1,	/* A02 = -3.278831537326359207851e-02. */
+		0x5B650092, 0xBFD0DB89,	/* A03 = -2.633994476818851682154e-01. */
+		0xAC6D7635, 0xBF3BEC6A,	/* A00 = -4.260788377246944457107e-04. */
+		0xD884E719, 0x3FF01AD1,	/* A01 = +1.006547780778822565040e+00. */
+		0xA1B1434A, 0xBFA260B2,	/* A02 = -3.589399551186163439542e-02. */
+		0x529E93D6, 0xBFD09581,	/* A03 = -2.591250712233067465817e-01. */
+		0x6167882B, 0xBF4164E2,	/* A00 = -5.308251737086202562063e-04. */
+		0x14B62B81, 0x3FF01FEF,	/* A01 = +1.007796364693348545316e+00. */
+		0x4538AA42, 0xBFA4EB01,	/* A02 = -4.085544557559163403315e-02. */
+		0x6FEAF41F, 0xBFD029D3,	/* A03 = -2.525528519580024222613e-01. */
+		0xF4E53DC8, 0xBF46F6FF,	/* A00 = -7.008313930700277652464e-04. */
+		0xB51CBBA0, 0x3FF027CB,	/* A01 = +1.009715754956893363214e+00. */
+		0x9FEC112E, 0xBFA89DEC,	/* A02 = -4.807986690687680864098e-02. */
+		0x464D0DB4, 0xBFCF2A99,	/* A03 = -2.434875100390009317053e-01. */
+		0x4F66A4D9, 0xBF4DCC9C,	/* A00 = -9.094012482836712945103e-04. */
+		0xCFCCD583, 0x3FF030E7,	/* A01 = +1.011939822882909068014e+00. */
+		0x95814081, 0xBFACAA3B,	/* A02 = -5.598627281199331645611e-02. */
+		0x156BE7CF, 0xBFCDF78F,	/* A03 = -2.341173987004467604844e-01. */
+		0x74E5C7A6, 0xBF5308ED,	/* A00 = -1.161796466103906435435e-03. */
+		0x86412ECB, 0x3FF03B59,	/* A01 = +1.014489674026594512313e+00. */
+		0xA88DCC3F, 0xBFB087EB,	/* A02 = -6.457398285947223148806e-02. */
+		0xD134862F, 0xBFCCBB9B,	/* A03 = -2.244753619680052991736e-01. */
+		0xC00DF4B5, 0xBF57FA23,	/* A00 = -1.463446533505758208674e-03. */
+		0x58A1BCC0, 0x3FF04735,	/* A01 = +1.017384859292903342975e+00. */
+		0xBC6360EF, 0xBFB2E702,	/* A02 = -7.383744334527241048871e-02. */
+		0x46379288, 0xBFCB77D5,	/* A03 = -2.145945160729250122955e-01. */
+		0x71557F71, 0xBF5DD129,	/* A00 = -1.819887610814388068450e-03. */
+		0xDF5000A8, 0x3FF0548D,	/* A01 = +1.020643112482540360020e+00. */
+		0x3DA186E1, 0xBFB571B6,	/* A02 = -8.376635555898871710045e-02. */
+		0x02605148, 0xBFCA2D52,	/* A03 = -2.045080672838912594358e-01. */
+		0xAD5D4F17, 0xBF6252B1,	/* A00 = -2.236697221556737096709e-03. */
+		0x8A910BF7, 0x3FF06373,	/* A01 = +1.024280110622155737232e+00. */
+		0x8E6B601B, 0xBFB8270C,	/* A02 = -9.434584118878357184013e-02. */
+		0xD950A07E, 0xBFC8DD27,	/* A03 = -1.942491351230763441116e-01. */
+		0x91730CFC, 0xBF66470C,	/* A00 = -2.719425723258004842786e-03. */
+		0x68FCF331, 0x3FF073F4,	/* A01 = +1.028309259519300633556e+00. */
+		0x952191E4, 0xBFBB05C2,	/* A02 = -1.055566419686964629854e-01. */
+		0x770DE2BD, 0xBFC7886A,	/* A03 = -1.838505822486435070662e-01. */
+		0xAC8E98EC, 0xBF6AD114,	/* A00 = -3.273525599485007861467e-03. */
+		0xF53E5226, 0x3FF0861B,	/* A01 = +1.032741506559554434119e+00. */
+		0x9B461507, 0xBFBE0C4F,	/* A02 = -1.173753503881763554650e-01. */
+		0x037CDE3A, 0xBFC6302A,	/* A03 = -1.733448521642786954722e-01. */
+		0x2A6C2AF8, 0xBF6FFBDE,	/* A00 = -3.904279630096648551207e-03. */
+		0xEB8E7DA3, 0x3FF099F2,	/* A01 = +1.037585182326304034106e+00. */
+		0xD192DDF0, 0xBFC09C74,	/* A02 = -1.297746680554463516444e-01. */
+		0xD8E3079F, 0xBFC4D571,	/* A03 = -1.627638157861470424859e-01. */
+		0xC0B952AA, 0xBF72E8FD,	/* A00 = -4.616728994353872309042e-03. */
+		0x273C9533, 0x3FF0AF7F,	/* A01 = +1.042845872181101141152e+00. */
+		0x12736F10, 0xBFC244C5,	/* A02 = -1.427236881344176033792e-01. */
+		0x4F58B902, 0xBFC37947,	/* A03 = -1.521386277613104298645e-01. */
+		0xAF17395B, 0xBF762EAB,	/* A00 = -5.415602341101023557701e-03. */
+		0x886F63FB, 0x3FF0C6C3,	/* A01 = +1.048526318502125631582e+00. */
+		0x918EA12A, 0xBFC3FDF9,	/* A02 = -1.561881981590514389957e-01. */
+		0x9ECAB895, 0xBFC21CA8,	/* A03 = -1.414995932913753196036e-01. */
+		0xCE5B2BAE, 0xBF79D387,	/* A00 = -6.305246822828998107258e-03. */
+		0xE2346376, 0x3FF0DFBF,	/* A01 = +1.054626353847394337748e+00. */
+		0x43602620, 0xBFC5C6DA,	/* A02 = -1.701309994680721970894e-01. */
+		0xD8DB6631, 0xBFC0C08B,	/* A03 = -1.308760460731704100557e-01. */
+		0xE8DA9060, 0xBF7DDBA8,	/* A00 = -7.289562037531366334164e-03. */
+		0xF0D1B464, 0x3FF0FA70,	/* A01 = +1.061142864894713433443e+00. */
+		0xD92BAA7C, 0xBFC79E18,	/* A02 = -1.845122394946264732241e-01. */
+		0xF74C2669, 0xBFBECBBB,	/* A03 = -1.202962378266875381749e-01. */
+		0x76EA25DA, 0xBF81254E,	/* A00 = -8.371937755572145950511e-03. */
+		0x5835EBD0, 0x3FF116D0,	/* A01 = +1.068069786618014660462e+00. */
+		0x9E2ED224, 0xBFC98253,	/* A02 = -1.992897531869327609755e-01. */
+		0x3C350159, 0xBFBC1B04,	/* A03 = -1.097872397413132278254e-01. */
+		0xBA863403, 0xBF8391AC,	/* A00 = -9.555196230190082448686e-03. */
+		0xAA477FE2, 0x3FF134D4,	/* A01 = +1.075398125794884141015e+00. */
+		0x609FEAFB, 0xBFCB7218,	/* A02 = -2.144194099235717521079e-01. */
+		0x6CB88329, 0xBFB970A1,	/* A03 = -9.937485603633135211599e-02. */
+		0x88E48E8B, 0xBF879350,	/* A00 = -1.151144902957603431692e-02. */
+		0x92AD7DD3, 0x3FF16498,	/* A01 = +1.087059567413110938716e+00. */
+		0xDDE75409, 0xBFCE6971,	/* A02 = -2.375929196847723912089e-01. */
+		0xE88CB251, 0xBFB58291,	/* A03 = -8.402358939628952472223e-02. */
+		0x2C325325, 0xBF8DB3A6,	/* A00 = -1.450280973794233242702e-02. */
+		0x00C6DEEA, 0x3FF1A9C9,	/* A01 = +1.103951457056548068891e+00. */
+		0x65B0E08E, 0xBFD13DBC,	/* A02 = -2.693930619311765140012e-01. */
+		0xF62696D1, 0xBFB06696,	/* A03 = -6.406539449252625362252e-02. */
+		0x99F2E27A, 0xBF925836,	/* A00 = -1.791463198307716858659e-02. */
+		0xB85AA9F0, 0x3FF1F451,	/* A01 = +1.122148246892376022288e+00. */
+		0xF8288180, 0xBFD34FD5,	/* A02 = -3.017477916164565954205e-01. */
+		0x2825B683, 0xBFA6FB69,	/* A03 = -4.488686194495718900788e-02. */
+		0x6E673D6F, 0xBF9641C2,	/* A00 = -2.173522757385398448959e-02. */
+		0xDA5E2B07, 0x3FF24364,	/* A01 = +1.141453602790251542487e+00. */
+		0xA5EF5890, 0xBFD564A5,	/* A02 = -3.342680092295120530821e-01. */
+		0x2011A982, 0xBF9B4371,	/* A03 = -2.662445791467283467968e-02. */
+		0x38EC2F39, 0xBF9A9010,	/* A00 = -2.594018313816024226548e-02. */
+		0x56DFFEBA, 0x3FF29613,	/* A01 = +1.161639537196534011088e+00. */
+		0xB17198C7, 0xBFD775EB,	/* A02 = -3.665723069046972759644e-01. */
+		0x926CD462, 0xBF833B1A,	/* A03 = -9.390075295963199591975e-03. */
+		0x6A461B91, 0xBF9F396A,	/* A00 = -3.049246095317987084727e-02. */
+		0xBAEF534B, 0x3FF2EB53,	/* A01 = +1.182452898229899629357e+00. */
+		0xF8AD8BBD, 0xBFD97DAB,	/* A02 = -3.982953957076310058660e-01. */
+		0x3E0F8837, 0x3F7B8F6A,	/* A03 = +6.728568086119371925713e-03. */
+		0x590F8BAA, 0xBFA21878,	/* A00 = -3.534294211546946951064e-02. */
+		0x790236E1, 0x3FF34209,	/* A01 = +1.203622315111197105253e+00. */
+		0x0E71BECB, 0xBFDB764C,	/* A02 = -4.290952817018306997277e-01. */
+		0xC03F84C0, 0x3F962FE0,	/* A03 = +2.166701482190513949888e-02. */
+		0x9AD27ECC, 0xBFA4B36B,	/* A00 = -4.043136849327097492868e-02. */
+		0x5B12FC16, 0x3FF3990C,	/* A01 = +1.224865298994477935679e+00. */
+		0xB0D01390, 0xBFDD5AAB,	/* A02 = -4.586590983092770912322e-01. */
+		0x5CA162DB, 0x3FA21DAF,	/* A03 = +3.538272863142363083844e-02. */
+		0x4D7BF28B, 0xBFA7645E,	/* A00 = -4.568762489177399105378e-02. */
+		0xD51C0D9F, 0x3FF3EF2F,	/* A01 = +1.245895225962932562069e+00. */
+		0x7E1B686E, 0xBFDF2637,	/* A02 = -4.867075664057044503963e-01. */
+		0x756EE812, 0x3FA8803E,	/* A03 = +4.785342391501513914509e-02. */
+		0x25C64413, 0xBFAA2109,	/* A00 = -5.103329263796054643398e-02. */
+		0xF897D8E7, 0x3FF44349,	/* A01 = +1.266427966181760345066e+00. */
+		0x02C6D8E2, 0xBFE06A7B,	/* A02 = -5.129981092675530707226e-01. */
+		0x4734F5D0, 0x3FAE3F19,	/* A03 = +5.907515520309980505687e-02. */
+		0xF8A19BBB, 0xBFACDE48,	/* A00 = -5.638340029764018351832e-02. */
+		0xD5466582, 0x3FF49439,	/* A01 = +1.286187966447272845727e+00. */
+		0xC1063DDC, 0xBFE131C7,	/* A02 = -5.373266954429101183166e-01. */
+		0xC36AD805, 0x3FB1ADEE,	/* A03 = +6.906025191241844940482e-02. */
+		0x8F585680, 0xBFAF905D,	/* A00 = -6.164829611604449866036e-02. */
+		0x1FD27F99, 0x3FF4E0ED,	/* A01 = +1.304913639360142818546e+00. */
+		0x59DC1D3D, 0xBFE1E7A8,	/* A02 = -5.595285182070380836095e-01. */
+		0x8E4642A1, 0x3FB3ED01,	/* A03 = +7.783517573831001679086e-02. */
+		0x104160BA, 0xBFB11595,	/* A00 = -6.673556944713512906198e-02. */
+		0x0340490B, 0x3FF52865,	/* A01 = +1.322361958217302513319e+00. */
+		0xB40BC974, 0xBFE28B14,	/* A02 = -5.794776455425521000109e-01. */
+		0xF5BAF6D7, 0x3FB5DF49,	/* A03 = +8.543836831355676453281e-02. */
+		0x97344BA4, 0xBFB2513A,	/* A00 = -7.155195418844911836587e-02. */
+		0x0DB5EE14, 0x3FF569BA,	/* A01 = +1.338312200124055273420e+00. */
+		0xA8B67B20, 0xBFE31B53,	/* A02 = -5.970857901737396389308e-01. */
+		0x97BB0544, 0x3FB787F2,	/* A03 = +9.191814617499455275507e-02. */
+		0xE848FAFA, 0xBFB37512,	/* A00 = -7.600515528700305112331e-02. */
+		0x33B403C8, 0x3FF5A41F,	/* A01 = +1.352568819013173495591e+00. */
+		0xEA9A58A5, 0xBFE397F6,	/* A02 = -6.123003561103997904880e-01. */
+		0xFF25CA06, 0x3FB8EAA9,	/* A03 = +9.733068923177520814782e-02. */
+		0x603AFC5D, 0xBFB47B3E,	/* A00 = -8.000554894805263217439e-02. */
+		0xEDE40487, 0x3FF5D6E3,	/* A01 = +1.364963464031718975988e+00. */
+		0xBCA6D631, 0xBFE400D5,	/* A02 = -6.251019177058819709103e-01. */
+		0x0ED567FE, 0x3FBA0B83,	/* A03 = +1.017381583418739132707e-01. */
+		0x8AC90496, 0xBFB5BBFE,	/* A00 = -8.489981544791400103200e-02. */
+		0x70107E95, 0x3FF612BA,	/* A01 = +1.379572332145390989311e+00. */
+		0xF1FA7693, 0xBFE477EA,	/* A02 = -6.396383978023599814478e-01. */
+		0xB7C08A95, 0x3FBB4784,	/* A03 = +1.065600346196709652391e-01. */
+		0x40743939, 0xBFB6D5D9,	/* A00 = -8.920057128509463473254e-02. */
+		0x748F70CE, 0x3FF644A8,	/* A01 = +1.391762214006166953340e+00. */
+		0xAB07EA37, 0xBFE4D646,	/* A02 = -6.511567440459832267763e-01. */
+		0x4E1D5292, 0x3FBC354F,	/* A03 = +1.101884427747086558913e-01. */
+		0x19E4F3D1, 0xBFB7223D,	/* A00 = -9.036619074045339206069e-02. */
+		0xEB42B7FA, 0x3FF6518F,	/* A01 = +1.394912642466350494175e+00. */
+		0xCB87498C, 0xBFE4ED86,	/* A02 = -6.539949393430091184598e-01. */
+		0xF28CCA9B, 0x3FBC6D29,	/* A03 = +1.110407082713131127205e-01. */
+		0x52FF6312, 0xBFB68786,	/* A00 = -8.800544287022329936754e-02. */
+		0xC302D040, 0x3FF63948,	/* A01 = +1.388985406648330922508e+00. */
+		0xE7904E17, 0xBFE4C4E2,	/* A02 = -6.490339777687407218920e-01. */
+		0x56CA1ABE, 0x3FBC1273,	/* A03 = +1.096565329445224612481e-01. */
+		0x8B0C91D6, 0xBFB4F5D1,	/* A00 = -8.187589306596207427980e-02. */
+		0xEB7DD0B8, 0x3FF5FD27,	/* A01 = +1.374305648697413673176e+00. */
+		0x1A2B2FC6, 0xBFE464E0,	/* A02 = -6.373138915164353601739e-01. */
+		0x47674A30, 0x3FBB4605,	/* A03 = +1.065371798825160976065e-01. */
+		0xFA16A685, 0xBFB26642,	/* A00 = -7.187288861919156890412e-02. */
+		0xEDE1C95A, 0x3FF59F9B,	/* A01 = +1.351467065073470141812e+00. */
+		0x20C8FBEA, 0xBFE3D679,	/* A02 = -6.199308052381387046381e-01. */
+		0xA8D3CBC1, 0x3FBA24F6,	/* A03 = +1.021265184570401413078e-01. */
+		0x4794F097, 0xBFADB529,	/* A00 = -5.802277563859197656582e-02. */
+		0x7B9CF453, 0x3FF523EA,	/* A01 = +1.321268542159732772845e+00. */
+		0xB55E35DB, 0xBFE322A8,	/* A02 = -5.979808370918208160205e-01. */
+		0x3B1B3E37, 0x3FB8C867,	/* A03 = +9.680791085269722928697e-02. */
+		0x61965C6A, 0xBFA4B7D6,	/* A00 = -4.046506825687219699450e-02. */
+		0xE2CE3122, 0x3FF48DE3,	/* A01 = +1.284641157110919085227e+00. */
+		0xD1A7F445, 0xBFE251FE,	/* A02 = -5.725092024655472622285e-01. */
+		0x9FCABDB9, 0x3FB74569,	/* A03 = +9.090290213747821701507e-02. */
+		0x56E4EE1D, 0xBF93E604,	/* A00 = -1.943213253365004902773e-02. */
+		0x4E628A59, 0x3FF3E1A1,	/* A01 = +1.242585474196536532432e+00. */
+		0xB660E876, 0xBFE16C5A,	/* A02 = -5.444768488007543094653e-01. */
+		0xAA8C188F, 0x3FB5AD33,	/* A03 = +8.467410005332197397987e-02. */
+		0xC47C7961, 0x3F738C17,	/* A00 = +4.772274820224659853951e-03. */
+		0xDE3BD146, 0x3FF3234D,	/* A01 = +1.196119182682268355933e+00. */
+		0xD77A9D3B, 0xBFE078C0,	/* A02 = -5.147403915952176722826e-01. */
+		0xB3E276B8, 0x3FB40D74,	/* A03 = +7.833032027925923568290e-02. */
+		0xECC689C7, 0x3FA0474B,	/* A00 = +3.179394975019849550746e-02. */
+		0x4FA7D18A, 0x3FF256FB,	/* A01 = +1.146235762743432307076e+00. */
+		0x3FB285E2, 0xBFDEFA8E,	/* A02 = -4.840427038235174395098e-01. */
+		0x07493D59, 0x3FB270C0,	/* A03 = +7.203293016322244446403e-02. */
+		0x1E479BDC, 0x3FAF5BD5,	/* A00 = +6.124750132203590768931e-02. */
+		0xD0B53BC5, 0x3FF18081,	/* A01 = +1.093873801484492647162e+00. */
+		0x39BD0C03, 0xBFDCFE24,	/* A02 = -4.530115665294831006626e-01. */
+		0x5A45AFDD, 0x3FB0DEFE,	/* A03 = +6.590261176978580437424e-02. */
+		0x2806EA26, 0x3FB7BD5D,	/* A00 = +9.273321368429118805032e-02. */
+		0xE35B4440, 0x3FF0A369,	/* A01 = +1.039895904647224256223e+00. */
+		0x5C9951E7, 0xBFDB04BC,	/* A02 = -4.221640495573226181669e-01. */
+		0xA9D6DEEF, 0x3FAEBBBA,	/* A03 = +6.002600978120919278380e-02. */
+		0x11098DBC, 0x3FC01BE4,	/* A00 = +1.258511622610124502941e-01. */
+		0xABC031C1, 0x3FEF85BD,	/* A01 = +9.850757936961188621083e-01. */
+		0x375097C2, 0xBFD91521,	/* A02 = -3.919146576102968682065e-01. */
+		0x0086D982, 0x3FABE26F,	/* A03 = +5.446192628317005068883e-02. */
+		0xFF5776B9, 0x3FC481D7,	/* A00 = +1.602125164781023347604e-01. */
+		0x6C1E7218, 0x3FEDC350,	/* A01 = +9.300920592973538347792e-01. */
+		0x88DA7D4F, 0xBFD7349A,	/* A02 = -3.625856720409119104964e-01. */
+		0xDFF8E2AE, 0x3FA936E2,	/* A03 = +4.924687370334389358018e-02. */
+		0xF96FA27A, 0x3FC90471,	/* A00 = +1.954481571149420671141e-01. */
+		0x601987A2, 0x3FEC0451,	/* A01 = +8.755270840595026360376e-01. */
+		0xD4B898DC, 0xBFD5671C,	/* A02 = -3.344184949259110251063e-01. */
+		0x94603B67, 0x3FA6BB95,	/* A03 = +4.439990459660841243261e-02. */
+		0xB9ED944C, 0x3FCFD8AD,	/* A00 = +2.488000066615846384011e-01. */
+		0x73F6809A, 0x3FE978C0,	/* A01 = +7.959902062321078108909e-01. */
+		0x00BCD5A9, 0xBFD2DF7E,	/* A02 = -2.948908812716931060471e-01. */
+		0x33D490B2, 0x3FA36140,	/* A03 = +3.785133965200894456959e-02. */
+		0x12AFE5A0, 0x3FD4846A,	/* A00 = +3.205819303981005674586e-01. */
+		0x47D40472, 0x3FE63A11,	/* A01 = +6.945883181471244061100e-01. */
+		0x8AD34450, 0xBFCFA226,	/* A02 = -2.471359422548027318101e-01. */
+		0x01D9FFE0, 0x3F9F1502,	/* A03 = +3.035357605267552383310e-02. */
+		0x41F82BEB, 0x3FD90186,	/* A00 = +3.907180446846598154131e-01. */
+		0x220FFBDC, 0x3FE33B7C,	/* A01 = +6.010113396913498995389e-01. */
+		0x87E29C86, 0xBFCA4E41,	/* A02 = -2.055131829740483584423e-01. */
+		0xED19F8F4, 0x3F98C30C,	/* A03 = +2.418155858185229434287e-02. */
+		0x55BEB078, 0x3FDD4B82,	/* A00 = +4.577337109901757905561e-01. */
+		0x19D3A49B, 0x3FE0858B,	/* A01 = +5.163016800335243905451e-01. */
+		0x9EACE564, 0xBFC5BC92,	/* A02 = -1.698172831327539045176e-01. */
+		0xCE57DE2B, 0x3F93A083,	/* A03 = +1.916700312537337677621e-02. */
+		0xE039295C, 0x3FE0A8E5,	/* A00 = +5.206174258576470315063e-01. */
+		0x234583FE, 0x3FDC35E1,	/* A01 = +4.407885403107342225937e-01. */
+		0x4E31AEB9, 0xBFC1DE03,	/* A02 = -1.395877963835710222629e-01. */
+		0xB3471BDC, 0x3F8EFDEB,	/* A03 = +1.513275280821162888101e-02. */
+		0x603CB2A5, 0x3FE2851B,	/* A00 = +5.787484054213406503564e-01. */
+		0x4ABBB286, 0x3FD7F4A4,	/* A01 = +3.743067483726821853551e-01. */
+		0x67087DE7, 0xBFBD3EEB,	/* A02 = -1.142413260026767657385e-01. */
+		0x8329E8BD, 0x3F8864F3,	/* A03 = +1.191129917173260922836e-02. */
+		0xE3C34AC1, 0x3FE437DB,	/* A00 = +6.318187187665317283702e-01. */
+		0x789441B5, 0x3FD43F6F,	/* A01 = +3.163717916040938438194e-01. */
+		0x7901B9A4, 0xBFB7D92E,	/* A02 = -9.315767721429907277653e-02. */
+		0x342308E1, 0x3F8327ED,	/* A03 = +9.353497651663324544136e-03. */
+		0x7766D55C, 0x3FE5C097,	/* A00 = +6.797597248138731451661e-01. */
+		0xA764D8F9, 0x3FD10B42,	/* A01 = +2.663122782427219115142e-01. */
+		0x51D3D70F, 0xBFB36333,	/* A02 = -7.573242900602060456716e-02. */
+		0x30FF899C, 0x3F7E079E,	/* A03 = +7.331483779099558922843e-03. */
+		0xE08A88C4, 0x3FE7202C,	/* A00 = +7.226776490754436288455e-01. */
+		0xB5662B01, 0x3FCC973E,	/* A01 = +2.233656297433626314319e-01. */
+		0x55F9920B, 0xBFAF70A4,	/* A02 = -6.140626477716545211782e-02. */
+		0x11CE99B6, 0x3F778124,	/* A03 = +5.738392731393584730859e-03. */
+		0x424095B1, 0x3FE85879,	/* A00 = +7.608000082006382003286e-01. */
+		0xD1674D84, 0x3FC7E73B,	/* A01 = +1.867441914060742336190e-01. */
+		0xE4BF333B, 0xBFA96F84,	/* A02 = -4.967894832916504993525e-02. */
+		0xDCA6E117, 0x3F72606D,	/* A03 = +4.486493251924870105662e-03. */
+		0x4957F4DD, 0x3FE96BFE,	/* A00 = +7.944327766887472330737e-01. */
+		0x80D25478, 0x3FC3ED47,	/* A01 = +1.556786898624158421711e-01. */
+		0xF9A56B58, 0xBFA489C5,	/* A02 = -4.011362717093075458408e-02. */
+		0x17E9AD2A, 0x3F6CB5DC,	/* A03 = +3.504686231556104931972e-03. */
+		0xB2F41234, 0x3FEA5D9C,	/* A00 = +8.239272589858672724006e-01. */
+		0x58374DCF, 0x3FC091A7,	/* A01 = +1.294449978582705440555e-01. */
+		0x6D4B5CE0, 0xBFA08E43,	/* A02 = -3.233538350257858517978e-02. */
+		0xAD53E6B7, 0x3F666997,	/* A03 = +2.735897297154145629133e-03. */
+		0x342CB850, 0x3FEB3060,	/* A00 = +8.496552485501158713532e-01. */
+		0xBBC7DC1B, 0x3FBB7D30,	/* A01 = +1.073790033768634993860e-01. */
+		0x3443D9E3, 0xBF9AA6BA,	/* A02 = -2.602663940430173170060e-02. */
+		0x64B7850B, 0x3F617CA7,	/* A03 = +2.134634914668814050648e-03. */
+		0xA6A0C7B8, 0x3FEBE759,	/* A00 = +8.719909910635044170135e-01. */
+		0xE6A703FF, 0x3FB6C10D,	/* A01 = +8.888327485239243264115e-02. */
+		0x6D8BE1F6, 0xBF956C56,	/* A02 = -2.092108768099084498138e-02. */
+		0xA4A59CF8, 0x3F5B46D1,	/* A03 = +1.664833764687232917079e-03. */
+		0x94887A04, 0x3FEC8584,	/* A00 = +8.912985707318630268503e-01. */
+		0xF543394D, 0x3FB2CC31,	/* A01 = +7.342827070099140762682e-02. */
+		0x7FF69137, 0xBF913347,	/* A02 = -1.679717749142747504343e-02. */
+		0x2FBB4DA5, 0x3F554448,	/* A03 = +1.298017973501022466823e-03. */
+		0x9D0E32E9, 0x3FED0DB5,	/* A00 = +9.079235141267335551518e-01. */
+		0xAFFC6EF4, 0x3FAF006B,	/* A01 = +6.055008433597022787787e-02. */
+		0x6FA2B97A, 0xBF8B9714,	/* A02 = -1.347175565419144252499e-02. */
+		0x1F4CDC69, 0x3F5093B0,	/* A03 = +1.011774057770665211434e-03. */
+		0xC3EC457C, 0x3FEDB487,	/* A00 = +9.282873942012623835751e-01. */
+		0x09D0BD1D, 0x3FA7390C,	/* A01 = +4.535710925881118044112e-02. */
+		0xC3181106, 0xBF83D9F7,	/* A02 = -9.693084374710735778846e-03. */
+		0x0A3C0E64, 0x3F46E34A,	/* A03 = +6.984817050299072134500e-04. */
+		0xB4E6EB00, 0x3FEE5FFC,	/* A00 = +9.492171796076434020506e-01. */
+		0xED00AADF, 0x3F9F4913,	/* A01 = +3.055220731782070861526e-02. */
+		0xD0E59B5C, 0xBF79670B,	/* A02 = -6.201788097633133961528e-03. */
+		0xEBCAF96D, 0x3F3BC998,	/* A03 = +4.240034429975534616304e-04. */
+		0x1E9542FE, 0x3FEEDBA4,	/* A00 = +9.643116566968215064293e-01. */
+		0x18D9C24D, 0x3F94F5DD,	/* A01 = +2.046914543319848858727e-02. */
+		0x6AA122B9, 0xBF703489,	/* A02 = -3.956352980886528904192e-03. */
+		0x47810B39, 0x3F30DCCB,	/* A03 = +2.573009765038273091199e-04. */
+		0x882520ED, 0x3FEF33F2,	/* A00 = +9.750912341196716903724e-01. */
+		0x2CF553FF, 0x3F8BF37F,	/* A01 = +1.364802699996836392315e-02. */
+		0x05A69619, 0xBF649F6F,	/* A02 = -2.517430152880317534986e-03. */
+		0xC950AAC9, 0x3F247623,	/* A03 = +1.561087307505231250044e-04. */
+		0x57751741, 0x3FEF7277,	/* A00 = +9.827229221489021115943e-01. */
+		0x912C4400, 0x3F828E67,	/* A01 = +9.060677640748693306705e-03. */
+		0xA806CC2C, 0xBF5A2F51,	/* A02 = -1.598195784123355826789e-03. */
+		0x7687E613, 0x3F18D35D,	/* A03 = +9.470231965016282719549e-05. */
+		0x25C5942A, 0x3FEF9E63,	/* A00 = +9.880843866091073568469e-01. */
+		0x17618F76, 0x3F788AB1,	/* A01 = +5.991641772286606867914e-03. */
+		0xB0B1EA89, 0xBF5096EA,	/* A02 = -1.012543859160305046233e-03. */
+		0xEC4435AB, 0x3F0E1E50,	/* A03 = +5.744633156910412119652e-05. */
+		0x84049369, 0x3FEFBD07,	/* A00 = +9.918248728250605994461e-01. */
+		0x8294035F, 0x3F702BBD,	/* A01 = +3.947963975634432264028e-03. */
+		0xE0F00593, 0xBF44FB55,	/* A02 = -6.403130845457509273330e-04. */
+		0xD723230A, 0x3F0244DC,	/* A03 = +3.484534217219031730379e-05. */
+		0xE2366A43, 0x3FEFD245,	/* A00 = +9.944180887426415926811e-01. */
+		0xEC088433, 0x3F653D82,	/* A01 = +2.592807490387838333795e-03. */
+		0x5E013CB8, 0xBF3A7DF7,	/* A02 = -4.042366908878036561859e-04. */
+		0x69F991CD, 0x3EF6298E,	/* A03 = +2.113564425911141559972e-05. */
+		0xA508BC69, 0x3FEFE0EA,	/* A00 = +9.962056372950317539861e-01. */
+		0x1AF3FDDA, 0x3F5BD077,	/* A01 = +1.697651208644282514598e-03. */
+		0x254DE571, 0xBF30B2E1,	/* A02 = -2.548026725928887099328e-04. */
+		0x70EC0256, 0x3EEAE28B,	/* A03 = +1.281973848454955042307e-05. */
+		0x303D7F96, 0x3FEFEAF5,	/* A00 = +9.974313680831865536192e-01. */
+		0x1365657E, 0x3F522911,	/* A01 = +1.108423877289460134782e-03. */
+		0xD04DFE66, 0xBF250572,	/* A02 = -1.603796628408704519168e-04. */
+		0xBB57C981, 0x3EE04E89,	/* A03 = +7.775682983689149966743e-06. */
+		0x52F1CF44, 0x3FEFF1CF,	/* A00 = +9.982678051005469122003e-01. */
+		0x16147CEB, 0x3F47A713,	/* A01 = +7.218211359577819110842e-04. */
+		0x04055719, 0xBF1A6D76,	/* A02 = -1.008132248946049582547e-04. */
+		0x7586A85C, 0x3ED3C804,	/* A03 = +4.716233739913014633626e-06. */
+		0x0369EF69, 0x3FEFF677,	/* A00 = +9.988360468555416149528e-01. */
+		0x1180FBF0, 0x3F3EBB26,	/* A01 = +4.689186039321105101130e-04. */
+		0x4FE19D7F, 0xBF109775,	/* A02 = -6.329206004950480057066e-05. */
+		0x83BCA0A7, 0x3EC7FEFF,	/* A03 = +2.860556404988488738366e-06. */
+		0x42371AC4, 0x3FEFF99D,	/* A00 = +9.992204945818561334647e-01. */
+		0xEC271F59, 0x3F33EB2A,	/* A01 = +3.039340773764907474054e-04. */
+		0xE0FC0D79, 0xBF04CF18,	/* A02 = -3.968996690952969588805e-05. */
+		0xD6019BE9, 0x3EBD1BDB,	/* A03 = +1.735021065507727833886e-06. */
+		0xA32B0D91, 0x3FEFFBBC,	/* A00 = +9.994795977476532700123e-01. */
+		0x1615110A, 0x3F29C41E,	/* A01 = +1.965796209707565346710e-04. */
+		0x3D9DCB5A, 0xBEFA11F9,	/* A02 = -2.486248909101414873235e-05. */
+		0x4546F7A7, 0x3EB1A7CA,	/* A03 = +1.052345642723709228769e-06. */
+		0x8B8E8DE2, 0x3FEFFD29,	/* A00 = +9.996535993308806045121e-01. */
+		0x2D523C5B, 0x3F20A1C4,	/* A01 = +1.268913244172078754520e-04. */
+		0x364AFAE4, 0xBEF0507A,	/* A02 = -1.555859070622834605755e-05. */
+		0x17E7CDF4, 0x3EA56ACA,	/* A03 = +6.382806956848098872313e-07. */
+		0xC82BA5A3, 0x3FEFFE1D,	/* A00 = +9.997700604991915929176e-01. */
+		0xB90F1769, 0x3F156E73,	/* A01 = +8.175450626798714452801e-05. */
+		0x79D0A09F, 0xBEE46635,	/* A02 = -9.727122057226747625365e-06. */
+		0xFEC5D4C1, 0x3E99FAF6,	/* A03 = +3.871371052824002996020e-07. */
+		0xD0BB5E81, 0x3FEFFEF8,	/* A00 = +9.998745037837154514548e-01. */
+		0xA18D39C3, 0x3F06686D,	/* A01 = +4.273972098777251447726e-05. */
+		0x98073E90, 0xBED46BC2,	/* A02 = -4.868731025855742842491e-06. */
+		0x86B9D0FD, 0x3E88E422,	/* A03 = +1.854535328530838170114e-07. */
+		0xBC68DDC7, 0x3FEFFF8D,	/* A00 = +9.999455146670975791423e-01. */
+		0x53A80AF0, 0x3EF26B29,	/* A01 = +1.756534514108903368909e-05. */
+		0x2D580F83, 0xBEBFC447,	/* A02 = -1.893443529411295465239e-06. */
+		0x4553D19F, 0x3E72505B,	/* A03 = +6.822456673547912277047e-08. */
+		0xD1276609, 0x3FEFFFCE,	/* A00 = +9.999765477215883935358e-01. */
+		0xC7CC58F5, 0x3EDE1A94,	/* A01 = +7.177313020153979672606e-06. */
+		0x88744E57, 0xBEA8A2C9,	/* A02 = -7.342066660497443762363e-07. */
+		0x36BBBAF4, 0x3E5AF300,	/* A03 = +2.509841882843541084885e-08. */
+		0xFE70FCFC, 0x3FEFFFEA,	/* A00 = +9.999899835164849370983e-01. */
+		0x5E3549F5, 0x3EC87917,	/* A01 = +2.917410471128503564412e-06. */
+		0x677D1813, 0xBE930E36,	/* A02 = -2.839493400307523115929e-07. */
+		0x5B42D48F, 0x3E43D400,	/* A03 = +9.233192745401904898013e-09. */
+		0x00000000, 0x3ff00000,
+		0x00000000, 0x00000000,
+		0x00000000, 0x00000000,
+		0x00000000, 0x00000000)
+
+	.align	32
+	.type	__svml_stanh_common_data_avx2, @object
+	.size	__svml_stanh_common_data_avx2, .-__svml_stanh_common_data_avx2
+# endif
+#endif
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (4 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 06/27] x86/fpu: Update rodata usage in svml_s_tanhf_*_{avx2, sse4} Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-16 17:05   ` H.J. Lu via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 08/27] x86/fpu: Update rodata usage in svml_s_atanhf16_core_avx512.S Noah Goldstein via Libc-alpha
                   ` (20 subsequent siblings)
  26 siblings, 1 reply; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

No changes to the logic, just change how rodata is handled.

1. Define the rodatas using the new macros so they check that the
   offset is correct.

2. Use common data where applicable.
---
 .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
 1 file changed, 197 insertions(+), 253 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
index d74fc7731d..765e9ed7f7 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
@@ -70,94 +70,99 @@
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
-   by use in the function. On cold-starts this might help the
-   prefetcher. Possibly a better idea is to interleave start/end so
-   that the prefetcher is less likely to detect a stream and pull
-   irrelivant lines into cache.  */
 
-/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
- */
+
+#define LOCAL_DATA_NAME	__svml_stanh_data_internal
+#define LOCAL_DATA_NAME_UNALIGNED	__svml_stanh_data_internal_unaligned
+#include "svml_s_common_evex512_rodata_offsets.h"
+
+/* Offsets for data table __svml_stanh_data_internal_unaligned.
+   4 bytes each.  */
 #define _iExpMantMask_UISA		0
 #define _iMinIdxOfsMask_UISA		4
 #define _iMaxIdxMask_UISA		8
 #define _iExpMask			12
 
-/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
-   each.  */
-#define _sC_lo				0
-#define _sC_hi				64
-#define _sP7_lo				128
-#define _sP7_hi				192
-#define _sSignMask			256
-#define _sP6_lo				320
-#define _sP6_hi				384
-#define _sP5_lo				448
-#define _sP5_hi				512
-#define _sP4_lo				576
-#define _sP4_hi				640
-#define _sP3_lo				704
-#define _sP3_hi				768
-#define _sP2_lo				832
-#define _sP2_hi				896
-#define _sP0_lo				960
-#define _sP0_hi				1024
+/* Offsets for data table __svml_stanh_data_internal. Ordered
+   by use in the function. On cold-starts this might help the
+   prefetcher. Possibly a better idea is to interleave start/end so
+   that the prefetcher is less likely to detect a stream and pull
+   irrelivant lines into cache.  */
+
+/* Offsets for data table __svml_stanh_data_internal.
+   64 bytes each.  */
+#define _sC_lo	0
+#define _sC_hi	64
+#define _sP7_lo	128
+#define _sP7_hi	192
+#define _sP6_lo	256
+#define _sP6_hi	320
+#define _sP5_lo	384
+#define _sP5_hi	448
+#define _sP4_lo	512
+#define _sP4_hi	576
+#define _sP3_lo	640
+#define _sP3_hi	704
+#define _sP2_lo	768
+#define _sP2_hi	832
+#define _sP0_lo	896
+#define _sP0_hi	960
+
 
 #include <sysdep.h>
-#define TANHF_DATA(x)			((x)+__svml_stanh_data_internal_al64)
-#define TANHF_DATA_UNALIGNED(x)		((x)+__svml_stanh_data_internal)
 
 	.section .text.evex512, "ax", @progbits
 ENTRY(_ZGVeN16v_tanhf_skx)
-	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	vpandd	TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
-	vpsubd	TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
+	/* Here huge arguments, INF and NaNs are filtered out to
+	   callout.  */
+	vpandd	LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
+	vpsubd	LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
 
 	/* Selection arguments between [0, 0x03e00000] into zmm3.  */
 	vpxord	%zmm3, %zmm3, %zmm3
 	vpmaxsd	%zmm3, %zmm2, %zmm3
-	vpminsd	TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
+	vpminsd	LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
 
 	/* Setup permute indices in zmm3.  */
 	vpsrld	$21, %zmm3, %zmm3
 
 	/* Store if there are any special cases in k1.  */
-	vpcmpd	$6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
+	vpcmpd	$6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
 
-	vmovaps	TANHF_DATA(_sC_lo)(%rip), %zmm5
-	vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
+	vmovaps	LOCAL_DATA(_sC_lo)(%rip), %zmm5
+	vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
 
-	vmovaps	TANHF_DATA(_sP7_lo)(%rip), %zmm2
-	vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
+	vmovaps	LOCAL_DATA(_sP7_lo)(%rip), %zmm2
+	vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
 
 	/* Store absolute values of inputs in zmm1.  */
-	vmovaps	TANHF_DATA(_sSignMask)(%rip), %zmm4
+	vmovaps	COMMON_DATA(_SignMask)(%rip), %zmm4
 	vandnps	%zmm0, %zmm4, %zmm1
 	vsubps	{rn-sae}, %zmm5, %zmm1, %zmm1
 
-	vmovaps	TANHF_DATA(_sP6_lo)(%rip), %zmm5
-	vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
+	vmovaps	LOCAL_DATA(_sP6_lo)(%rip), %zmm5
+	vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
 
-	vmovaps	TANHF_DATA(_sP5_lo)(%rip), %zmm6
-	vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
+	vmovaps	LOCAL_DATA(_sP5_lo)(%rip), %zmm6
+	vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
 
 	vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
 	vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
 
-	vmovaps	TANHF_DATA(_sP4_lo)(%rip), %zmm7
-	vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
+	vmovaps	LOCAL_DATA(_sP4_lo)(%rip), %zmm7
+	vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
 
-	vmovaps	TANHF_DATA(_sP3_lo)(%rip), %zmm8
-	vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
+	vmovaps	LOCAL_DATA(_sP3_lo)(%rip), %zmm8
+	vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
 
 	vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
 	vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
 
-	vmovaps	TANHF_DATA(_sP2_lo)(%rip), %zmm9
-	vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
+	vmovaps	LOCAL_DATA(_sP2_lo)(%rip), %zmm9
+	vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
 
-	vmovaps	TANHF_DATA(_sP0_lo)(%rip), %zmm10
-	vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
+	vmovaps	LOCAL_DATA(_sP0_lo)(%rip), %zmm10
+	vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
 
 	vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
 	vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
@@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
 
 	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
+
 	/* Wait until after branch of write over zmm0.  */
 	vpternlogd $0xec, %zmm4, %zmm2, %zmm0
 
@@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
 
 	/* Cold case. edx has 1s where there was a special value that
 	   needs to be handled by a tanhf call. Optimize for code size
-	   more so than speed here. */
+	   more so than speed here.  */
 L(SPECIAL_VALUES_BRANCH):
-	# LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
-    /* Use r13 to save/restore the stack. This allows us to use rbp as
-       callee save register saving code size. */
+
+	/* Use r13 to save/restore the stack. This allows us to use rbp
+	   as callee save register saving code size.  */
 	pushq	%r13
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(r13, -16)
-	/* Need to callee save registers to preserve state across tanhf calls.
-	 */
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (r13, -16)
+	/* Need to callee save registers to preserve state across tanhf
+	   calls.  */
 	pushq	%rbx
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(rbx, -24)
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbx, -24)
 	pushq	%rbp
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(rbp, -32)
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbp, -32)
 	movq	%rsp, %r13
-	cfi_def_cfa_register(r13)
+	cfi_def_cfa_register (r13)
 
 	/* Align stack and make room for 2x zmm vectors.  */
 	andq	$-64, %rsp
@@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
 
 	vzeroupper
 
-	/* edx has 1s where there was a special value that needs to be handled
-	   by a tanhf call.  */
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a tanhf call.  */
 	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	# LOE rbx rbp r12 r13 r14 r15
-	/* use rbp as index for special value that is saved across calls to
-	   tanhf. We technically don't need a callee save register here as offset
-	   to rsp is always [0, 56] so we can restore rsp by realigning to 64.
-	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
-	   in the loop. Realigning also costs more code size.  */
+
+	/* use rbp as index for special value that is saved across calls
+	   to tanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 56] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop. Realigning
+	   also costs more code size.  */
 	xorl	%ebp, %ebp
 	tzcntl	%ebx, %ebp
 
@@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
 	vmovss	64(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
 
-	/* No good way to avoid the store-forwarding fault this will cause on
-	   return. `lfence` avoids the SF fault but at greater cost as it
-	   serialized stack/callee save restoration.  */
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
 	vmovss	%xmm0, (%rsp, %rbp, 4)
 
-	blsrl   %ebx, %ebx
+	blsrl	%ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
-	# LOE r12 r13 r14 r15
+
 
 	/* All results have been written to (%rsp).  */
 	vmovaps	(%rsp), %zmm0
 	/* Restore rsp.  */
 	movq	%r13, %rsp
-	cfi_def_cfa_register(rsp)
+	cfi_def_cfa_register (rsp)
 	/* Restore callee save registers.  */
 	popq	%rbp
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(rbp)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
 	popq	%rbx
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(rbp)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
 	popq	%r13
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(r13)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (r13)
 	ret
 END(_ZGVeN16v_tanhf_skx)
 
-	.section .rodata, "a"
+	.section .rodata.evex512, "a"
 	.align	16
-#ifdef __svml_stanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct
-	{
-	__declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
-	__declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
-	__declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
-	__declspec(align(4)) VUINT32 _iExpMask[1][1];
-	__declspec(align(64)) VUINT32 _sC_lo[16][1];
-	__declspec(align(64)) VUINT32 _sC_hi[16][1];
-	__declspec(align(64)) VUINT32 _sP7_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP7_hi[16][1];
-	__declspec(align(64)) VUINT32 _sSignMask[16][1];
-	__declspec(align(64)) VUINT32 _sP6_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP6_hi[16][1];
-	__declspec(align(64)) VUINT32 _sP5_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP5_hi[16][1];
-	__declspec(align(64)) VUINT32 _sP4_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP4_hi[16][1];
-	__declspec(align(64)) VUINT32 _sP3_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP3_hi[16][1];
-	__declspec(align(64)) VUINT32 _sP2_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP2_hi[16][1];
-	__declspec(align(64)) VUINT32 _sP0_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP0_hi[16][1];
-} __svml_stanh_data_internal;
-#endif
-
-__svml_stanh_data_internal:
-	.align	4
-	/* _iExpMantMask_UISA */
-	.long	0x7fe00000
-
-	.align	4
-	/* _iMinIdxOfsMask_UISA */
-	.long	0x3d400000
-
-	.align	4
-	/* _iMaxIdxMask_UISA */
-	.long	0x03e00000
-
-	.align	4
-	/* _iExpMask */
-	.long	0x7f000000
-
-	.align	64
-__svml_stanh_data_internal_al64:
-	.align	64
-	/* _sC_lo */
-	.long	0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
-	.long	0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
-	.long	0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
-	.long	0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
-
-	.align	64
-	/* _sC_hi */
-	.long	0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
-	.long	0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
-	.long	0x40500000, 0x40700000, 0x40900000, 0x40b00000
-	.long	0x40d00000, 0x40f00000, 0x41100000, 0x00000000
-
-	.align	64
-	/* _sP7_lo */
-	.long	0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
-	.long	0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
-	.long	0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
-	.long	0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
-
-	.align	64
-	/* _sP7_hi */
-	.long	0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
-	.long	0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
-	.long	0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
-	.long	0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
 
-	.align	64
-	/* _sSignMask */
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
-
-	.align	64
-	/* _sP6_lo */
-	.long	0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
-	.long	0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
-	.long	0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
-	.long	0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
-
-	.align	64
-	/* _sP6_hi */
-	.long	0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
-	.long	0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
-	.long	0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
-	.long	0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
-
-	.align	64
-	/* _sP5_lo */
-	.long	0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
-	.long	0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
-	.long	0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
-	.long	0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
-
-	.align	64
-	/* _sP5_hi */
-	.long	0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
-	.long	0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
-	.long	0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
-	.long	0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
-
-	.align	64
-	/* _sP4_lo */
-	.long	0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
-	.long	0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
-	.long	0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
-	.long	0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
-
-	.align	64
-	/* _sP4_hi */
-	.long	0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
-	.long	0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
-	.long	0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
-	.long	0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
-
-	.align	64
-	/* _sP3_lo */
-	.long	0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
-	.long	0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
-	.long	0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
-	.long	0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
-
-	.align	64
-	/* _sP3_hi */
-	.long	0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
-	.long	0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
-	.long	0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
-	.long	0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
-
-	.align	64
-	/* _sP2_lo */
-	.long	0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
-	.long	0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
-	.long	0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
-	.long	0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
-
-	.align	64
-	/* _sP2_hi */
-	.long	0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
-	.long	0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
-	.long	0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
-	.long	0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
-
-	.align	64
-	/* _sP0_lo */
-	.long	0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
-	.long	0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
-	.long	0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
-	.long	0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
-
-	.align	64
-	/* _sP0_hi */
-	.long	0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
-	.long	0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
-	.long	0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
-	.long	0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
+LOCAL_DATA_NAME_UNALIGNED:
+	float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
+	float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
+	float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
+	float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
+	.type	LOCAL_DATA_NAME_UNALIGNED, @object
+	.size	LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
 
 	.align	64
-	.type	__svml_stanh_data_internal_al64, @object
-	.size	__svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
-	.type	__svml_stanh_data_internal, @object
-	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
+LOCAL_DATA_NAME:
+	float_block (LOCAL_DATA_NAME, _sC_lo,
+		0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
+		0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
+		0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
+		0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
+
+	float_block (LOCAL_DATA_NAME, _sC_hi,
+		0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
+		0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
+		0x40500000, 0x40700000, 0x40900000, 0x40b00000,
+		0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP7_lo,
+		0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
+		0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
+		0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
+		0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
+
+	float_block (LOCAL_DATA_NAME, _sP7_hi,
+		0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
+		0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
+		0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
+		0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP6_lo,
+		0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
+		0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
+		0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
+		0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
+
+	float_block (LOCAL_DATA_NAME, _sP6_hi,
+		0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
+		0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
+		0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
+		0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP5_lo,
+		0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
+		0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
+		0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
+		0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
+
+	float_block (LOCAL_DATA_NAME, _sP5_hi,
+		0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
+		0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
+		0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
+		0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP4_lo,
+		0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
+		0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
+		0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
+		0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
+
+	float_block (LOCAL_DATA_NAME, _sP4_hi,
+		0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
+		0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
+		0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
+		0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP3_lo,
+		0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
+		0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
+		0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
+		0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
+
+	float_block (LOCAL_DATA_NAME, _sP3_hi,
+		0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
+		0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
+		0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
+		0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP2_lo,
+		0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
+		0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
+		0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
+		0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
+
+	float_block (LOCAL_DATA_NAME, _sP2_hi,
+		0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
+		0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
+		0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
+		0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP0_lo,
+		0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
+		0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
+		0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
+		0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
+
+	float_block (LOCAL_DATA_NAME, _sP0_hi,
+		0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
+		0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
+		0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
+		0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 08/27] x86/fpu: Update rodata usage in svml_s_atanhf16_core_avx512.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (5 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 09/27] x86/fpu: Update rodata usage in svml_s_atanhf4_core_sse4.S Noah Goldstein via Libc-alpha
                   ` (19 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

No changes to the logic, just change how rodata is handled.

1. Define the rodatas using the new macros so they check that the
   offset is correct.

2. Use common data where applicable.
---
 .../multiarch/svml_s_atanhf16_core_avx512.S   | 416 +++++++-----------
 1 file changed, 171 insertions(+), 245 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
index 07094cc82f..4e6af8ddf9 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
@@ -31,101 +31,96 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal_avx512 and
-   __svml_satanh_data_internal_avx512_al64. Ordered by use in the
-   function. On cold-starts this might help the prefetcher. Possibly
-   a better idea is to interleave start/end so that the prefetcher is
-   less likely to detect a stream and pull irrelivant lines into
-   cache.  */
-
-/* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as
-   the memory is broadcast to {1to16}.  */
-#define AbsMask				0
-
-/* Offset into __svml_satanh_data_internal_avx512_al64. The full value
-   is used here.  */
-#define One				0
-#define AddB5				64
-#define RcpBitMask			128
-#define Log_tbl_L_lo			192
-#define Log_tbl_L_hi			256
-#define Log_tbl_H_lo			320
-#define Log_tbl_H_hi			384
-#define L2H				448
-#define L2L				512
-#define poly_coeff3			576
-#define poly_coeff2			640
-#define poly_coeff1			704
+#define LOCAL_DATA_NAME	__svml_satanh_data_internal
+#include "svml_s_common_evex512_rodata_offsets.h"
+
+/* Offsets for data table __svml_satanh_data_internal are Ordered by
+   use in the function. On cold-starts this might help the
+   prefetcher. Possibly a better idea is to interleave start/end so
+   that the prefetcher is less likely to detect a stream and pull
+   irrelivant lines into cache.  */
+
+/* Offset into __svml_satanh_data_internal.  64 bytes each.  */
+#define AddB5	0
+#define RcpBitMask	64
+#define Log_tbl_L_lo	128
+#define Log_tbl_L_hi	192
+#define Log_tbl_H_lo	256
+#define Log_tbl_H_hi	320
+#define L2H	384
+#define L2L	448
+#define poly_coeff3	512
+#define poly_coeff2	576
+#define poly_coeff1	640
 
-#include <sysdep.h>
 
-#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal_avx512_al64)
+#include <sysdep.h>
 
 	.section .text.evex512, "ax", @progbits
 ENTRY(_ZGVeN16v_atanhf_skx)
-	vandps	AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6
-	vmovups	ATANHF_DATA(One)(%rip), %zmm4
+	vandps	COMMON_DATA(_AbsMask)(%rip){1to16}, %zmm0, %zmm6
+	vmovups	COMMON_DATA(_OneF)(%rip), %zmm4
 
-	/* 1+y */
+	/* 1+y.  */
 	vaddps	{rn-sae}, %zmm4, %zmm6, %zmm9
 
-	/* 1-y */
+	/* 1-y.  */
 	vsubps	{rn-sae}, %zmm6, %zmm4, %zmm8
 
-	/* round reciprocals to 1+5b mantissas */
-	vmovups	ATANHF_DATA(AddB5)(%rip), %zmm14
-	vmovups	ATANHF_DATA(RcpBitMask)(%rip), %zmm1
+	/* round reciprocals to 1+5b mantissas.  */
+	vmovups	LOCAL_DATA(AddB5)(%rip), %zmm14
+	vmovups	LOCAL_DATA(RcpBitMask)(%rip), %zmm1
 
-	/* RcpP ~ 1/Yp */
+	/* RcpP ~ 1/Yp.  */
 	vrcp14ps %zmm9, %zmm12
 
-	/* RcpM ~ 1/Ym */
+	/* RcpM ~ 1/Ym.  */
 	vrcp14ps %zmm8, %zmm13
 
-	/* Yp_high */
+	/* Yp_high.  */
 	vsubps	{rn-sae}, %zmm4, %zmm9, %zmm2
 
-	/* -Ym_high */
+	/* -Ym_high.  */
 	vsubps	{rn-sae}, %zmm4, %zmm8, %zmm5
 
 
-	/* input outside (-1, 1) ? */
+	/* input outside (-1, 1) ?  */
 	vpaddd	%zmm14, %zmm12, %zmm15
 	vpaddd	%zmm14, %zmm13, %zmm12
 
-	/* Yp_low */
+	/* Yp_low.  */
 	vsubps	{rn-sae}, %zmm2, %zmm6, %zmm3
 	vandps	%zmm1, %zmm15, %zmm7
 	vandps	%zmm1, %zmm12, %zmm12
 
-	/* Ym_low */
+	/* Ym_low.  */
 	vaddps	{rn-sae}, %zmm5, %zmm6, %zmm5
 
-	/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
+	/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low.  */
 	vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
 
-	/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
+	/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low.  */
 	vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
 
-	vmovups	ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
-	vmovups	ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
+	vmovups	LOCAL_DATA(Log_tbl_L_lo)(%rip), %zmm10
+	vmovups	LOCAL_DATA(Log_tbl_L_hi)(%rip), %zmm13
 
-	/* exponents */
+	/* exponents.  */
 	vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
 	vgetexpps {sae}, %zmm7, %zmm15
 
 
-	/* Table lookups */
+	/* Table lookups.  */
 	vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
 	vgetexpps {sae}, %zmm12, %zmm14
 
 
-	/* Prepare table index */
+	/* Prepare table index.  */
 	vpsrld	$18, %zmm7, %zmm3
 	vpsrld	$18, %zmm12, %zmm2
-	vmovups	ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
-	vmovups	ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
-	/* Km-Kp */
+	vmovups	LOCAL_DATA(Log_tbl_H_lo)(%rip), %zmm11
+	vmovups	LOCAL_DATA(Log_tbl_H_hi)(%rip), %zmm7
+	/* Km-Kp.  */
 
 	vmovaps	%zmm3, %zmm5
 	vpermi2ps %zmm13, %zmm10, %zmm3
@@ -135,35 +130,35 @@ ENTRY(_ZGVeN16v_atanhf_skx)
 	vsubps	{rn-sae}, %zmm15, %zmm14, %zmm1
 	vsubps	{rn-sae}, %zmm3, %zmm10, %zmm7
 
-	/* K*L2H + Th */
-	vmovups	ATANHF_DATA(L2H)(%rip), %zmm2
+	/* K*L2H + Th.  */
+	vmovups	LOCAL_DATA(L2H)(%rip), %zmm2
 
-	/* K*L2L + Tl */
-	vmovups	ATANHF_DATA(L2L)(%rip), %zmm3
+	/* K*L2L + Tl.  */
+	vmovups	LOCAL_DATA(L2L)(%rip), %zmm3
 
-	/* table values */
+	/* table values.  */
 	vsubps	{rn-sae}, %zmm5, %zmm11, %zmm5
 	vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
 	vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
-	/* polynomials */
-	vmovups	ATANHF_DATA(poly_coeff3)(%rip), %zmm7
-	vmovups	ATANHF_DATA(poly_coeff2)(%rip), %zmm10
+	/* polynomials.  */
+	vmovups	LOCAL_DATA(poly_coeff3)(%rip), %zmm7
+	vmovups	LOCAL_DATA(poly_coeff2)(%rip), %zmm10
 	vmovaps	%zmm10, %zmm14
 	vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
 	vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
-	vmovups	ATANHF_DATA(poly_coeff1)(%rip), %zmm12
+	vmovups	COMMON_DATA(_Neg5F)(%rip), %zmm12
 	vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
 	vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
 	vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
 	vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
 
-	/* (K*L2L + Tl) + Rp*PolyP */
+	/* (K*L2L + Tl) + Rp*PolyP.  */
 	vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
 
 	/* zmm12 = zmm12 & (zmm4 | zmm0).  */
 	vpternlogq $0xe0, %zmm0, %zmm4, %zmm12
 
-	/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
+	/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM.  */
 	vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14
 	vaddps	{rn-sae}, %zmm14, %zmm10, %zmm8
 
@@ -171,9 +166,9 @@ ENTRY(_ZGVeN16v_atanhf_skx)
 	kmovw	%k0, %edx
 	testl	%edx, %edx
 
-	/* Go to special inputs processing branch */
+	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12
+
 	vmulps	{rn-sae}, %zmm12, %zmm8, %zmm0
 
 	/* No register to restore on fast path.  */
@@ -181,24 +176,24 @@ ENTRY(_ZGVeN16v_atanhf_skx)
 
 	/* Cold case. edx has 1s where there was a special value that
 	   needs to be handled by a atanhf call. Optimize for code size
-	   more so than speed here. */
+	   more so than speed here.  */
 L(SPECIAL_VALUES_BRANCH):
-	# LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12
-    /* Use r13 to save/restore the stack. This allows us to use rbp as
-       callee save register saving code size. */
+
+	/* Use r13 to save/restore the stack. This allows us to use rbp as
+	   callee save register saving code size.  */
 	pushq	%r13
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(r13, -16)
-	/* Need to callee save registers to preserve state across tanhf calls.
-	 */
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (r13, -16)
+	/* Need to callee save registers to preserve state across tanhf
+	   calls.  */
 	pushq	%rbx
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(rbx, -24)
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbx, -24)
 	pushq	%rbp
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(rbp, -32)
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbp, -32)
 	movq	%rsp, %r13
-	cfi_def_cfa_register(r13)
+	cfi_def_cfa_register (r13)
 
 	/* Align stack and make room for 2x zmm vectors.  */
 	andq	$-64, %rsp
@@ -208,16 +203,17 @@ L(SPECIAL_VALUES_BRANCH):
 	vmovaps	%zmm0, 64(%rsp)
 	vzeroupper
 
-	/* edx has 1s where there was a special value that needs to be handled
-	   by a atanhf call.  */
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a atanhf call.  */
 	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	# LOE rbx rbp r12 r13 r14 r15
-	/* use rbp as index for special value that is saved across calls to
-	   atanhf. We technically don't need a callee save register here as offset
-	   to rsp is always [0, 56] so we can restore rsp by realigning to 64.
-	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
-	   in the loop. Realigning also costs more code size.  */
+
+	/* use rbp as index for special value that is saved across calls
+	   to atanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 56] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop. Realigning
+	   also costs more code size.  */
 	xorl	%ebp, %ebp
 	tzcntl	%ebx, %ebp
 
@@ -230,177 +226,107 @@ L(SPECIAL_VALUES_LOOP):
 	   serialized stack/callee save restoration.  */
 	vmovss	%xmm0, (%rsp, %rbp, 4)
 
-	blsrl   %ebx, %ebx
+	blsrl	%ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
-	# LOE r12 r13 r14 r15
+
 
 	/* All results have been written to (%rsp).  */
 	vmovaps	(%rsp), %zmm0
 	/* Restore rsp.  */
 	movq	%r13, %rsp
-	cfi_def_cfa_register(rsp)
+	cfi_def_cfa_register (rsp)
 	/* Restore callee save registers.  */
 	popq	%rbp
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(rbp)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
 	popq	%rbx
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(rbp)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
 	popq	%r13
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(r13)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (r13)
 	ret
 END(_ZGVeN16v_atanhf_skx)
 
-	.section .rodata, "a"
-	.align	4
-#ifdef __svml_satanh_data_internal_avx512_typedef
-typedef unsigned int VUINT32;
-typedef struct{
-	__declspec(align(4)) VUINT32 AbsMask[1][1];
-	__declspec(align(64)) VUINT32 One[16][1];
-	__declspec(align(64)) VUINT32 AddB5[16][1];
-	__declspec(align(64)) VUINT32 RcpBitMask[16][1];
-	__declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1];
-	__declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1];
-	__declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1];
-	__declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1];
-	__declspec(align(64)) VUINT32 L2H[16][1];
-	__declspec(align(64)) VUINT32 L2L[16][1];
-	__declspec(align(64)) VUINT32 poly_coeff3[16][1];
-	__declspec(align(64)) VUINT32 poly_coeff2[16][1];
-	__declspec(align(64)) VUINT32 poly_coeff1[16][1];
-} __svml_satanh_data_internal_avx512;
-#endif
-__svml_satanh_data_internal_avx512:
-	/* Leave this at front so we can potentially save space due to
-	   smaller alignment constraint.  */
-	.align	4
-    /* AbsMask */
-	.long	0x7fffffff
-	.align	64
-__svml_satanh_data_internal_avx512_al64:
-	/* One */
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* AddB5 */
-	.align	64
-	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
-	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
-	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
-	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
-	/* RcpBitMask */
-	.align	64
-	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
-	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
-	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
-	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
-	/* Log_tbl_L_lo */
-	.align	64
-	.long	0x00000000
-	.long	0x3726c39e
-	.long	0x38a30c01
-	.long	0x37528ae5
-	.long	0x38e0edc5
-	.long	0xb8ab41f8
-	.long	0xb7cf8f58
-	.long	0x3896a73d
-	.long	0xb5838656
-	.long	0x380c36af
-	.long	0xb8235454
-	.long	0x3862bae1
-	.long	0x38c5e10e
-	.long	0x38dedfac
-	.long	0x38ebfb5e
-	.long	0xb8e63c9f
-	/* Log_tbl_L_hi */
-	.align	64
-	.long	0xb85c1340
-	.long	0x38777bcd
-	.long	0xb6038656
-	.long	0x37d40984
-	.long	0xb8b85028
-	.long	0xb8ad5a5a
-	.long	0x3865c84a
-	.long	0x38c3d2f5
-	.long	0x383ebce1
-	.long	0xb8a1ed76
-	.long	0xb7a332c4
-	.long	0xb779654f
-	.long	0xb8602f73
-	.long	0x38f85db0
-	.long	0x37b4996f
-	.long	0xb8bfb3ca
-	/* Log_tbl_H_lo */
-	.align	64
-	.long	0x00000000
-	.long	0x3cfc0000
-	.long	0x3d780000
-	.long	0x3db78000
-	.long	0x3df10000
-	.long	0x3e14c000
-	.long	0x3e300000
-	.long	0x3e4a8000
-	.long	0x3e648000
-	.long	0x3e7dc000
-	.long	0x3e8b4000
-	.long	0x3e974000
-	.long	0x3ea30000
-	.long	0x3eae8000
-	.long	0x3eb9c000
-	.long	0x3ec4e000
-	/* Log_tbl_H_hi */
-	.align	64
-	.long	0x3ecfa000
-	.long	0x3eda2000
-	.long	0x3ee48000
-	.long	0x3eeea000
-	.long	0x3ef8a000
-	.long	0x3f013000
-	.long	0x3f05f000
-	.long	0x3f0aa000
-	.long	0x3f0f4000
-	.long	0x3f13d000
-	.long	0x3f184000
-	.long	0x3f1ca000
-	.long	0x3f20f000
-	.long	0x3f252000
-	.long	0x3f295000
-	.long	0x3f2d7000
-	/* L2H = log(2)_high */
-	.align	64
-	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
-	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
-	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
-	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
-	/* L2L = log(2)_low */
-	.align	64
-	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
-	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
-	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
-	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
-	/* poly_coeff3 */
-	.align	64
-	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
-	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
-	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
-	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
-	/* poly_coeff2 */
-	.align	64
-	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
-	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
-	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
-	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
-	/* poly_coeff1 */
-	.align	64
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.section .rodata.evex512, "a"
 	.align	64
-	.type	__svml_satanh_data_internal_avx512_al64, @object
-	.size	__svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64
-	.type	__svml_satanh_data_internal_avx512, @object
-	.size	__svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
+LOCAL_DATA_NAME:
+	DATA_VEC (LOCAL_DATA_NAME, AddB5, 0x00020000)
+	DATA_VEC (LOCAL_DATA_NAME, RcpBitMask, 0xfffc0000)
+
+	float_block (LOCAL_DATA_NAME, Log_tbl_L_lo,
+		0x00000000,
+		0x3726c39e,
+		0x38a30c01,
+		0x37528ae5,
+		0x38e0edc5,
+		0xb8ab41f8,
+		0xb7cf8f58,
+		0x3896a73d,
+		0xb5838656,
+		0x380c36af,
+		0xb8235454,
+		0x3862bae1,
+		0x38c5e10e,
+		0x38dedfac,
+		0x38ebfb5e,
+		0xb8e63c9f)
+	float_block (LOCAL_DATA_NAME, Log_tbl_L_hi,
+		0xb85c1340,
+		0x38777bcd,
+		0xb6038656,
+		0x37d40984,
+		0xb8b85028,
+		0xb8ad5a5a,
+		0x3865c84a,
+		0x38c3d2f5,
+		0x383ebce1,
+		0xb8a1ed76,
+		0xb7a332c4,
+		0xb779654f,
+		0xb8602f73,
+		0x38f85db0,
+		0x37b4996f,
+		0xb8bfb3ca)
+	float_block (LOCAL_DATA_NAME, Log_tbl_H_lo,
+		0x00000000,
+		0x3cfc0000,
+		0x3d780000,
+		0x3db78000,
+		0x3df10000,
+		0x3e14c000,
+		0x3e300000,
+		0x3e4a8000,
+		0x3e648000,
+		0x3e7dc000,
+		0x3e8b4000,
+		0x3e974000,
+		0x3ea30000,
+		0x3eae8000,
+		0x3eb9c000,
+		0x3ec4e000)
+	float_block (LOCAL_DATA_NAME, Log_tbl_H_hi,
+		0x3ecfa000,
+		0x3eda2000,
+		0x3ee48000,
+		0x3eeea000,
+		0x3ef8a000,
+		0x3f013000,
+		0x3f05f000,
+		0x3f0aa000,
+		0x3f0f4000,
+		0x3f13d000,
+		0x3f184000,
+		0x3f1ca000,
+		0x3f20f000,
+		0x3f252000,
+		0x3f295000,
+		0x3f2d7000)
+
+	DATA_VEC (LOCAL_DATA_NAME, L2H, 0x3f317000)
+	DATA_VEC (LOCAL_DATA_NAME, L2L, 0x3805fdf4)
+	DATA_VEC (LOCAL_DATA_NAME, poly_coeff3, 0xbe800810)
+	DATA_VEC (LOCAL_DATA_NAME, poly_coeff2, 0x3eaab11e)
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 09/27] x86/fpu: Update rodata usage in svml_s_atanhf4_core_sse4.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (6 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 08/27] x86/fpu: Update rodata usage in svml_s_atanhf16_core_avx512.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 10/27] x86/fpu: Update rodata usage in svml_s_atanhf8_core_avx2.S Noah Goldstein via Libc-alpha
                   ` (18 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

No changes to the logic, just change how rodata is handled.

1. Define the rodatas using the new macros so they check that the
   offset is correct.

2. Use common data where applicable.
---
 .../fpu/multiarch/svml_s_atanhf4_core_sse4.S  | 240 ++++++++----------
 1 file changed, 102 insertions(+), 138 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
index 37200b3601..da5744506f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
@@ -30,46 +30,45 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
-   by use in the function. On cold-starts this might help the
-   prefetcher. Possibly a better idea is to interleave start/end so
-   that the prefetcher is less likely to detect a stream and pull
-   irrelivant lines into cache.  */
-#define sOne				0
-#define SgnMask				16
-#define sTopMask12			32
-#define iBrkValue			48
-#define iOffExpoMask			64
-#define sPoly				80
-#define sLn2				208
-#define TinyRange			224
+#define LOCAL_DATA_NAME	__svml_satanh_data_internal
+#include "svml_s_common_sse4_rodata_offsets.h"
+
+/* Offsets for data table __svml_stan_data_internal.  */
+#define _Poly_1	0
+#define _Poly_2	16
+#define _Poly_3	32
+#define _Poly_4	48
+#define _Poly_5	64
+#define _Poly_6	80
+#define _Poly_7	96
+#define _TinyRange	112
 
 #include <sysdep.h>
-#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal)
 
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_atanhf_sse4)
 	movaps	%xmm0, %xmm5
 
-	/* Load constants including One = 1 */
-	movups	ATANHF_DATA(sOne)(%rip), %xmm4
+	/* Load constants including One = 1.  */
+	movups	COMMON_DATA(_OneF)(%rip), %xmm4
 	movaps	%xmm5, %xmm3
 
-	/* Strip off the sign, so treat X as positive until right at the end */
-	movups	ATANHF_DATA(SgnMask)(%rip), %xmm1
+	/* Strip off the sign, so treat X as positive until right at the
+	   end.  */
+	movups	COMMON_DATA(_AbsMask)(%rip), %xmm1
 	movaps	%xmm4, %xmm2
 	andps	%xmm1, %xmm0
 	movaps	%xmm4, %xmm10
-	movups	ATANHF_DATA(sTopMask12)(%rip), %xmm11
+	movups	COMMON_DATA(_Neg4096)(%rip), %xmm11
 	movaps	%xmm4, %xmm14
 	movaps	%xmm11, %xmm9
 
 
-	/*
-	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
-	 * the upper part UHi being <= 12 bits long. Then we have
-	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
-	 */
+	/* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two
+	   pieces, the upper part UHi being <= 12 bits long. Then we
+	   have:
+	    atanh(X) = 1/2 * log((1 + X) / (1 - X))
+	             = 1/2 * log1p(V / (UHi + ULo)).  */
 	movaps	%xmm0, %xmm6
 	mulps	%xmm5, %xmm3
 	subps	%xmm0, %xmm2
@@ -80,65 +79,61 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
 	andps	%xmm2, %xmm9
 
 
-	/*
-	 * Check whether |X| < 1, in which case we use the main function.
-	 * Otherwise set the rangemask so that the callout will get used.
-	 * Note that this will also use the callout for NaNs since not(NaN < 1).
-	 */
+	/* Check whether |X| < 1, in which case we use the main
+	   function. Otherwise set the rangemask so that the callout
+	   will get used. Note that this will also use the callout for
+	   NaNs since not(NaN < 1).  */
 	rcpps	%xmm9, %xmm7
 	subps	%xmm9, %xmm2
 	andps	%xmm11, %xmm7
 
 
-	/*
-	 * Split V as well into upper 12 bits and lower part, so that we can get
-	 * a preliminary quotient estimate without rounding error.
-	 */
+	/* Split V as well into upper 12 bits and lower part, so that we
+	   can get a preliminary quotient estimate without rounding
+	   error.  */
 	andps	%xmm6, %xmm11
 	mulps	%xmm7, %xmm9
 	addps	%xmm2, %xmm10
 	subps	%xmm11, %xmm6
 
-	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
+	/* Hence get initial quotient estimate:
+	   QHi + QLo = R * VHi + R * VLo.  */
 	mulps	%xmm7, %xmm11
 	mulps	%xmm7, %xmm10
 	subps	%xmm9, %xmm14
 	mulps	%xmm6, %xmm7
 	subps	%xmm10, %xmm14
 
-	/* Compute D = E + E^2 */
+	/* Compute D = E + E^2.  */
 	movaps	%xmm14, %xmm13
 	movaps	%xmm4, %xmm8
 	mulps	%xmm14, %xmm13
 
-	/* reduction: compute r,n */
-	movdqu	ATANHF_DATA(iBrkValue)(%rip), %xmm9
+	/* reduction: compute r,n.  */
+	movdqu	COMMON_DATA(_IBrkValue)(%rip), %xmm9
 	addps	%xmm13, %xmm14
 
-	/*
-	 * Compute R * (VHi + VLo) * (1 + E + E^2)
-	 * = R *  (VHi + VLo) * (1 + D)
-	 * = QHi + (QHi * D + QLo + QLo * D)
-	 */
+	/* Compute R * (VHi + VLo) * (1 + E + E^2)
+	   = R *  (VHi + VLo) * (1 + D)
+	   = QHi + (QHi * D + QLo + QLo * D).  */
 	movaps	%xmm14, %xmm2
 	mulps	%xmm7, %xmm14
 	mulps	%xmm11, %xmm2
 	addps	%xmm14, %xmm7
-	movdqu	ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
+	movdqu	COMMON_DATA(_NotiOffExpoMask)(%rip), %xmm12
 	movaps	%xmm4, %xmm14
 
-	/* Record the sign for eventual reincorporation. */
+	/* Record the sign for eventual reincorporation.  */
 	addps	%xmm7, %xmm2
 
 
-	/*
-	 * Now finally accumulate the high and low parts of the
-	 * argument to log1p, H + L, with a final compensated summation.
-	 */
+	/* Now finally accumulate the high and low parts of the
+	   argument to log1p, H + L, with a final compensated summation.  */
 	movaps	%xmm2, %xmm6
 	andnps	%xmm5, %xmm1
 	movaps	%xmm4, %xmm7
-	/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
+	/* Or the sign bit in with the tiny result to handle atanh(-0)
+	   correctly.  */
 	addps	%xmm11, %xmm6
 	maxps	%xmm6, %xmm7
 	minps	%xmm6, %xmm8
@@ -149,43 +144,43 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
 	subps	%xmm10, %xmm7
 	psubd	%xmm9, %xmm10
 	addps	%xmm8, %xmm7
-	pand	%xmm10, %xmm12
+	pandn	%xmm10, %xmm12
 	psrad	$23, %xmm10
 	cvtdq2ps %xmm10, %xmm13
 	addps	%xmm7, %xmm2
 
-	/* final reconstruction */
+	/* final reconstruction.  */
 	pslld	$23, %xmm10
 	paddd	%xmm9, %xmm12
 	psubd	%xmm10, %xmm14
 
-	/* polynomial evaluation */
+	/* polynomial evaluation.  */
 	subps	%xmm4, %xmm12
 	mulps	%xmm14, %xmm2
-	movups	ATANHF_DATA(sPoly+0)(%rip), %xmm7
+	movups	LOCAL_DATA(_Poly_1)(%rip), %xmm7
 	addps	%xmm12, %xmm2
 	mulps	%xmm2, %xmm7
 
 
-	/* Finally, halve the result and reincorporate the sign */
-	addps	ATANHF_DATA(sPoly+16)(%rip), %xmm7
+	/* Finally, halve the result and reincorporate the sign.  */
+	addps	LOCAL_DATA(_Poly_2)(%rip), %xmm7
 	mulps	%xmm2, %xmm7
-	addps	ATANHF_DATA(sPoly+32)(%rip), %xmm7
+	addps	LOCAL_DATA(_Poly_3)(%rip), %xmm7
 	mulps	%xmm2, %xmm7
-	addps	ATANHF_DATA(sPoly+48)(%rip), %xmm7
+	addps	LOCAL_DATA(_Poly_4)(%rip), %xmm7
 	mulps	%xmm2, %xmm7
-	addps	ATANHF_DATA(sPoly+64)(%rip), %xmm7
+	addps	LOCAL_DATA(_Poly_5)(%rip), %xmm7
 	mulps	%xmm2, %xmm7
-	addps	ATANHF_DATA(sPoly+80)(%rip), %xmm7
+	addps	LOCAL_DATA(_Poly_6)(%rip), %xmm7
 	mulps	%xmm2, %xmm7
-	addps	ATANHF_DATA(sPoly+96)(%rip), %xmm7
+	addps	LOCAL_DATA(_Poly_7)(%rip), %xmm7
 	mulps	%xmm2, %xmm7
-	movaps	ATANHF_DATA(sPoly+112)(%rip), %xmm6
+	movaps	COMMON_DATA(_Neg5F)(%rip), %xmm6
 	addps	%xmm6, %xmm7
 	mulps	%xmm2, %xmm7
 	mulps	%xmm2, %xmm7
-	mulps	ATANHF_DATA(sLn2)(%rip), %xmm13
-	/* We can build `sHalf` with `sPoly & sOne`.  */
+	mulps	COMMON_DATA(_Ln2)(%rip), %xmm13
+	/* We can build `sHalf` with `_Poly & sOne`.  */
 	andps	%xmm4, %xmm6
 	orps	%xmm1, %xmm3
 	xorps	%xmm6, %xmm1
@@ -197,7 +192,7 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
 	/* Finish check of NaNs.  */
 	cmpleps	%xmm0, %xmm4
 	movmskps %xmm4, %edx
-	cmpltps	ATANHF_DATA(TinyRange)(%rip), %xmm0
+	cmpltps	LOCAL_DATA(_TinyRange)(%rip), %xmm0
 
 	andps	%xmm0, %xmm3
 	andnps	%xmm1, %xmm0
@@ -206,115 +201,84 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
 	testl	%edx, %edx
 	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx rbp r12 r13 r14 r15 xmm0
+
 	/* No registers to restore on fast path.  */
 	ret
 
 
 	/* Cold case. edx has 1s where there was a special value that
 	   needs to be handled by a atanhf call. Optimize for code size
-	   more so than speed here. */
+	   more so than speed here.  */
 L(SPECIAL_VALUES_BRANCH):
-	# LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
+
 	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
-       call entry will be 16-byte aligned. */
+	   call entry will be 16-byte aligned.  */
 	subq	$56, %rsp
-	cfi_def_cfa_offset(64)
+	cfi_def_cfa_offset (64)
 	movups	%xmm0, 24(%rsp)
 	movups	%xmm5, 40(%rsp)
 
 	/* Use rbx/rbp for callee save registers as they get short
-       encoding for many instructions (as compared with r12/r13). */
+	   encoding for many instructions (as compared with r12/r13).  */
 	movq	%rbx, (%rsp)
-	cfi_offset(rbx, -64)
+	cfi_offset (rbx, -64)
 	movq	%rbp, 8(%rsp)
-	cfi_offset(rbp, -56)
-	/* edx has 1s where there was a special value that needs to be handled
-	   by a tanhf call.  */
+	cfi_offset (rbp, -56)
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a tanhf call.  */
 	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	# LOE rbx rbp r12 r13 r14 r15
-	/* use rbp as index for special value that is saved across calls to
-	   tanhf. We technically don't need a callee save register here as offset
-	   to rsp is always [0, 12] so we can restore rsp by realigning to 64.
-	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
-	   in the loop.  */
+
+	/* use rbp as index for special value that is saved across calls
+	   to tanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 12] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop.  */
 	xorl	%ebp, %ebp
 	bsfl	%ebx, %ebp
 
 	/* Scalar math fucntion call to process special input.  */
 	movss	40(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
-	/* No good way to avoid the store-forwarding fault this will cause on
-	   return. `lfence` avoids the SF fault but at greater cost as it
-	   serialized stack/callee save restoration.  */
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
 	movss	%xmm0, 24(%rsp, %rbp, 4)
 
 	leal	-1(%rbx), %eax
 	andl	%eax, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
-	# LOE r12 r13 r14 r15
+
 	/* All results have been written to 24(%rsp).  */
 	movups	24(%rsp), %xmm0
 	movq	(%rsp), %rbx
-	cfi_restore(rbx)
+	cfi_restore (rbx)
 	movq	8(%rsp), %rbp
-	cfi_restore(rbp)
+	cfi_restore (rbp)
 	addq	$56, %rsp
-	cfi_def_cfa_offset(8)
+	cfi_def_cfa_offset (8)
 	ret
 END(_ZGVbN4v_atanhf_sse4)
 
-	.section .rodata, "a"
+	.section .rodata.sse4, "a"
 	.align	16
 
-#ifdef __svml_satanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct{
-	__declspec(align(16)) VUINT32 sOne[4][1];
-	__declspec(align(16)) VUINT32 SgnMask[4][1];
-	__declspec(align(16)) VUINT32 sTopMask12[4][1];
-	__declspec(align(16)) VUINT32 iBrkValue[4][1];
-	__declspec(align(16)) VUINT32 iOffExpoMask[4][1];
-	__declspec(align(16)) VUINT32 sPoly[8][4][1];
-	__declspec(align(16)) VUINT32 sLn2[4][1];
-	__declspec(align(16)) VUINT32 TinyRange[4][1];
-} __svml_satanh_data_internal;
-#endif
-
-__svml_satanh_data_internal:
-	/* sOne = SP 1.0 */
-	.align	16
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* SgnMask */
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-	/* sTopMask12 */
-	.align	16
-	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
-	/* iBrkValue = SP 2/3 */
-	.align	16
-	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
-	/* iOffExpoMask = SP significand mask ==*/
-	.align	16
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-
-	/* sPoly[] = SP polynomial */
-	.align	16
-	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
-	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
-	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
-	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
-	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
-	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
-	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
-
-	/* sLn2 = SP ln(2) */
-	.align	16
-	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
-	/* TinyRange */
-	.align	16
-	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
-	.align	16
-	.type	__svml_satanh_data_internal, @object
-	.size	__svml_satanh_data_internal, .-__svml_satanh_data_internal
+LOCAL_DATA_NAME:
+	/* _Poly[] = SP polynomial.  */
+	/* 1.3820238411426544189453125e-01 P7.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_1, 0x3e0d84ed)
+	/* -1.5122179687023162841796875e-01 P6.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_2, 0xbe1ad9e3)
+	/* 1.4042308926582336425781250e-01 P5.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_3, 0x3e0fcb12)
+	/* -1.6472326219081878662109375e-01 P4.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_4, 0xbe28ad37)
+	/* 2.0007920265197753906250000e-01 P3.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_5, 0x3e4ce190)
+	/* -2.5004237890243530273437500e-01 P2.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_6, 0xbe80058e)
+	/* 3.3333265781402587890625000e-01 P1.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_7, 0x3eaaaa94)
+	DATA_VEC (LOCAL_DATA_NAME, _TinyRange, 0x0C000000)
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 10/27] x86/fpu: Update rodata usage in svml_s_atanhf8_core_avx2.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (7 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 09/27] x86/fpu: Update rodata usage in svml_s_atanhf4_core_sse4.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 11/27] x86/fpu: Optimize svml_s_atanf16_core_avx512.S Noah Goldstein via Libc-alpha
                   ` (17 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

No changes to the logic, just change how rodata is handled.

1. Define the rodatas using the new macros so they check that the
   offset is correct.

2. Use common data where applicable.
---
 .../fpu/multiarch/svml_s_atanhf8_core_avx2.S  | 308 ++++++++----------
 1 file changed, 129 insertions(+), 179 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
index 49ffd7a9b2..0c93da0166 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
@@ -1,5 +1,5 @@
 /* Function atanhf vectorized with AVX2.
-   Copyright (C) 2021-2022 Free Software Foundation, Inc.
+n   Copyright (C) 2021-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -30,181 +30,178 @@
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
+
+#define LOCAL_DATA_NAME	__svml_satan_data_internal
+#include "svml_s_common_avx2_rodata_offsets.h"
+
+/* Offsets for data table __svml_satanh_data_internal. Ordered
    by use in the function. On cold-starts this might hhelp the
    prefetcher. Possibly a better idea is to interleave start/end so
    that the prefetcher is less likely to detect a stream and pull
    irrelivant lines into cache.  */
-#define SgnMask				0
-#define sOne				32
-#define sTopMask12			64
-#define TinyRange			96
-#define iBrkValue			128
-#define iOffExpoMask			160
-#define sPoly				192
-#define sLn2				448
-#define sHalf				480
+#define _TinyRange	0
+#define _Poly_1	32
+#define _Poly_2	64
+#define _Poly_3	96
+#define _Poly_4	128
+#define _Poly_5	160
+#define _Poly_6	192
+#define _Poly_7	224
+#define _Half	256
 
 #include <sysdep.h>
-#define ATANHF_DATA(x)			((x)+__svml_satanh_data_internal)
 
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_atanhf_avx2)
-	/* Strip off the sign, so treat X as positive until right at the end */
-	vmovaps	ATANHF_DATA(SgnMask)(%rip), %ymm2
+	/* Strip off the sign, so treat X as positive until right at the end.  */
+	vmovaps	COMMON_DATA(_AbsMask)(%rip), %ymm2
 	vandps	%ymm2, %ymm0, %ymm3
-	/* Load constants including One = 1 */
-	vmovups	ATANHF_DATA(sOne)(%rip), %ymm5
+	/* Load constants including One = 1.  */
+	vmovups	COMMON_DATA(_OneF)(%rip), %ymm5
 	vsubps	%ymm3, %ymm5, %ymm1
-	vmovups	ATANHF_DATA(sTopMask12)(%rip), %ymm4
+	vmovups	COMMON_DATA(_Neg4096)(%rip), %ymm4
 
 	vrcpps	%ymm1, %ymm7
 	vsubps	%ymm1, %ymm5, %ymm9
 	vandps	%ymm4, %ymm7, %ymm6
 	vsubps	%ymm3, %ymm9, %ymm7
 
-	/* No need to split sU when FMA is available */
+	/* No need to split sU when FMA is available.  */
 	vfnmadd213ps %ymm5, %ymm6, %ymm1
 	vmovaps	%ymm0, %ymm8
 	vfmadd213ps %ymm0, %ymm0, %ymm0
 	vfnmadd231ps %ymm6, %ymm7, %ymm1
 
-	/*
-	 * Check whether |X| < 1, in which case we use the main function.
-	 * Otherwise set the rangemask so that the callout will get used.
-	 * Note that this will also use the callout for NaNs since not(NaN < 1).
-	 */
+	/* Check whether |X| < 1, in which case we use the main
+	   function. Otherwise set the rangemask so that the callout
+	   will get used. Note that this will also use the callout for
+	   NaNs since not(NaN < 1).  */
 	vcmpnlt_uqps %ymm5, %ymm3, %ymm14
-	vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
+	vcmplt_oqps LOCAL_DATA(_TinyRange)(%rip), %ymm3, %ymm15
 
-	/*
-	 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
-	 * the upper part UHi being <= 12 bits long. Then we have
-	 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
-	 */
+	/* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two
+	   pieces, the upper part UHi being <= 12 bits long. Then we
+	   have atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V
+	   / (UHi + ULo)).  */
 	vaddps	%ymm3, %ymm3, %ymm3
 
-	/*
-	 * Split V as well into upper 12 bits and lower part, so that we can get
-	 * a preliminary quotient estimate without rounding error.
-	 */
+	/* Split V as well into upper 12 bits and lower part, so that we
+	   can get a preliminary quotient estimate without rounding
+	   error.  */
 	vandps	%ymm4, %ymm3, %ymm4
 	vsubps	%ymm4, %ymm3, %ymm7
 
-	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
+	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo.  */
 	vmulps	%ymm4, %ymm6, %ymm4
 
-	/* Compute D = E + E^2 */
+	/* Compute D = E + E^2.  */
 	vfmadd213ps %ymm1, %ymm1, %ymm1
 
 	/* Record the sign for eventual reincorporation.  */
 	vandnps	%ymm8, %ymm2, %ymm3
 
-	/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
+	/* Or the sign bit in with the tiny result to handle atanh(-0)
+	   correctly.  */
 	vorps	%ymm3, %ymm0, %ymm13
 	vmulps	%ymm7, %ymm6, %ymm2
 
-	/*
-	 * Compute R * (VHi + VLo) * (1 + E + E^2)
-	 * = R *  (VHi + VLo) * (1 + D)
-	 * = QHi + (QHi * D + QLo + QLo * D)
-	 */
-
-	/*
-	 * If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
-	 * vaddps %ymm1, %ymm9, %ymm1` can be replaced with
-	 * `vfmadd231ps %ymm1, %ymm4, %ymm4`.
-	 */
+	/* Compute R * (VHi + VLo) * (1 + E + E^2)
+	   = R *  (VHi + VLo) * (1 + D)
+	   = QHi + (QHi * D + QLo + QLo * D).  */
+
+	/* If less precision is acceptable the:
+	   `vmulps %ymm1, %ymm4, %ymm6; vaddps %ymm1, %ymm9, %ymm1`
+	   can be replaced with:
+	   `vfmadd231ps %ymm1, %ymm4, %ymm4`.  */
 	vmulps	%ymm1, %ymm4, %ymm6
 	vfmadd213ps %ymm2, %ymm2, %ymm1
 	vaddps	%ymm1, %ymm6, %ymm1
 
-	/*
-	 * Now finally accumulate the high and low parts of the
-	 * argument to log1p, H + L, with a final compensated summation.
-	 */
+	/* Now finally accumulate the high and low parts of the
+	   argument to log1p, H + L, with a final compensated summation.  */
 	vaddps	%ymm1, %ymm4, %ymm2
 
-	/* reduction: compute r, n */
-	vmovups	ATANHF_DATA(iBrkValue)(%rip), %ymm9
+	/* reduction: compute r, n.  */
+	vmovups	COMMON_DATA(_IBrkValue)(%rip), %ymm9
 
-	/*
-	 * Now we feed into the log1p code, using H in place of _VARG1 and
-	 * later incorporating L into the reduced argument.
-	 * compute 1+x as high, low parts
-	 */
+	/* Now we feed into the log1p code, using H in place of _VARG1 and
+	   later incorporating L into the reduced argument.
+	   compute 1+x as high, low parts.  */
 	vmaxps	%ymm2, %ymm5, %ymm0
 	vminps	%ymm2, %ymm5, %ymm6
 
-	/* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`).  */
+	/* This is needed for rounding (see `vaddps %ymm1, %ymm4,
+	   %ymm2`).  */
 	vsubps	%ymm2, %ymm4, %ymm2
 	vaddps	%ymm6, %ymm0, %ymm4
 	vpsubd	%ymm9, %ymm4, %ymm7
 	vsubps	%ymm4, %ymm0, %ymm4
 	vaddps	%ymm2, %ymm1, %ymm2
-	vmovaps	ATANHF_DATA(iOffExpoMask)(%rip), %ymm1
+	vmovaps	COMMON_DATA(_NotiOffExpoMask)(%rip), %ymm1
 
-	vandps	%ymm1, %ymm7, %ymm0
+	vandnps	%ymm7, %ymm1, %ymm0
 	vaddps	%ymm4, %ymm6, %ymm4
-	vandnps	%ymm7, %ymm1, %ymm6
-	vmovups	ATANHF_DATA(sPoly+0)(%rip), %ymm1
+	vandps	%ymm7, %ymm1, %ymm6
+
+	vmovups	LOCAL_DATA(_Poly_1)(%rip), %ymm1
 	vpaddd	%ymm9, %ymm0, %ymm0
 	vaddps	%ymm4, %ymm2, %ymm4
 	vpsubd	%ymm6, %ymm5, %ymm6
 
-	/* polynomial evaluation */
+	/* polynomial evaluation.  */
 	vsubps	%ymm5, %ymm0, %ymm2
 	vfmadd231ps %ymm4, %ymm6, %ymm2
-	vfmadd213ps ATANHF_DATA(sPoly+32)(%rip), %ymm2, %ymm1
-	vfmadd213ps ATANHF_DATA(sPoly+64)(%rip), %ymm2, %ymm1
-	vfmadd213ps ATANHF_DATA(sPoly+96)(%rip), %ymm2, %ymm1
-	vfmadd213ps ATANHF_DATA(sPoly+128)(%rip), %ymm2, %ymm1
-	vfmadd213ps ATANHF_DATA(sPoly+160)(%rip), %ymm2, %ymm1
-	vfmadd213ps ATANHF_DATA(sPoly+192)(%rip), %ymm2, %ymm1
-	vfmadd213ps ATANHF_DATA(sPoly+224)(%rip), %ymm2, %ymm1
+
+	vfmadd213ps LOCAL_DATA(_Poly_2)(%rip), %ymm2, %ymm1
+	vfmadd213ps LOCAL_DATA(_Poly_3)(%rip), %ymm2, %ymm1
+	vfmadd213ps LOCAL_DATA(_Poly_4)(%rip), %ymm2, %ymm1
+	vfmadd213ps LOCAL_DATA(_Poly_5)(%rip), %ymm2, %ymm1
+	vfmadd213ps LOCAL_DATA(_Poly_6)(%rip), %ymm2, %ymm1
+	vfmadd213ps LOCAL_DATA(_Poly_7)(%rip), %ymm2, %ymm1
+	vfmadd213ps COMMON_DATA(_Neg5F)(%rip), %ymm2, %ymm1
 
 	vmulps	%ymm1, %ymm2, %ymm1
 	vfmadd213ps %ymm2, %ymm2, %ymm1
 
-	/* final reconstruction */
+	/* final reconstruction.  */
 	vpsrad	$23, %ymm7, %ymm6
 	vcvtdq2ps %ymm6, %ymm2
-	vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
+	vfmadd132ps COMMON_DATA(_Ln2)(%rip), %ymm1, %ymm2
 
-	/* Finally, halve the result and reincorporate the sign */
-	vxorps	ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
+	/* Finally, halve the result and reincorporate the sign.  */
+	vxorps	LOCAL_DATA(_Half)(%rip), %ymm3, %ymm3
 	vmulps	%ymm2, %ymm3, %ymm2
 	vmovmskps %ymm14, %edx
 	testl	%edx, %edx
 
 	vblendvps %ymm15, %ymm13, %ymm2, %ymm0
-	/* Go to special inputs processing branch */
+	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx rdx r12 r13 r14 r15 ymm0
+
 	/* No registers to restore on fast path.  */
 	ret
 
 
 	/* Cold case. edx has 1s where there was a special value that
 	   needs to be handled by a atanhf call. Optimize for code size
-	   more so than speed here. */
+	   more so than speed here.  */
 L(SPECIAL_VALUES_BRANCH):
-	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8
-    /* Use r13 to save/restore the stack. This allows us to use rbp as
-       callee save register saving code size. */
+
+	/* Use r13 to save/restore the stack. This allows us to use rbp
+	   as callee save register saving code size.  */
 	pushq	%r13
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(r13, -16)
-	/* Need to callee save registers to preserve state across tanhf calls.
-	 */
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (r13, -16)
+	/* Need to callee save registers to preserve state across tanhf
+	   calls.  */
 	pushq	%rbx
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(rbx, -24)
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbx, -24)
 	pushq	%rbp
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(rbp, -32)
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbp, -32)
 	movq	%rsp, %r13
-	cfi_def_cfa_register(r13)
+	cfi_def_cfa_register (r13)
 
 	/* Align stack and make room for 2x ymm vectors.  */
 	andq	$-32, %rsp
@@ -217,16 +214,17 @@ L(SPECIAL_VALUES_BRANCH):
 
 	vzeroupper
 
-	/* edx has 1s where there was a special value that needs to be handled
-	   by a atanhf call.  */
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a atanhf call.  */
 	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	# LOE rbx rbp r12 r13 r14 r15
-	/* use rbp as index for special value that is saved across calls to
-	   atanhf. We technically don't need a callee save register here as offset
-	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
-	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
-	   in the loop. Realigning also costs more code size.  */
+
+	/* use rbp as index for special value that is saved across calls
+	   to atanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 28] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop. Realigning
+	   also costs more code size.  */
 	xorl	%ebp, %ebp
 	tzcntl	%ebx, %ebp
 
@@ -234,100 +232,52 @@ L(SPECIAL_VALUES_LOOP):
 	vmovss	32(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
 
-	/* No good way to avoid the store-forwarding fault this will cause on
-	   return. `lfence` avoids the SF fault but at greater cost as it
-	   serialized stack/callee save restoration.  */
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
 	vmovss	%xmm0, (%rsp, %rbp, 4)
 
-	blsrl   %ebx, %ebx
+	blsrl	%ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
-	# LOE r12 r13 r14 r15
-
 
 	/* All results have been written to (%rsp).  */
 	vmovups	(%rsp), %ymm0
 	/* Restore rsp.  */
 	movq	%r13, %rsp
-	cfi_def_cfa_register(rsp)
+	cfi_def_cfa_register (rsp)
 	/* Restore callee save registers.  */
 	popq	%rbp
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(rbp)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
 	popq	%rbx
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(rbp)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
 	popq	%r13
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(r13)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (r13)
 	ret
 END(_ZGVdN8v_atanhf_avx2)
 
-	.section .rodata, "a"
-	.align	32
-#ifdef __svml_satanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct{
-	__declspec(align(32)) VUINT32 SgnMask[8][1];
-	__declspec(align(32)) VUINT32 sOne[8][1];
-	__declspec(align(32)) VUINT32 sTopMask12[8][1];
-	__declspec(align(32)) VUINT32 TinyRange[8][1];
-	__declspec(align(32)) VUINT32 iBrkValue[8][1];
-	__declspec(align(32)) VUINT32 iOffExpoMask[8][1];
-	__declspec(align(32)) VUINT32 sPoly[8][8][1];
-	__declspec(align(32)) VUINT32 sLn2[8][1];
-	__declspec(align(32)) VUINT32 sHalf[8][1];
-} __svml_satanh_data_internal;
-#endif
-__svml_satanh_data_internal:
-	/* SgnMask */
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-	/* sOne = SP 1.0 */
+	.section .rodata.avx2, "a"
 	.align	32
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* sTopMask12 */
-	.align	32
-	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
-	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
-	/* TinyRange */
-	.align	32
-	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
-	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
-	/* iBrkValue = SP 2/3 */
-	.align	32
-	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
-	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
-	/* iOffExpoMask = SP significand mask */
-	.align	32
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	/* sPoly[] = SP polynomial */
-	.align	32
-	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
-	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
-	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
-	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
-	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
-	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
-	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
-	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
-	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
-	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
-	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
-	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
-	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
-	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
-	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
-	/* sLn2 = SP ln(2) */
-	.align	32
-	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
-	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
-	/* sHalf */
-	.align	32
-	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
-	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
-	.align	32
-	.type	__svml_satanh_data_internal, @object
-	.size	__svml_satanh_data_internal, .-__svml_satanh_data_internal
+
+LOCAL_DATA_NAME:
+	DATA_VEC (LOCAL_DATA_NAME, _TinyRange, 0x0C000000)
+	/* _Poly[] = SP polynomial.  */
+	/* 1.3820238411426544189453125e-01 P7.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_1, 0x3e0d84ed)
+	/* -1.5122179687023162841796875e-01 P6.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_2, 0xbe1ad9e3)
+	/* 1.4042308926582336425781250e-01 P5.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_3, 0x3e0fcb12)
+	/* -1.6472326219081878662109375e-01 P4.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_4, 0xbe28ad37)
+	/* 2.0007920265197753906250000e-01 P3.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_5, 0x3e4ce190)
+	/* -2.5004237890243530273437500e-01 P2.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_6, 0xbe80058e)
+	/* 3.3333265781402587890625000e-01 P1.  */
+	DATA_VEC (LOCAL_DATA_NAME, _Poly_7, 0x3eaaaa94)
+	DATA_VEC (LOCAL_DATA_NAME, _Half, 0x3F000000)
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 11/27] x86/fpu: Optimize svml_s_atanf16_core_avx512.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (8 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 10/27] x86/fpu: Update rodata usage in svml_s_atanhf8_core_avx2.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 12/27] x86/fpu: Optimize svml_s_atanf4_core_sse4.S Noah Goldstein via Libc-alpha
                   ` (16 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Change the algorithm used to match the avx2 implementation which
   seems to be faster.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

Changing the algorithm (1) causes a slight ULP error increase (exact
same as the avx2 version).

Before:

   ulp:
	 0: 4127324924 (0.9610)
	 1:  167635550 (0.0390)
	 2:       6822 (0.0000)
	 3:          0 (0.0000)
	 4:          0 (0.0000)

After:
   ulp:
	 0: 4088299128 (0.9519)
	 1:  206531674 (0.0481)
	 2:     136494 (0.0000)
	 3:          0 (0.0000)
	 4:          0 (0.0000)

Since the max ULP is the same and the distribution matches the avx2
implementation this seems like an acceptable "regression" as it
doesn't seem feasible any application could have been relying on
the precision distribution.

Code Size Change: -79 Bytes (193 - 272)

Perf Changes:
Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.7612
0F          (0x0000ffff, Denorm)   -> 1.3234
.1F         (0x3dcccccd)           -> 0.7690
5F          (0x40a00000)           -> 0.7752
2315255808F (0x4f0a0000)           -> 0.7712
-NaN        (0xffffffff)           -> 0.7824

Note the ~32% regression in the denorm case is because of
additional micro-code assists (from the algorithm shift).
This generally seems worth it for the ~23-24% perf improvement
in other cases as denormal inputs are almost certainly cold cases.
---
 .../multiarch/svml_s_atanf16_core_avx512.S    | 199 ++++++------------
 1 file changed, 67 insertions(+), 132 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
index 88b44a989c..abb3c76209 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
@@ -28,146 +28,81 @@
  *
  */
 
-/* Offsets for data table __svml_satan_data_internal_avx512
- */
-#define AbsMask				0
-#define Shifter				64
-#define MaxThreshold			128
-#define MOne				192
-#define One				256
-#define LargeX				320
-#define Zero				384
-#define Tbl_H				448
-#define Pi2				576
-#define coeff_1				640
-#define coeff_2				704
-#define coeff_3				768
+#define LOCAL_DATA_NAME	__svml_satan_data_internal
+#include "svml_s_common_evex512_rodata_offsets.h"
+/* Offsets for data table __svml_satan_data_internal.  */
+#define _sPC8	0
+#define _sPC7	64
+#define _sPC6	128
+#define _sPC5	192
+#define _sPC4	256
+#define _sPC3	320
+#define _sPC2	384
+#define _sPC1	448
+#define _sPIO2	512
 
 #include <sysdep.h>
 
 	.section .text.evex512, "ax", @progbits
 ENTRY(_ZGVeN16v_atanf_skx)
-	vandps	__svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
-	vmovups	MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
-	vmovups	One+__svml_satan_data_internal_avx512(%rip), %zmm8
-
-	/* round to 2 bits after binary point */
-	vreduceps $40, {sae}, %zmm7, %zmm5
-
-	/* saturate X range */
-	vmovups	LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
-	vmovups	Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
-	vcmpps	$29, {sae}, %zmm3, %zmm7, %k1
-
-	/* table lookup sequence */
-	vmovups	Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
-	vsubps	{rn-sae}, %zmm5, %zmm7, %zmm4
-	vaddps	{rn-sae}, %zmm2, %zmm7, %zmm1
-	vxorps	%zmm0, %zmm7, %zmm0
-	vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
-	vmovups	coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
-
-	/* if|X|>=MaxThreshold, set DiffX=-1 */
-	vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
-	vmovups	coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
-
-	/* if|X|>=MaxThreshold, set Y=X */
-	vminps	{sae}, %zmm7, %zmm6, %zmm8{%k1}
-
-	/* R+Rl = DiffX/Y */
-	vgetmantps $0, {sae}, %zmm9, %zmm12
-	vgetexpps {sae}, %zmm9, %zmm10
-	vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
-	vgetmantps $0, {sae}, %zmm8, %zmm15
-	vgetexpps {sae}, %zmm8, %zmm11
-	vmovups	coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
-
-	/* set table value to Pi/2 for large X */
-	vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
-	vrcp14ps %zmm15, %zmm13
-	vsubps	{rn-sae}, %zmm11, %zmm10, %zmm2
-	vmulps	{rn-sae}, %zmm13, %zmm12, %zmm14
-	vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
-	vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
-	vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
-
-	/* polynomial evaluation */
-	vmulps	{rn-sae}, %zmm7, %zmm7, %zmm8
-	vmulps	{rn-sae}, %zmm7, %zmm8, %zmm6
-	vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
-	vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
-	vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
-	vaddps	{rn-sae}, %zmm9, %zmm8, %zmm10
-	vxorps	%zmm0, %zmm10, %zmm0
+	/* 1) If x>1,      then r=-1/x, PIO2=Pi/2
+	   2) If -1<=x<=1, then r=x,    PIO2=0
+	   3) If x<-1,     then r=-1/x, PIO2=-Pi/2.  */
+	vmovups	COMMON_DATA(_OneF)(%rip), %zmm2
+	vmovups	COMMON_DATA(_SignMask)(%rip), %zmm7
+
+
+	/* Use minud\maxud operations for argument reduction.  */
+	vandnps	%zmm0, %zmm7, %zmm3
+	vpcmpgtd %zmm2, %zmm3, %k1
+
+	vpmaxud	%zmm3, %zmm2, %zmm4
+	vpminud	%zmm3, %zmm2, %zmm5
+
+	vdivps	%zmm4, %zmm5, %zmm4
+
+	vandps	%zmm7, %zmm0, %zmm3
+	vmovdqa32 %zmm7, %zmm7{%k1}{z}
+
+	vmulps	%zmm4, %zmm4, %zmm1
+	vpternlogq $0x96, %zmm3, %zmm4, %zmm7
+
+	/* Polynomial.  */
+
+	vmovups	LOCAL_DATA(_sPC8)(%rip), %zmm0
+	vmovups	LOCAL_DATA(_sPC7)(%rip), %zmm4
+
+	vmulps	%zmm1, %zmm1, %zmm5
+
+	vfmadd213ps LOCAL_DATA(_sPC6)(%rip), %zmm5, %zmm0
+	vfmadd213ps LOCAL_DATA(_sPC5)(%rip), %zmm5, %zmm4
+	vfmadd213ps LOCAL_DATA(_sPC4)(%rip), %zmm5, %zmm0
+	vfmadd213ps LOCAL_DATA(_sPC3)(%rip), %zmm5, %zmm4
+	vfmadd213ps LOCAL_DATA(_sPC2)(%rip), %zmm5, %zmm0
+	vfmadd213ps LOCAL_DATA(_sPC1)(%rip), %zmm5, %zmm4
+	vfmadd213ps %zmm4, %zmm1, %zmm0
+	vfmadd213ps %zmm2, %zmm1, %zmm0
+	vorps	LOCAL_DATA(_sPIO2)(%rip), %zmm3, %zmm3{%k1}
+
+	/* Reconstruction.  */
+	vfmadd213ps %zmm3, %zmm7, %zmm0
 	ret
 
 END(_ZGVeN16v_atanf_skx)
 
-	.section .rodata, "a"
+	.section .rodata.evex512, "a"
 	.align	64
 
-#ifdef __svml_satan_data_internal_avx512_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(64)) VUINT32 AbsMask[16][1];
-	__declspec(align(64)) VUINT32 Shifter[16][1];
-	__declspec(align(64)) VUINT32 MaxThreshold[16][1];
-	__declspec(align(64)) VUINT32 MOne[16][1];
-	__declspec(align(64)) VUINT32 One[16][1];
-	__declspec(align(64)) VUINT32 LargeX[16][1];
-	__declspec(align(64)) VUINT32 Zero[16][1];
-	__declspec(align(64)) VUINT32 Tbl_H[32][1];
-	__declspec(align(64)) VUINT32 Pi2[16][1];
-	__declspec(align(64)) VUINT32 coeff[3][16][1];
-} __svml_satan_data_internal_avx512;
-#endif
-__svml_satan_data_internal_avx512:
-	/* AbsMask */
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-	/* Shifter */
-	.align	64
-	.long	0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
-	/* MaxThreshold */
-	.align	64
-	.long	0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
-	/* MOne */
-	.align	64
-	.long	0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
-	/* One */
-	.align	64
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* LargeX */
-	.align	64
-	.long	0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
-	/* Zero */
-	.align	64
-	.long	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
-	/* Tbl_H */
-	.align	64
-	.long	0x00000000, 0x3e7adbb0
-	.long	0x3eed6338, 0x3f24bc7d
-	.long	0x3f490fdb, 0x3f6563e3
-	.long	0x3f7b985f, 0x3f869c79
-	.long	0x3f8db70d, 0x3f93877b
-	.long	0x3f985b6c, 0x3f9c6b53
-	.long	0x3f9fe0bb, 0x3fa2daa4
-	.long	0x3fa57088, 0x3fa7b46f
-	.long	0x3fa9b465, 0x3fab7b7a
-	.long	0x3fad1283, 0x3fae809e
-	.long	0x3fafcb99, 0x3fb0f836
-	.long	0x3fb20a6a, 0x3fb30581
-	.long	0x3fb3ec43, 0x3fb4c10a
-	.long	0x3fb585d7, 0x3fb63c64
-	.long	0x3fb6e62c, 0x3fb78478
-	.long	0x3fb81868, 0x3fb8a2f5
-	/* Pi2 */
-	.align	64
-	.long	0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
-	/* coeff3 */
-	.align	64
-	.long	0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
-	.long	0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
-	.long	0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
-	.align	64
-	.type	__svml_satan_data_internal_avx512, @object
-	.size	__svml_satan_data_internal_avx512, .-__svml_satan_data_internal_avx512
+LOCAL_DATA_NAME:
+	DATA_VEC (LOCAL_DATA_NAME, _sPC8, 0x3B322CC0)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC7, 0xBC7F2631)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC6, 0x3D2BC384)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC5, 0xBD987629)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC4, 0x3DD96474)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC3, 0xBE1161F8)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC2, 0x3E4CB79F)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC1, 0xBEAAAA49)
+	DATA_VEC (LOCAL_DATA_NAME, _sPIO2, 0x3FC90FDB)
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 12/27] x86/fpu: Optimize svml_s_atanf4_core_sse4.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (9 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 11/27] x86/fpu: Optimize svml_s_atanf16_core_avx512.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 13/27] x86/fpu: Optimize svml_s_atanf8_core_avx2.S Noah Goldstein via Libc-alpha
                   ` (15 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
2. Remove unused rodata.
3. Use common data definitions where possible.

Code Size Change: -31 Bytes (173 - 204)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.9446
0F          (0x0000ffff, Denorm)   -> 0.9977
.1F         (0x3dcccccd)           -> 0.9380
5F          (0x40a00000)           -> 0.9542
2315255808F (0x4f0a0000)           -> 1.0115
-NaN        (0xffffffff)           -> 0.9232
---
 .../fpu/multiarch/svml_s_atanf4_core_sse4.S   | 198 +++++++-----------
 1 file changed, 75 insertions(+), 123 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
index 83cecb8ee5..2ab599f7a8 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
@@ -28,136 +28,88 @@
  *
  */
 
-/* Offsets for data table __svml_satan_data_internal
- */
-#define _sSIGN_MASK			0
-#define _sABS_MASK			16
-#define _sONE				32
-#define _sPIO2				48
-#define _sPC8				64
-#define _sPC7				80
-#define _sPC6				96
-#define _sPC5				112
-#define _sPC4				128
-#define _sPC3				144
-#define _sPC2				160
-#define _sPC1				176
-#define _sPC0				192
+#define LOCAL_DATA_NAME	__svml_satan_data_internal
+#include "svml_s_common_sse4_rodata_offsets.h"
+/* Offsets for data table __svml_satan_data_internal.  */
+#define _SignMask	0
+#define _sPIO2	16
+#define _sPC7	32
+#define _sPC5	48
+#define _sPC3	64
+#define _sPC1	80
+#define _sPC8	96
+#define _sPC6	112
+#define _sPC4	128
+#define _sPC2	144
+#define _sPC0	160
 
 #include <sysdep.h>
 
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_atanf_sse4)
-	/*
-	 * To use minps\maxps operations for argument reduction
-	 * uncomment _AT_USEMINMAX_ definition
-	 *  Declarations
-	 * Variables
-	 * Constants
-	 */
-	movups	_sABS_MASK+__svml_satan_data_internal(%rip), %xmm2
-
-	/*
-	 * 1) If x>1,      then r=-1/x, PIO2=Pi/2
-	 * 2) If -1<=x<=1, then r=x,    PIO2=0
-	 * 3) If x<-1,     then r=-1/x, PIO2=-Pi/2
-	 */
-	movups	_sONE+__svml_satan_data_internal(%rip), %xmm1
-	andps	%xmm0, %xmm2
-	movaps	%xmm2, %xmm9
-	movaps	%xmm1, %xmm3
-	cmpleps	%xmm1, %xmm9
-	maxps	%xmm2, %xmm3
-	minps	%xmm2, %xmm1
-	divps	%xmm3, %xmm1
-	movups	__svml_satan_data_internal(%rip), %xmm4
-	movaps	%xmm9, %xmm10
-	andps	%xmm4, %xmm0
-	andnps	%xmm4, %xmm9
-	pxor	%xmm0, %xmm9
-	pxor	%xmm1, %xmm9
-
-	/* Polynomial. */
-	movaps	%xmm9, %xmm8
-	mulps	%xmm9, %xmm8
-	movaps	%xmm8, %xmm7
-	mulps	%xmm8, %xmm7
-	movups	_sPC8+__svml_satan_data_internal(%rip), %xmm6
-	mulps	%xmm7, %xmm6
-	movups	_sPC7+__svml_satan_data_internal(%rip), %xmm5
-	mulps	%xmm7, %xmm5
-	addps	_sPC6+__svml_satan_data_internal(%rip), %xmm6
-	mulps	%xmm7, %xmm6
-	addps	_sPC5+__svml_satan_data_internal(%rip), %xmm5
-	mulps	%xmm7, %xmm5
-	addps	_sPC4+__svml_satan_data_internal(%rip), %xmm6
-	mulps	%xmm7, %xmm6
-	addps	_sPC3+__svml_satan_data_internal(%rip), %xmm5
-	mulps	%xmm5, %xmm7
-	addps	_sPC2+__svml_satan_data_internal(%rip), %xmm6
-	mulps	%xmm8, %xmm6
-	addps	_sPC1+__svml_satan_data_internal(%rip), %xmm7
-	andnps	_sPIO2+__svml_satan_data_internal(%rip), %xmm10
-	addps	%xmm6, %xmm7
-	mulps	%xmm7, %xmm8
-	pxor	%xmm0, %xmm10
-	addps	_sPC0+__svml_satan_data_internal(%rip), %xmm8
-
-	/* Reconstruction. */
-	mulps	%xmm8, %xmm9
-	addps	%xmm9, %xmm10
-	movaps	%xmm10, %xmm0
+	/* 1) If x>1,      then r=-1/x, PIO2=Pi/2
+	   2) If -1<=x<=1, then r=x,    PIO2=0
+	   3) If x<-1,     then r=-1/x, PIO2=-Pi/2.  */
+	movups	COMMON_DATA(_OneF)(%rip), %xmm1
+	/* use minud\maxud operations for argument reduction.  */
+	movups	LOCAL_DATA(_SignMask)(%rip), %xmm5
+	movaps	%xmm5, %xmm6
+	andnps	%xmm0, %xmm5
+	andps	%xmm6, %xmm0
+	movaps	%xmm5, %xmm7
+    
+	movaps	%xmmA, %xmm4
+	pminud	%xmm5, %xmmA
+	pmaxud	%xmm4, %xmm7
+    pcmpgtd	%xmmA, %xmm5
+	divps	%xmm7, %xmmA
+    
+	andps	%xmm5, %xmm6
+	pxor	%xmm0, %xmm6
+	andps	LOCAL_DATA(_sPIO2)(%rip), %xmm5
+	pxor	%xmm0, %xmm5
+	pxor	%xmmA, %xmm6
+	/* Polynomial.  */
+	mulps	%xmmA, %xmmA
+	movaps	%xmmA, %xmm0
+	mulps	%xmmA, %xmmA
+	movups	LOCAL_DATA(_sPC7)(%rip), %xmm2
+	mulps	%xmmA, %xmm2
+	addps	LOCAL_DATA(_sPC5)(%rip), %xmm2
+	mulps	%xmmA, %xmm2
+	addps	LOCAL_DATA(_sPC3)(%rip), %xmm2
+	mulps	%xmmA, %xmm2
+	addps	LOCAL_DATA(_sPC1)(%rip), %xmm2
+	movups	LOCAL_DATA(_sPC8)(%rip), %xmm3
+	mulps	%xmmA, %xmm3
+	addps	LOCAL_DATA(_sPC6)(%rip), %xmm3
+	mulps	%xmmA, %xmm3
+	addps	LOCAL_DATA(_sPC4)(%rip), %xmm3
+	mulps	%xmmA, %xmm3
+	addps	LOCAL_DATA(_sPC2)(%rip), %xmm3
+	mulps	%xmm0, %xmm3
+	addps	%xmm3, %xmm2
+	mulps	%xmm2, %xmm0
+	addps	%xmm4, %xmm0
+	/* Reconstruction.  */
+	mulps	%xmm6, %xmm0
+	addps	%xmm5, %xmm0
 	ret
-
 END(_ZGVbN4v_atanf_sse4)
 
-	.section .rodata, "a"
+	.section .rodata.sse4, "a"
 	.align	16
+LOCAL_DATA_NAME:
+	DATA_VEC (LOCAL_DATA_NAME, _SignMask, 0x80000000)
+	DATA_VEC (LOCAL_DATA_NAME, _sPIO2, 0x3fc90fdb)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC7, 0xBC7F2631)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC5, 0xBD987629)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC3, 0xBE1161F8)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC1, 0xBEAAAA49)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC8, 0x3B322CC0)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC6, 0x3D2BC384)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC4, 0x3DD96474)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC2, 0x3E4CB79F)
 
-#ifdef __svml_satan_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(16)) VUINT32 _sSIGN_MASK[4][1];
-	__declspec(align(16)) VUINT32 _sABS_MASK[4][1];
-	__declspec(align(16)) VUINT32 _sONE[4][1];
-	__declspec(align(16)) VUINT32 _sPIO2[4][1];
-	__declspec(align(16)) VUINT32 _sPC8[4][1];
-	__declspec(align(16)) VUINT32 _sPC7[4][1];
-	__declspec(align(16)) VUINT32 _sPC6[4][1];
-	__declspec(align(16)) VUINT32 _sPC5[4][1];
-	__declspec(align(16)) VUINT32 _sPC4[4][1];
-	__declspec(align(16)) VUINT32 _sPC3[4][1];
-	__declspec(align(16)) VUINT32 _sPC2[4][1];
-	__declspec(align(16)) VUINT32 _sPC1[4][1];
-	__declspec(align(16)) VUINT32 _sPC0[4][1];
-} __svml_satan_data_internal;
-#endif
-__svml_satan_data_internal:
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000 // _sSIGN_MASK
-	.align	16
-	.long	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // _sABS_MASK
-	.align	16
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sONE
-	.align	16
-	.long	0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // _sPIO2
-	.align	16
-	.long	0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // _sPC8
-	.align	16
-	.long	0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // _sPC7
-	.align	16
-	.long	0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // _sPC6
-	.align	16
-	.long	0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // _sPC5
-	.align	16
-	.long	0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // _sPC4
-	.align	16
-	.long	0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // _sPC3
-	.align	16
-	.long	0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // _sPC2
-	.align	16
-	.long	0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // _sPC1
-	.align	16
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sPC0
-	.align	16
-	.type	__svml_satan_data_internal, @object
-	.size	__svml_satan_data_internal, .-__svml_satan_data_internal
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 13/27] x86/fpu: Optimize svml_s_atanf8_core_avx2.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (10 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 12/27] x86/fpu: Optimize svml_s_atanf4_core_sse4.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 14/27] x86/fpu: Add common rodata file for svml_s_tanf_*_{avx512, avx2, sse4}.S Noah Goldstein via Libc-alpha
                   ` (14 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
2. Remove unused rodata.
3. Use common data definitions where possible.

Code Size Change: -12 Bytes (163 - 175)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.8484
0F          (0x0000ffff, Denorm)   -> 0.9993
.1F         (0x3dcccccd)           -> 0.9368
5F          (0x40a00000)           -> 0.9476
2315255808F (0x4f0a0000)           -> 0.9454
-NaN        (0xffffffff)           -> 0.9193
---
 .../fpu/multiarch/svml_s_atanf8_core_avx2.S   | 162 +++++++-----------
 1 file changed, 58 insertions(+), 104 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S
index ee49a3e10e..649277c682 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S
@@ -28,120 +28,74 @@
  *
  */
 
-/* Offsets for data table __svml_satan_data_internal
- */
-#define _sSIGN_MASK			0
-#define _sABS_MASK			32
-#define _sONE				64
-#define _sPIO2				96
-#define _sPC8				128
-#define _sPC7				160
-#define _sPC6				192
-#define _sPC5				224
-#define _sPC4				256
-#define _sPC3				288
-#define _sPC2				320
-#define _sPC1				352
-#define _sPC0				384
+
+#define LOCAL_DATA_NAME	__svml_satan_data_internal
+#include "svml_s_common_avx2_rodata_offsets.h"
+/* Offsets for data table __svml_satan_data_internal.  */
+#define _sPC8	0
+#define _sPC7	32
+#define _sPC6	64
+#define _sPC5	96
+#define _sPC4	128
+#define _sPC3	160
+#define _sPC2	192
+#define _sPC1	224
+#define _sPC0	256
 
 #include <sysdep.h>
 
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_atanf_avx2)
-	/*
-	 * 1) If x>1,      then r=-1/x, PIO2=Pi/2
-	 * 2) If -1<=x<=1, then r=x,    PIO2=0
-	 * 3) If x<-1,     then r=-1/x, PIO2=-Pi/2
-	 */
-	vmovups	_sONE+__svml_satan_data_internal(%rip), %ymm2
-	vmovups	__svml_satan_data_internal(%rip), %ymm7
-	vmovups	_sPC7+__svml_satan_data_internal(%rip), %ymm13
+	/* 1) If x>1,      then r=-1/x, PIO2=Pi/2
+	   2) If -1<=x<=1, then r=x,    PIO2=0
+	   3) If x<-1,     then r=-1/x, PIO2=-Pi/2.  */
+	vmovups	COMMON_DATA(_AbsMask)(%rip), %ymm7
+	vmovups	COMMON_DATA(_OneF)(%rip), %ymm2
+
+	vandps	%ymm0, %ymm7, %ymm3
+	/* Use minud\maxud operations for argument reduction.  */
+	vpmaxud	%ymm3, %ymm2, %ymm5
+	vpminud	%ymm3, %ymm2, %ymm4
 
-	/*
-	 * To use minps\maxps operations for argument reduction
-	 * uncomment _AT_USEMINMAX_ definition
-	 *  Declarations
-	 * Variables
-	 * Constants
-	 */
-	vandps	_sABS_MASK+__svml_satan_data_internal(%rip), %ymm0, %ymm3
-	vmaxps	%ymm3, %ymm2, %ymm5
-	vminps	%ymm3, %ymm2, %ymm4
-	vcmple_oqps %ymm2, %ymm3, %ymm6
-	vdivps	%ymm5, %ymm4, %ymm11
-	vandps	%ymm7, %ymm0, %ymm9
-	vandnps	%ymm7, %ymm6, %ymm8
-	vxorps	%ymm9, %ymm8, %ymm10
-	vxorps	%ymm11, %ymm10, %ymm15
+	vdivps	%ymm5, %ymm4, %ymm4
 
-	/* Polynomial. */
-	vmulps	%ymm15, %ymm15, %ymm14
-	vmovups	_sPC8+__svml_satan_data_internal(%rip), %ymm0
-	vmulps	%ymm14, %ymm14, %ymm12
-	vfmadd213ps _sPC6+__svml_satan_data_internal(%rip), %ymm12, %ymm0
-	vfmadd213ps _sPC5+__svml_satan_data_internal(%rip), %ymm12, %ymm13
-	vfmadd213ps _sPC4+__svml_satan_data_internal(%rip), %ymm12, %ymm0
-	vfmadd213ps _sPC3+__svml_satan_data_internal(%rip), %ymm12, %ymm13
-	vfmadd213ps _sPC2+__svml_satan_data_internal(%rip), %ymm12, %ymm0
-	vfmadd213ps _sPC1+__svml_satan_data_internal(%rip), %ymm12, %ymm13
-	vfmadd213ps %ymm13, %ymm14, %ymm0
-	vfmadd213ps _sPC0+__svml_satan_data_internal(%rip), %ymm14, %ymm0
-	vandnps	_sPIO2+__svml_satan_data_internal(%rip), %ymm6, %ymm1
-	vxorps	%ymm9, %ymm1, %ymm1
+	vpcmpgtd %ymm2, %ymm3, %ymm6
+	vandnps	%ymm0, %ymm7, %ymm3
+	vandnps	%ymm6, %ymm7, %ymm7
+	vxorps	%ymm3, %ymm7, %ymm5
+	vxorps	%ymm4, %ymm5, %ymm7
+	/* Polynomial.  */
+	vmulps	%ymm4, %ymm4, %ymm1
+	vmovups	LOCAL_DATA(_sPC8)(%rip), %ymm0
+	vmovups	LOCAL_DATA(_sPC7)(%rip), %ymm4
+	vmulps	%ymm1, %ymm1, %ymm5
+	vfmadd213ps LOCAL_DATA(_sPC6)(%rip), %ymm5, %ymm0
+	vfmadd213ps LOCAL_DATA(_sPC5)(%rip), %ymm5, %ymm4
+	vfmadd213ps LOCAL_DATA(_sPC4)(%rip), %ymm5, %ymm0
+	vfmadd213ps LOCAL_DATA(_sPC3)(%rip), %ymm5, %ymm4
+	vfmadd213ps LOCAL_DATA(_sPC2)(%rip), %ymm5, %ymm0
+	vfmadd213ps LOCAL_DATA(_sPC1)(%rip), %ymm5, %ymm4
+	vfmadd213ps %ymm4, %ymm1, %ymm0
+	vfmadd213ps %ymm2, %ymm1, %ymm0
+	vandps	COMMON_DATA(_TanSPI1_FMA)(%rip), %ymm6, %ymm1
+	vxorps	%ymm3, %ymm1, %ymm1
 
-	/* Reconstruction. */
-	vfmadd213ps %ymm1, %ymm15, %ymm0
+	/* Reconstruction.  */
+	vfmadd213ps %ymm1, %ymm7, %ymm0
 	ret
 
 END(_ZGVdN8v_atanf_avx2)
 
-	.section .rodata, "a"
-	.align	32
-
-#ifdef __svml_satan_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(32)) VUINT32 _sSIGN_MASK[8][1];
-	__declspec(align(32)) VUINT32 _sABS_MASK[8][1];
-	__declspec(align(32)) VUINT32 _sONE[8][1];
-	__declspec(align(32)) VUINT32 _sPIO2[8][1];
-	__declspec(align(32)) VUINT32 _sPC8[8][1];
-	__declspec(align(32)) VUINT32 _sPC7[8][1];
-	__declspec(align(32)) VUINT32 _sPC6[8][1];
-	__declspec(align(32)) VUINT32 _sPC5[8][1];
-	__declspec(align(32)) VUINT32 _sPC4[8][1];
-	__declspec(align(32)) VUINT32 _sPC3[8][1];
-	__declspec(align(32)) VUINT32 _sPC2[8][1];
-	__declspec(align(32)) VUINT32 _sPC1[8][1];
-	__declspec(align(32)) VUINT32 _sPC0[8][1];
-} __svml_satan_data_internal;
-#endif
-__svml_satan_data_internal:
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 // _sSIGN_MASK
-	.align	32
-	.long	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // _sABS_MASK
-	.align	32
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sONE
-	.align	32
-	.long	0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // _sPIO2
-	.align	32
-	.long	0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // _sPC8
-	.align	32
-	.long	0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // _sPC7
-	.align	32
-	.long	0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // _sPC6
-	.align	32
-	.long	0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // _sPC5
-	.align	32
-	.long	0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // _sPC4
-	.align	32
-	.long	0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // _sPC3
-	.align	32
-	.long	0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // _sPC2
-	.align	32
-	.long	0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // _sPC1
-	.align	32
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sPC0
+	.section .rodata.avx2, "a"
 	.align	32
-	.type	__svml_satan_data_internal, @object
-	.size	__svml_satan_data_internal, .-__svml_satan_data_internal
+LOCAL_DATA_NAME:
+	DATA_VEC (LOCAL_DATA_NAME, _sPC8, 0x3B322CC0)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC7, 0xBC7F2631)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC6, 0x3D2BC384)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC5, 0xBD987629)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC4, 0x3DD96474)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC3, 0xBE1161F8)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC2, 0x3E4CB79F)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC1, 0xBEAAAA49)
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 14/27] x86/fpu: Add common rodata file for svml_s_tanf_*_{avx512, avx2, sse4}.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (11 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 13/27] x86/fpu: Optimize svml_s_atanf8_core_avx2.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 15/27] x86/fpu: Optimize svml_s_tanf16_core_avx512.S Noah Goldstein via Libc-alpha
                   ` (13 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

The avx2/sse4 implementation share many definitions including the
larger `_Coeffs` table. Sharing this data across avx2/sse4
implementations saves more than 5kb.

As well all three implementations use the `_Reduction` table. Sharing
this table across all three implementations saves more than 6kb.
---
 .../fpu/multiarch/svml_s_tanf_rodata.h.S      | 1641 +++++++++++++++++
 1 file changed, 1641 insertions(+)
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_tanf_rodata.h.S

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf_rodata.h.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf_rodata.h.S
new file mode 100644
index 0000000000..d45125ff13
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf_rodata.h.S
@@ -0,0 +1,1641 @@
+/* Datatables for tanf_{avx512,avx2,sse4}
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+#include "svml_common_data_macros.h.S"
+
+#if (defined AVX2_SHARED_OFFSETS) || (defined AVX2_SHARED_TABLE)
+
+/* Offsets are ordered by use in the function. On cold-starts this
+   might help the prefetcher. If the streaming prefetchers kick in it
+   will prefetch into the lookup table.  */
+# define _sInvPi	0
+# define _sRShifter	32
+# define _sQ2	64
+# define _sP1	96
+# define _sQ1	128
+# define _sP0	160
+# define _sRangeReductionVal	192
+# define _Low16	224
+# define _SH_FLT_1	256
+# define _SH_FLT_2	288
+# define _Low18	320
+# define _Low9	352
+# define _SH_FLT_3	384
+# define _SH_FLT_4	416
+# define _Low7	448
+# define _Coeffs	480
+
+# define AVX2_SHARED_DATA(offset)	((offset) + (__svml_stan_common_data_avx2))
+
+# if (defined AVX2_SHARED_TABLE)
+	.section .rodata.avx2, "a"
+	.align	32
+	.globl	__svml_stan_common_data_avx2
+__svml_stan_common_data_avx2:
+	float_vector32 (__svml_stan_common_data_avx2, _sInvPi, 0x3F22F983)
+	float_vector32 (__svml_stan_common_data_avx2, _sRShifter, 0x4B400000)
+	float_vector32 (__svml_stan_common_data_avx2, _sQ2, 0x3C1F336B)
+	float_vector32 (__svml_stan_common_data_avx2, _sP1, 0xBDC433B4)
+	float_vector32 (__svml_stan_common_data_avx2, _sQ1, 0xBEDBB7AB)
+	float_vector32 (__svml_stan_common_data_avx2, _sP0, 0x3F7FFFFC)
+	float_vector32 (__svml_stan_common_data_avx2, _sRangeReductionVal, 0x46010000)
+	float_vector32 (__svml_stan_common_data_avx2, _Low16, 0x0000ffff)
+	float_vector32 (__svml_stan_common_data_avx2, _SH_FLT_1, 0x47400000)
+	float_vector32 (__svml_stan_common_data_avx2, _SH_FLT_2, 0x28800000)
+	float_vector32 (__svml_stan_common_data_avx2, _Low18, 0x0003ffff)
+	float_vector32 (__svml_stan_common_data_avx2, _Low9, 0x000001ff)
+	float_vector32 (__svml_stan_common_data_avx2, _SH_FLT_3, 0x34000000)
+	float_vector32 (__svml_stan_common_data_avx2, _SH_FLT_4, 0x40c90fdb)
+	float_vector32 (__svml_stan_common_data_avx2, _Low7, 0x0000007f)
+
+	/* _Coeffs Breakpoint B = 0 * pi/128, function tan(B + x).  */
+	float_block (__svml_stan_common_data_avx2, _Coeffs,
+		0x3FC90FDB,	/* B' = pi/2 - B (high single). */
+		0xB33BBD2E,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x00000000,	/* c0 (high single). */
+		0x00000000,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x00000000,	/* c1 (low single). */
+		0x00000000,	/* c2. */
+		0x3EAAACDD,	/* c3. */
+		0x00000000,	/* c4. */
+		0x3FC5EB9B,	/* B' = pi/2 - B (high single). */
+		0x32DE638C,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3CC91A31,	/* c0 (high single). */
+		0x2F8E8D1A,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3A1DFA00,	/* c1 (low single). */
+		0x3CC9392D,	/* c2. */
+		0x3EAB1889,	/* c3. */
+		0x3C885D3B,	/* c4. */
+		0x3FC2C75C,	/* B' = pi/2 - B (high single). */
+		0xB2CBBE8A,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3D49393C,	/* c0 (high single). */
+		0x30A39F5B,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3B1E2B00,	/* c1 (low single). */
+		0x3D49B5D4,	/* c2. */
+		0x3EAC4F10,	/* c3. */
+		0x3CFD9425,	/* c4. */
+		0x3FBFA31C,	/* B' = pi/2 - B (high single). */
+		0x33450FB0,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3D9711CE,	/* c0 (high single). */
+		0x314FEB28,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3BB24C00,	/* c1 (low single). */
+		0x3D97E43A,	/* c2. */
+		0x3EAE6A89,	/* c3. */
+		0x3D4D07E0,	/* c4. */
+		0x3FBC7EDD,	/* B' = pi/2 - B (high single). */
+		0xB1800ADD,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3DC9B5DC,	/* c0 (high single). */
+		0x3145AD86,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3C1EEF20,	/* c1 (low single). */
+		0x3DCBAAEA,	/* c2. */
+		0x3EB14E5E,	/* c3. */
+		0x3D858BB2,	/* c4. */
+		0x3FB95A9E,	/* B' = pi/2 - B (high single). */
+		0xB3651267,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3DFC98C2,	/* c0 (high single). */
+		0xB0AE525C,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3C793D20,	/* c1 (low single). */
+		0x3E003845,	/* c2. */
+		0x3EB5271F,	/* c3. */
+		0x3DAC669E,	/* c4. */
+		0x3FB6365E,	/* B' = pi/2 - B (high single). */
+		0x328BB91C,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3E17E564,	/* c0 (high single). */
+		0xB1C5A2E4,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3CB440D0,	/* c1 (low single). */
+		0x3E1B3D00,	/* c2. */
+		0x3EB9F664,	/* c3. */
+		0x3DD647C0,	/* c4. */
+		0x3FB3121F,	/* B' = pi/2 - B (high single). */
+		0xB30F347D,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3E31AE4D,	/* c0 (high single). */
+		0xB1F32251,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3CF6A500,	/* c1 (low single). */
+		0x3E3707DA,	/* c2. */
+		0x3EBFA489,	/* c3. */
+		0x3DFBD9C7,	/* c4. */
+		0x3FAFEDDF,	/* B' = pi/2 - B (high single). */
+		0x331BBA77,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3E4BAFAF,	/* c0 (high single). */
+		0x2F2A29E0,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3D221018,	/* c1 (low single). */
+		0x3E53BED0,	/* c2. */
+		0x3EC67E26,	/* c3. */
+		0x3E1568E2,	/* c4. */
+		0x3FACC9A0,	/* B' = pi/2 - B (high single). */
+		0xB2655A50,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3E65F267,	/* c0 (high single). */
+		0x31B4B1DF,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3D4E8B90,	/* c1 (low single). */
+		0x3E718ACA,	/* c2. */
+		0x3ECE7164,	/* c3. */
+		0x3E2DC161,	/* c4. */
+		0x3FA9A560,	/* B' = pi/2 - B (high single). */
+		0x33719861,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3E803FD4,	/* c0 (high single). */
+		0xB2279E66,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3D807FC8,	/* c1 (low single). */
+		0x3E884BD4,	/* c2. */
+		0x3ED7812D,	/* c3. */
+		0x3E4636EB,	/* c4. */
+		0x3FA68121,	/* B' = pi/2 - B (high single). */
+		0x31E43AAC,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3E8DB082,	/* c0 (high single). */
+		0xB132A234,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3D9CD7D0,	/* c1 (low single). */
+		0x3E988A60,	/* c2. */
+		0x3EE203E3,	/* c3. */
+		0x3E63582C,	/* c4. */
+		0x3FA35CE2,	/* B' = pi/2 - B (high single). */
+		0xB33889B6,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3E9B5042,	/* c0 (high single). */
+		0xB22A3AEE,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3DBC7490,	/* c1 (low single). */
+		0x3EA99AF5,	/* c2. */
+		0x3EEDE107,	/* c3. */
+		0x3E80E9AA,	/* c4. */
+		0x3FA038A2,	/* B' = pi/2 - B (high single). */
+		0x32E4CA7E,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3EA92457,	/* c0 (high single). */
+		0x30B80830,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3DDF8200,	/* c1 (low single). */
+		0x3EBB99E9,	/* c2. */
+		0x3EFB4AA8,	/* c3. */
+		0x3E9182BE,	/* c4. */
+		0x3F9D1463,	/* B' = pi/2 - B (high single). */
+		0xB2C55799,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3EB73250,	/* c0 (high single). */
+		0xB2028823,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E0318F8,	/* c1 (low single). */
+		0x3ECEA678,	/* c2. */
+		0x3F053C67,	/* c3. */
+		0x3EA41E53,	/* c4. */
+		0x3F99F023,	/* B' = pi/2 - B (high single). */
+		0x33484328,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3EC5800D,	/* c0 (high single). */
+		0xB214C3C1,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E185E54,	/* c1 (low single). */
+		0x3EE2E342,	/* c2. */
+		0x3F0DCA73,	/* c3. */
+		0x3EB8CC21,	/* c4. */
+		0x3F96CBE4,	/* B' = pi/2 - B (high single). */
+		0xB14CDE2E,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3ED413CD,	/* c0 (high single). */
+		0xB1C06152,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E2FB0CC,	/* c1 (low single). */
+		0x3EF876CB,	/* c2. */
+		0x3F177807,	/* c3. */
+		0x3ED08437,	/* c4. */
+		0x3F93A7A5,	/* B' = pi/2 - B (high single). */
+		0xB361DEEE,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3EE2F439,	/* c0 (high single). */
+		0xB1F4399E,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E49341C,	/* c1 (low single). */
+		0x3F07C61A,	/* c2. */
+		0x3F22560F,	/* c3. */
+		0x3EEAA81E,	/* c4. */
+		0x3F908365,	/* B' = pi/2 - B (high single). */
+		0x3292200D,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3EF22870,	/* c0 (high single). */
+		0x325271F4,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E65107A,	/* c1 (low single). */
+		0x3F1429F0,	/* c2. */
+		0x3F2E8AFC,	/* c3. */
+		0x3F040498,	/* c4. */
+		0x3F8D5F26,	/* B' = pi/2 - B (high single). */
+		0xB30C0105,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3F00DC0D,	/* c0 (high single). */
+		0xB214AF72,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E81B994,	/* c1 (low single). */
+		0x3F218233,	/* c2. */
+		0x3F3C4531,	/* c3. */
+		0x3F149688,	/* c4. */
+		0x3F8A3AE6,	/* B' = pi/2 - B (high single). */
+		0x331EEDF0,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3F08D5B9,	/* c0 (high single). */
+		0xB25EF98E,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E92478D,	/* c1 (low single). */
+		0x3F2FEDC9,	/* c2. */
+		0x3F4BCD58,	/* c3. */
+		0x3F27AE9E,	/* c4. */
+		0x3F8716A7,	/* B' = pi/2 - B (high single). */
+		0xB2588C6D,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3F1105AF,	/* c0 (high single). */
+		0x32F045B0,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3EA44EE2,	/* c1 (low single). */
+		0x3F3F8FDB,	/* c2. */
+		0x3F5D3FD0,	/* c3. */
+		0x3F3D0A23,	/* c4. */
+		0x3F83F267,	/* B' = pi/2 - B (high single). */
+		0x3374CBD9,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3F1970C4,	/* c0 (high single). */
+		0x32904848,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3EB7EFF8,	/* c1 (low single). */
+		0x3F50907C,	/* c2. */
+		0x3F710FEA,	/* c3. */
+		0x3F561FED,	/* c4. */
+		0x3F80CE28,	/* B' = pi/2 - B (high single). */
+		0x31FDD672,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3F221C37,	/* c0 (high single). */
+		0xB20C61DC,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3ECD4F71,	/* c1 (low single). */
+		0x3F631DAA,	/* c2. */
+		0x3F83B471,	/* c3. */
+		0x3F7281EA,	/* c4. */
+		0x3F7B53D1,	/* B' = pi/2 - B (high single). */
+		0x32955386,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3F2B0DC1,	/* c0 (high single). */
+		0x32AB7EBA,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3EE496C2,	/* c1 (low single). */
+		0x3F776C40,	/* c2. */
+		0x3F9065C1,	/* c3. */
+		0x3F89AFB6,	/* c4. */
+		0x3F750B52,	/* B' = pi/2 - B (high single). */
+		0x32EB316F,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3F344BA9,	/* c0 (high single). */
+		0xB2B8B0EA,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3EFDF4F7,	/* c1 (low single). */
+		0x3F86DCA8,	/* c2. */
+		0x3F9ED53B,	/* c3. */
+		0x3F9CBEDE,	/* c4. */
+		0x3F6EC2D4,	/* B' = pi/2 - B (high single). */
+		0xB2BEF0A7,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3F3DDCCF,	/* c0 (high single). */
+		0x32D29606,	/* c0 (low single). */
+		0x40000000,	/* c1 (high 1 bit). */
+		0xBEE6606F,	/* c1 (low single). */
+		0x3F9325D6,	/* c2. */
+		0x3FAF4E69,	/* c3. */
+		0x3FB3080C,	/* c4. */
+		0x3F687A55,	/* B' = pi/2 - B (high single). */
+		0xB252257B,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3F47C8CC,	/* c0 (high single). */
+		0xB200F51A,	/* c0 (low single). */
+		0x40000000,	/* c1 (high 1 bit). */
+		0xBEC82C6C,	/* c1 (low single). */
+		0x3FA0BAE9,	/* c2. */
+		0x3FC2252F,	/* c3. */
+		0x3FCD24C7,	/* c4. */
+		0x3F6231D6,	/* B' = pi/2 - B (high single). */
+		0xB119A6A2,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3F521801,	/* c0 (high single). */
+		0x32AE4178,	/* c0 (low single). */
+		0x40000000,	/* c1 (high 1 bit). */
+		0xBEA72938,	/* c1 (low single). */
+		0x3FAFCC22,	/* c2. */
+		0x3FD7BD4A,	/* c3. */
+		0x3FEBB01B,	/* c4. */
+		0x3F5BE957,	/* B' = pi/2 - B (high single). */
+		0x3205522A,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3F5CD3BE,	/* c0 (high single). */
+		0x31460308,	/* c0 (low single). */
+		0x40000000,	/* c1 (high 1 bit). */
+		0xBE8306C5,	/* c1 (low single). */
+		0x3FC09232,	/* c2. */
+		0x3FF09632,	/* c3. */
+		0x4007DB00,	/* c4. */
+		0x3F55A0D8,	/* B' = pi/2 - B (high single). */
+		0x329886FF,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3F68065E,	/* c0 (high single). */
+		0x32670D1A,	/* c0 (low single). */
+		0x40000000,	/* c1 (high 1 bit). */
+		0xBE36D1D6,	/* c1 (low single). */
+		0x3FD35007,	/* c2. */
+		0x4006A861,	/* c3. */
+		0x401D4BDA,	/* c4. */
+		0x3F4F5859,	/* B' = pi/2 - B (high single). */
+		0x32EE64E8,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0x3F73BB75,	/* c0 (high single). */
+		0x32FC908D,	/* c0 (low single). */
+		0x40000000,	/* c1 (high 1 bit). */
+		0xBDBF94B0,	/* c1 (low single). */
+		0x3FE8550F,	/* c2. */
+		0x40174F67,	/* c3. */
+		0x4036C608,	/* c4. */
+		0x3F490FDB,	/* B' = pi/2 - B (high single). */
+		0xB2BBBD2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE8BE60E,	/* c0 (high single). */
+		0x320D8D84,	/* c0 (low single). */
+		0x3F000000,	/* c1 (high 1 bit). */
+		0xBDF817B1,	/* c1 (low single). */
+		0xBD8345EB,	/* c2. */
+		0x3D1DFDAC,	/* c3. */
+		0xBC52CF6F,	/* c4. */
+		0x3F42C75C,	/* B' = pi/2 - B (high single). */
+		0xB24BBE8A,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE87283F,	/* c0 (high single). */
+		0xB268B966,	/* c0 (low single). */
+		0x3F000000,	/* c1 (high 1 bit). */
+		0xBDFE6529,	/* c1 (low single). */
+		0xBD7B1953,	/* c2. */
+		0x3D18E109,	/* c3. */
+		0xBC4570B0,	/* c4. */
+		0x3F3C7EDD,	/* B' = pi/2 - B (high single). */
+		0xB1000ADD,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE827420,	/* c0 (high single). */
+		0x320B8B4D,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DFB9428,	/* c1 (low single). */
+		0xBD7002B4,	/* c2. */
+		0x3D142A6C,	/* c3. */
+		0xBC3A47FF,	/* c4. */
+		0x3F36365E,	/* B' = pi/2 - B (high single). */
+		0x320BB91C,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE7B9282,	/* c0 (high single). */
+		0xB13383D2,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DF5D211,	/* c1 (low single). */
+		0xBD6542B3,	/* c2. */
+		0x3D0FE5E5,	/* c3. */
+		0xBC31FB14,	/* c4. */
+		0x3F2FEDDF,	/* B' = pi/2 - B (high single). */
+		0x329BBA77,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE724E73,	/* c0 (high single). */
+		0x3120C3E2,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DF05283,	/* c1 (low single). */
+		0xBD5AD45E,	/* c2. */
+		0x3D0BAFBF,	/* c3. */
+		0xBC27B8BB,	/* c4. */
+		0x3F29A560,	/* B' = pi/2 - B (high single). */
+		0x32F19861,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE691B44,	/* c0 (high single). */
+		0x31F18936,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DEB138B,	/* c1 (low single). */
+		0xBD50B2F7,	/* c2. */
+		0x3D07BE3A,	/* c3. */
+		0xBC1E46A7,	/* c4. */
+		0x3F235CE2,	/* B' = pi/2 - B (high single). */
+		0xB2B889B6,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE5FF82C,	/* c0 (high single). */
+		0xB170723A,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DE61354,	/* c1 (low single). */
+		0xBD46DA06,	/* c2. */
+		0x3D0401F8,	/* c3. */
+		0xBC14E013,	/* c4. */
+		0x3F1D1463,	/* B' = pi/2 - B (high single). */
+		0xB2455799,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE56E46B,	/* c0 (high single). */
+		0x31E3F001,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DE15025,	/* c1 (low single). */
+		0xBD3D4550,	/* c2. */
+		0x3D00462D,	/* c3. */
+		0xBC092C98,	/* c4. */
+		0x3F16CBE4,	/* B' = pi/2 - B (high single). */
+		0xB0CCDE2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE4DDF41,	/* c0 (high single). */
+		0xB1AEA094,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DDCC85C,	/* c1 (low single). */
+		0xBD33F0BE,	/* c2. */
+		0x3CFA23B0,	/* c3. */
+		0xBC01FCF7,	/* c4. */
+		0x3F108365,	/* B' = pi/2 - B (high single). */
+		0x3212200D,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE44E7F8,	/* c0 (high single). */
+		0xB1CAA3CB,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DD87A74,	/* c1 (low single). */
+		0xBD2AD885,	/* c2. */
+		0x3CF3C785,	/* c3. */
+		0xBBF1E348,	/* c4. */
+		0x3F0A3AE6,	/* B' = pi/2 - B (high single). */
+		0x329EEDF0,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE3BFDDC,	/* c0 (high single). */
+		0xB132521A,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DD464FC,	/* c1 (low single). */
+		0xBD21F8F1,	/* c2. */
+		0x3CEE3076,	/* c3. */
+		0xBBE6D263,	/* c4. */
+		0x3F03F267,	/* B' = pi/2 - B (high single). */
+		0x32F4CBD9,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE33203E,	/* c0 (high single). */
+		0x31FEF5BE,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DD0869C,	/* c1 (low single). */
+		0xBD194E8C,	/* c2. */
+		0x3CE8DCA9,	/* c3. */
+		0xBBDADA55,	/* c4. */
+		0x3EFB53D1,	/* B' = pi/2 - B (high single). */
+		0x32155386,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE2A4E71,	/* c0 (high single). */
+		0xB19CFCEC,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DCCDE11,	/* c1 (low single). */
+		0xBD10D605,	/* c2. */
+		0x3CE382A7,	/* c3. */
+		0xBBC8BD97,	/* c4. */
+		0x3EEEC2D4,	/* B' = pi/2 - B (high single). */
+		0xB23EF0A7,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE2187D0,	/* c0 (high single). */
+		0xB1B7C7F7,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DC96A2B,	/* c1 (low single). */
+		0xBD088C22,	/* c2. */
+		0x3CDE950E,	/* c3. */
+		0xBBB89AD1,	/* c4. */
+		0x3EE231D6,	/* B' = pi/2 - B (high single). */
+		0xB099A6A2,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE18CBB7,	/* c0 (high single). */
+		0xAFE28430,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DC629CE,	/* c1 (low single). */
+		0xBD006DCD,	/* c2. */
+		0x3CDA5A2C,	/* c3. */
+		0xBBB0B3D2,	/* c4. */
+		0x3ED5A0D8,	/* B' = pi/2 - B (high single). */
+		0x321886FF,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE101985,	/* c0 (high single). */
+		0xB02FB2B8,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DC31BF3,	/* c1 (low single). */
+		0xBCF0F04D,	/* c2. */
+		0x3CD60BC7,	/* c3. */
+		0xBBA138BA,	/* c4. */
+		0x3EC90FDB,	/* B' = pi/2 - B (high single). */
+		0xB23BBD2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBE07709D,	/* c0 (high single). */
+		0xB18A2A83,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DC03FA2,	/* c1 (low single). */
+		0xBCE15096,	/* c2. */
+		0x3CD26472,	/* c3. */
+		0xBB9A1270,	/* c4. */
+		0x3EBC7EDD,	/* B' = pi/2 - B (high single). */
+		0xB0800ADD,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBDFDA0CB,	/* c0 (high single). */
+		0x2F14FCA0,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DBD93F7,	/* c1 (low single). */
+		0xBCD1F71B,	/* c2. */
+		0x3CCEDD2B,	/* c3. */
+		0xBB905946,	/* c4. */
+		0x3EAFEDDF,	/* B' = pi/2 - B (high single). */
+		0x321BBA77,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBDEC708C,	/* c0 (high single). */
+		0xB14895C4,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DBB181E,	/* c1 (low single). */
+		0xBCC2DEA6,	/* c2. */
+		0x3CCB5027,	/* c3. */
+		0xBB7F3969,	/* c4. */
+		0x3EA35CE2,	/* B' = pi/2 - B (high single). */
+		0xB23889B6,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBDDB4F55,	/* c0 (high single). */
+		0x30F6437E,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DB8CB52,	/* c1 (low single). */
+		0xBCB40210,	/* c2. */
+		0x3CC82D45,	/* c3. */
+		0xBB643075,	/* c4. */
+		0x3E96CBE4,	/* B' = pi/2 - B (high single). */
+		0xB04CDE2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBDCA3BFF,	/* c0 (high single). */
+		0x311C95EA,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DB6ACDE,	/* c1 (low single). */
+		0xBCA55C5B,	/* c2. */
+		0x3CC5BC04,	/* c3. */
+		0xBB63A969,	/* c4. */
+		0x3E8A3AE6,	/* B' = pi/2 - B (high single). */
+		0x321EEDF0,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBDB93569,	/* c0 (high single). */
+		0xAFB9ED00,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DB4BC1F,	/* c1 (low single). */
+		0xBC96E905,	/* c2. */
+		0x3CC2E6F5,	/* c3. */
+		0xBB3E10A6,	/* c4. */
+		0x3E7B53D1,	/* B' = pi/2 - B (high single). */
+		0x31955386,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBDA83A77,	/* c0 (high single). */
+		0x316D967A,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DB2F87C,	/* c1 (low single). */
+		0xBC88A31F,	/* c2. */
+		0x3CC0E763,	/* c3. */
+		0xBB3F1666,	/* c4. */
+		0x3E6231D6,	/* B' = pi/2 - B (high single). */
+		0xB019A6A2,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBD974A0D,	/* c0 (high single). */
+		0xB14F365B,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DB1616F,	/* c1 (low single). */
+		0xBC750CD8,	/* c2. */
+		0x3CBEB595,	/* c3. */
+		0xBB22B883,	/* c4. */
+		0x3E490FDB,	/* B' = pi/2 - B (high single). */
+		0xB1BBBD2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBD866317,	/* c0 (high single). */
+		0xAFF02140,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DAFF67D,	/* c1 (low single). */
+		0xBC591CD0,	/* c2. */
+		0x3CBCBEAD,	/* c3. */
+		0xBB04BBEC,	/* c4. */
+		0x3E2FEDDF,	/* B' = pi/2 - B (high single). */
+		0x319BBA77,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBD6B08FF,	/* c0 (high single). */
+		0xB0EED236,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DAEB739,	/* c1 (low single). */
+		0xBC3D6D51,	/* c2. */
+		0x3CBB485D,	/* c3. */
+		0xBAFFF5BA,	/* c4. */
+		0x3E16CBE4,	/* B' = pi/2 - B (high single). */
+		0xAFCCDE2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBD495A6C,	/* c0 (high single). */
+		0xB0A427BD,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DADA345,	/* c1 (low single). */
+		0xBC21F648,	/* c2. */
+		0x3CB9D1B4,	/* c3. */
+		0xBACB5567,	/* c4. */
+		0x3DFB53D1,	/* B' = pi/2 - B (high single). */
+		0x31155386,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBD27B856,	/* c0 (high single). */
+		0xB0F7EE91,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DACBA4E,	/* c1 (low single). */
+		0xBC06AEE3,	/* c2. */
+		0x3CB8E5DC,	/* c3. */
+		0xBAEC00EE,	/* c4. */
+		0x3DC90FDB,	/* B' = pi/2 - B (high single). */
+		0xB13BBD2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBD0620A3,	/* c0 (high single). */
+		0xB0ECAB40,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DABFC11,	/* c1 (low single). */
+		0xBBD7200F,	/* c2. */
+		0x3CB79475,	/* c3. */
+		0xBA2B0ADC,	/* c4. */
+		0x3D96CBE4,	/* B' = pi/2 - B (high single). */
+		0xAF4CDE2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBCC92278,	/* c0 (high single). */
+		0x302F2E68,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DAB6854,	/* c1 (low single). */
+		0xBBA1214F,	/* c2. */
+		0x3CB6C1E9,	/* c3. */
+		0x3843C2F3,	/* c4. */
+		0x3D490FDB,	/* B' = pi/2 - B (high single). */
+		0xB0BBBD2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBC861015,	/* c0 (high single). */
+		0xAFD68E2E,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DAAFEEB,	/* c1 (low single). */
+		0xBB569F3F,	/* c2. */
+		0x3CB6A84E,	/* c3. */
+		0xBAC64194,	/* c4. */
+		0x3CC90FDB,	/* B' = pi/2 - B (high single). */
+		0xB03BBD2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0xBC060BF3,	/* c0 (high single). */
+		0x2FE251AE,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DAABFB9,	/* c1 (low single). */
+		0xBAD67C60,	/* c2. */
+		0x3CB64CA5,	/* c3. */
+		0xBACDE881,	/* c4. */
+		0x00000000,	/* B' = pi/2 - B (high single). */
+		0x00000000,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x00000000,	/* c0 (high single). */
+		0x00000000,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DAAAAAB,	/* c1 (low single). */
+		0x00000000,	/* c2. */
+		0x3CB5E28B,	/* c3. */
+		0x00000000,	/* c4. */
+		0xBCC90FDB,	/* B' = pi/2 - B (high single). */
+		0x303BBD2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3C060BF3,	/* c0 (high single). */
+		0xAFE251AE,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DAABFB9,	/* c1 (low single). */
+		0x3AD67C60,	/* c2. */
+		0x3CB64CA5,	/* c3. */
+		0x3ACDE881,	/* c4. */
+		0xBD490FDB,	/* B' = pi/2 - B (high single). */
+		0x30BBBD2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3C861015,	/* c0 (high single). */
+		0x2FD68E2E,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DAAFEEB,	/* c1 (low single). */
+		0x3B569F3F,	/* c2. */
+		0x3CB6A84E,	/* c3. */
+		0x3AC64194,	/* c4. */
+		0xBD96CBE4,	/* B' = pi/2 - B (high single). */
+		0x2F4CDE2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3CC92278,	/* c0 (high single). */
+		0xB02F2E68,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DAB6854,	/* c1 (low single). */
+		0x3BA1214F,	/* c2. */
+		0x3CB6C1E9,	/* c3. */
+		0xB843C2F2,	/* c4. */
+		0xBDC90FDB,	/* B' = pi/2 - B (high single). */
+		0x313BBD2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3D0620A3,	/* c0 (high single). */
+		0x30ECAB40,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DABFC11,	/* c1 (low single). */
+		0x3BD7200F,	/* c2. */
+		0x3CB79475,	/* c3. */
+		0x3A2B0ADC,	/* c4. */
+		0xBDFB53D1,	/* B' = pi/2 - B (high single). */
+		0xB1155386,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3D27B856,	/* c0 (high single). */
+		0x30F7EE91,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DACBA4E,	/* c1 (low single). */
+		0x3C06AEE3,	/* c2. */
+		0x3CB8E5DC,	/* c3. */
+		0x3AEC00EE,	/* c4. */
+		0xBE16CBE4,	/* B' = pi/2 - B (high single). */
+		0x2FCCDE2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3D495A6C,	/* c0 (high single). */
+		0x30A427BD,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DADA345,	/* c1 (low single). */
+		0x3C21F648,	/* c2. */
+		0x3CB9D1B4,	/* c3. */
+		0x3ACB5567,	/* c4. */
+		0xBE2FEDDF,	/* B' = pi/2 - B (high single). */
+		0xB19BBA77,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3D6B08FF,	/* c0 (high single). */
+		0x30EED236,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DAEB739,	/* c1 (low single). */
+		0x3C3D6D51,	/* c2. */
+		0x3CBB485D,	/* c3. */
+		0x3AFFF5BA,	/* c4. */
+		0xBE490FDB,	/* B' = pi/2 - B (high single). */
+		0x31BBBD2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3D866317,	/* c0 (high single). */
+		0x2FF02140,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DAFF67D,	/* c1 (low single). */
+		0x3C591CD0,	/* c2. */
+		0x3CBCBEAD,	/* c3. */
+		0x3B04BBEC,	/* c4. */
+		0xBE6231D6,	/* B' = pi/2 - B (high single). */
+		0x3019A6A2,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3D974A0D,	/* c0 (high single). */
+		0x314F365B,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DB1616F,	/* c1 (low single). */
+		0x3C750CD8,	/* c2. */
+		0x3CBEB595,	/* c3. */
+		0x3B22B883,	/* c4. */
+		0xBE7B53D1,	/* B' = pi/2 - B (high single). */
+		0xB1955386,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3DA83A77,	/* c0 (high single). */
+		0xB16D967A,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DB2F87C,	/* c1 (low single). */
+		0x3C88A31F,	/* c2. */
+		0x3CC0E763,	/* c3. */
+		0x3B3F1666,	/* c4. */
+		0xBE8A3AE6,	/* B' = pi/2 - B (high single). */
+		0xB21EEDF0,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3DB93569,	/* c0 (high single). */
+		0x2FB9ED00,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DB4BC1F,	/* c1 (low single). */
+		0x3C96E905,	/* c2. */
+		0x3CC2E6F5,	/* c3. */
+		0x3B3E10A6,	/* c4. */
+		0xBE96CBE4,	/* B' = pi/2 - B (high single). */
+		0x304CDE2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3DCA3BFF,	/* c0 (high single). */
+		0xB11C95EA,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DB6ACDE,	/* c1 (low single). */
+		0x3CA55C5B,	/* c2. */
+		0x3CC5BC04,	/* c3. */
+		0x3B63A969,	/* c4. */
+		0xBEA35CE2,	/* B' = pi/2 - B (high single). */
+		0x323889B6,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3DDB4F55,	/* c0 (high single). */
+		0xB0F6437E,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DB8CB52,	/* c1 (low single). */
+		0x3CB40210,	/* c2. */
+		0x3CC82D45,	/* c3. */
+		0x3B643075,	/* c4. */
+		0xBEAFEDDF,	/* B' = pi/2 - B (high single). */
+		0xB21BBA77,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3DEC708C,	/* c0 (high single). */
+		0x314895C4,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DBB181E,	/* c1 (low single). */
+		0x3CC2DEA6,	/* c2. */
+		0x3CCB5027,	/* c3. */
+		0x3B7F3969,	/* c4. */
+		0xBEBC7EDD,	/* B' = pi/2 - B (high single). */
+		0x30800ADD,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3DFDA0CB,	/* c0 (high single). */
+		0xAF14FCA0,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DBD93F7,	/* c1 (low single). */
+		0x3CD1F71B,	/* c2. */
+		0x3CCEDD2B,	/* c3. */
+		0x3B905946,	/* c4. */
+		0xBEC90FDB,	/* B' = pi/2 - B (high single). */
+		0x323BBD2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E07709D,	/* c0 (high single). */
+		0x318A2A83,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DC03FA2,	/* c1 (low single). */
+		0x3CE15096,	/* c2. */
+		0x3CD26472,	/* c3. */
+		0x3B9A1270,	/* c4. */
+		0xBED5A0D8,	/* B' = pi/2 - B (high single). */
+		0xB21886FF,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E101985,	/* c0 (high single). */
+		0x302FB2B8,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DC31BF3,	/* c1 (low single). */
+		0x3CF0F04D,	/* c2. */
+		0x3CD60BC7,	/* c3. */
+		0x3BA138BA,	/* c4. */
+		0xBEE231D6,	/* B' = pi/2 - B (high single). */
+		0x3099A6A2,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E18CBB7,	/* c0 (high single). */
+		0x2FE28430,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DC629CE,	/* c1 (low single). */
+		0x3D006DCD,	/* c2. */
+		0x3CDA5A2C,	/* c3. */
+		0x3BB0B3D2,	/* c4. */
+		0xBEEEC2D4,	/* B' = pi/2 - B (high single). */
+		0x323EF0A7,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E2187D0,	/* c0 (high single). */
+		0x31B7C7F7,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DC96A2B,	/* c1 (low single). */
+		0x3D088C22,	/* c2. */
+		0x3CDE950E,	/* c3. */
+		0x3BB89AD1,	/* c4. */
+		0xBEFB53D1,	/* B' = pi/2 - B (high single). */
+		0xB2155386,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E2A4E71,	/* c0 (high single). */
+		0x319CFCEC,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DCCDE11,	/* c1 (low single). */
+		0x3D10D605,	/* c2. */
+		0x3CE382A7,	/* c3. */
+		0x3BC8BD97,	/* c4. */
+		0xBF03F267,	/* B' = pi/2 - B (high single). */
+		0xB2F4CBD9,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E33203E,	/* c0 (high single). */
+		0xB1FEF5BE,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DD0869C,	/* c1 (low single). */
+		0x3D194E8C,	/* c2. */
+		0x3CE8DCA9,	/* c3. */
+		0x3BDADA55,	/* c4. */
+		0xBF0A3AE6,	/* B' = pi/2 - B (high single). */
+		0xB29EEDF0,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E3BFDDC,	/* c0 (high single). */
+		0x3132521A,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DD464FC,	/* c1 (low single). */
+		0x3D21F8F1,	/* c2. */
+		0x3CEE3076,	/* c3. */
+		0x3BE6D263,	/* c4. */
+		0xBF108365,	/* B' = pi/2 - B (high single). */
+		0xB212200D,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E44E7F8,	/* c0 (high single). */
+		0x31CAA3CB,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DD87A74,	/* c1 (low single). */
+		0x3D2AD885,	/* c2. */
+		0x3CF3C785,	/* c3. */
+		0x3BF1E348,	/* c4. */
+		0xBF16CBE4,	/* B' = pi/2 - B (high single). */
+		0x30CCDE2E,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E4DDF41,	/* c0 (high single). */
+		0x31AEA094,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DDCC85C,	/* c1 (low single). */
+		0x3D33F0BE,	/* c2. */
+		0x3CFA23B0,	/* c3. */
+		0x3C01FCF7,	/* c4. */
+		0xBF1D1463,	/* B' = pi/2 - B (high single). */
+		0x32455799,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E56E46B,	/* c0 (high single). */
+		0xB1E3F001,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DE15025,	/* c1 (low single). */
+		0x3D3D4550,	/* c2. */
+		0x3D00462D,	/* c3. */
+		0x3C092C98,	/* c4. */
+		0xBF235CE2,	/* B' = pi/2 - B (high single). */
+		0x32B889B6,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E5FF82C,	/* c0 (high single). */
+		0x3170723A,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DE61354,	/* c1 (low single). */
+		0x3D46DA06,	/* c2. */
+		0x3D0401F8,	/* c3. */
+		0x3C14E013,	/* c4. */
+		0xBF29A560,	/* B' = pi/2 - B (high single). */
+		0xB2F19861,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E691B44,	/* c0 (high single). */
+		0xB1F18936,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DEB138B,	/* c1 (low single). */
+		0x3D50B2F7,	/* c2. */
+		0x3D07BE3A,	/* c3. */
+		0x3C1E46A7,	/* c4. */
+		0xBF2FEDDF,	/* B' = pi/2 - B (high single). */
+		0xB29BBA77,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E724E73,	/* c0 (high single). */
+		0xB120C3E2,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DF05283,	/* c1 (low single). */
+		0x3D5AD45E,	/* c2. */
+		0x3D0BAFBF,	/* c3. */
+		0x3C27B8BB,	/* c4. */
+		0xBF36365E,	/* B' = pi/2 - B (high single). */
+		0xB20BB91C,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E7B9282,	/* c0 (high single). */
+		0x313383D2,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DF5D211,	/* c1 (low single). */
+		0x3D6542B3,	/* c2. */
+		0x3D0FE5E5,	/* c3. */
+		0x3C31FB14,	/* c4. */
+		0xBF3C7EDD,	/* B' = pi/2 - B (high single). */
+		0x31000ADD,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E827420,	/* c0 (high single). */
+		0xB20B8B4D,	/* c0 (low single). */
+		0x3E800000,	/* c1 (high 1 bit). */
+		0x3DFB9428,	/* c1 (low single). */
+		0x3D7002B4,	/* c2. */
+		0x3D142A6C,	/* c3. */
+		0x3C3A47FF,	/* c4. */
+		0xBF42C75C,	/* B' = pi/2 - B (high single). */
+		0x324BBE8A,	/* B' = pi/2 - B (low single). */
+		0x3F800000,	/* tau (1 for cot path). */
+		0x3E87283F,	/* c0 (high single). */
+		0x3268B966,	/* c0 (low single). */
+		0x3F000000,	/* c1 (high 1 bit). */
+		0xBDFE6529,	/* c1 (low single). */
+		0x3D7B1953,	/* c2. */
+		0x3D18E109,	/* c3. */
+		0x3C4570B0,	/* c4. */
+		0xBF490FDB,	/* B' = pi/2 - B (high single). */
+		0x32BBBD2E,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF800000,	/* c0 (high single). */
+		0x2B410000,	/* c0 (low single). */
+		0x40000000,	/* c1 (high 1 bit). */
+		0xB3000000,	/* c1 (low single). */
+		0xC0000000,	/* c2. */
+		0x402AB7C8,	/* c3. */
+		0xC05561DB,	/* c4. */
+		0xBF4F5859,	/* B' = pi/2 - B (high single). */
+		0xB2EE64E8,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF73BB75,	/* c0 (high single). */
+		0xB2FC908D,	/* c0 (low single). */
+		0x40000000,	/* c1 (high 1 bit). */
+		0xBDBF94B0,	/* c1 (low single). */
+		0xBFE8550F,	/* c2. */
+		0x40174F67,	/* c3. */
+		0xC036C608,	/* c4. */
+		0xBF55A0D8,	/* B' = pi/2 - B (high single). */
+		0xB29886FF,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF68065E,	/* c0 (high single). */
+		0xB2670D1A,	/* c0 (low single). */
+		0x40000000,	/* c1 (high 1 bit). */
+		0xBE36D1D6,	/* c1 (low single). */
+		0xBFD35007,	/* c2. */
+		0x4006A861,	/* c3. */
+		0xC01D4BDA,	/* c4. */
+		0xBF5BE957,	/* B' = pi/2 - B (high single). */
+		0xB205522A,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF5CD3BE,	/* c0 (high single). */
+		0xB1460308,	/* c0 (low single). */
+		0x40000000,	/* c1 (high 1 bit). */
+		0xBE8306C5,	/* c1 (low single). */
+		0xBFC09232,	/* c2. */
+		0x3FF09632,	/* c3. */
+		0xC007DB00,	/* c4. */
+		0xBF6231D6,	/* B' = pi/2 - B (high single). */
+		0x3119A6A2,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF521801,	/* c0 (high single). */
+		0xB2AE4178,	/* c0 (low single). */
+		0x40000000,	/* c1 (high 1 bit). */
+		0xBEA72938,	/* c1 (low single). */
+		0xBFAFCC22,	/* c2. */
+		0x3FD7BD4A,	/* c3. */
+		0xBFEBB01B,	/* c4. */
+		0xBF687A55,	/* B' = pi/2 - B (high single). */
+		0x3252257B,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF47C8CC,	/* c0 (high single). */
+		0x3200F51A,	/* c0 (low single). */
+		0x40000000,	/* c1 (high 1 bit). */
+		0xBEC82C6C,	/* c1 (low single). */
+		0xBFA0BAE9,	/* c2. */
+		0x3FC2252F,	/* c3. */
+		0xBFCD24C7,	/* c4. */
+		0xBF6EC2D4,	/* B' = pi/2 - B (high single). */
+		0x32BEF0A7,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF3DDCCF,	/* c0 (high single). */
+		0xB2D29606,	/* c0 (low single). */
+		0x40000000,	/* c1 (high 1 bit). */
+		0xBEE6606F,	/* c1 (low single). */
+		0xBF9325D6,	/* c2. */
+		0x3FAF4E69,	/* c3. */
+		0xBFB3080C,	/* c4. */
+		0xBF750B52,	/* B' = pi/2 - B (high single). */
+		0xB2EB316F,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF344BA9,	/* c0 (high single). */
+		0x32B8B0EA,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3EFDF4F7,	/* c1 (low single). */
+		0xBF86DCA8,	/* c2. */
+		0x3F9ED53B,	/* c3. */
+		0xBF9CBEDE,	/* c4. */
+		0xBF7B53D1,	/* B' = pi/2 - B (high single). */
+		0xB2955386,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF2B0DC1,	/* c0 (high single). */
+		0xB2AB7EBA,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3EE496C2,	/* c1 (low single). */
+		0xBF776C40,	/* c2. */
+		0x3F9065C1,	/* c3. */
+		0xBF89AFB6,	/* c4. */
+		0xBF80CE28,	/* B' = pi/2 - B (high single). */
+		0xB1FDD672,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF221C37,	/* c0 (high single). */
+		0x320C61DC,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3ECD4F71,	/* c1 (low single). */
+		0xBF631DAA,	/* c2. */
+		0x3F83B471,	/* c3. */
+		0xBF7281EA,	/* c4. */
+		0xBF83F267,	/* B' = pi/2 - B (high single). */
+		0xB374CBD9,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF1970C4,	/* c0 (high single). */
+		0xB2904848,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3EB7EFF8,	/* c1 (low single). */
+		0xBF50907C,	/* c2. */
+		0x3F710FEA,	/* c3. */
+		0xBF561FED,	/* c4. */
+		0xBF8716A7,	/* B' = pi/2 - B (high single). */
+		0x32588C6D,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF1105AF,	/* c0 (high single). */
+		0xB2F045B0,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3EA44EE2,	/* c1 (low single). */
+		0xBF3F8FDB,	/* c2. */
+		0x3F5D3FD0,	/* c3. */
+		0xBF3D0A23,	/* c4. */
+		0xBF8A3AE6,	/* B' = pi/2 - B (high single). */
+		0xB31EEDF0,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF08D5B9,	/* c0 (high single). */
+		0x325EF98E,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E92478D,	/* c1 (low single). */
+		0xBF2FEDC9,	/* c2. */
+		0x3F4BCD58,	/* c3. */
+		0xBF27AE9E,	/* c4. */
+		0xBF8D5F26,	/* B' = pi/2 - B (high single). */
+		0x330C0105,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBF00DC0D,	/* c0 (high single). */
+		0x3214AF72,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E81B994,	/* c1 (low single). */
+		0xBF218233,	/* c2. */
+		0x3F3C4531,	/* c3. */
+		0xBF149688,	/* c4. */
+		0xBF908365,	/* B' = pi/2 - B (high single). */
+		0xB292200D,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBEF22870,	/* c0 (high single). */
+		0xB25271F4,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E65107A,	/* c1 (low single). */
+		0xBF1429F0,	/* c2. */
+		0x3F2E8AFC,	/* c3. */
+		0xBF040498,	/* c4. */
+		0xBF93A7A5,	/* B' = pi/2 - B (high single). */
+		0x3361DEEE,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBEE2F439,	/* c0 (high single). */
+		0x31F4399E,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E49341C,	/* c1 (low single). */
+		0xBF07C61A,	/* c2. */
+		0x3F22560F,	/* c3. */
+		0xBEEAA81E,	/* c4. */
+		0xBF96CBE4,	/* B' = pi/2 - B (high single). */
+		0x314CDE2E,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBED413CD,	/* c0 (high single). */
+		0x31C06152,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E2FB0CC,	/* c1 (low single). */
+		0xBEF876CB,	/* c2. */
+		0x3F177807,	/* c3. */
+		0xBED08437,	/* c4. */
+		0xBF99F023,	/* B' = pi/2 - B (high single). */
+		0xB3484328,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBEC5800D,	/* c0 (high single). */
+		0x3214C3C1,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E185E54,	/* c1 (low single). */
+		0xBEE2E342,	/* c2. */
+		0x3F0DCA73,	/* c3. */
+		0xBEB8CC21,	/* c4. */
+		0xBF9D1463,	/* B' = pi/2 - B (high single). */
+		0x32C55799,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBEB73250,	/* c0 (high single). */
+		0x32028823,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3E0318F8,	/* c1 (low single). */
+		0xBECEA678,	/* c2. */
+		0x3F053C67,	/* c3. */
+		0xBEA41E53,	/* c4. */
+		0xBFA038A2,	/* B' = pi/2 - B (high single). */
+		0xB2E4CA7E,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBEA92457,	/* c0 (high single). */
+		0xB0B80830,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3DDF8200,	/* c1 (low single). */
+		0xBEBB99E9,	/* c2. */
+		0x3EFB4AA8,	/* c3. */
+		0xBE9182BE,	/* c4. */
+		0xBFA35CE2,	/* B' = pi/2 - B (high single). */
+		0x333889B6,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBE9B5042,	/* c0 (high single). */
+		0x322A3AEE,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3DBC7490,	/* c1 (low single). */
+		0xBEA99AF5,	/* c2. */
+		0x3EEDE107,	/* c3. */
+		0xBE80E9AA,	/* c4. */
+		0xBFA68121,	/* B' = pi/2 - B (high single). */
+		0xB1E43AAC,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBE8DB082,	/* c0 (high single). */
+		0x3132A234,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3D9CD7D0,	/* c1 (low single). */
+		0xBE988A60,	/* c2. */
+		0x3EE203E3,	/* c3. */
+		0xBE63582C,	/* c4. */
+		0xBFA9A560,	/* B' = pi/2 - B (high single). */
+		0xB3719861,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBE803FD4,	/* c0 (high single). */
+		0x32279E66,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3D807FC8,	/* c1 (low single). */
+		0xBE884BD4,	/* c2. */
+		0x3ED7812D,	/* c3. */
+		0xBE4636EB,	/* c4. */
+		0xBFACC9A0,	/* B' = pi/2 - B (high single). */
+		0x32655A50,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBE65F267,	/* c0 (high single). */
+		0xB1B4B1DF,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3D4E8B90,	/* c1 (low single). */
+		0xBE718ACA,	/* c2. */
+		0x3ECE7164,	/* c3. */
+		0xBE2DC161,	/* c4. */
+		0xBFAFEDDF,	/* B' = pi/2 - B (high single). */
+		0xB31BBA77,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBE4BAFAF,	/* c0 (high single). */
+		0xAF2A29E0,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3D221018,	/* c1 (low single). */
+		0xBE53BED0,	/* c2. */
+		0x3EC67E26,	/* c3. */
+		0xBE1568E2,	/* c4. */
+		0xBFB3121F,	/* B' = pi/2 - B (high single). */
+		0x330F347D,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBE31AE4D,	/* c0 (high single). */
+		0x31F32251,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3CF6A500,	/* c1 (low single). */
+		0xBE3707DA,	/* c2. */
+		0x3EBFA489,	/* c3. */
+		0xBDFBD9C7,	/* c4. */
+		0xBFB6365E,	/* B' = pi/2 - B (high single). */
+		0xB28BB91C,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBE17E564,	/* c0 (high single). */
+		0x31C5A2E4,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3CB440D0,	/* c1 (low single). */
+		0xBE1B3D00,	/* c2. */
+		0x3EB9F664,	/* c3. */
+		0xBDD647C0,	/* c4. */
+		0xBFB95A9E,	/* B' = pi/2 - B (high single). */
+		0x33651267,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBDFC98C2,	/* c0 (high single). */
+		0x30AE525C,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3C793D20,	/* c1 (low single). */
+		0xBE003845,	/* c2. */
+		0x3EB5271F,	/* c3. */
+		0xBDAC669E,	/* c4. */
+		0xBFBC7EDD,	/* B' = pi/2 - B (high single). */
+		0x31800ADD,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBDC9B5DC,	/* c0 (high single). */
+		0xB145AD86,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3C1EEF20,	/* c1 (low single). */
+		0xBDCBAAEA,	/* c2. */
+		0x3EB14E5E,	/* c3. */
+		0xBD858BB2,	/* c4. */
+		0xBFBFA31C,	/* B' = pi/2 - B (high single). */
+		0xB3450FB0,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBD9711CE,	/* c0 (high single). */
+		0xB14FEB28,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3BB24C00,	/* c1 (low single). */
+		0xBD97E43A,	/* c2. */
+		0x3EAE6A89,	/* c3. */
+		0xBD4D07E0,	/* c4. */
+		0xBFC2C75C,	/* B' = pi/2 - B (high single). */
+		0x32CBBE8A,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBD49393C,	/* c0 (high single). */
+		0xB0A39F5B,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3B1E2B00,	/* c1 (low single). */
+		0xBD49B5D4,	/* c2. */
+		0x3EAC4F10,	/* c3. */
+		0xBCFD9425,	/* c4. */
+		0xBFC5EB9B,	/* B' = pi/2 - B (high single). */
+		0xB2DE638C,	/* B' = pi/2 - B (low single). */
+		0x00000000,	/* tau (1 for cot path). */
+		0xBCC91A31,	/* c0 (high single). */
+		0xAF8E8D1A,	/* c0 (low single). */
+		0x3F800000,	/* c1 (high 1 bit). */
+		0x3A1DFA00,	/* c1 (low single). */
+		0xBCC9392D,	/* c2. */
+		0x3EAB1889,	/* c3. */
+		0xBC885D3B,	/* c4. */
+		/* So its save to read 32 bytes past the end.  */
+		0x00000000,
+		0x00000000,
+		0x00000000,
+		0x00000000,
+		0x00000000,
+		0x00000000,
+		0x00000000,
+		0x00000000)
+	.type	__svml_stan_common_data_avx2, @object
+	.size	__svml_stan_common_data_avx2, .-__svml_stan_common_data_avx2
+
+# endif
+#endif
+
+#if (defined AVX512_SHARED_OFFSETS) || (defined AVX512_SHARED_TABLE)
+
+# define _Reduction	0
+# define AVX512_SHARED_DATA(offset)	\
+	((offset) + (__svml_stan_common_data_avx512))
+
+# if (defined AVX512_SHARED_TABLE)
+
+	.section .rodata.evex512, "a"
+	.align	64
+	.globl	__svml_stan_common_data_avx512
+__svml_stan_common_data_avx512:
+	float_block (__svml_stan_common_data_avx512, _Reduction,
+		/* P_hi     P_med       P_lo.  */
+		0x00000000, 0x00000000, 0x00000000,	/* 0 */
+		0x00000000, 0x00000000, 0x00000000,	/* 1 */
+		0x00000000, 0x00000000, 0x00000000,	/* 2 */
+		0x00000000, 0x00000000, 0x00000000,	/* 3 */
+		0x00000000, 0x00000000, 0x00000000,	/* 4 */
+		0x00000000, 0x00000000, 0x00000000,	/* 5 */
+		0x00000000, 0x00000000, 0x00000000,	/* 6 */
+		0x00000000, 0x00000000, 0x00000000,	/* 7 */
+		0x00000000, 0x00000000, 0x00000000,	/* 8 */
+		0x00000000, 0x00000000, 0x00000000,	/* 9 */
+		0x00000000, 0x00000000, 0x00000000,	/* 10 */
+		0x00000000, 0x00000000, 0x00000000,	/* 11 */
+		0x00000000, 0x00000000, 0x00000000,	/* 12 */
+		0x00000000, 0x00000000, 0x00000000,	/* 13 */
+		0x00000000, 0x00000000, 0x00000000,	/* 14 */
+		0x00000000, 0x00000000, 0x00000000,	/* 15 */
+		0x00000000, 0x00000000, 0x00000000,	/* 16 */
+		0x00000000, 0x00000000, 0x00000000,	/* 17 */
+		0x00000000, 0x00000000, 0x00000000,	/* 18 */
+		0x00000000, 0x00000000, 0x00000000,	/* 19 */
+		0x00000000, 0x00000000, 0x00000000,	/* 20 */
+		0x00000000, 0x00000000, 0x00000000,	/* 21 */
+		0x00000000, 0x00000000, 0x00000000,	/* 22 */
+		0x00000000, 0x00000000, 0x00000000,	/* 23 */
+		0x00000000, 0x00000000, 0x00000000,	/* 24 */
+		0x00000000, 0x00000000, 0x00000000,	/* 25 */
+		0x00000000, 0x00000000, 0x00000000,	/* 26 */
+		0x00000000, 0x00000000, 0x00000000,	/* 27 */
+		0x00000000, 0x00000000, 0x00000000,	/* 28 */
+		0x00000000, 0x00000000, 0x00000000,	/* 29 */
+		0x00000000, 0x00000000, 0x00000000,	/* 30 */
+		0x00000000, 0x00000000, 0x00000000,	/* 31 */
+		0x00000000, 0x00000000, 0x00000000,	/* 32 */
+		0x00000000, 0x00000000, 0x00000000,	/* 33 */
+		0x00000000, 0x00000000, 0x00000000,	/* 34 */
+		0x00000000, 0x00000000, 0x00000000,	/* 35 */
+		0x00000000, 0x00000000, 0x00000000,	/* 36 */
+		0x00000000, 0x00000000, 0x00000000,	/* 37 */
+		0x00000000, 0x00000000, 0x00000000,	/* 38 */
+		0x00000000, 0x00000000, 0x00000000,	/* 39 */
+		0x00000000, 0x00000000, 0x00000000,	/* 40 */
+		0x00000000, 0x00000000, 0x00000000,	/* 41 */
+		0x00000000, 0x00000000, 0x00000000,	/* 42 */
+		0x00000000, 0x00000000, 0x00000000,	/* 43 */
+		0x00000000, 0x00000000, 0x00000000,	/* 44 */
+		0x00000000, 0x00000000, 0x00000000,	/* 45 */
+		0x00000000, 0x00000000, 0x00000000,	/* 46 */
+		0x00000000, 0x00000000, 0x00000000,	/* 47 */
+		0x00000000, 0x00000000, 0x00000000,	/* 48 */
+		0x00000000, 0x00000000, 0x00000000,	/* 49 */
+		0x00000000, 0x00000000, 0x00000000,	/* 50 */
+		0x00000000, 0x00000000, 0x00000000,	/* 51 */
+		0x00000000, 0x00000000, 0x00000000,	/* 52 */
+		0x00000000, 0x00000000, 0x00000000,	/* 53 */
+		0x00000000, 0x00000000, 0x00000000,	/* 54 */
+		0x00000000, 0x00000000, 0x00000000,	/* 55 */
+		0x00000000, 0x00000000, 0x00000000,	/* 56 */
+		0x00000000, 0x00000000, 0x00000001,	/* 57 */
+		0x00000000, 0x00000000, 0x00000002,	/* 58 */
+		0x00000000, 0x00000000, 0x00000005,	/* 59 */
+		0x00000000, 0x00000000, 0x0000000A,	/* 60 */
+		0x00000000, 0x00000000, 0x00000014,	/* 61 */
+		0x00000000, 0x00000000, 0x00000028,	/* 62 */
+		0x00000000, 0x00000000, 0x00000051,	/* 63 */
+		0x00000000, 0x00000000, 0x000000A2,	/* 64 */
+		0x00000000, 0x00000000, 0x00000145,	/* 65 */
+		0x00000000, 0x00000000, 0x0000028B,	/* 66 */
+		0x00000000, 0x00000000, 0x00000517,	/* 67 */
+		0x00000000, 0x00000000, 0x00000A2F,	/* 68 */
+		0x00000000, 0x00000000, 0x0000145F,	/* 69 */
+		0x00000000, 0x00000000, 0x000028BE,	/* 70 */
+		0x00000000, 0x00000000, 0x0000517C,	/* 71 */
+		0x00000000, 0x00000000, 0x0000A2F9,	/* 72 */
+		0x00000000, 0x00000000, 0x000145F3,	/* 73 */
+		0x00000000, 0x00000000, 0x00028BE6,	/* 74 */
+		0x00000000, 0x00000000, 0x000517CC,	/* 75 */
+		0x00000000, 0x00000000, 0x000A2F98,	/* 76 */
+		0x00000000, 0x00000000, 0x00145F30,	/* 77 */
+		0x00000000, 0x00000000, 0x0028BE60,	/* 78 */
+		0x00000000, 0x00000000, 0x00517CC1,	/* 79 */
+		0x00000000, 0x00000000, 0x00A2F983,	/* 80 */
+		0x00000000, 0x00000000, 0x0145F306,	/* 81 */
+		0x00000000, 0x00000000, 0x028BE60D,	/* 82 */
+		0x00000000, 0x00000000, 0x0517CC1B,	/* 83 */
+		0x00000000, 0x00000000, 0x0A2F9836,	/* 84 */
+		0x00000000, 0x00000000, 0x145F306D,	/* 85 */
+		0x00000000, 0x00000000, 0x28BE60DB,	/* 86 */
+		0x00000000, 0x00000000, 0x517CC1B7,	/* 87 */
+		0x00000000, 0x00000000, 0xA2F9836E,	/* 88 */
+		0x00000000, 0x00000001, 0x45F306DC,	/* 89 */
+		0x00000000, 0x00000002, 0x8BE60DB9,	/* 90 */
+		0x00000000, 0x00000005, 0x17CC1B72,	/* 91 */
+		0x00000000, 0x0000000A, 0x2F9836E4,	/* 92 */
+		0x00000000, 0x00000014, 0x5F306DC9,	/* 93 */
+		0x00000000, 0x00000028, 0xBE60DB93,	/* 94 */
+		0x00000000, 0x00000051, 0x7CC1B727,	/* 95 */
+		0x00000000, 0x000000A2, 0xF9836E4E,	/* 96 */
+		0x00000000, 0x00000145, 0xF306DC9C,	/* 97 */
+		0x00000000, 0x0000028B, 0xE60DB939,	/* 98 */
+		0x00000000, 0x00000517, 0xCC1B7272,	/* 99 */
+		0x00000000, 0x00000A2F, 0x9836E4E4,	/* 100 */
+		0x00000000, 0x0000145F, 0x306DC9C8,	/* 101 */
+		0x00000000, 0x000028BE, 0x60DB9391,	/* 102 */
+		0x00000000, 0x0000517C, 0xC1B72722,	/* 103 */
+		0x00000000, 0x0000A2F9, 0x836E4E44,	/* 104 */
+		0x00000000, 0x000145F3, 0x06DC9C88,	/* 105 */
+		0x00000000, 0x00028BE6, 0x0DB93910,	/* 106 */
+		0x00000000, 0x000517CC, 0x1B727220,	/* 107 */
+		0x00000000, 0x000A2F98, 0x36E4E441,	/* 108 */
+		0x00000000, 0x00145F30, 0x6DC9C882,	/* 109 */
+		0x00000000, 0x0028BE60, 0xDB939105,	/* 110 */
+		0x00000000, 0x00517CC1, 0xB727220A,	/* 111 */
+		0x00000000, 0x00A2F983, 0x6E4E4415,	/* 112 */
+		0x00000000, 0x0145F306, 0xDC9C882A,	/* 113 */
+		0x00000000, 0x028BE60D, 0xB9391054,	/* 114 */
+		0x00000000, 0x0517CC1B, 0x727220A9,	/* 115 */
+		0x00000000, 0x0A2F9836, 0xE4E44152,	/* 116 */
+		0x00000000, 0x145F306D, 0xC9C882A5,	/* 117 */
+		0x00000000, 0x28BE60DB, 0x9391054A,	/* 118 */
+		0x00000000, 0x517CC1B7, 0x27220A94,	/* 119 */
+		0x00000000, 0xA2F9836E, 0x4E441529,	/* 120 */
+		0x00000001, 0x45F306DC, 0x9C882A53,	/* 121 */
+		0x00000002, 0x8BE60DB9, 0x391054A7,	/* 122 */
+		0x00000005, 0x17CC1B72, 0x7220A94F,	/* 123 */
+		0x0000000A, 0x2F9836E4, 0xE441529F,	/* 124 */
+		0x00000014, 0x5F306DC9, 0xC882A53F,	/* 125 */
+		0x00000028, 0xBE60DB93, 0x91054A7F,	/* 126 */
+		0x00000051, 0x7CC1B727, 0x220A94FE,	/* 127 */
+		0x000000A2, 0xF9836E4E, 0x441529FC,	/* 128 */
+		0x00000145, 0xF306DC9C, 0x882A53F8,	/* 129 */
+		0x0000028B, 0xE60DB939, 0x1054A7F0,	/* 130 */
+		0x00000517, 0xCC1B7272, 0x20A94FE1,	/* 131 */
+		0x00000A2F, 0x9836E4E4, 0x41529FC2,	/* 132 */
+		0x0000145F, 0x306DC9C8, 0x82A53F84,	/* 133 */
+		0x000028BE, 0x60DB9391, 0x054A7F09,	/* 134 */
+		0x0000517C, 0xC1B72722, 0x0A94FE13,	/* 135 */
+		0x0000A2F9, 0x836E4E44, 0x1529FC27,	/* 136 */
+		0x000145F3, 0x06DC9C88, 0x2A53F84E,	/* 137 */
+		0x00028BE6, 0x0DB93910, 0x54A7F09D,	/* 138 */
+		0x000517CC, 0x1B727220, 0xA94FE13A,	/* 139 */
+		0x000A2F98, 0x36E4E441, 0x529FC275,	/* 140 */
+		0x00145F30, 0x6DC9C882, 0xA53F84EA,	/* 141 */
+		0x0028BE60, 0xDB939105, 0x4A7F09D5,	/* 142 */
+		0x00517CC1, 0xB727220A, 0x94FE13AB,	/* 143 */
+		0x00A2F983, 0x6E4E4415, 0x29FC2757,	/* 144 */
+		0x0145F306, 0xDC9C882A, 0x53F84EAF,	/* 145 */
+		0x028BE60D, 0xB9391054, 0xA7F09D5F,	/* 146 */
+		0x0517CC1B, 0x727220A9, 0x4FE13ABE,	/* 147 */
+		0x0A2F9836, 0xE4E44152, 0x9FC2757D,	/* 148 */
+		0x145F306D, 0xC9C882A5, 0x3F84EAFA,	/* 149 */
+		0x28BE60DB, 0x9391054A, 0x7F09D5F4,	/* 150 */
+		0x517CC1B7, 0x27220A94, 0xFE13ABE8,	/* 151 */
+		0xA2F9836E, 0x4E441529, 0xFC2757D1,	/* 152 */
+		0x45F306DC, 0x9C882A53, 0xF84EAFA3,	/* 153 */
+		0x8BE60DB9, 0x391054A7, 0xF09D5F47,	/* 154 */
+		0x17CC1B72, 0x7220A94F, 0xE13ABE8F,	/* 155 */
+		0x2F9836E4, 0xE441529F, 0xC2757D1F,	/* 156 */
+		0x5F306DC9, 0xC882A53F, 0x84EAFA3E,	/* 157 */
+		0xBE60DB93, 0x91054A7F, 0x09D5F47D,	/* 158 */
+		0x7CC1B727, 0x220A94FE, 0x13ABE8FA,	/* 159 */
+		0xF9836E4E, 0x441529FC, 0x2757D1F5,	/* 160 */
+		0xF306DC9C, 0x882A53F8, 0x4EAFA3EA,	/* 161 */
+		0xE60DB939, 0x1054A7F0, 0x9D5F47D4,	/* 162 */
+		0xCC1B7272, 0x20A94FE1, 0x3ABE8FA9,	/* 163 */
+		0x9836E4E4, 0x41529FC2, 0x757D1F53,	/* 164 */
+		0x306DC9C8, 0x82A53F84, 0xEAFA3EA6,	/* 165 */
+		0x60DB9391, 0x054A7F09, 0xD5F47D4D,	/* 166 */
+		0xC1B72722, 0x0A94FE13, 0xABE8FA9A,	/* 167 */
+		0x836E4E44, 0x1529FC27, 0x57D1F534,	/* 168 */
+		0x06DC9C88, 0x2A53F84E, 0xAFA3EA69,	/* 169 */
+		0x0DB93910, 0x54A7F09D, 0x5F47D4D3,	/* 170 */
+		0x1B727220, 0xA94FE13A, 0xBE8FA9A6,	/* 171 */
+		0x36E4E441, 0x529FC275, 0x7D1F534D,	/* 172 */
+		0x6DC9C882, 0xA53F84EA, 0xFA3EA69B,	/* 173 */
+		0xDB939105, 0x4A7F09D5, 0xF47D4D37,	/* 174 */
+		0xB727220A, 0x94FE13AB, 0xE8FA9A6E,	/* 175 */
+		0x6E4E4415, 0x29FC2757, 0xD1F534DD,	/* 176 */
+		0xDC9C882A, 0x53F84EAF, 0xA3EA69BB,	/* 177 */
+		0xB9391054, 0xA7F09D5F, 0x47D4D377,	/* 178 */
+		0x727220A9, 0x4FE13ABE, 0x8FA9A6EE,	/* 179 */
+		0xE4E44152, 0x9FC2757D, 0x1F534DDC,	/* 180 */
+		0xC9C882A5, 0x3F84EAFA, 0x3EA69BB8,	/* 181 */
+		0x9391054A, 0x7F09D5F4, 0x7D4D3770,	/* 182 */
+		0x27220A94, 0xFE13ABE8, 0xFA9A6EE0,	/* 183 */
+		0x4E441529, 0xFC2757D1, 0xF534DDC0,	/* 184 */
+		0x9C882A53, 0xF84EAFA3, 0xEA69BB81,	/* 185 */
+		0x391054A7, 0xF09D5F47, 0xD4D37703,	/* 186 */
+		0x7220A94F, 0xE13ABE8F, 0xA9A6EE06,	/* 187 */
+		0xE441529F, 0xC2757D1F, 0x534DDC0D,	/* 188 */
+		0xC882A53F, 0x84EAFA3E, 0xA69BB81B,	/* 189 */
+		0x91054A7F, 0x09D5F47D, 0x4D377036,	/* 190 */
+		0x220A94FE, 0x13ABE8FA, 0x9A6EE06D,	/* 191 */
+		0x441529FC, 0x2757D1F5, 0x34DDC0DB,	/* 192 */
+		0x882A53F8, 0x4EAFA3EA, 0x69BB81B6,	/* 193 */
+		0x1054A7F0, 0x9D5F47D4, 0xD377036D,	/* 194 */
+		0x20A94FE1, 0x3ABE8FA9, 0xA6EE06DB,	/* 195 */
+		0x41529FC2, 0x757D1F53, 0x4DDC0DB6,	/* 196 */
+		0x82A53F84, 0xEAFA3EA6, 0x9BB81B6C,	/* 197 */
+		0x054A7F09, 0xD5F47D4D, 0x377036D8,	/* 198 */
+		0x0A94FE13, 0xABE8FA9A, 0x6EE06DB1,	/* 199 */
+		0x1529FC27, 0x57D1F534, 0xDDC0DB62,	/* 200 */
+		0x2A53F84E, 0xAFA3EA69, 0xBB81B6C5,	/* 201 */
+		0x54A7F09D, 0x5F47D4D3, 0x77036D8A,	/* 202 */
+		0xA94FE13A, 0xBE8FA9A6, 0xEE06DB14,	/* 203 */
+		0x529FC275, 0x7D1F534D, 0xDC0DB629,	/* 204 */
+		0xA53F84EA, 0xFA3EA69B, 0xB81B6C52,	/* 205 */
+		0x4A7F09D5, 0xF47D4D37, 0x7036D8A5,	/* 206 */
+		0x94FE13AB, 0xE8FA9A6E, 0xE06DB14A,	/* 207 */
+		0x29FC2757, 0xD1F534DD, 0xC0DB6295,	/* 208 */
+		0x53F84EAF, 0xA3EA69BB, 0x81B6C52B,	/* 209 */
+		0xA7F09D5F, 0x47D4D377, 0x036D8A56,	/* 210 */
+		0x4FE13ABE, 0x8FA9A6EE, 0x06DB14AC,	/* 211 */
+		0x9FC2757D, 0x1F534DDC, 0x0DB62959,	/* 212 */
+		0x3F84EAFA, 0x3EA69BB8, 0x1B6C52B3,	/* 213 */
+		0x7F09D5F4, 0x7D4D3770, 0x36D8A566,	/* 214 */
+		0xFE13ABE8, 0xFA9A6EE0, 0x6DB14ACC,	/* 215 */
+		0xFC2757D1, 0xF534DDC0, 0xDB629599,	/* 216 */
+		0xF84EAFA3, 0xEA69BB81, 0xB6C52B32,	/* 217 */
+		0xF09D5F47, 0xD4D37703, 0x6D8A5664,	/* 218 */
+		0xE13ABE8F, 0xA9A6EE06, 0xDB14ACC9,	/* 219 */
+		0xC2757D1F, 0x534DDC0D, 0xB6295993,	/* 220 */
+		0x84EAFA3E, 0xA69BB81B, 0x6C52B327,	/* 221 */
+		0x09D5F47D, 0x4D377036, 0xD8A5664F,	/* 222 */
+		0x13ABE8FA, 0x9A6EE06D, 0xB14ACC9E,	/* 223 */
+		0x2757D1F5, 0x34DDC0DB, 0x6295993C,	/* 224 */
+		0x4EAFA3EA, 0x69BB81B6, 0xC52B3278,	/* 225 */
+		0x9D5F47D4, 0xD377036D, 0x8A5664F1,	/* 226 */
+		0x3ABE8FA9, 0xA6EE06DB, 0x14ACC9E2,	/* 227 */
+		0x757D1F53, 0x4DDC0DB6, 0x295993C4,	/* 228 */
+		0xEAFA3EA6, 0x9BB81B6C, 0x52B32788,	/* 229 */
+		0xD5F47D4D, 0x377036D8, 0xA5664F10,	/* 230 */
+		0xABE8FA9A, 0x6EE06DB1, 0x4ACC9E21,	/* 231 */
+		0x57D1F534, 0xDDC0DB62, 0x95993C43,	/* 232 */
+		0xAFA3EA69, 0xBB81B6C5, 0x2B327887,	/* 233 */
+		0x5F47D4D3, 0x77036D8A, 0x5664F10E,	/* 234 */
+		0xBE8FA9A6, 0xEE06DB14, 0xACC9E21C,	/* 235 */
+		0x7D1F534D, 0xDC0DB629, 0x5993C439,	/* 236 */
+		0xFA3EA69B, 0xB81B6C52, 0xB3278872,	/* 237 */
+		0xF47D4D37, 0x7036D8A5, 0x664F10E4,	/* 238 */
+		0xE8FA9A6E, 0xE06DB14A, 0xCC9E21C8,	/* 239 */
+		0xD1F534DD, 0xC0DB6295, 0x993C4390,	/* 240 */
+		0xA3EA69BB, 0x81B6C52B, 0x32788720,	/* 241 */
+		0x47D4D377, 0x036D8A56, 0x64F10E41,	/* 242 */
+		0x8FA9A6EE, 0x06DB14AC, 0xC9E21C82,	/* 243 */
+		0x1F534DDC, 0x0DB62959, 0x93C43904,	/* 244 */
+		0x3EA69BB8, 0x1B6C52B3, 0x27887208,	/* 245 */
+		0x7D4D3770, 0x36D8A566, 0x4F10E410,	/* 246 */
+		0xFA9A6EE0, 0x6DB14ACC, 0x9E21C820,	/* 247 */
+		0xF534DDC0, 0xDB629599, 0x3C439041,	/* 248 */
+		0xEA69BB81, 0xB6C52B32, 0x78872083,	/* 249 */
+		0xD4D37703, 0x6D8A5664, 0xF10E4107,	/* 250 */
+		0xA9A6EE06, 0xDB14ACC9, 0xE21C820F,	/* 251 */
+		0x534DDC0D, 0xB6295993, 0xC439041F,	/* 252 */
+		0xA69BB81B, 0x6C52B327, 0x8872083F,	/* 253 */
+		0x4D377036, 0xD8A5664F, 0x10E4107F,	/* 254 */
+		0x9A6EE06D, 0xB14ACC9E, 0x21C820FF,	/* 255 */
+		/* Exists so that its save to read 32-bytes past
+		   the end.  */
+		0x00000000, 0x00000000, 0x00000000,
+		0x00000000, 0x00000000, 0x00000000,
+		0x00000000, 0x00000000)
+	.type	__svml_stan_common_data_avx512, @object
+	.size	__svml_stan_common_data_avx512, .-__svml_stan_common_data_avx512
+# endif
+#endif
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 15/27] x86/fpu: Optimize svml_s_tanf16_core_avx512.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (12 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 14/27] x86/fpu: Add common rodata file for svml_s_tanf_*_{avx512, avx2, sse4}.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 16/27] x86/fpu: Optimize svml_s_tanf4_core_sse4.S Noah Goldstein via Libc-alpha
                   ` (12 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Add option to sacrifice some precision to save instructions.
    - Three precision levels that can be set by defining `PRECISION`.
    - Lower setting gets better perf but higher average ULP error.
        - All settings stay in 4ulp bound.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

ULP Error results for the three `PRECISION` values:

   PRECISION == 0:
       ulp:
       0  :  3374033104 (0.7856)
       1  :   893707604 (0.2081)
       2  :    26831634 (0.0062)
       3  :      393466 (0.0001)
       4  :        1488 (0.0000)

   PRECISION == 1:
       ulp:
       0  : 3677094430 (0.8561)
       1  :  609296734 (0.1419)
       2  :    8347192 (0.0019)
       3  :     228138 (0.0001)
       4  :        802 (0.0000)

   PRECISION == 2 (Same dist as current impl):
       ulp:
       error breakdown:
       0  :  3722920128 (0.8668)
       1  :   566817724 (0.1320)
       2  :     5022802 (0.0012)
       3  :      205902 (0.0000)
       4  :         740 (0.0000)

Currently leaving `PRECISION` set at zero as the function stays in the
4 ulp limit and it gets the best performance.

Code Size Change: -176 Bytes (1130 - 1306)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.6867
0F          (0x0000ffff, Denorm)   -> 0.5873
.1F         (0x3dcccccd)           -> 0.6561
5F          (0x40a00000)           -> 0.6486
2315255808F (0x4f0a0000)           -> 0.7996
-NaN        (0xffffffff)           -> 0.8154
---
 .../fpu/multiarch/svml_s_tanf16_core_avx512.S | 1352 +++++++----------
 1 file changed, 522 insertions(+), 830 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
index da3477f16e..26362a673d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
@@ -32,893 +32,585 @@
  *
  */
 
+#define LOCAL_DATA_NAME	__svml_stan_data_internal
+#define LOCAL_DATA_NAME_UNALIGNED	__svml_stan_data_internal_unaligned
+#include "svml_s_common_evex512_rodata_offsets.h"
+
+#define AVX512_SHARED_TABLE
+#include "svml_s_tanf_rodata.h.S"
+
+/* Offsets for data table __svml_stan_data_internal_unaligned
+ */
+#define _FLT_1_1to16	0
+#define _FLT_2_1to16	4
+#define _FLT_3_1to16	8
+
+
 /* Offsets for data table __svml_stan_data_internal
  */
-#define _sInvPI_uisa			0
-#define _sPI1_uisa			64
-#define _sPI2_uisa			128
-#define _sPI3_uisa			192
-#define Th_tbl_uisa			256
-#define _sPC3_uisa			384
-#define _sPC5_uisa			448
-#define _sRangeReductionVal_uisa	512
-#define _sAbsMask			576
-#define _sRangeVal			640
-#define _sRShifter			704
-#define _sOne				768
-#define _sRangeReductionVal		832
-#define _sPI1				896
-#define _sPI2				960
-#define _sPI3				1024
+#define _sInvPI_uisa	0
+#define _sRShifter	64
+#define _sPI1_uisa	128
+#define _sPI2_uisa	192
+#define _sPI3_uisa	256
+#define _sRangeReductionVal_uisa	320
+#define _sPC5_uisa	384
+#define _sPC3_uisa	448
+#define _Th_tbl_uisa_lo	512
+#define _Th_tbl_uisa_hi	576
+#define _sRangeVal	640
+#define _FLT_1	704
+#define _FLT_2	768
+#define _FLT_3	832
+#define _FLT_4	896
+#define _FLT_5	960
+#define _FLT_6	1024
+#define _FLT_7	1088
+
+#define PRECISION	0
+/* 0, 1, or 2. The following values get the following
+   ULP breakdowns:
+   PRECISION == 0:
+       ulp:
+       0  :  3374033104 (0.7856)
+       1  :   893707604 (0.2081)
+       2  :    26831634 (0.0062)
+       3  :      393466 (0.0001)
+       4  :        1488 (0.0000)
+       Avg: 0.2209
+
+   PRECISION == 1:
+       ulp:
+       0  : 3677094430 (0.8561)
+       1  :  609296734 (0.1419)
+       2  :    8347192 (0.0019)
+       3  :     228138 (0.0001)
+       4  :        802 (0.0000)
+       Avg: 0.1459
+
+   PRECISION == 2:
+       ulp:
+       error breakdown:
+       0  :  3722920128 (0.8668)
+       1  :   566817724 (0.1320)
+       2  :     5022802 (0.0012)
+       3  :      205902 (0.0000)
+       4  :         740 (0.0000)
+       Avg: 0.1345  */
 
 #include <sysdep.h>
 
 	.section .text.evex512, "ax", @progbits
 ENTRY(_ZGVeN16v_tanf_skx)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-64, %rsp
-	subq	$192, %rsp
-	xorl	%edx, %edx
-
-	/* Large values check */
-	vmovups	_sRangeReductionVal_uisa+__svml_stan_data_internal(%rip), %zmm10
-
-	/*
-	 *
-	 * Main path
-	 *
-	 * start arg. reduction
-	 */
-	vmovups	_sRShifter+__svml_stan_data_internal(%rip), %zmm1
-	vmovups	_sPI1_uisa+__svml_stan_data_internal(%rip), %zmm4
-	vmovups	_sPI2_uisa+__svml_stan_data_internal(%rip), %zmm2
-	vmovups	_sPI3_uisa+__svml_stan_data_internal(%rip), %zmm3
-	vmovaps	%zmm0, %zmm11
-	vandps	_sAbsMask+__svml_stan_data_internal(%rip), %zmm11, %zmm0
-	vcmpps	$22, {sae}, %zmm10, %zmm0, %k6
-	vmovups	__svml_stan_data_internal(%rip), %zmm10
-
-	/*
-	 *
-	 * End of main path
-	 */
-
-	kortestw %k6, %k6
-	vfmadd213ps {rn-sae}, %zmm1, %zmm11, %zmm10
+	/* Main path start arg. reduction.  */
+	vmovups	LOCAL_DATA(_sInvPI_uisa)(%rip), %zmm10
+	vmovups	LOCAL_DATA(_sRShifter)(%rip), %zmm1
+	vfmadd213ps {rn-sae}, %zmm1, %zmm0, %zmm10
 	vsubps	{rn-sae}, %zmm1, %zmm10, %zmm5
-	vfnmadd213ps {rn-sae}, %zmm11, %zmm5, %zmm4
+	vmovups	LOCAL_DATA(_sPI1_uisa)(%rip), %zmm4
+	vfnmadd213ps {rn-sae}, %zmm0, %zmm5, %zmm4
+	vmovups	LOCAL_DATA(_sPI2_uisa)(%rip), %zmm2
 	vfnmadd231ps {rn-sae}, %zmm5, %zmm2, %zmm4
+	vmovups	LOCAL_DATA(_sPI3_uisa)(%rip), %zmm3
 	vfnmadd213ps {rn-sae}, %zmm4, %zmm3, %zmm5
 
-	/* Go to auxilary branch */
+
+	/* Reused throughout in large case.  */
+	vmovaps	COMMON_DATA(_AbsMask)(%rip), %zmm7
+
+	/* Large values check.  */
+	vmovups	LOCAL_DATA(_sRangeReductionVal_uisa)(%rip), %zmm6
+	vandps	%zmm7, %zmm0, %zmm11
+	vcmpps	$22, {sae}, %zmm6, %zmm11, %k6
+
+	ktestd	%k6, %k6
+	/* Go to auxilary branch.  */
 	jne	L(AUX_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm5 zmm10 zmm11 k6
 
-	/* Return from auxilary branch
-	 * for out of main path inputs
-	 */
 
-L(AUX_BRANCH_RETURN):
-	/* Table lookup */
-	vmovups	Th_tbl_uisa+__svml_stan_data_internal(%rip), %zmm3
-	vmovups	_sPC3_uisa+__svml_stan_data_internal(%rip), %zmm0
+	/* Table lookup.  */
 	vmulps	{rn-sae}, %zmm5, %zmm5, %zmm1
-	vpermt2ps Th_tbl_uisa+64+__svml_stan_data_internal(%rip), %zmm10, %zmm3
-	vmovups	_sPC5_uisa+__svml_stan_data_internal(%rip), %zmm10
-	vfmadd231ps {rn-sae}, %zmm1, %zmm10, %zmm0
-	vmulps	{rn-sae}, %zmm5, %zmm0, %zmm4
-	vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm4
+	vmovups	LOCAL_DATA(_sPC5_uisa)(%rip), %zmm4
+	vmovups	LOCAL_DATA(_sPC3_uisa)(%rip), %zmm11
+	vfmadd231ps {rn-sae}, %zmm1, %zmm4, %zmm11
+	vmulps	{rn-sae}, %zmm5, %zmm1, %zmm2
+	vmovups	LOCAL_DATA(_Th_tbl_uisa_lo)(%rip), %zmm3
+	vpermt2ps LOCAL_DATA(_Th_tbl_uisa_hi)(%rip), %zmm10, %zmm3
+	vfmadd213ps {rn-sae}, %zmm5, %zmm2, %zmm11
 
-	/*
-	 * Computer Denominator:
-	 * sDenominator - sDlow ~= 1-(sTh+sTl)*(sP+sPlow)
-	 */
-	vmovups	_sOne+__svml_stan_data_internal(%rip), %zmm5
-	vmulps	{rn-sae}, %zmm4, %zmm3, %zmm7
 
-	/*
-	 * Compute Numerator:
-	 * sNumerator + sNlow ~= sTh+sTl+sP+sPlow
-	 */
-	vaddps	{rn-sae}, %zmm3, %zmm4, %zmm8
+	/* Computer Denominator:
+	   sDenominator - sDlow ~= 1-(sTh+sTl) * (sP+sPlow).  */
+	vmulps	{rn-sae}, %zmm11, %zmm3, %zmm7
+
+
+	vmovups	COMMON_DATA(_OneF)(%rip), %zmm5
+
+	/* Compute Numerator:
+	   sNumerator + sNlow ~= sTh+sTl+sP+sPlow.  */
+	vaddps	{rn-sae}, %zmm3, %zmm11, %zmm8
 	vsubps	{rn-sae}, %zmm7, %zmm5, %zmm9
-	vsubps	{rn-sae}, %zmm3, %zmm8, %zmm2
 
-	/*
-	 * Now computes (sNumerator + sNlow)/(sDenominator - sDlow)
-	 * Choose NR iteration instead of hardware division
-	 */
+#if PRECISION >= 2
+	/* High Precision Version.  */
 	vrcp14ps %zmm9, %zmm14
+	vsubps	{rn-sae}, %zmm3, %zmm8, %zmm2
+
 	vsubps	{rn-sae}, %zmm5, %zmm9, %zmm6
-	vsubps	{rn-sae}, %zmm2, %zmm4, %zmm13
 	vmulps	{rn-sae}, %zmm8, %zmm14, %zmm15
-	vaddps	{rn-sae}, %zmm7, %zmm6, %zmm12
 
-	/* One NR iteration to refine sQuotient */
+	/* One NR iteration to refine sQuotient.  */
 	vfmsub213ps {rn-sae}, %zmm8, %zmm15, %zmm9
+	vaddps	{rn-sae}, %zmm7, %zmm6, %zmm12
 	vfnmadd213ps {rn-sae}, %zmm9, %zmm15, %zmm12
+	vsubps	{rn-sae}, %zmm2, %zmm11, %zmm13
 	vsubps	{rn-sae}, %zmm13, %zmm12, %zmm0
-	vfnmadd213ps {rn-sae}, %zmm15, %zmm14, %zmm0
-	testl	%edx, %edx
-
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
-
-	/* Restore registers
-	 * and exit the function
-	 */
 
-L(EXIT):
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
+	vfnmadd213ps {rn-sae}, %zmm15, %zmm14, %zmm0
+#else
+	/* Low Precision Version.  */
+	vdivps	{rn-sae}, %zmm9, %zmm8, %zmm0
+#endif
 	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-
-	/* Branch to process
-	 * special inputs
-	 */
-
-L(SPECIAL_VALUES_BRANCH):
-	vmovups	%zmm11, 64(%rsp)
-	vmovups	%zmm0, 128(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx zmm0
-
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
-
-	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
-
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
-
-L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$16, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	128(%rsp), %zmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 zmm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	vmovss	64(%rsp, %r14, 4), %xmm0
-	call	tanf@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
-
-	vmovss	%xmm0, 128(%rsp, %r14, 4)
-
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	cfi_restore(12)
-	cfi_restore(13)
-	cfi_restore(14)
-	# LOE rbx r15 r12d r13d
-
-	/* Auxilary branch
-	 * for out of main path inputs
-	 */
 
+	.p2align 4
 L(AUX_BRANCH):
-	vmovups	_sRangeVal+__svml_stan_data_internal(%rip), %zmm6
-
-	/*
-	 * Get the (2^a / 2pi) mod 1 values from the table.
-	 * Because doesn't have I-type gather, we need a trivial cast
-	 */
-	lea	__svml_stan_reduction_data_internal(%rip), %rax
-	vmovups	%zmm5, (%rsp)
-	vandps	%zmm0, %zmm6, %zmm14
-	vcmpps	$0, {sae}, %zmm6, %zmm14, %k0
-
-	/*
-	 * Break the P_xxx and m into 16-bit chunks ready for
-	 * the long multiplication via 16x16->32 multiplications
-	 */
-	vmovups	.FLT_15(%rip), %zmm6
-	kxnorw	%k0, %k0, %k1
+	/* Hoping k0 doesn't have some long dependency chain attached to
+	   it. NB: We really don't need all 1s, we only need the `k6`
+	   mask. Currently `vpgatherdps` does not optimize out any
+	   loads at zero-bits for the mask.  */
 	kxnorw	%k0, %k0, %k2
-	kxnorw	%k0, %k0, %k3
-	kmovw	%k0, %edx
-	vpandd	.FLT_12(%rip), %zmm11, %zmm5
-	vpsrld	$23, %zmm5, %zmm7
-	vpslld	$1, %zmm7, %zmm8
-	vpaddd	%zmm7, %zmm8, %zmm9
-	vpslld	$2, %zmm9, %zmm4
-	vpxord	%zmm3, %zmm3, %zmm3
-	vpxord	%zmm15, %zmm15, %zmm15
-	vpxord	%zmm2, %zmm2, %zmm2
-	vgatherdps (%rax, %zmm4), %zmm3{%k1}
-	vgatherdps 4(%rax, %zmm4), %zmm15{%k2}
-	vgatherdps 8(%rax, %zmm4), %zmm2{%k3}
-	vpsrld	$16, %zmm3, %zmm5
-	vpsrld	$16, %zmm2, %zmm13
 
-	/*
-	 * Also get the significand as an integer
-	 * NB: adding in the integer bit is wrong for denorms!
-	 * To make this work for denorms we should do something slightly different
-	 */
-	vpandd	.FLT_13(%rip), %zmm11, %zmm0
-	vpaddd	.FLT_14(%rip), %zmm0, %zmm1
-	vpsrld	$16, %zmm15, %zmm0
-	vpsrld	$16, %zmm1, %zmm8
-	vpandd	%zmm6, %zmm3, %zmm9
-	vpandd	%zmm6, %zmm15, %zmm12
-	vpandd	%zmm6, %zmm2, %zmm7
-	vpandd	%zmm6, %zmm1, %zmm14
-
-	/* Now do the big multiplication and carry propagation */
-	vpmulld	%zmm9, %zmm8, %zmm4
-	vpmulld	%zmm0, %zmm8, %zmm3
-	vpmulld	%zmm12, %zmm8, %zmm2
-	vpmulld	%zmm13, %zmm8, %zmm1
-	vpmulld	%zmm7, %zmm8, %zmm8
-	vpmulld	%zmm5, %zmm14, %zmm7
-	vpmulld	%zmm9, %zmm14, %zmm5
-	vpmulld	%zmm0, %zmm14, %zmm9
-	vpmulld	%zmm12, %zmm14, %zmm0
-	vpmulld	%zmm13, %zmm14, %zmm12
-	vpsrld	$16, %zmm12, %zmm14
-	vpsrld	$16, %zmm0, %zmm13
-	vpsrld	$16, %zmm9, %zmm15
-	vpsrld	$16, %zmm5, %zmm12
-	vpsrld	$16, %zmm8, %zmm8
-	vpaddd	%zmm14, %zmm1, %zmm1
-	vpaddd	%zmm13, %zmm2, %zmm2
-	vpaddd	%zmm15, %zmm3, %zmm15
-	vpaddd	%zmm12, %zmm4, %zmm3
-	vpandd	%zmm6, %zmm0, %zmm13
-	vpaddd	%zmm1, %zmm13, %zmm4
-	vpaddd	%zmm4, %zmm8, %zmm14
-	vpsrld	$16, %zmm14, %zmm0
-	vpandd	%zmm6, %zmm9, %zmm9
-	vpaddd	%zmm2, %zmm9, %zmm1
-	vpaddd	%zmm1, %zmm0, %zmm8
+	/* Multiply indexes by 12. Note we could rearrange the data and
+	   then just shift down by 23 saving 2x instructions. This will
+	   probably look slightly better on microbenchmarks but as it
+	   is now we get some constructive cache interference between
+	   the gathers. As well this minimizes the total lines brought
+	   in. Its a judgement call but intuitively this will be better
+	   for applications. If someone has the time/inclination
+	   benchmarking this on some real applications may be worth it.  */
+	vpsrld	$23, %zmm11, %zmm8
+	vpaddd	%zmm8, %zmm8, %zmm1
+	vpaddd	%zmm1, %zmm8, %zmm14
+
+	/* Get the (2^a / 2pi) mod 1 values from the table.
+	   Because
+	   doesn't have I-type gather, we need a trivial cast.  */
+	lea	AVX512_SHARED_DATA(_Reduction)(%rip), %rax
+
+	/* Offset 4 gather has the most work based on it so we want it
+	   to be finished first to keep the backend busy.  */
+
+	/* NB: The dependency break is VERY important.  */
+	vpxor	%ymm4, %ymm4, %ymm4
+	vgatherdps 4(%rax, %zmm14, 4), %zmm4{%k2}
+
+
+	/* If the magnitude of the input is <= 2^-20, then
+	   just pass
+	   through the input, since no reduction will be needed and
+	   the main path will only work accurately if the reduced
+	   argument is
+	   about >= 2^-40 (which it is for all large pi
+	   multiples).  */
+	vmovups	LOCAL_DATA(_sRangeVal)(%rip), %zmm9
+	/* `zmm11` already has sign bit cast off. We are checking if the
+	   exp was 0xff so we can just use unsigned comparison.  */
+	vpcmpd	$5, %zmm9, %zmm11, %k1
+
+	/* Also get the significand as an integer
+	   NB: adding in the
+	   integer bit is wrong for denorms!
+	   To make this work for
+	   denorms we should do something slightly different.  */
+
+	/* zmm9 = zmm9 & (~zmm11) | _FLT_1_1to16(%rip).  */
+	vpternlogd $0xae, LOCAL_DATA_UNALIGNED(_FLT_1_1to16)(%rip){1to16}, %zmm11, %zmm9
+
+	/* Break the P_xxx and m into 16-bit chunks ready for
+	   the
+	   long multiplication via 16x16->32 multiplications.  */
+	movl	$0x55555555, %ecx
+	kmovd	%ecx, %k2
+
+	vpsrld	$16, %zmm9, %zmm8
+	vmovdqu16 %zmm11, %zmm9{%k2}{z}
+
+	vpsrld	$16, %zmm4, %zmm15
+	vmovdqu16 %zmm4, %zmm1{%k2}{z}
+
+	/* Now do the big multiplication and carry propagation.  */
+	vpmulld	%zmm15, %zmm9, %zmm2
+	vpmulld	%zmm1, %zmm9, %zmm12
+
+	vpmulld	%zmm15, %zmm8, %zmm15
+	vpmulld	%zmm1, %zmm8, %zmm1
+
+	vpsrld	$16, %zmm2, %zmm4
+
+	vpaddd	%zmm4, %zmm15, %zmm4
+	vmovdqu16 %zmm2, %zmm15{%k2}{z}
 
-	/*
-	 * Now round at the 2^-8 bit position for reduction mod pi/2^7
-	 * instead of the original 2pi (but still with the same 2pi scaling).
-	 * Use a shifter of 2^15 + 2^14.
-	 * The N we get is our final version; it has an offset of
-	 * 2^8 because of the implicit integer bit, and anyway for negative
-	 * starting value it's a 2s complement thing. But we need to mask
-	 * off the exponent part anyway so it's fine.
-	 */
-	vmovups	.FLT_18(%rip), %zmm1
-	vpandd	%zmm6, %zmm7, %zmm7
-	vpaddd	%zmm3, %zmm7, %zmm13
-	vpsrld	$16, %zmm8, %zmm3
-	vpandd	%zmm6, %zmm5, %zmm5
-	vpaddd	%zmm15, %zmm5, %zmm2
-	vpaddd	%zmm2, %zmm3, %zmm15
-	vpsrld	$16, %zmm15, %zmm12
-	vpaddd	%zmm13, %zmm12, %zmm5
-
-	/* Assemble reduced argument from the pieces */
-	vpandd	%zmm6, %zmm14, %zmm9
-	vpandd	%zmm6, %zmm15, %zmm7
-	vpslld	$16, %zmm5, %zmm6
-	vpslld	$16, %zmm8, %zmm5
-	vpaddd	%zmm7, %zmm6, %zmm4
-	vpaddd	%zmm9, %zmm5, %zmm9
-	vpsrld	$9, %zmm4, %zmm6
 
-	/*
-	 * We want to incorporate the original sign now too.
-	 * Do it here for convenience in getting the right N value,
-	 * though we could wait right to the end if we were prepared
-	 * to modify the sign of N later too.
-	 * So get the appropriate sign mask now (or sooner).
-	 */
-	vpandd	.FLT_16(%rip), %zmm11, %zmm0
-	vpandd	.FLT_21(%rip), %zmm9, %zmm13
-	vpslld	$5, %zmm13, %zmm14
+	kxnorw	%k0, %k0, %k3
+	vpxor	%ymm3, %ymm3, %ymm3
+	vgatherdps (%rax, %zmm14, 4), %zmm3{%k3}
+	vpsrld	$16, %zmm3, %zmm6
+	vmovdqu16 %zmm3, %zmm3{%k2}{z}
 
-	/*
-	 * Create floating-point high part, implicitly adding integer bit 1
-	 * Incorporate overall sign at this stage too.
-	 */
-	vpxord	.FLT_17(%rip), %zmm0, %zmm8
-	vpord	%zmm8, %zmm6, %zmm2
-	vaddps	{rn-sae}, %zmm2, %zmm1, %zmm12
-	vsubps	{rn-sae}, %zmm1, %zmm12, %zmm3
-	vsubps	{rn-sae}, %zmm3, %zmm2, %zmm7
 
-	/*
-	 * Create floating-point low and medium parts, respectively
-	 * lo_17, ... lo_0, 0, ..., 0
-	 * hi_8, ... hi_0, lo_31, ..., lo_18
-	 * then subtract off the implicitly added integer bits,
-	 * 2^-46 and 2^-23, respectively.
-	 * Put the original sign into all of them at this stage.
-	 */
-	vpxord	.FLT_20(%rip), %zmm0, %zmm6
-	vpord	%zmm6, %zmm14, %zmm15
-	vpandd	.FLT_23(%rip), %zmm4, %zmm4
-	vsubps	{rn-sae}, %zmm6, %zmm15, %zmm8
-	vandps	.FLT_26(%rip), %zmm11, %zmm15
-	vpsrld	$18, %zmm9, %zmm6
+	/* Do this comparison while `zmm11` still contains abs(input).  */
+	vmovups	LOCAL_DATA(_FLT_1)(%rip), %zmm2
+	vcmpps	$22, {sae}, %zmm2, %zmm11, %k5
 
-	/*
-	 * If the magnitude of the input is <= 2^-20, then
-	 * just pass through the input, since no reduction will be needed and
-	 * the main path will only work accurately if the reduced argument is
-	 * about >= 2^-40 (which it is for all large pi multiples)
-	 */
-	vmovups	.FLT_27(%rip), %zmm14
-	vcmpps	$26, {sae}, %zmm14, %zmm15, %k4
-	vcmpps	$22, {sae}, %zmm14, %zmm15, %k5
-	vpxord	.FLT_22(%rip), %zmm0, %zmm1
-	vpslld	$14, %zmm4, %zmm0
-	vpord	%zmm6, %zmm0, %zmm0
-	vpord	%zmm1, %zmm0, %zmm4
-	vsubps	{rn-sae}, %zmm1, %zmm4, %zmm2
-	vpternlogd $255, %zmm6, %zmm6, %zmm6
-
-	/* Now add them up into 2 reasonably aligned pieces */
-	vaddps	{rn-sae}, %zmm2, %zmm7, %zmm13
-	vsubps	{rn-sae}, %zmm13, %zmm7, %zmm7
-	vaddps	{rn-sae}, %zmm7, %zmm2, %zmm3
+	vpmulld	%zmm3, %zmm9, %zmm11
+	vpmulld	%zmm3, %zmm8, %zmm3
 
-	/*
-	 * The output is _VRES_R (high) + _VRES_E (low), and the integer part is _VRES_IND
-	 * Set sRp2 = _VRES_R^2 and then resume the original code.
-	 */
-	vmovups	.FLT_28(%rip), %zmm2
-	vaddps	{rn-sae}, %zmm8, %zmm3, %zmm1
-	vmovups	.FLT_25(%rip), %zmm8
+	kxnorw	%k0, %k0, %k4
+	vpxor	%ymm2, %ymm2, %ymm2
+	vgatherdps 8(%rax, %zmm14, 4), %zmm2{%k4}
+	vpsrld	$16, %zmm2, %zmm14
+	vpmulld	%zmm14, %zmm9, %zmm13
+	vpmulld	%zmm14, %zmm8, %zmm14
 
-	/* Grab our final N value as an integer, appropriately masked mod 2^8 */
-	vpandd	.FLT_19(%rip), %zmm12, %zmm5
+	vmovdqu16 %zmm2, %zmm2{%k2}{z}
+	vpmulld	%zmm2, %zmm8, %zmm8
+	/* We never take the upperhalf of zmm2.  */
+	vpmullw	%zmm6, %zmm9, %zmm2{%k2}{z}
 
-	/*
-	 * Now multiply those numbers all by 2 pi, reasonably accurately.
-	 * (RHi + RLo) * (pi_lead + pi_trail) ~=
-	 * RHi * pi_lead + (RHi * pi_trail + RLo * pi_lead)
-	 */
-	vmovups	.FLT_24(%rip), %zmm12
-	vmulps	{rn-sae}, %zmm12, %zmm13, %zmm0
-	vmovaps	%zmm12, %zmm9
-	vfmsub213ps {rn-sae}, %zmm0, %zmm13, %zmm9
-	vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm13
-	vmovaps	%zmm6, %zmm8
-	vfmadd213ps {rn-sae}, %zmm13, %zmm12, %zmm1
-	vpandnd	%zmm15, %zmm15, %zmm8{%k4}
-	vpandnd	%zmm15, %zmm15, %zmm6{%k5}
-	vandps	%zmm11, %zmm6, %zmm14
-	vandps	%zmm0, %zmm8, %zmm15
-	vandps	%zmm1, %zmm8, %zmm12
-	vorps	%zmm15, %zmm14, %zmm6
-	vpsrld	$31, %zmm6, %zmm3
-	vpsubd	%zmm3, %zmm2, %zmm4
-	vpaddd	%zmm4, %zmm5, %zmm7
-	vpsrld	$2, %zmm7, %zmm13
-	vpslld	$2, %zmm13, %zmm9
+	vpsrld	$16, %zmm12, %zmm9
+	vpsrld	$16, %zmm11, %zmm6
+	vpsrld	$16, %zmm13, %zmm13
 
-	/*
-	 *
-	 * End of large arguments path
-	 *
-	 * Merge results from main and large paths:
-	 */
-	vblendmps %zmm13, %zmm10, %zmm10{%k6}
-	vpsubd	%zmm9, %zmm5, %zmm5
-	vmovups	.FLT_29(%rip), %zmm9
-	vcvtdq2ps {rn-sae}, %zmm5, %zmm0
-	vmovups	.FLT_30(%rip), %zmm5
-	vfmadd231ps {rn-sae}, %zmm0, %zmm5, %zmm12
-	vmovups	(%rsp), %zmm5
-	vaddps	{rn-sae}, %zmm6, %zmm12, %zmm6
-	vfmadd213ps {rn-sae}, %zmm6, %zmm9, %zmm0
-	vblendmps %zmm0, %zmm5, %zmm5{%k6}
-
-	/* Return to main vector processing path */
-	jmp	L(AUX_BRANCH_RETURN)
-	# LOE rbx r12 r13 r14 r15 edx zmm5 zmm10 zmm11
-END(_ZGVeN16v_tanf_skx)
+	vpaddd	%zmm9, %zmm1, %zmm9
+	vpaddd	%zmm6, %zmm3, %zmm6
+	vpaddd	%zmm13, %zmm14, %zmm14
 
-	.section .rodata, "a"
-	.align	64
+	vpsrld	$16, %zmm8, %zmm8
 
-.FLT_12:
-	.long	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000
-	.type	.FLT_12, @object
-	.size	.FLT_12, 64
-	.align	64
+	vmovdqu16 %zmm12, %zmm13{%k2}{z}
+	vpaddd	%zmm14, %zmm13, %zmm3
+
+	vpaddd	%zmm9, %zmm15, %zmm14
+	vpaddd	%zmm3, %zmm8, %zmm9
+	vpsrld	$16, %zmm9, %zmm12
+
+	vpaddd	%zmm14, %zmm12, %zmm8
+
+	/* Now round at the 2^-8 bit position for reduction mod pi/2^7
+	   instead of the original 2pi (but still with the same 2pi
+	   scaling).
+	   Use a shifter of 2^15 + 2^14.
+	   The N we get is
+	   our final version; it has an offset of
+	   2^8 because of the
+	   implicit integer bit, and anyway for negative
+	   starting
+	   value it's a 2s complement thing. But we need to mask
+	   off
+	   the exponent part anyway so it's fine.  */
+
+	/* We already truncated zmm2.  */
+	vpaddd	%zmm6, %zmm2, %zmm13
+
+	vpsrld	$16, %zmm8, %zmm15
+	vmovdqu16 %zmm11, %zmm11{%k2}{z}
+	vpaddd	%zmm4, %zmm11, %zmm1
+
+
+	vpaddd	%zmm1, %zmm15, %zmm4
+	vpsrld	$16, %zmm4, %zmm12
+	vpaddd	%zmm13, %zmm12, %zmm11
+
+	/* Assemble reduced argument from the pieces.  */
+	vpslldq	$2, %zmm11, %zmm1
+	vpslldq	$2, %zmm8, %zmm11
+	vpblendmw %zmm4, %zmm1, %zmm3{%k2}
+	vmovdqu16 %zmm9, %zmm11{%k2}
+	vmovaps	COMMON_DATA(_OneF)(%rip), %zmm9
+	vmovups	LOCAL_DATA(_FLT_2)(%rip), %zmm14
+	vpsrld	$9, %zmm3, %zmm2
+
+
+	/* We want to incorporate the original sign now too.
+	   Do it
+	   here for convenience in getting the right N value,
+	   though
+	   we could wait right to the end if we were prepared
+	   to
+	   modify the sign of N later too.
+	   So get the appropriate
+	   sign mask now (or sooner).  */
+	vpandnd	%zmm0, %zmm7, %zmm1
+	vpslld	$5, %zmm11, %zmm13
+
+	/* Create floating-point high part, implicitly adding integer
+	   bit 1
+	   Incorporate overall sign at this stage too.  */
+	vpternlogd $0xfe, %zmm9, %zmm1, %zmm2
+	vaddps	{rn-sae}, %zmm2, %zmm14, %zmm12
+	vsubps	{rn-sae}, %zmm14, %zmm12, %zmm15
+	vsubps	{rn-sae}, %zmm15, %zmm2, %zmm2
+
+	/* Create floating-point low and medium parts, respectively
+	   lo_17, ... lo_0, 0, ..., 0
+	   hi_8, ... hi_0, lo_31, ...,
+	   lo_18
+	   then subtract off the implicitly added integer bits,
+	   2^-46 and 2^-23, respectively.
+	   Put the original sign into
+	   all of them at this stage.  */
+
+	/* Save code size by microfusing vpord _FLT_2_1to16, %zmm1. This
+	   increase the dependency chain on computing `zmm13` (we could
+	   use vptern).  */
+	vpord	LOCAL_DATA_UNALIGNED(_FLT_2_1to16)(%rip){1to16}, %zmm1, %zmm15
+	/* Don't need to full addition result.  */
+	vmovaps	LOCAL_DATA(_FLT_3)(%rip), %zmm6
+	vpandd	%zmm6, %zmm4, %zmm3
+	/* zmm13 = (zmm13 & ~_NotIOffExpoMask) | zmm15.  */
+	vpternlogd $0xdc, COMMON_DATA(_NotiOffExpoMask)(%rip){1to16}, %zmm15, %zmm13
+
+	vsubps	{rn-sae}, %zmm15, %zmm13, %zmm8
+	vpsrld	$18, %zmm11, %zmm15
+
+	vpxord	LOCAL_DATA_UNALIGNED(_FLT_3_1to16)(%rip){1to16}, %zmm1, %zmm14
+	vpslld	$14, %zmm3, %zmm1
+
+	vpternlogd $0xfe, %zmm15, %zmm14, %zmm1
+	vsubps	{rn-sae}, %zmm14, %zmm1, %zmm11
+
+
+	/* Now add them up into 2 reasonably aligned pieces.  */
+	vaddps	{rn-sae}, %zmm11, %zmm2, %zmm13
+	vsubps	{rn-sae}, %zmm13, %zmm2, %zmm2
+	/* `zmm15` is generally zero. Possibly place for optimization
+	   later on.  */
+	vaddps	{rn-sae}, %zmm2, %zmm11, %zmm15
 
-.FLT_13:
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	.type	.FLT_13, @object
-	.size	.FLT_13, 64
-	.align	64
-
-.FLT_14:
-	.long	0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000
-	.type	.FLT_14, @object
-	.size	.FLT_14, 64
-	.align	64
-
-.FLT_15:
-	.long	0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
-	.type	.FLT_15, @object
-	.size	.FLT_15, 64
-	.align	64
+	/*
 
-.FLT_16:
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
-	.type	.FLT_16, @object
-	.size	.FLT_16, 64
-	.align	64
+	   The output is _VRES_R (high) + _VRES_E (low), and the
+	   integer part is _VRES_IND
+	   Set sRp2 = _VRES_R^2 and then
+	   resume the original code.  */
+	vaddps	{rn-sae}, %zmm8, %zmm15, %zmm15
+	vmovups	LOCAL_DATA(_FLT_4)(%rip), %zmm8
+
+	/* Grab our final N value as an integer, appropriately masked
+	   mod 2^8.  */
+	vpandd	%zmm6, %zmm12, %zmm6
+
+	/* Now multiply those numbers all by 2 pi, reasonably
+	   accurately.
+	   (RHi + RLo)
+	   (pi_lead + pi_trail) ~=
+	   RHi
+	   pi_lead + (RHi
+	   pi_trail + RLo
+	   pi_lead).  */
+	vmovups	LOCAL_DATA(_FLT_5)(%rip), %zmm12
+	vmulps	{rn-sae}, %zmm12, %zmm13, %zmm1
+	vblendmps %zmm1, %zmm0, %zmm14{%k5}
+	vfmsub231ps {rn-sae}, %zmm12, %zmm13, %zmm1
+	vfmadd213ps {rn-sae}, %zmm1, %zmm8, %zmm13
+	vfmadd213ps {rn-sae}, %zmm13, %zmm15, %zmm12{%k5}{z}
+
+
+	vpsrld	$31, %zmm14, %zmm15
+
+	vpsubd	%zmm7, %zmm6, %zmm2
+	vpaddd	%zmm7, %zmm15, %zmm3
+	vpsubd	%zmm3, %zmm2, %zmm2
+
+	vpsrld	$2, %zmm2, %zmm10{%k6}
+	vpslld	$2, %zmm10, %zmm11
+
+	/* End of large arguments path
+	   Merge results from main and
+	   large paths:.  */
+	vpsubd	%zmm11, %zmm6, %zmm6
+	vmovups	LOCAL_DATA(_FLT_6)(%rip), %zmm11
+	vcvtdq2ps {rn-sae}, %zmm6, %zmm1
+	vmovups	LOCAL_DATA(_FLT_7)(%rip), %zmm6
+	vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm12
+	vaddps	{rn-sae}, %zmm14, %zmm12, %zmm5{%k6}
+	vfmadd231ps {rn-sae}, %zmm1, %zmm11, %zmm5{%k6}
+
+
+	/* Table lookup.  */
+	vmovups	LOCAL_DATA(_Th_tbl_uisa_lo)(%rip), %zmm3
+	vmovups	LOCAL_DATA(_sPC3_uisa)(%rip), %zmm4
+	vmulps	{rn-sae}, %zmm5, %zmm5, %zmm1
+	vpermt2ps LOCAL_DATA(_Th_tbl_uisa_hi)(%rip), %zmm10, %zmm3
+	vmovups	LOCAL_DATA(_sPC5_uisa)(%rip), %zmm10
+	vfmadd231ps {rn-sae}, %zmm1, %zmm10, %zmm4
+	vmulps	{rn-sae}, %zmm5, %zmm4, %zmm15
+	vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm15
+
+	/* Computer Denominator:
+	   sDenominator - sDlow ~= 1-(sTh+sTl) * (sP+sPlow).  */
+	vmulps	{rn-sae}, %zmm15, %zmm3, %zmm7
+
+	/* Compute Numerator:
+	   sNumerator + sNlow ~= sTh+sTl+sP+sPlow.  */
+	vaddps	{rn-sae}, %zmm3, %zmm15, %zmm8
+	vsubps	{rn-sae}, %zmm7, %zmm9, %zmm11
+
+#if PRECISION >= 1
+	/* High Precision Version.  */
+	vrcp14ps %zmm11, %zmm14
+	vsubps	{rn-sae}, %zmm3, %zmm8, %zmm2
+	vsubps	{rn-sae}, %zmm9, %zmm11, %zmm6
+	vsubps	{rn-sae}, %zmm2, %zmm15, %zmm13
+	vmulps	{rn-sae}, %zmm8, %zmm14, %zmm4
 
-.FLT_17:
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	.type	.FLT_17, @object
-	.size	.FLT_17, 64
-	.align	64
+	vaddps	{rn-sae}, %zmm7, %zmm6, %zmm12
+	/* One NR iteration to refine sQuotient.  */
+	vfmsub213ps {rn-sae}, %zmm8, %zmm4, %zmm11
+	vfnmadd213ps {rn-sae}, %zmm11, %zmm4, %zmm12
+	kmovw	%k1, %edx
+	testl	%edx, %edx
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
 
-.FLT_18:
-	.long	0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000
-	.type	.FLT_18, @object
-	.size	.FLT_18, 64
-	.align	64
+	vsubps	{rn-sae}, %zmm13, %zmm12, %zmm0
+	vfnmadd213ps {rn-sae}, %zmm4, %zmm14, %zmm0
+#else
+	/* Low Precision Version.  */
+	kmovw	%k1, %edx
+	testl	%edx, %edx
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	vdivps	%zmm11, %zmm8, %zmm0
+#endif
+	/* Restore registers
+	   and exit the function.  */
+	ret
 
-.FLT_19:
-	.long	0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff
-	.type	.FLT_19, @object
-	.size	.FLT_19, 64
-	.align	64
 
-.FLT_20:
-	.long	0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000
-	.type	.FLT_20, @object
-	.size	.FLT_20, 64
-	.align	64
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanf call. Optimize for code size
+	   moreso than speed here.  */
+L(SPECIAL_VALUES_BRANCH):
 
-.FLT_21:
-	.long	0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff
-	.type	.FLT_21, @object
-	.size	.FLT_21, 64
-	.align	64
+	/* Use r13 to save/restore the stack. This allows us to use rbp
+	   as callee save register saving code size.  */
+	pushq	%r13
+	cfi_def_cfa (rsp, 16)
+	/* Need to callee save registers to preserve state across tanf
+	   calls.  */
+	pushq	%rbx
+	cfi_def_cfa (rsp, 24)
+	pushq	%rbp
+	cfi_def_cfa (rsp, 32)
+	movq	%rsp, %r13
+	cfi_def_cfa (r13, 32)
+#if PRECISION >= 1
+	vsubps	{rn-sae}, %zmm13, %zmm12, %zmm1
+	vfnmadd213ps {rn-sae}, %zmm4, %zmm1, %zmm14
+#else
+	vdivps	%zmm11, %zmm8, %zmm14
+#endif
+	/* Align stack and make room for 2x zmm vectors.  */
+	andq	$-64, %rsp
+	addq	$-128, %rsp
 
-.FLT_22:
-	.long	0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000
-	.type	.FLT_22, @object
-	.size	.FLT_22, 64
-	.align	64
 
-.FLT_23:
-	.long	0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
-	.type	.FLT_23, @object
-	.size	.FLT_23, 64
-	.align	64
 
-.FLT_24:
-	.long	0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb
-	.type	.FLT_24, @object
-	.size	.FLT_24, 64
-	.align	64
+	/* Save origional input.  */
+	vmovaps	%zmm0, 64(%rsp)
+	/* Save all already computed inputs.  */
+	vmovaps	%zmm14, (%rsp)
 
-.FLT_25:
-	.long	0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e
-	.type	.FLT_25, @object
-	.size	.FLT_25, 64
-	.align	64
-
-.FLT_26:
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-	.type	.FLT_26, @object
-	.size	.FLT_26, 64
-	.align	64
+	vzeroupper
 
-.FLT_27:
-	.long	0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000
-	.type	.FLT_27, @object
-	.size	.FLT_27, 64
-	.align	64
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a tanf call.  */
+	movl	%edx, %ebx
+L(SPECIAL_VALUES_LOOP):
 
-.FLT_28:
-	.long	0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002
-	.type	.FLT_28, @object
-	.size	.FLT_28, 64
-	.align	64
+	/* use rbp as index for special value that is saved across calls
+	   to tanf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 56] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop. Realigning
+	   also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	64(%rsp, %rbp, 4), %xmm0
+	call	tanf@PLT
 
-.FLT_29:
-	.long	0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb, 0x3cc90fdb
-	.type	.FLT_29, @object
-	.size	.FLT_29, 64
-	.align	64
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
 
-.FLT_30:
-	.long	0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e, 0xb03bbd2e
-	.type	.FLT_30, @object
-	.size	.FLT_30, 64
-	.align	64
+	blsrl	%ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
 
-#ifdef __svml_stan_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(64)) VUINT32 _sInvPI_uisa[16][1];
-	__declspec(align(64)) VUINT32 _sPI1_uisa[16][1];
-	__declspec(align(64)) VUINT32 _sPI2_uisa[16][1];
-	__declspec(align(64)) VUINT32 _sPI3_uisa[16][1];
-	__declspec(align(64)) VUINT32 Th_tbl_uisa[32][1];
-	__declspec(align(64)) VUINT32 _sPC3_uisa[16][1];
-	__declspec(align(64)) VUINT32 _sPC5_uisa[16][1];
-	__declspec(align(64)) VUINT32 _sRangeReductionVal_uisa[16][1];
-	__declspec(align(64)) VUINT32 _sAbsMask[16][1];
-	__declspec(align(64)) VUINT32 _sRangeVal[16][1];
-	__declspec(align(64)) VUINT32 _sRShifter[16][1];
-	__declspec(align(64)) VUINT32 _sOne[16][1];
-	__declspec(align(64)) VUINT32 _sRangeReductionVal[16][1];
-	__declspec(align(64)) VUINT32 _sPI1[16][1];
-	__declspec(align(64)) VUINT32 _sPI2[16][1];
-	__declspec(align(64)) VUINT32 _sPI3[16][1];
-} __svml_stan_data_internal;
-#endif
-__svml_stan_data_internal:
-	/* UISA */
-	.long	0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983 /* _sInvPI_uisa */
-	.align	64
-	.long	0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda /* _sPI1_uisa */
-	.align	64
-	.long	0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168 /* _sPI2_uisa */
-	.align	64
-	.long	0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5 /* _sPI3_uisa */
-	/* Th_tbl_uisa for i from 0 to 31 do printsingle(tan(i*Pi/32)); */
-	.align	64
-	.long	0x80000000, 0x3dc9b5dc, 0x3e4bafaf, 0x3e9b5042
-	.long	0x3ed413cd, 0x3f08d5b9, 0x3f2b0dc1, 0x3f521801
-	.long	0x3f800000, 0x3f9bf7ec, 0x3fbf90c7, 0x3fef789e
-	.long	0x401a827a, 0x4052facf, 0x40a0dff7, 0x41227363
-	.long	0xff7fffff, 0xc1227363, 0xc0a0dff7, 0xc052facf
-	.long	0xc01a827a, 0xbfef789e, 0xbfbf90c7, 0xbf9bf7ec
-	.long	0xbf800000, 0xbf521801, 0xbf2b0dc1, 0xbf08d5b9
-	.long	0xbed413cd, 0xbe9b5042, 0xbe4bafaf, 0xbdc9b5dc
-	.align	64
-	.long	0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6 /* _sPC3_uisa */
-	.align	64
-	.long	0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888 /* _sPC5_uisa */
-	.align	64
-	.long	0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000 /* _sRangeReductionVal_uisa */
-	.align	64
-	.long	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF /* _sAbsMask */
-	.align	64
-	.long	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 /* _sRangeVal */
-	.align	64
-	.long	0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000 /* _sRShifter */
-	.align	64
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 /* _sOne */
-	.align	64
-	.long	0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000 /* _sRangeVal */
-	.align	64
-	.long	0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000 /* _sPI1 */
-	.align	64
-	.long	0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000 /* _sPI2 */
-	.align	64
-	.long	0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000 /* _sPI3 */
-	.align	64
-	.type	__svml_stan_data_internal, @object
-	.size	__svml_stan_data_internal, .-__svml_stan_data_internal
-	.align	64
+	/* All results have been written to 64(%rsp).  */
+	vmovaps	(%rsp), %zmm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa (rsp, 32)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_def_cfa (rsp, 24)
+	popq	%rbx
+	cfi_def_cfa (rsp, 16)
+	popq	%r13
+	ret
+END(_ZGVeN16v_tanf_skx)
 
-#ifdef __svml_stan_reduction_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(64)) VUINT32 _sPtable[256][3][1];
-} __svml_stan_reduction_data_internal;
-#endif
-__svml_stan_reduction_data_internal:
-	/*     P_hi                  P_med               P_lo                */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 0 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 1 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 2 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 3 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 4 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 5 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 6 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 7 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 8 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 9 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 10 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 11 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 12 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 13 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 14 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 15 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 16 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 17 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 18 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 19 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 20 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 21 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 22 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 23 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 24 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 25 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 26 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 27 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 28 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 29 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 30 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 31 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 32 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 33 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 34 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 35 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 36 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 37 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 38 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 39 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 40 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 41 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 42 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 43 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 44 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 45 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 46 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 47 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 48 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 49 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 50 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 51 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 52 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 53 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 54 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 55 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 56 */
-	.long	0x00000000, 0x00000000, 0x00000001 /* 57 */
-	.long	0x00000000, 0x00000000, 0x00000002 /* 58 */
-	.long	0x00000000, 0x00000000, 0x00000005 /* 59 */
-	.long	0x00000000, 0x00000000, 0x0000000A /* 60 */
-	.long	0x00000000, 0x00000000, 0x00000014 /* 61 */
-	.long	0x00000000, 0x00000000, 0x00000028 /* 62 */
-	.long	0x00000000, 0x00000000, 0x00000051 /* 63 */
-	.long	0x00000000, 0x00000000, 0x000000A2 /* 64 */
-	.long	0x00000000, 0x00000000, 0x00000145 /* 65 */
-	.long	0x00000000, 0x00000000, 0x0000028B /* 66 */
-	.long	0x00000000, 0x00000000, 0x00000517 /* 67 */
-	.long	0x00000000, 0x00000000, 0x00000A2F /* 68 */
-	.long	0x00000000, 0x00000000, 0x0000145F /* 69 */
-	.long	0x00000000, 0x00000000, 0x000028BE /* 70 */
-	.long	0x00000000, 0x00000000, 0x0000517C /* 71 */
-	.long	0x00000000, 0x00000000, 0x0000A2F9 /* 72 */
-	.long	0x00000000, 0x00000000, 0x000145F3 /* 73 */
-	.long	0x00000000, 0x00000000, 0x00028BE6 /* 74 */
-	.long	0x00000000, 0x00000000, 0x000517CC /* 75 */
-	.long	0x00000000, 0x00000000, 0x000A2F98 /* 76 */
-	.long	0x00000000, 0x00000000, 0x00145F30 /* 77 */
-	.long	0x00000000, 0x00000000, 0x0028BE60 /* 78 */
-	.long	0x00000000, 0x00000000, 0x00517CC1 /* 79 */
-	.long	0x00000000, 0x00000000, 0x00A2F983 /* 80 */
-	.long	0x00000000, 0x00000000, 0x0145F306 /* 81 */
-	.long	0x00000000, 0x00000000, 0x028BE60D /* 82 */
-	.long	0x00000000, 0x00000000, 0x0517CC1B /* 83 */
-	.long	0x00000000, 0x00000000, 0x0A2F9836 /* 84 */
-	.long	0x00000000, 0x00000000, 0x145F306D /* 85 */
-	.long	0x00000000, 0x00000000, 0x28BE60DB /* 86 */
-	.long	0x00000000, 0x00000000, 0x517CC1B7 /* 87 */
-	.long	0x00000000, 0x00000000, 0xA2F9836E /* 88 */
-	.long	0x00000000, 0x00000001, 0x45F306DC /* 89 */
-	.long	0x00000000, 0x00000002, 0x8BE60DB9 /* 90 */
-	.long	0x00000000, 0x00000005, 0x17CC1B72 /* 91 */
-	.long	0x00000000, 0x0000000A, 0x2F9836E4 /* 92 */
-	.long	0x00000000, 0x00000014, 0x5F306DC9 /* 93 */
-	.long	0x00000000, 0x00000028, 0xBE60DB93 /* 94 */
-	.long	0x00000000, 0x00000051, 0x7CC1B727 /* 95 */
-	.long	0x00000000, 0x000000A2, 0xF9836E4E /* 96 */
-	.long	0x00000000, 0x00000145, 0xF306DC9C /* 97 */
-	.long	0x00000000, 0x0000028B, 0xE60DB939 /* 98 */
-	.long	0x00000000, 0x00000517, 0xCC1B7272 /* 99 */
-	.long	0x00000000, 0x00000A2F, 0x9836E4E4 /* 100 */
-	.long	0x00000000, 0x0000145F, 0x306DC9C8 /* 101 */
-	.long	0x00000000, 0x000028BE, 0x60DB9391 /* 102 */
-	.long	0x00000000, 0x0000517C, 0xC1B72722 /* 103 */
-	.long	0x00000000, 0x0000A2F9, 0x836E4E44 /* 104 */
-	.long	0x00000000, 0x000145F3, 0x06DC9C88 /* 105 */
-	.long	0x00000000, 0x00028BE6, 0x0DB93910 /* 106 */
-	.long	0x00000000, 0x000517CC, 0x1B727220 /* 107 */
-	.long	0x00000000, 0x000A2F98, 0x36E4E441 /* 108 */
-	.long	0x00000000, 0x00145F30, 0x6DC9C882 /* 109 */
-	.long	0x00000000, 0x0028BE60, 0xDB939105 /* 110 */
-	.long	0x00000000, 0x00517CC1, 0xB727220A /* 111 */
-	.long	0x00000000, 0x00A2F983, 0x6E4E4415 /* 112 */
-	.long	0x00000000, 0x0145F306, 0xDC9C882A /* 113 */
-	.long	0x00000000, 0x028BE60D, 0xB9391054 /* 114 */
-	.long	0x00000000, 0x0517CC1B, 0x727220A9 /* 115 */
-	.long	0x00000000, 0x0A2F9836, 0xE4E44152 /* 116 */
-	.long	0x00000000, 0x145F306D, 0xC9C882A5 /* 117 */
-	.long	0x00000000, 0x28BE60DB, 0x9391054A /* 118 */
-	.long	0x00000000, 0x517CC1B7, 0x27220A94 /* 119 */
-	.long	0x00000000, 0xA2F9836E, 0x4E441529 /* 120 */
-	.long	0x00000001, 0x45F306DC, 0x9C882A53 /* 121 */
-	.long	0x00000002, 0x8BE60DB9, 0x391054A7 /* 122 */
-	.long	0x00000005, 0x17CC1B72, 0x7220A94F /* 123 */
-	.long	0x0000000A, 0x2F9836E4, 0xE441529F /* 124 */
-	.long	0x00000014, 0x5F306DC9, 0xC882A53F /* 125 */
-	.long	0x00000028, 0xBE60DB93, 0x91054A7F /* 126 */
-	.long	0x00000051, 0x7CC1B727, 0x220A94FE /* 127 */
-	.long	0x000000A2, 0xF9836E4E, 0x441529FC /* 128 */
-	.long	0x00000145, 0xF306DC9C, 0x882A53F8 /* 129 */
-	.long	0x0000028B, 0xE60DB939, 0x1054A7F0 /* 130 */
-	.long	0x00000517, 0xCC1B7272, 0x20A94FE1 /* 131 */
-	.long	0x00000A2F, 0x9836E4E4, 0x41529FC2 /* 132 */
-	.long	0x0000145F, 0x306DC9C8, 0x82A53F84 /* 133 */
-	.long	0x000028BE, 0x60DB9391, 0x054A7F09 /* 134 */
-	.long	0x0000517C, 0xC1B72722, 0x0A94FE13 /* 135 */
-	.long	0x0000A2F9, 0x836E4E44, 0x1529FC27 /* 136 */
-	.long	0x000145F3, 0x06DC9C88, 0x2A53F84E /* 137 */
-	.long	0x00028BE6, 0x0DB93910, 0x54A7F09D /* 138 */
-	.long	0x000517CC, 0x1B727220, 0xA94FE13A /* 139 */
-	.long	0x000A2F98, 0x36E4E441, 0x529FC275 /* 140 */
-	.long	0x00145F30, 0x6DC9C882, 0xA53F84EA /* 141 */
-	.long	0x0028BE60, 0xDB939105, 0x4A7F09D5 /* 142 */
-	.long	0x00517CC1, 0xB727220A, 0x94FE13AB /* 143 */
-	.long	0x00A2F983, 0x6E4E4415, 0x29FC2757 /* 144 */
-	.long	0x0145F306, 0xDC9C882A, 0x53F84EAF /* 145 */
-	.long	0x028BE60D, 0xB9391054, 0xA7F09D5F /* 146 */
-	.long	0x0517CC1B, 0x727220A9, 0x4FE13ABE /* 147 */
-	.long	0x0A2F9836, 0xE4E44152, 0x9FC2757D /* 148 */
-	.long	0x145F306D, 0xC9C882A5, 0x3F84EAFA /* 149 */
-	.long	0x28BE60DB, 0x9391054A, 0x7F09D5F4 /* 150 */
-	.long	0x517CC1B7, 0x27220A94, 0xFE13ABE8 /* 151 */
-	.long	0xA2F9836E, 0x4E441529, 0xFC2757D1 /* 152 */
-	.long	0x45F306DC, 0x9C882A53, 0xF84EAFA3 /* 153 */
-	.long	0x8BE60DB9, 0x391054A7, 0xF09D5F47 /* 154 */
-	.long	0x17CC1B72, 0x7220A94F, 0xE13ABE8F /* 155 */
-	.long	0x2F9836E4, 0xE441529F, 0xC2757D1F /* 156 */
-	.long	0x5F306DC9, 0xC882A53F, 0x84EAFA3E /* 157 */
-	.long	0xBE60DB93, 0x91054A7F, 0x09D5F47D /* 158 */
-	.long	0x7CC1B727, 0x220A94FE, 0x13ABE8FA /* 159 */
-	.long	0xF9836E4E, 0x441529FC, 0x2757D1F5 /* 160 */
-	.long	0xF306DC9C, 0x882A53F8, 0x4EAFA3EA /* 161 */
-	.long	0xE60DB939, 0x1054A7F0, 0x9D5F47D4 /* 162 */
-	.long	0xCC1B7272, 0x20A94FE1, 0x3ABE8FA9 /* 163 */
-	.long	0x9836E4E4, 0x41529FC2, 0x757D1F53 /* 164 */
-	.long	0x306DC9C8, 0x82A53F84, 0xEAFA3EA6 /* 165 */
-	.long	0x60DB9391, 0x054A7F09, 0xD5F47D4D /* 166 */
-	.long	0xC1B72722, 0x0A94FE13, 0xABE8FA9A /* 167 */
-	.long	0x836E4E44, 0x1529FC27, 0x57D1F534 /* 168 */
-	.long	0x06DC9C88, 0x2A53F84E, 0xAFA3EA69 /* 169 */
-	.long	0x0DB93910, 0x54A7F09D, 0x5F47D4D3 /* 170 */
-	.long	0x1B727220, 0xA94FE13A, 0xBE8FA9A6 /* 171 */
-	.long	0x36E4E441, 0x529FC275, 0x7D1F534D /* 172 */
-	.long	0x6DC9C882, 0xA53F84EA, 0xFA3EA69B /* 173 */
-	.long	0xDB939105, 0x4A7F09D5, 0xF47D4D37 /* 174 */
-	.long	0xB727220A, 0x94FE13AB, 0xE8FA9A6E /* 175 */
-	.long	0x6E4E4415, 0x29FC2757, 0xD1F534DD /* 176 */
-	.long	0xDC9C882A, 0x53F84EAF, 0xA3EA69BB /* 177 */
-	.long	0xB9391054, 0xA7F09D5F, 0x47D4D377 /* 178 */
-	.long	0x727220A9, 0x4FE13ABE, 0x8FA9A6EE /* 179 */
-	.long	0xE4E44152, 0x9FC2757D, 0x1F534DDC /* 180 */
-	.long	0xC9C882A5, 0x3F84EAFA, 0x3EA69BB8 /* 181 */
-	.long	0x9391054A, 0x7F09D5F4, 0x7D4D3770 /* 182 */
-	.long	0x27220A94, 0xFE13ABE8, 0xFA9A6EE0 /* 183 */
-	.long	0x4E441529, 0xFC2757D1, 0xF534DDC0 /* 184 */
-	.long	0x9C882A53, 0xF84EAFA3, 0xEA69BB81 /* 185 */
-	.long	0x391054A7, 0xF09D5F47, 0xD4D37703 /* 186 */
-	.long	0x7220A94F, 0xE13ABE8F, 0xA9A6EE06 /* 187 */
-	.long	0xE441529F, 0xC2757D1F, 0x534DDC0D /* 188 */
-	.long	0xC882A53F, 0x84EAFA3E, 0xA69BB81B /* 189 */
-	.long	0x91054A7F, 0x09D5F47D, 0x4D377036 /* 190 */
-	.long	0x220A94FE, 0x13ABE8FA, 0x9A6EE06D /* 191 */
-	.long	0x441529FC, 0x2757D1F5, 0x34DDC0DB /* 192 */
-	.long	0x882A53F8, 0x4EAFA3EA, 0x69BB81B6 /* 193 */
-	.long	0x1054A7F0, 0x9D5F47D4, 0xD377036D /* 194 */
-	.long	0x20A94FE1, 0x3ABE8FA9, 0xA6EE06DB /* 195 */
-	.long	0x41529FC2, 0x757D1F53, 0x4DDC0DB6 /* 196 */
-	.long	0x82A53F84, 0xEAFA3EA6, 0x9BB81B6C /* 197 */
-	.long	0x054A7F09, 0xD5F47D4D, 0x377036D8 /* 198 */
-	.long	0x0A94FE13, 0xABE8FA9A, 0x6EE06DB1 /* 199 */
-	.long	0x1529FC27, 0x57D1F534, 0xDDC0DB62 /* 200 */
-	.long	0x2A53F84E, 0xAFA3EA69, 0xBB81B6C5 /* 201 */
-	.long	0x54A7F09D, 0x5F47D4D3, 0x77036D8A /* 202 */
-	.long	0xA94FE13A, 0xBE8FA9A6, 0xEE06DB14 /* 203 */
-	.long	0x529FC275, 0x7D1F534D, 0xDC0DB629 /* 204 */
-	.long	0xA53F84EA, 0xFA3EA69B, 0xB81B6C52 /* 205 */
-	.long	0x4A7F09D5, 0xF47D4D37, 0x7036D8A5 /* 206 */
-	.long	0x94FE13AB, 0xE8FA9A6E, 0xE06DB14A /* 207 */
-	.long	0x29FC2757, 0xD1F534DD, 0xC0DB6295 /* 208 */
-	.long	0x53F84EAF, 0xA3EA69BB, 0x81B6C52B /* 209 */
-	.long	0xA7F09D5F, 0x47D4D377, 0x036D8A56 /* 210 */
-	.long	0x4FE13ABE, 0x8FA9A6EE, 0x06DB14AC /* 211 */
-	.long	0x9FC2757D, 0x1F534DDC, 0x0DB62959 /* 212 */
-	.long	0x3F84EAFA, 0x3EA69BB8, 0x1B6C52B3 /* 213 */
-	.long	0x7F09D5F4, 0x7D4D3770, 0x36D8A566 /* 214 */
-	.long	0xFE13ABE8, 0xFA9A6EE0, 0x6DB14ACC /* 215 */
-	.long	0xFC2757D1, 0xF534DDC0, 0xDB629599 /* 216 */
-	.long	0xF84EAFA3, 0xEA69BB81, 0xB6C52B32 /* 217 */
-	.long	0xF09D5F47, 0xD4D37703, 0x6D8A5664 /* 218 */
-	.long	0xE13ABE8F, 0xA9A6EE06, 0xDB14ACC9 /* 219 */
-	.long	0xC2757D1F, 0x534DDC0D, 0xB6295993 /* 220 */
-	.long	0x84EAFA3E, 0xA69BB81B, 0x6C52B327 /* 221 */
-	.long	0x09D5F47D, 0x4D377036, 0xD8A5664F /* 222 */
-	.long	0x13ABE8FA, 0x9A6EE06D, 0xB14ACC9E /* 223 */
-	.long	0x2757D1F5, 0x34DDC0DB, 0x6295993C /* 224 */
-	.long	0x4EAFA3EA, 0x69BB81B6, 0xC52B3278 /* 225 */
-	.long	0x9D5F47D4, 0xD377036D, 0x8A5664F1 /* 226 */
-	.long	0x3ABE8FA9, 0xA6EE06DB, 0x14ACC9E2 /* 227 */
-	.long	0x757D1F53, 0x4DDC0DB6, 0x295993C4 /* 228 */
-	.long	0xEAFA3EA6, 0x9BB81B6C, 0x52B32788 /* 229 */
-	.long	0xD5F47D4D, 0x377036D8, 0xA5664F10 /* 230 */
-	.long	0xABE8FA9A, 0x6EE06DB1, 0x4ACC9E21 /* 231 */
-	.long	0x57D1F534, 0xDDC0DB62, 0x95993C43 /* 232 */
-	.long	0xAFA3EA69, 0xBB81B6C5, 0x2B327887 /* 233 */
-	.long	0x5F47D4D3, 0x77036D8A, 0x5664F10E /* 234 */
-	.long	0xBE8FA9A6, 0xEE06DB14, 0xACC9E21C /* 235 */
-	.long	0x7D1F534D, 0xDC0DB629, 0x5993C439 /* 236 */
-	.long	0xFA3EA69B, 0xB81B6C52, 0xB3278872 /* 237 */
-	.long	0xF47D4D37, 0x7036D8A5, 0x664F10E4 /* 238 */
-	.long	0xE8FA9A6E, 0xE06DB14A, 0xCC9E21C8 /* 239 */
-	.long	0xD1F534DD, 0xC0DB6295, 0x993C4390 /* 240 */
-	.long	0xA3EA69BB, 0x81B6C52B, 0x32788720 /* 241 */
-	.long	0x47D4D377, 0x036D8A56, 0x64F10E41 /* 242 */
-	.long	0x8FA9A6EE, 0x06DB14AC, 0xC9E21C82 /* 243 */
-	.long	0x1F534DDC, 0x0DB62959, 0x93C43904 /* 244 */
-	.long	0x3EA69BB8, 0x1B6C52B3, 0x27887208 /* 245 */
-	.long	0x7D4D3770, 0x36D8A566, 0x4F10E410 /* 246 */
-	.long	0xFA9A6EE0, 0x6DB14ACC, 0x9E21C820 /* 247 */
-	.long	0xF534DDC0, 0xDB629599, 0x3C439041 /* 248 */
-	.long	0xEA69BB81, 0xB6C52B32, 0x78872083 /* 249 */
-	.long	0xD4D37703, 0x6D8A5664, 0xF10E4107 /* 250 */
-	.long	0xA9A6EE06, 0xDB14ACC9, 0xE21C820F /* 251 */
-	.long	0x534DDC0D, 0xB6295993, 0xC439041F /* 252 */
-	.long	0xA69BB81B, 0x6C52B327, 0x8872083F /* 253 */
-	.long	0x4D377036, 0xD8A5664F, 0x10E4107F /* 254 */
-	.long	0x9A6EE06D, 0xB14ACC9E, 0x21C820FF /* 255 */
-	.align	64
-	.type	__svml_stan_reduction_data_internal, @object
-	.size	__svml_stan_reduction_data_internal, .-__svml_stan_reduction_data_internal
+	.section .rodata.evex512, "a"
+
+	/* Place the minimally aligned pieces at the begining so there
+	   is a chance they fit in aligning bytes.  */
+	.align	16
+LOCAL_DATA_NAME_UNALIGNED:
+	float_block (LOCAL_DATA_NAME_UNALIGNED, _FLT_1_1to16, 0x00800000)
+	float_block (LOCAL_DATA_NAME_UNALIGNED, _FLT_2_1to16, 0x28800000)
+	float_block (LOCAL_DATA_NAME_UNALIGNED, _FLT_3_1to16, 0x34000000)
+
+	.type	LOCAL_DATA_NAME_UNALIGNED, @object
+	.size	LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
+
+
+	.align	64
+LOCAL_DATA_NAME:
+	DATA_VEC (LOCAL_DATA_NAME, _sInvPI_uisa, 0x4122f983)
+	DATA_VEC (LOCAL_DATA_NAME, _sRShifter, 0x4B400000)
+	DATA_VEC (LOCAL_DATA_NAME, _sPI1_uisa, 0x3dc90fda)
+	DATA_VEC (LOCAL_DATA_NAME, _sPI2_uisa, 0x31a22168)
+	DATA_VEC (LOCAL_DATA_NAME, _sPI3_uisa, 0x25c234c5)
+	DATA_VEC (LOCAL_DATA_NAME, _sRangeReductionVal_uisa, 0x46010000)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC5_uisa, 0x3e08b888)
+	DATA_VEC (LOCAL_DATA_NAME, _sPC3_uisa, 0x3eaaaaa6)
+
+	float_block (LOCAL_DATA_NAME, _Th_tbl_uisa_lo,
+		0x80000000, 0x3dc9b5dc, 0x3e4bafaf, 0x3e9b5042,
+		0x3ed413cd, 0x3f08d5b9, 0x3f2b0dc1, 0x3f521801,
+		0x3f800000, 0x3f9bf7ec, 0x3fbf90c7, 0x3fef789e,
+		0x401a827a, 0x4052facf, 0x40a0dff7, 0x41227363)
+
+	float_block (LOCAL_DATA_NAME, _Th_tbl_uisa_hi,
+		0xff7fffff, 0xc1227363, 0xc0a0dff7, 0xc052facf,
+		0xc01a827a, 0xbfef789e, 0xbfbf90c7, 0xbf9bf7ec,
+		0xbf800000, 0xbf521801, 0xbf2b0dc1, 0xbf08d5b9,
+		0xbed413cd, 0xbe9b5042, 0xbe4bafaf, 0xbdc9b5dc)
+
+	DATA_VEC (LOCAL_DATA_NAME, _sRangeVal, 0x7f800000)
+	DATA_VEC (LOCAL_DATA_NAME, _FLT_1, 0x35800000)
+	DATA_VEC (LOCAL_DATA_NAME, _FLT_2, 0x47400000)
+	DATA_VEC (LOCAL_DATA_NAME, _FLT_3, 0x000001ff)
+	DATA_VEC (LOCAL_DATA_NAME, _FLT_4, 0xb43bbd2e)
+	DATA_VEC (LOCAL_DATA_NAME, _FLT_5, 0x40c90fdb)
+	DATA_VEC (LOCAL_DATA_NAME, _FLT_6, 0x3cc90fdb)
+	DATA_VEC (LOCAL_DATA_NAME, _FLT_7, 0xb03bbd2e)
+
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 16/27] x86/fpu: Optimize svml_s_tanf4_core_sse4.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (13 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 15/27] x86/fpu: Optimize svml_s_tanf16_core_avx512.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 17/27] x86/fpu: Optimize svml_s_tanf8_core_avx2.S Noah Goldstein via Libc-alpha
                   ` (11 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Remove many unnecissary spills.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

Code Size Change: -980 Bytes (1619 - 2599)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.8527
0F          (0x0000ffff, Denorm)   -> 0.9879
.1F         (0x3dcccccd)           -> 0.8542
5F          (0x40a00000)           -> 0.8633
2315255808F (0x4f0a0000)           -> 0.7640
-NaN        (0xffffffff)           -> 0.7966
---
 .../fpu/multiarch/svml_s_tanf4_core_sse4.S    | 3031 +++--------------
 1 file changed, 531 insertions(+), 2500 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf4_core_sse4.S
index 3dc82cae68..f3f0c867ef 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf4_core_sse4.S
@@ -45,2553 +45,584 @@
  *
  */
 
-/* Offsets for data table __svml_stan_data_internal
- */
-#define _sInvPI_uisa			0
-#define _sPI1_uisa			16
-#define _sPI2_uisa			32
-#define _sPI3_uisa			48
-#define _sPI2_ha_uisa			64
-#define _sPI3_ha_uisa			80
-#define Th_tbl_uisa			96
-#define Tl_tbl_uisa			224
-#define _sPC3_uisa			352
-#define _sPC5_uisa			368
-#define _sRangeReductionVal_uisa	384
-#define _sInvPi				400
-#define _sSignMask			416
-#define _sAbsMask			432
-#define _sRangeVal			448
-#define _sRShifter			464
-#define _sOne				480
-#define _sRangeReductionVal		496
-#define _sPI1				512
-#define _sPI2				528
-#define _sPI3				544
-#define _sPI4				560
-#define _sPI1_FMA			576
-#define _sPI2_FMA			592
-#define _sPI3_FMA			608
-#define _sP0				624
-#define _sP1				640
-#define _sQ0				656
-#define _sQ1				672
-#define _sQ2				688
-#define _sTwo				704
-#define _sCoeffs			720
+#define LOCAL_DATA_NAME	__svml_stan_data_internal
+#include "svml_s_common_sse4_rodata_offsets.h"
+
+#define AVX2_SHARED_OFFSETS
+#define AVX512_SHARED_OFFSETS
+#include "svml_s_tanf_rodata.h.S"
+
+/* Offsets for data table __svml_stan_data_internal.  */
+#define _sPI1	0
+#define _sPI2	16
+#define _sPI3	32
+#define _sPI4	48
+#define _sRangeVal	64
+#define _FLT_0	80
+#define _FLT_1	96
+
 
 #include <sysdep.h>
 
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_tanf_sse4)
-	subq	$232, %rsp
-	cfi_def_cfa_offset(240)
-	movaps	%xmm0, %xmm13
-	movups	_sAbsMask+__svml_stan_data_internal(%rip), %xmm12
-
-	/*
-	 * Legacy Code
-	 * Here HW FMA can be unavailable
-	 */
-	xorl	%eax, %eax
-	movaps	%xmm12, %xmm4
-	pxor	%xmm10, %xmm10
-	movups	_sInvPi+__svml_stan_data_internal(%rip), %xmm2
-	andps	%xmm13, %xmm4
-	mulps	%xmm4, %xmm2
-
-	/* Range reduction */
-	movaps	%xmm4, %xmm1
-
-	/*
-	 *
-	 * Main path (_LA_ and _EP_)
-	 *
-	 * Octant calculation
-	 */
-	movups	_sRShifter+__svml_stan_data_internal(%rip), %xmm3
-
-	/* Large values check */
-	movaps	%xmm4, %xmm11
-	movups	_sPI1+__svml_stan_data_internal(%rip), %xmm5
-	andnps	%xmm13, %xmm12
-	movups	_sPI2+__svml_stan_data_internal(%rip), %xmm6
-	addps	%xmm3, %xmm2
-	cmpnleps _sRangeReductionVal+__svml_stan_data_internal(%rip), %xmm11
-	movaps	%xmm2, %xmm8
-	movups	_sPI3+__svml_stan_data_internal(%rip), %xmm7
-	subps	%xmm3, %xmm8
-	movmskps %xmm11, %edx
-	movups	_sPI4+__svml_stan_data_internal(%rip), %xmm9
-	mulps	%xmm8, %xmm5
-	mulps	%xmm8, %xmm6
-	mulps	%xmm8, %xmm7
-	subps	%xmm5, %xmm1
-	mulps	%xmm8, %xmm9
-	subps	%xmm6, %xmm1
-	movups	_sQ2+__svml_stan_data_internal(%rip), %xmm15
+	movaps	%xmm0, %xmm15
+	movups	COMMON_DATA(_AbsMask)(%rip), %xmm4
 
-	/* Inversion mask and sign calculation */
-	movaps	%xmm2, %xmm5
+	andps	%xmm0, %xmm4
 
-	/* Rational approximation */
-	movups	_sP1+__svml_stan_data_internal(%rip), %xmm14
-	pslld	$30, %xmm2
-	cmpneqps %xmm10, %xmm2
-	subps	%xmm7, %xmm1
+	movups	AVX2_SHARED_DATA(_sInvPi)(%rip), %xmm0
+	mulps	%xmm4, %xmm0
 
-	/* Exchanged numerator and denominator if necessary */
-	movaps	%xmm2, %xmm0
-	movaps	%xmm2, %xmm10
-	pslld	$31, %xmm5
-	subps	%xmm9, %xmm1
-	movaps	%xmm1, %xmm3
-	pxor	%xmm12, %xmm5
-	mulps	%xmm1, %xmm3
-	mulps	%xmm3, %xmm15
-	mulps	%xmm3, %xmm14
-	addps	_sQ1+__svml_stan_data_internal(%rip), %xmm15
-	addps	_sP0+__svml_stan_data_internal(%rip), %xmm14
-	mulps	%xmm15, %xmm3
-	mulps	%xmm14, %xmm1
-	addps	_sQ0+__svml_stan_data_internal(%rip), %xmm3
-	andnps	%xmm1, %xmm0
-	andps	%xmm3, %xmm10
-	andps	%xmm2, %xmm1
-	andnps	%xmm3, %xmm2
-	orps	%xmm10, %xmm0
-	orps	%xmm2, %xmm1
-
-	/* Division */
-	divps	%xmm1, %xmm0
-
-	/* Sign setting */
-	pxor	%xmm5, %xmm0
+	/* Range reduction.  */
+	movaps	%xmm4, %xmm1
 
 	/*
-	 *
-	 * End of main path (_LA_ and _EP_)
-	 */
+	   Main path (_LA_ and _EP_)
 
-	testl	%edx, %edx
+	   Octant calculation.  */
+	movups	AVX2_SHARED_DATA(_sRShifter)(%rip), %xmm3
 
-	/* Go to auxilary branch */
-	jne	L(AUX_BRANCH)
-	# LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm4 xmm11 xmm12 xmm13
-
-	/* Return from auxilary branch
-	 * for out of main path inputs
-	 */
+	/* Large values check.  */
+	movups	LOCAL_DATA(_sPI1)(%rip), %xmm5
+	movups	LOCAL_DATA(_sPI2)(%rip), %xmm6
+	addps	%xmm3, %xmm0
+	movaps	%xmm0, %xmm2
+	movups	LOCAL_DATA(_sPI3)(%rip), %xmm7
+	subps	%xmm3, %xmm2
 
-L(AUX_BRANCH_RETURN):
-	testl	%eax, %eax
+	mulps	%xmm2, %xmm5
+	mulps	%xmm2, %xmm6
+	mulps	%xmm2, %xmm7
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm13
+	subps	%xmm5, %xmm1
+	mulps	LOCAL_DATA(_sPI4)(%rip), %xmm2
+	subps	%xmm6, %xmm1
+	movups	AVX2_SHARED_DATA(_sQ2)(%rip), %xmm6
 
-	/* Restore registers
-	 * and exit the function
-	 */
 
-L(EXIT):
-	addq	$232, %rsp
-	cfi_def_cfa_offset(8)
-	ret
-	cfi_def_cfa_offset(240)
+	/* Rational approximation.  */
+	movups	AVX2_SHARED_DATA(_sP1)(%rip), %xmm5
 
-	/* Branch to process
-	 * special inputs
-	 */
+	/* Inversion mask and sign calculation.  */
+	pslld	$31, %xmm0
+	subps	%xmm7, %xmm1
 
-L(SPECIAL_VALUES_BRANCH):
-	movups	%xmm13, 32(%rsp)
-	movups	%xmm0, 48(%rsp)
-	# LOE rbx rbp r12 r13 r14 r15 eax xmm0
-
-	xorl	%edx, %edx
-	movq	%r12, 16(%rsp)
-	cfi_offset(12, -224)
-	movl	%edx, %r12d
-	movq	%r13, 8(%rsp)
-	cfi_offset(13, -232)
-	movl	%eax, %r13d
-	movq	%r14, (%rsp)
-	cfi_offset(14, -240)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
-
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
+	/* Exchanged numerator and denominator if necessary.  */
+	subps	%xmm2, %xmm1
+	movaps	%xmm1, %xmm3
+	mulps	%xmm1, %xmm1
+	mulps	%xmm1, %xmm6
+	mulps	%xmm1, %xmm5
+	addps	AVX2_SHARED_DATA(_sQ1)(%rip), %xmm6
+	movups	AVX2_SHARED_DATA(_sP0)(%rip), %xmm2
+	addps	%xmm2, %xmm5
+	mulps	%xmm6, %xmm1
+	mulps	%xmm5, %xmm3
+	addps	%xmm2, %xmm1
 
-L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$4, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx rbp r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	movups	48(%rsp), %xmm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	cfi_offset(12, -224)
-	cfi_offset(13, -232)
-	cfi_offset(14, -240)
-	# LOE rbx rbp r12 r13 r14 r15 xmm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
-	call	tanf@PLT
-	# LOE rbx rbp r14 r15 r12d r13d xmm0
+	movaps	%xmm3, %xmm2
+	blendvps %xmm0, %xmm1, %xmm3
+	blendvps %xmm0, %xmm2, %xmm1
 
-	movss	%xmm0, 48(%rsp, %r14, 4)
+	/* Division.  */
+	divps	%xmm1, %xmm3
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	cfi_restore(12)
-	cfi_restore(13)
-	cfi_restore(14)
-	# LOE rbx rbp r15 r12d r13d
+	/* Sign setting.  */
+	pxor	%xmm3, %xmm0
 
-	/* Auxilary branch
-	 * for out of main path inputs
-	 */
+	movaps	%xmm4, %xmm3
+	pcmpgtd	AVX2_SHARED_DATA(_sRangeReductionVal)(%rip), %xmm3
+	pmovmskb %xmm3, %edx
 
-L(AUX_BRANCH):
-	movl	$2139095040, %eax
+	/* End of main path (_LA_ and _EP_).  */
+	testl	%edx, %edx
+	/* Go to auxilary branch.  */
+	jne	L(AUX_BRANCH)
 
-	/*
-	 * Get the (2^a / 2pi) mod 1 values from the table.
-	 * Because doesn't have I-type gather, we need a trivial cast
-	 */
-	lea	__svml_stan_reduction_data_internal(%rip), %r8
-	movups	%xmm13, 64(%rsp)
+	/* Set sign.  */
+	andnps	%xmm15, %xmm4
+	pxor	%xmm4, %xmm0
+	ret
 
-	/*
-	 * Also get the significand as an integer
-	 * NB: adding in the integer bit is wrong for denorms!
-	 * To make this work for denorms we should do something slightly different
-	 */
-	movl	$8388607, %r9d
-	movups	%xmm12, 80(%rsp)
-	movl	$8388608, %r10d
-	movups	%xmm11, 96(%rsp)
+L(AUX_BRANCH):
+	movaps	%xmm3, %xmm14
+	andnps	%xmm0, %xmm3
 
-	/*
-	 * Break the P_xxx and m into 16-bit chunks ready for
-	 * the long multiplication via 16x16->32 multiplications
-	 */
-	movl	$65535, %r11d
-	movd	%eax, %xmm3
-	pshufd	$0, %xmm3, %xmm2
-	andps	%xmm2, %xmm13
-	cmpeqps	%xmm2, %xmm13
-	pand	%xmm4, %xmm2
-	psrld	$23, %xmm2
-	movdqa	%xmm2, %xmm12
-	pslld	$1, %xmm12
-	paddd	%xmm2, %xmm12
-	pslld	$2, %xmm12
-	pshufd	$1, %xmm12, %xmm10
-	pshufd	$2, %xmm12, %xmm11
-	pshufd	$3, %xmm12, %xmm14
-	movd	%xmm12, %edx
-	movd	%xmm10, %ecx
-	movd	%xmm11, %esi
-	movd	%r9d, %xmm11
-	movd	%xmm14, %edi
-	movd	4(%rdx, %r8), %xmm6
-	movd	4(%rcx, %r8), %xmm7
-	movd	4(%rsi, %r8), %xmm3
-	movl	$872415232, %r9d
-	movd	4(%rdi, %r8), %xmm5
-	punpckldq %xmm7, %xmm6
-	punpckldq %xmm5, %xmm3
-	movd	8(%rdi, %r8), %xmm10
-	movmskps %xmm13, %eax
-	punpcklqdq %xmm3, %xmm6
-	movd	8(%rdx, %r8), %xmm3
-	movd	8(%rcx, %r8), %xmm2
-	movd	8(%rsi, %r8), %xmm13
-	punpckldq %xmm2, %xmm3
-	punpckldq %xmm10, %xmm13
-	punpcklqdq %xmm13, %xmm3
-	pshufd	$0, %xmm11, %xmm13
-	movdqa	%xmm3, %xmm2
-	movups	%xmm4, 48(%rsp)
-	pand	%xmm4, %xmm13
-	movd	%r10d, %xmm4
-	psrld	$16, %xmm2
-	movd	(%rdx, %r8), %xmm9
+	/* Get the (2^a / 2pi) mod 1 values from the table.  */
+	movaps	%xmm4, %xmm1
+	psrld	$0x17, %xmm4
+	/* Compute indices in xmm5 (need 4x scale).  */
+	movaps	%xmm4, %xmm5
+	paddd	%xmm4, %xmm4
+	paddd	%xmm4, %xmm5
+
+	pextrq	$0x1, %xmm5, %rcx
+	movq	%xmm5, %rdx
+
+
+	/* Move indices into GPRs.  */
+	movl	%edx, %esi
+	movl	%ecx, %edi
+	shrq	$0x20, %rdx
+	shrq	$0x20, %rcx
+
+	lea	AVX512_SHARED_DATA(_Reduction)(%rip), %rax
+	movq	0(%rax, %rcx, 4), %xmm4
+	movq	0(%rax, %rdi, 4), %xmm5
+	punpckldq %xmm4, %xmm5
+	movq	0(%rax, %rsi, 4), %xmm4
+	movq	0(%rax, %rdx, 4), %xmm2
+	movaps	AVX2_SHARED_DATA(_Low16)(%rip), %xmm9
+	punpckldq %xmm2, %xmm4
+	movaps	%xmm4, %xmm2
+	punpcklqdq %xmm5, %xmm4
+	punpckhqdq %xmm5, %xmm2
+
+	/* Break the P_xxx and m into 16-bit chunks ready for
+	   the long multiplication via 16x16->32 multiplications.  */
+	movaps	%xmm4, %xmm5
+	pand	%xmm9, %xmm4
+	psrld	$0x10, %xmm5
+	movaps	%xmm4, %xmm6
+	psrlq	$0x20, %xmm4
+	movaps	COMMON_DATA(_NotiOffExpoMask)(%rip), %xmm8
+	pandn	%xmm1, %xmm8
+	/* Also get the significand as an integer
+	   NB: adding in the integer bit is wrong for denorms!
+	   To make this work for denorms we should do something
+	   slightly different.  */
+	movaps	LOCAL_DATA(_sRangeVal)(%rip), %xmm7
+	paddd	%xmm7, %xmm1
+	movmskps %xmm1, %r8d
 
-	/*
-	 * We want to incorporate the original sign now too.
-	 * Do it here for convenience in getting the right N value,
-	 * though we could wait right to the end if we were prepared
-	 * to modify the sign of N later too.
-	 * So get the appropriate sign mask now (or sooner).
-	 */
-	movl	$-2147483648, %edx
-	movd	(%rcx, %r8), %xmm8
+	por	%xmm8, %xmm7
 
-	/*
-	 * Create floating-point high part, implicitly adding integer bit 1
-	 * Incorporate overall sign at this stage too.
-	 */
-	movl	$1065353216, %ecx
-	movd	(%rsi, %r8), %xmm15
+	pand	%xmm9, %xmm8
+	movaps	%xmm8, %xmm1
 
-	/*
-	 * Now round at the 2^-8 bit position for reduction mod pi/2^7
-	 * instead of the original 2pi (but still with the same 2pi scaling).
-	 * Use a shifter of 2^15 + 2^14.
-	 * The N we get is our final version; it has an offset of
-	 * 2^8 because of the implicit integer bit, and anyway for negative
-	 * starting value it's a 2s complement thing. But we need to mask
-	 * off the exponent part anyway so it's fine.
-	 */
-	movl	$1195376640, %esi
-	movd	(%rdi, %r8), %xmm1
-	movl	$511, %r10d
-	movups	%xmm0, 112(%rsp)
-	movd	%r11d, %xmm0
-	pshufd	$0, %xmm4, %xmm12
-	movdqa	%xmm2, %xmm4
-	punpckldq %xmm8, %xmm9
-	paddd	%xmm12, %xmm13
-	punpckldq %xmm1, %xmm15
-	movdqa	%xmm13, %xmm12
-	pshufd	$0, %xmm0, %xmm8
-	movdqa	%xmm6, %xmm0
-	punpcklqdq %xmm15, %xmm9
-	pand	%xmm8, %xmm13
-	movdqa	%xmm9, %xmm14
-	pand	%xmm8, %xmm9
-	movdqa	%xmm13, %xmm10
-	psrld	$16, %xmm14
-	movdqu	%xmm14, 128(%rsp)
-
-	/* Now do the big multiplication and carry propagation */
-	movdqa	%xmm9, %xmm14
-	psrlq	$32, %xmm10
-	psrlq	$32, %xmm14
-	movdqa	%xmm13, %xmm15
-	movdqa	%xmm10, %xmm7
-	pmuludq	%xmm9, %xmm15
-	psrld	$16, %xmm0
-	pmuludq	%xmm14, %xmm7
-	movdqu	%xmm9, 144(%rsp)
-	psllq	$32, %xmm7
-	movdqu	.FLT_16(%rip), %xmm9
-	pand	%xmm8, %xmm6
-	pand	%xmm9, %xmm15
-	psrld	$16, %xmm12
-	movdqa	%xmm0, %xmm1
-	por	%xmm7, %xmm15
-	movdqa	%xmm13, %xmm7
-	pand	%xmm8, %xmm3
-	movdqu	%xmm0, 160(%rsp)
-	movdqa	%xmm12, %xmm11
-	movdqu	%xmm15, 208(%rsp)
-	psrlq	$32, %xmm1
-	pmuludq	%xmm0, %xmm7
-	movdqa	%xmm6, %xmm5
-	movdqa	%xmm10, %xmm15
-	movdqa	%xmm12, %xmm0
-	movdqu	%xmm14, 176(%rsp)
-	psrlq	$32, %xmm11
-	movdqu	%xmm1, 192(%rsp)
-	psrlq	$32, %xmm5
-	pmuludq	%xmm1, %xmm15
-	movdqa	%xmm13, %xmm1
-	pmuludq	%xmm3, %xmm0
+	psrlq	$0x20, %xmm8
+	movaps	%xmm8, %xmm10
+	pmuludq	%xmm4, %xmm8
+	psllq	$0x20, %xmm8
+	movaps	%xmm1, %xmm11
 	pmuludq	%xmm6, %xmm1
-	pmuludq	%xmm12, %xmm6
-	movdqa	%xmm10, %xmm14
-	psrlq	$32, %xmm3
-	pmuludq	%xmm5, %xmm14
-	pand	%xmm9, %xmm1
-	pmuludq	%xmm11, %xmm3
-	pmuludq	%xmm11, %xmm5
-	psllq	$32, %xmm14
-	pand	%xmm9, %xmm0
-	psllq	$32, %xmm3
-	psrlq	$32, %xmm4
-	por	%xmm14, %xmm1
-	por	%xmm3, %xmm0
-	movdqa	%xmm12, %xmm14
-	movdqa	%xmm11, %xmm3
-	pmuludq	%xmm2, %xmm14
-	pand	%xmm9, %xmm7
-	pmuludq	%xmm4, %xmm3
-	pmuludq	%xmm13, %xmm2
-	pmuludq	%xmm10, %xmm4
-	pand	%xmm9, %xmm2
-	psllq	$32, %xmm4
-	psllq	$32, %xmm15
-	pand	%xmm9, %xmm14
-	psllq	$32, %xmm3
-	por	%xmm4, %xmm2
-	por	%xmm15, %xmm7
-	por	%xmm3, %xmm14
-	psrld	$16, %xmm2
-	pand	%xmm9, %xmm6
-	psllq	$32, %xmm5
-	movdqa	%xmm1, %xmm15
-	paddd	%xmm2, %xmm14
-	movdqa	%xmm7, %xmm2
-	por	%xmm5, %xmm6
-	psrld	$16, %xmm1
-	pand	%xmm8, %xmm2
+	blendps	$0xaa, %xmm8, %xmm1
+	movaps	%xmm1, %xmm8
+	psrld	$0x10, %xmm1
+	pand	%xmm9, %xmm8
+	movaps	%xmm7, %xmm13
+	psrld	$0x10, %xmm7
+	psrlq	$0x30, %xmm13
+	pmuludq	%xmm7, %xmm6
+	pmuludq	%xmm13, %xmm4
+	psllq	$0x20, %xmm4
+	blendps	$0xaa, %xmm4, %xmm6
 	paddd	%xmm1, %xmm6
-	movdqu	160(%rsp), %xmm1
-	paddd	%xmm6, %xmm2
-	movdqu	192(%rsp), %xmm6
-	psrld	$16, %xmm7
-	pmuludq	%xmm12, %xmm1
-	pand	%xmm8, %xmm15
-	pmuludq	%xmm11, %xmm6
-	pmuludq	144(%rsp), %xmm12
-	pmuludq	176(%rsp), %xmm11
+	movaps	%xmm5, %xmm4
+	psrlq	$0x20, %xmm5
+	pmuludq	%xmm11, %xmm4
+	pmuludq	%xmm10, %xmm5
+	psllq	$0x20, %xmm5
+	blendps	$0xaa, %xmm5, %xmm4
+	pand	%xmm9, %xmm4
+	paddd	%xmm6, %xmm4
+	movaps	%xmm2, %xmm5
+	psrld	$0x10, %xmm2
+	movaps	%xmm11, %xmm6
+	pmuludq	%xmm2, %xmm11
+	pmuludq	%xmm7, %xmm2
+	movaps	%xmm5, %xmm1
+	psrlq	$0x30, %xmm5
 	pand	%xmm9, %xmm1
-	psllq	$32, %xmm6
-	por	%xmm6, %xmm1
-	psrld	$16, %xmm0
-	paddd	%xmm7, %xmm1
-	paddd	%xmm14, %xmm15
-	movdqu	128(%rsp), %xmm7
-	paddd	%xmm15, %xmm0
-	pmuludq	%xmm7, %xmm13
-	psrlq	$32, %xmm7
-	pmuludq	%xmm7, %xmm10
-	movdqa	%xmm0, %xmm14
-	pand	%xmm9, %xmm13
-	movdqu	208(%rsp), %xmm5
-	psrld	$16, %xmm14
-	paddd	%xmm2, %xmm14
-	movdqa	%xmm5, %xmm15
-	movdqa	%xmm14, %xmm3
-	pand	%xmm8, %xmm15
-	psrld	$16, %xmm3
-	paddd	%xmm1, %xmm15
-	psllq	$32, %xmm10
-	pand	%xmm9, %xmm12
-	psllq	$32, %xmm11
-	paddd	%xmm15, %xmm3
-	por	%xmm10, %xmm13
-	por	%xmm11, %xmm12
-	psrld	$16, %xmm5
-	movdqa	%xmm3, %xmm4
-	pand	%xmm8, %xmm13
+	movaps	%xmm10, %xmm12
+	pmuludq	%xmm5, %xmm10
+	psllq	$0x20, %xmm10
+	blendps	$0xaa, %xmm10, %xmm11
+	pmuludq	%xmm13, %xmm5
+	psllq	$0x20, %xmm5
+	blendps	$0xaa, %xmm5, %xmm2
+	movaps	%xmm11, %xmm5
+	pand	%xmm9, %xmm11
+	psrld	$0x10, %xmm5
+	paddd	%xmm5, %xmm2
+	paddd	%xmm2, %xmm8
+	movaps	%xmm6, %xmm5
+	pmuludq	%xmm1, %xmm6
+	movaps	%xmm1, %xmm2
+	psrlq	$0x20, %xmm1
+	pmuludq	%xmm7, %xmm2
+	movaps	%xmm12, %xmm10
+	pmuludq	%xmm1, %xmm12
+	psllq	$0x20, %xmm12
+	pmuludq	%xmm13, %xmm1
+	psllq	$0x20, %xmm1
+	blendps	$0xaa, %xmm1, %xmm2
+	blendps	$0xaa, %xmm12, %xmm6
+	movaps	%xmm6, %xmm1
+	psrld	$0x10, %xmm6
+	pand	%xmm9, %xmm1
+	paddd	%xmm6, %xmm2
+	paddd	%xmm2, %xmm11
+	movd	8(%rax, %rcx, 4), %xmm2
+	movd	8(%rax, %rdi, 4), %xmm6
+	punpckldq %xmm2, %xmm6
+	movd	8(%rax, %rdx, 4), %xmm2
+	movd	8(%rax, %rsi, 4), %xmm12
+	punpckldq %xmm2, %xmm12
+	punpcklqdq %xmm6, %xmm12
+	movaps	%xmm12, %xmm2
+	psrld	$0x10, %xmm12
+	pmuludq	%xmm12, %xmm5
+	pmuludq	%xmm7, %xmm12
+	movaps	%xmm2, %xmm6
+	psrlq	$0x30, %xmm2
+	pand	%xmm9, %xmm6
+	pmuludq	%xmm6, %xmm7
+	psrlq	$0x20, %xmm6
+	pmuludq	%xmm13, %xmm6
+	psllq	$0x20, %xmm6
+	blendps	$0xaa, %xmm6, %xmm7
+	psrld	$0x10, %xmm7
+	pmuludq	%xmm2, %xmm13
+	pmuludq	%xmm10, %xmm2
+	psllq	$0x20, %xmm2
+	psllq	$0x20, %xmm13
+	blendps	$0xaa, %xmm2, %xmm5
+	psrld	$0x10, %xmm5
+	blendps	$0xaa, %xmm13, %xmm12
 	paddd	%xmm5, %xmm12
-	psrld	$16, %xmm4
-	paddd	%xmm12, %xmm13
-	paddd	%xmm13, %xmm4
-	pand	%xmm8, %xmm3
-	pslld	$16, %xmm4
-	movd	%edx, %xmm9
-	movups	48(%rsp), %xmm15
-	paddd	%xmm3, %xmm4
-	pshufd	$0, %xmm9, %xmm7
-
-	/* Assemble reduced argument from the pieces */
-	pand	%xmm8, %xmm0
-	movd	%ecx, %xmm8
-	pand	%xmm15, %xmm7
-	pshufd	$0, %xmm8, %xmm1
-	movdqa	%xmm4, %xmm5
-	psrld	$9, %xmm5
-	pxor	%xmm7, %xmm1
-	por	%xmm1, %xmm5
-	movd	%esi, %xmm6
-	pshufd	$0, %xmm6, %xmm3
-	movdqa	%xmm5, %xmm6
-	movl	$262143, %r8d
-
-	/*
-	 * Create floating-point low and medium parts, respectively
-	 * lo_17, ... lo_0, 0, ..., 0
-	 * hi_8, ... hi_0, lo_31, ..., lo_18
-	 * then subtract off the implicitly added integer bits,
-	 * 2^-46 and 2^-23, respectively.
-	 * Put the original sign into all of them at this stage.
-	 */
-	movl	$679477248, %edi
-	movd	%r10d, %xmm13
-	pslld	$16, %xmm14
-	pshufd	$0, %xmm13, %xmm1
-	paddd	%xmm0, %xmm14
-	movd	%r9d, %xmm11
-	pand	%xmm4, %xmm1
-	movd	%r8d, %xmm9
-	movd	%edi, %xmm10
-	pshufd	$0, %xmm9, %xmm8
-	pslld	$14, %xmm1
-	pshufd	$0, %xmm10, %xmm0
-	pand	%xmm14, %xmm8
-	pshufd	$0, %xmm11, %xmm12
-	psrld	$18, %xmm14
-	pxor	%xmm7, %xmm0
-	pxor	%xmm12, %xmm7
-	por	%xmm14, %xmm1
-	pslld	$5, %xmm8
-	por	%xmm7, %xmm1
-
-	/*
-	 * Now multiply those numbers all by 2 pi, reasonably accurately.
-	 * The top part uses 2pi = s2pi_lead + s2pi_trail, where
-	 * s2pi_lead has 12 significant bits.
-	 */
-	movl	$1086918619, %r11d
-
-	/* Split RHi into 12-bit leading and trailing parts. */
-	movl	$-4096, %esi
-	por	%xmm0, %xmm8
-	movl	$1086918656, %edx
-	movl	$-1214941318, %ecx
-
-	/*
-	 * If the magnitude of the input is <= 2^-20, then
-	 * just pass through the input, since no reduction will be needed and
-	 * the main path will only work accurately if the reduced argument is
-	 * about >= 2^-40 (which it is for all large pi multiples)
-	 */
-	movl	$2147483647, %edi
-	addps	%xmm3, %xmm6
-	subps	%xmm7, %xmm1
-	subps	%xmm0, %xmm8
-	movaps	%xmm6, %xmm2
-	movd	%r11d, %xmm14
-	movd	%esi, %xmm4
-	movd	%edx, %xmm7
-	movl	$897581056, %r8d
-	subps	%xmm3, %xmm2
-
-	/* Grab our final N value as an integer, appropriately masked mod 2^8 */
-	movl	$255, %r9d
-	subps	%xmm2, %xmm5
-
-	/* Now add them up into 2 reasonably aligned pieces */
-	movaps	%xmm5, %xmm3
-
-	/*
-	 * The output is _VRES_R (high) + _VRES_E (low), and the integer part is _VRES_IND
-	 * Set sRp2 = _VRES_R^2 and then resume the original code.
-	 * Argument reduction is now finished: x = n * pi/128 + r
-	 * where n = iIndex and r = sR (high) + sE (low).
-	 * But we have n modulo 256, needed for sin/cos with period 2pi
-	 * but we want it modulo 128 since tan has period pi.
-	 */
-	movl	$127, %r10d
-	pshufd	$0, %xmm14, %xmm2
-	addps	%xmm1, %xmm3
-	pshufd	$0, %xmm4, %xmm14
-	movd	%r8d, %xmm4
-	pshufd	$0, %xmm4, %xmm9
-	subps	%xmm3, %xmm5
-	movdqa	%xmm9, %xmm11
-	addps	%xmm5, %xmm1
-	movd	%ecx, %xmm5
-	addps	%xmm1, %xmm8
-	pshufd	$0, %xmm7, %xmm1
-	movdqa	%xmm14, %xmm7
-	andps	%xmm3, %xmm7
+	paddd	%xmm12, %xmm1
+	paddd	%xmm1, %xmm7
+	movaps	%xmm7, %xmm5
+	psrld	$0x10, %xmm7
+	pand	%xmm9, %xmm5
+	paddd	%xmm11, %xmm7
+	movaps	%xmm7, %xmm2
+	psrld	$0x10, %xmm7
+	paddd	%xmm8, %xmm7
+	pslld	$0x10, %xmm2
+	paddd	%xmm5, %xmm2
+	pand	%xmm7, %xmm9
+	psrld	$0x10, %xmm7
+	paddd	%xmm4, %xmm7
+	pslld	$0x10, %xmm7
+	paddd	%xmm9, %xmm7
+	movaps	%xmm7, %xmm4
+	/* Assemble reduced argument from the pieces.  */
+	psrld	$0x9, %xmm7
+	/* Create floating-point high part, implicitly adding integer
+	   bit 1
+	   Incorporate overall sign at this stage too.  */
+	por	COMMON_DATA(_OneF)(%rip), %xmm7
+	movaps	AVX2_SHARED_DATA(_SH_FLT_1)(%rip), %xmm9
+	movaps	%xmm7, %xmm5
+	addps	%xmm9, %xmm7
+	movaps	%xmm7, %xmm6
+	subps	%xmm9, %xmm7
+	/* Grab our final N value as an integer, appropriately masked
+	   mod 2^8.  */
+	subps	%xmm7, %xmm5
+
+	movaps	%xmm2, %xmm9
+	psrld	$0x12, %xmm2
+	movaps	AVX2_SHARED_DATA(_Low9)(%rip), %xmm7
+	pand	%xmm4, %xmm7
+	pslld	$0xe, %xmm7
+	por	%xmm2, %xmm7
+	movaps	AVX2_SHARED_DATA(_SH_FLT_3)(%rip), %xmm4
+	por	%xmm4, %xmm7
+	subps	%xmm4, %xmm7
+	movaps	%xmm5, %xmm4
+	addps	%xmm7, %xmm5
 
-	/*
-	 * Do the multiplication as exact top part and "naive" low part.
-	 * This still maintains a similar level of offset and doesn't drop
-	 * the accuracy much below what we already have.
-	 */
-	movdqa	%xmm1, %xmm10
-	pshufd	$0, %xmm5, %xmm5
-	subps	%xmm7, %xmm3
-	mulps	%xmm7, %xmm10
+	/* Split RHi into 12-bit leading and trailing parts.  */
+	movaps	COMMON_DATA(_Neg4096)(%rip), %xmm0
+	subps	%xmm5, %xmm4
+	addps	%xmm4, %xmm7
+	movaps	%xmm0, %xmm4
+	andps	%xmm5, %xmm0
+	subps	%xmm0, %xmm5
+	/* Do the multiplication as exact top part and "naive" low.  */
+	movaps	LOCAL_DATA(_FLT_0)(%rip), %xmm2
+	movaps	%xmm2, %xmm8
+	mulps	%xmm5, %xmm2
+	movaps	AVX2_SHARED_DATA(_Low18)(%rip), %xmm10
+
+	mulps	%xmm0, %xmm8
+
+
+	pand	%xmm9, %xmm10
+	pslld	$0x5, %xmm10
+	movaps	AVX2_SHARED_DATA(_SH_FLT_2)(%rip), %xmm1
+
+	/* If the magnitude of the input is <= 2^-20, then
+	   just pass through the input, since no reduction will be needed and
+	   the main path will only work accurately if the reduced argument is
+	   about >= 2^-40 (which it is for all large pi multiples).  */
+
+	por	%xmm1, %xmm10
+	subps	%xmm1, %xmm10
+	addps	%xmm7, %xmm10
+
+	/* Now multiply those numbers all by 2 pi, reasonably accurately.
+	   The top part uses 2pi = s2pi_lead + s2pi_trail, where
+	   s2pi_lead has 12 significant bits.  */
+	movaps	AVX2_SHARED_DATA(_SH_FLT_4)(%rip), %xmm9
+	mulps	%xmm10, %xmm9
+	addps	%xmm2, %xmm9
+	/* Now add them up into 2 reasonably aligned pieces.  */
+	movaps	LOCAL_DATA(_FLT_1)(%rip), %xmm7
+	mulps	%xmm7, %xmm0
 	mulps	%xmm5, %xmm7
-	mulps	%xmm3, %xmm1
-	mulps	%xmm8, %xmm2
-	mulps	%xmm3, %xmm5
-	addps	%xmm7, %xmm1
-	addps	%xmm5, %xmm2
-	movd	%edi, %xmm8
-	addps	%xmm2, %xmm1
-
-	/*
-	 * Do another stage of compensated summation to get full offset
-	 * between the pieces sRedHi + sRedLo.
-	 * Depending on the later algorithm, we might avoid this stage.
-	 */
+	addps	%xmm8, %xmm7
+	addps	%xmm9, %xmm7
+	addps	%xmm7, %xmm0
+	lea	AVX2_SHARED_DATA(_Coeffs)(%rip), %rax
+
+	/* The output is _VRES_R (high) + _VRES_E (low), and the integer
+	   part is _VRES_IND Set sRp2 = _VRES_R^2 and then resume the
+	   original code. Argument reduction is now finished: x = n *
+	   pi/128 + r where n = iIndex and r = sR (high) + sE (low).
+	   But we have n modulo 256, needed for sin/cos with period 2pi
+	   but we want it modulo 128 since tan has period pi.  */
+	pand	AVX2_SHARED_DATA(_Low7)(%rip), %xmm6
+	movaps	%xmm6, %xmm9
+	/* Simply combine the two parts of the reduced argument
+	   since we can afford a few ulps in this case.  */
+	pslld	$0x2, %xmm6
+	paddd	%xmm9, %xmm6
+	movq	%xmm6, %rcx
+	movl	%ecx, %edx
+	shrq	$0x20, %rcx
+	pextrq	$0x1, %xmm6, %rsi
+	movl	%esi, %edi
+	shrq	$0x20, %rsi
+	movups	16(%rax, %rcx, 8), %xmm9
+	movups	16(%rax, %rdx, 8), %xmm7
+	movaps	%xmm7, %xmm5
+	punpckhdq %xmm9, %xmm7
+	punpckldq %xmm9, %xmm5
+	movups	16(%rax, %rsi, 8), %xmm9
+	movups	16(%rax, %rdi, 8), %xmm2
+	movaps	%xmm2, %xmm6
+	punpckhdq %xmm9, %xmm2
+	punpckldq %xmm9, %xmm6
+	movaps	%xmm7, %xmm9
+	punpckhqdq %xmm2, %xmm7
+	punpcklqdq %xmm2, %xmm9
+
+	/* Higher polynomial terms
+	   Stage 1 (with unlimited parallelism)
+	   P3 = C1_lo + C2 * Z.  */
+	mulps	%xmm0, %xmm7
+	addps	%xmm7, %xmm9
+	movq	32(%rax, %rsi, 8), %xmm7
+	movq	32(%rax, %rdi, 8), %xmm2
+	punpckldq %xmm7, %xmm2
+	movq	32(%rax, %rcx, 8), %xmm7
+	movq	32(%rax, %rdx, 8), %xmm8
+	punpckldq %xmm7, %xmm8
+	movaps	%xmm8, %xmm7
+	punpckhqdq %xmm2, %xmm8
+	punpcklqdq %xmm2, %xmm7
+	mulps	%xmm0, %xmm8
+	addps	%xmm8, %xmm7
+	movaps	%xmm0, %xmm2
+	mulps	%xmm0, %xmm0
+
+	mulps	%xmm0, %xmm7
+	addps	%xmm7, %xmm9
+	/* Final accumulation of low part.  */
+	mulps	%xmm2, %xmm9
+	movups	0(%rax, %rsi, 8), %xmm0
+	movups	0(%rax, %rdi, 8), %xmm7
+	movaps	%xmm7, %xmm8
+	punpckldq %xmm0, %xmm7
+	punpckhdq %xmm0, %xmm8
+	movups	0(%rax, %rcx, 8), %xmm0
+	movups	0(%rax, %rdx, 8), %xmm1
+	movaps	%xmm1, %xmm10
+	punpckldq %xmm0, %xmm1
+	punpckhdq %xmm0, %xmm10
 	movaps	%xmm1, %xmm0
-
-	/*  Load constants (not all needed at once)  */
-	lea	_sCoeffs+36+__svml_stan_data_internal(%rip), %rdi
-	pshufd	$0, %xmm8, %xmm8
-	addps	%xmm10, %xmm0
-	andps	%xmm15, %xmm8
-	subps	%xmm0, %xmm10
-	cmpltps	%xmm8, %xmm11
-	cmpleps	%xmm9, %xmm8
-	addps	%xmm10, %xmm1
-	andps	%xmm15, %xmm8
-	movd	%r9d, %xmm15
-	andps	%xmm11, %xmm0
-	andps	%xmm1, %xmm11
-	pshufd	$0, %xmm15, %xmm1
-	movd	%r10d, %xmm15
-	pshufd	$0, %xmm15, %xmm7
-	pand	%xmm1, %xmm6
-	pand	%xmm7, %xmm6
-	orps	%xmm0, %xmm8
+	punpcklqdq %xmm7, %xmm1
+	punpckhqdq %xmm7, %xmm0
+
+	/* Compute 2-part reciprocal component Construct a separate
+	   reduced argument modulo pi near pi/2 multiples. i.e. (pi/2 -
+	   x) mod pi, simply by subtracting the reduced argument from
+	   an accurate B_hi + B_lo = (128 - n) pi/128. Force the upper
+	   part of this reduced argument to half-length to simplify
+	   accurate reciprocation later on.  */
+	subps	%xmm2, %xmm1
+	movaps	%xmm4, %xmm7
+	andps	%xmm1, %xmm4
+	subps	%xmm4, %xmm1
+	addps	%xmm1, %xmm0
+
+	/* Now compute an approximate reciprocal to mix into the computation
+	   To avoid any danger of nonportability, force it to 12 bits,
+	   though I suspect it always is anyway on current platforms.  */
+	rcpps	%xmm4, %xmm1
+	andps	%xmm7, %xmm1
+	mulps	%xmm1, %xmm4
+	movaps	%xmm10, %xmm7
+	punpcklqdq %xmm8, %xmm10
+	punpckhqdq %xmm8, %xmm7
+	movaps	%xmm1, %xmm8
+	/* Finally, multiplex both parts so they are only used in
+	   cotangent path.  */
+	mulps	%xmm10, %xmm1
+	movaps	%xmm5, %xmm11
+	punpckhqdq %xmm6, %xmm5
+	punpcklqdq %xmm6, %xmm11
+
+	/* Compensated sum of dominant component(s) Compute C0_hi +
+	   C1_hi * Z + Recip_hi + Recip_lo = H4 (hi) + H9 (lo) H1 =
+	   C1_hi * Z (exact since C1_hi is 1 bit).  */
+	mulps	%xmm2, %xmm5
+	movaps	%xmm7, %xmm2
+	/* H2 = high(C0_hi + C1_hi * Z).  */
+	addps	%xmm5, %xmm7
+	/* H4 = high(H2 + Recip_hi).  */
+
+	subps	%xmm7, %xmm2
+	/* H5 = low(C0_hi + C1_hi * Z).  */
+	addps	%xmm2, %xmm5
+	movaps	%xmm7, %xmm2
+	addps	%xmm1, %xmm7
+
+	/* intermediate in compensated sum.  */
+	subps	%xmm7, %xmm1
+	/* H8 = low(H2 + Recip_hi).  */
+	addps	%xmm1, %xmm2
+
+	/* Get a better approximation to  1/sR_hi (not far short of an ulp)
+	   using a third-order polynomial approximation.  */
+	movups	COMMON_DATA(_OneF)(%rip), %xmm6
+	movaps	%xmm6, %xmm1
+	subps	%xmm4, %xmm6
 	movaps	%xmm6, %xmm4
-
-	/*
-	 * Simply combine the two parts of the reduced argument
-	 * since we can afford a few ulps in this case.
-	 */
-	addps	%xmm11, %xmm8
-	pslld	$2, %xmm4
-	paddd	%xmm6, %xmm4
-	pslld	$3, %xmm4
-	pshufd	$1, %xmm4, %xmm6
-	pshufd	$2, %xmm4, %xmm5
-	pshufd	$3, %xmm4, %xmm3
-	movd	%xmm4, %r11d
-	movd	%xmm6, %edx
-	movd	%xmm5, %ecx
-	movd	%xmm3, %esi
-	movd	-32(%r11, %rdi), %xmm15
-	movd	-32(%rdx, %rdi), %xmm12
-	movd	-32(%rcx, %rdi), %xmm7
-	movd	-32(%rsi, %rdi), %xmm13
-	punpckldq %xmm12, %xmm15
-	punpckldq %xmm13, %xmm7
-	movd	-28(%rsi, %rdi), %xmm5
-	punpcklqdq %xmm7, %xmm15
-	movd	-28(%r11, %rdi), %xmm7
-	movd	-28(%rdx, %rdi), %xmm6
-	movd	-28(%rcx, %rdi), %xmm4
-	movd	-36(%rcx, %rdi), %xmm9
-	movd	-36(%r11, %rdi), %xmm1
-	movd	-36(%rdx, %rdi), %xmm2
-	movd	-24(%rdx, %rdi), %xmm3
-	movd	-36(%rsi, %rdi), %xmm10
-	punpckldq %xmm6, %xmm7
-	punpckldq %xmm5, %xmm4
-	movd	-24(%r11, %rdi), %xmm6
-	punpckldq %xmm2, %xmm1
-	punpckldq %xmm10, %xmm9
-	punpcklqdq %xmm4, %xmm7
-	movd	-16(%r11, %rdi), %xmm4
-	punpckldq %xmm3, %xmm6
-	movd	-24(%rcx, %rdi), %xmm10
-	movd	-16(%rcx, %rdi), %xmm3
-	movd	-24(%rsi, %rdi), %xmm2
-	movd	-16(%rsi, %rdi), %xmm13
-	movd	-16(%rdx, %rdi), %xmm12
-	punpcklqdq %xmm9, %xmm1
-	movd	-20(%rdx, %rdi), %xmm9
-	punpckldq %xmm2, %xmm10
-	movd	-20(%r11, %rdi), %xmm5
-	movd	-20(%rcx, %rdi), %xmm11
-	movd	-20(%rsi, %rdi), %xmm0
-	punpckldq %xmm12, %xmm4
-	punpckldq %xmm13, %xmm3
-	punpcklqdq %xmm10, %xmm6
-	movd	-12(%rsi, %rdi), %xmm10
-	punpckldq %xmm9, %xmm5
-	punpckldq %xmm0, %xmm11
-	punpcklqdq %xmm3, %xmm4
-	movd	-12(%r11, %rdi), %xmm3
-	movd	-12(%rdx, %rdi), %xmm2
-	movd	-12(%rcx, %rdi), %xmm9
-	punpcklqdq %xmm11, %xmm5
-	punpckldq %xmm2, %xmm3
-	punpckldq %xmm10, %xmm9
-	movd	-8(%rcx, %rdi), %xmm10
-	movd	-8(%r11, %rdi), %xmm2
-	movd	-8(%rdx, %rdi), %xmm0
-	movd	-8(%rsi, %rdi), %xmm11
-	punpckldq %xmm0, %xmm2
-	punpckldq %xmm11, %xmm10
-	movd	-4(%rsi, %rdi), %xmm13
-	punpcklqdq %xmm9, %xmm3
-	punpcklqdq %xmm10, %xmm2
-	movd	-4(%r11, %rdi), %xmm10
-	movd	-4(%rdx, %rdi), %xmm12
-	movd	-4(%rcx, %rdi), %xmm9
-	punpckldq %xmm12, %xmm10
-	punpckldq %xmm13, %xmm9
-	punpcklqdq %xmm9, %xmm10
-
-	/*
-	 *  Compute 2-part reciprocal component
-	 * Construct a separate reduced argument modulo pi near pi/2 multiples.
-	 * i.e. (pi/2 - x) mod pi, simply by subtracting the reduced argument
-	 * from an accurate B_hi + B_lo = (128 - n) pi/128. Force the upper part
-	 * of this reduced argument to half-length to simplify accurate
-	 * reciprocation later on.
-	 */
-	movdqa	%xmm1, %xmm9
-	movd	(%r11, %rdi), %xmm13
-	subps	%xmm8, %xmm9
-	movd	(%rdx, %rdi), %xmm0
-	subps	%xmm9, %xmm1
-	punpckldq %xmm0, %xmm13
-	movdqa	%xmm14, %xmm0
-	andps	%xmm9, %xmm0
-	subps	%xmm8, %xmm1
-	subps	%xmm0, %xmm9
-	movd	(%rcx, %rdi), %xmm12
-	addps	%xmm9, %xmm15
-
-	/*
-	 * Now compute an approximate reciprocal to mix into the computation
-	 * To avoid any danger of nonportability, force it to 12 bits,
-	 * though I suspect it always is anyway on current platforms.
-	 */
-	rcpps	%xmm0, %xmm9
-	addps	%xmm15, %xmm1
-	andps	%xmm14, %xmm9
-	mulps	%xmm9, %xmm0
-
-	/*
-	 * Get a better approximation to  1/sR_hi (not far short of an ulp)
-	 * using a third-order polynomial approximation
-	 */
-	movaps	%xmm9, %xmm14
-	movd	(%rsi, %rdi), %xmm11
-
-	/*
-	 * Now compute the error sEr where sRecip_hi = (1/R_hi) * (1 - sEr)
-	 * so that we can compensate for it.
-	 */
-	movups	_sOne+__svml_stan_data_internal(%rip), %xmm15
-	punpckldq %xmm11, %xmm12
-	movaps	%xmm15, %xmm11
-	punpcklqdq %xmm12, %xmm13
-	subps	%xmm0, %xmm11
-	mulps	%xmm11, %xmm14
-	movups	%xmm11, (%rsp)
-	addps	%xmm9, %xmm14
-	mulps	%xmm11, %xmm11
-	movups	%xmm13, 32(%rsp)
-	movups	%xmm11, 16(%rsp)
-	movups	112(%rsp), %xmm0
-	movups	96(%rsp), %xmm11
-	movups	80(%rsp), %xmm12
-	movups	64(%rsp), %xmm13
-	# LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
-
-	/*
-	 *  Compensated sum of dominant component(s)
-	 * Compute C0_hi + C1_hi * Z + Recip_hi + Recip_lo = H4 (hi) + H9 (lo)
-	 * H1 = C1_hi * Z (exact since C1_hi is 1 bit)
-	 */
-	mulps	%xmm8, %xmm4
-	addps	16(%rsp), %xmm15
-
-	/* Finally, multiplex both parts so they are only used in cotangent path */
-	mulps	%xmm7, %xmm9
-
-	/*
-	 *  Higher polynomial terms
-	 * Stage 1 (with unlimited parallelism)
-	 * P3 = C1_lo + C2 * Z
-	 */
-	mulps	%xmm8, %xmm2
-	mulps	%xmm15, %xmm14
-	addps	%xmm2, %xmm3
-
-	/*
-	 * Multiply by sRecip_ok to make sR_lo relative to sR_hi
-	 * Since sR_lo is shifted off by about 12 bits, this is accurate enough.
-	 */
-	mulps	%xmm14, %xmm1
-
-	/*
-	 * Now create a low reciprocal using
-	 * (Recip_hi + Er * Recip_ok) * (1 + sR_lo^2 - sR_lo)
-	 * =~= Recip_hi + Recip_ok * (Er + sR_lo^2 - sR_lo)
-	 */
-	movaps	%xmm1, %xmm15
-	mulps	%xmm1, %xmm1
-	subps	(%rsp), %xmm15
-
-	/* P4 = C3 + C4 * Z */
-	movups	32(%rsp), %xmm2
-	subps	%xmm15, %xmm1
-	mulps	%xmm8, %xmm2
-	mulps	%xmm1, %xmm14
+	mulps	%xmm6, %xmm6
+	addps	%xmm6, %xmm1
+	movaps	%xmm8, %xmm6
+	mulps	%xmm4, %xmm8
+	addps	%xmm6, %xmm8
+	mulps	%xmm1, %xmm8
+
+	/* Multiply by sRecip_ok to make sR_lo relative to sR_hi Since
+	   sR_lo is shifted off by about 12 bits, this is accurate
+	   enough.  */
+	mulps	%xmm8, %xmm0
+	movaps	%xmm0, %xmm6
+	subps	%xmm4, %xmm0
+
+	/* Now create a low reciprocal using
+	   (Recip_hi + Er * Recip_ok) * (1 + sR_lo^2 - sR_lo)
+	   =~= Recip_hi + Recip_ok * (Er + sR_lo^2 - sR_lo).  */
+	mulps	%xmm6, %xmm6
+	/* P4 = C3 + C4 * Z.  */
+	subps	%xmm0, %xmm6
+	mulps	%xmm6, %xmm8
+	mulps	%xmm8, %xmm10
+	/* H7 = low(C0_hi + C1_hi * Z) + Recip_lo.  */
+	addps	%xmm5, %xmm10
+	/* Z2 = Z^2.  */
+
+	/* Now H4 + H9 should be that part.  */
 	addps	%xmm2, %xmm10
-	mulps	%xmm14, %xmm7
-
-	/* H2 = high(C0_hi + C1_hi * Z) */
-	movdqa	%xmm6, %xmm14
-	addps	%xmm4, %xmm14
-
-	/* H4 = high(H2 + Recip_hi) */
-	movaps	%xmm14, %xmm1
-
-	/* intermediate in compensated sum */
-	subps	%xmm14, %xmm6
-	addps	%xmm9, %xmm1
-
-	/* H5 = low(C0_hi + C1_hi * Z) */
-	addps	%xmm6, %xmm4
-
-	/* intermediate in compensated sum */
-	subps	%xmm1, %xmm9
-
-	/* H7 = low(C0_hi + C1_hi * Z) + Recip_lo */
-	addps	%xmm4, %xmm7
-
-	/* H8 = low(H2 + Recip_hi) */
-	addps	%xmm9, %xmm14
-
-	/* Z2 = Z^2 */
-	movaps	%xmm8, %xmm4
+	/* P9 = trail(dominant part) + C0_lo.  */
+	addps	%xmm10, %xmm11
+	/* Merge results from main and large paths:.  */
+	addps	%xmm9, %xmm11
+	addps	%xmm7, %xmm11
+	/* And now the very final summation.  */
+	andps	%xmm14, %xmm11
+
+	/* The end of implementation (LA with huge args reduction)
+	   End of large arguments path (_HA_, _LA_ and _EP_).  */
+	orps	%xmm3, %xmm11
+	movups	COMMON_DATA(_AbsMask)(%rip), %xmm3
+	andnps	%xmm15, %xmm3
+
+	/* Incorperate original sign.  */
+	xorps	%xmm3, %xmm11
+	/* Return to main vector processing path.  */
+	testl	%r8d, %r8d
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	movaps	%xmm11, %xmm0
+	ret
 
-	/* Now H4 + H9 should be that part */
-	addps	%xmm14, %xmm7
-	mulps	%xmm8, %xmm4
 
-	/* P9 = trail(dominant part) + C0_lo */
-	addps	%xmm7, %xmm5
-
-	/*
-	 * Stage 2 (with unlimited parallelism)
-	 * P6 = C1_lo + C2 * Z + C3 * Z^2 + C4 * Z^3
-	 */
-	mulps	%xmm4, %xmm10
-	addps	%xmm10, %xmm3
-
-	/* Final accumulation of low part */
-	mulps	%xmm3, %xmm8
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanhf call. Optimize for code size
+	   more so than speed here.  */
+L(SPECIAL_VALUES_BRANCH):
 
-	/* Merge results from main and large paths: */
-	movaps	%xmm11, %xmm3
-	andnps	%xmm0, %xmm3
-	addps	%xmm8, %xmm5
-	movaps	%xmm3, %xmm0
+	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
+	   call entry will be 16-byte aligned.  */
+	subq	$56, %rsp
+	cfi_def_cfa_offset (64)
+	movups	%xmm11, 24(%rsp)
+	movups	%xmm15, 40(%rsp)
+
+	/* Use rbx/rbp for callee save registers as they get short
+	   encoding for many instructions (as compared with r12/r13).  */
+	movq	%rbx, (%rsp)
+	cfi_offset (rbx, -64)
+	movq	%rbp, 8(%rsp)
+	cfi_offset (rbp, -56)
+	/* r8d has 1s where there was a special value that needs to be
+	   handled by a tanf call.  */
+	movl	%r8d, %ebx
+L(SPECIAL_VALUES_LOOP):
 
-	/* And now the very final summation */
-	addps	%xmm5, %xmm1
+	/* use rbp as index for special value that is saved across calls
+	   to tanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 12] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop.  */
+	xorl	%ebp, %ebp
+	bsfl	%ebx, %ebp
 
-	/*
-	 *  The end of implementation (LA with huge args reduction)
-	 * End of large arguments path (_HA_, _LA_ and _EP_)
-	 */
+	/* Scalar math fucntion call to process special input.  */
+	movss	40(%rsp, %rbp, 4), %xmm0
+	call	tanf@PLT
 
-	pxor	%xmm12, %xmm1
-	andps	%xmm11, %xmm1
-	orps	%xmm1, %xmm0
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
+	movss	%xmm0, 24(%rsp, %rbp, 4)
+
+	leal	-1(%rbx), %eax
+	andl	%eax, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+	/* All results have been written to 24(%rsp).  */
+	movups	24(%rsp), %xmm0
+	movq	(%rsp), %rbx
+	cfi_restore (rbx)
+	movq	8(%rsp), %rbp
+	cfi_restore (rbp)
+	addq	$56, %rsp
+	cfi_def_cfa_offset (8)
+	ret
 
-	/* Return to main vector processing path */
-	jmp	L(AUX_BRANCH_RETURN)
-	# LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm13
 END(_ZGVbN4v_tanf_sse4)
 
-	.section .rodata, "a"
+	.section .rodata.sse4, "a"
 	.align	16
 
-#ifdef __svml_stan_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(16)) VUINT32 _sInvPI_uisa[4][1];
-	__declspec(align(16)) VUINT32 _sPI1_uisa[4][1];
-	__declspec(align(16)) VUINT32 _sPI2_uisa[4][1];
-	__declspec(align(16)) VUINT32 _sPI3_uisa[4][1];
-	__declspec(align(16)) VUINT32 _sPI2_ha_uisa[4][1];
-	__declspec(align(16)) VUINT32 _sPI3_ha_uisa[4][1];
-	__declspec(align(16)) VUINT32 Th_tbl_uisa[32][1];
-	__declspec(align(16)) VUINT32 Tl_tbl_uisa[32][1];
-	__declspec(align(16)) VUINT32 _sPC3_uisa[4][1];
-	__declspec(align(16)) VUINT32 _sPC5_uisa[4][1];
-	__declspec(align(16)) VUINT32 _sRangeReductionVal_uisa[4][1];
-	__declspec(align(16)) VUINT32 _sInvPi[4][1];
-	__declspec(align(16)) VUINT32 _sSignMask[4][1];
-	__declspec(align(16)) VUINT32 _sAbsMask[4][1];
-	__declspec(align(16)) VUINT32 _sRangeVal[4][1];
-	__declspec(align(16)) VUINT32 _sRShifter[4][1];
-	__declspec(align(16)) VUINT32 _sOne[4][1];
-	__declspec(align(16)) VUINT32 _sRangeReductionVal[4][1];
-	__declspec(align(16)) VUINT32 _sPI1[4][1];
-	__declspec(align(16)) VUINT32 _sPI2[4][1];
-	__declspec(align(16)) VUINT32 _sPI3[4][1];
-	__declspec(align(16)) VUINT32 _sPI4[4][1];
-	__declspec(align(16)) VUINT32 _sPI1_FMA[4][1];
-	__declspec(align(16)) VUINT32 _sPI2_FMA[4][1];
-	__declspec(align(16)) VUINT32 _sPI3_FMA[4][1];
-	__declspec(align(16)) VUINT32 _sP0[4][1];
-	__declspec(align(16)) VUINT32 _sP1[4][1];
-	__declspec(align(16)) VUINT32 _sQ0[4][1];
-	__declspec(align(16)) VUINT32 _sQ1[4][1];
-	__declspec(align(16)) VUINT32 _sQ2[4][1];
-	__declspec(align(16)) VUINT32 _sTwo[4][1];
-	__declspec(align(16)) VUINT32 _sCoeffs[128][10][1];
-} __svml_stan_data_internal;
-#endif
-__svml_stan_data_internal:
-	/* UISA */
-	.long	0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983 /* _sInvPI_uisa */
-	.align	16
-	.long	0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda /* _sPI1_uisa */
-	.align	16
-	.long	0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168 /* _sPI2_uisa */
-	.align	16
-	.long	0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5 /* _sPI3_uisa */
-	.align	16
-	.long	0x31a22000, 0x31a22000, 0x31a22000, 0x31a22000 /* _sPI2_ha_uisa */
-	.align	16
-	.long	0x2a34611a, 0x2a34611a, 0x2a34611a, 0x2a34611a /* _sPI3_ha_uisa */
-	/* Th_tbl_uisa for i from 0 to 31 do printsingle(tan(i*Pi/32)); */
-	.align	16
-	.long	0x80000000, 0x3dc9b5dc, 0x3e4bafaf, 0x3e9b5042
-	.long	0x3ed413cd, 0x3f08d5b9, 0x3f2b0dc1, 0x3f521801
-	.long	0x3f800000, 0x3f9bf7ec, 0x3fbf90c7, 0x3fef789e
-	.long	0x401a827a, 0x4052facf, 0x40a0dff7, 0x41227363
-	.long	0xff7fffff, 0xc1227363, 0xc0a0dff7, 0xc052facf
-	.long	0xc01a827a, 0xbfef789e, 0xbfbf90c7, 0xbf9bf7ec
-	.long	0xbf800000, 0xbf521801, 0xbf2b0dc1, 0xbf08d5b9
-	.long	0xbed413cd, 0xbe9b5042, 0xbe4bafaf, 0xbdc9b5dc
-	/* Tl_tbl_uisa for i from 0 to 31 do printsingle(tan(i*Pi/32)-round(tan(i*Pi/32), SG, RN)); */
-	.align	16
-	.long	0x80000000, 0x3145b2da, 0x2f2a62b0, 0xb22a39c2
-	.long	0xb1c0621a, 0xb25ef963, 0x32ab7f99, 0x32ae4285
-	.long	0x00000000, 0x33587608, 0x32169d18, 0xb30c3ec0
-	.long	0xb3cc0622, 0x3390600e, 0x331091dc, 0xb454a046
-	.long	0xf3800000, 0x3454a046, 0xb31091dc, 0xb390600e
-	.long	0x33cc0622, 0x330c3ec0, 0xb2169d18, 0xb3587608
-	.long	0x00000000, 0xb2ae4285, 0xb2ab7f99, 0x325ef963
-	.long	0x31c0621a, 0x322a39c2, 0xaf2a62b0, 0xb145b2da
-	.align	16
-	.long	0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6 /* _sPC3_uisa */
-	.align	16
-	.long	0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888 /* _sPC5_uisa */
-	.align	16
-	.long	0x46010000, 0x46010000, 0x46010000, 0x46010000 /* _sRangeReductionVal_uisa */
-	.align	16
-	.long	0x3F22F983, 0x3F22F983, 0x3F22F983, 0x3F22F983 /* _sInvPi */
-	.align	16
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
-	.align	16
-	.long	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF /* _sAbsMask */
-	.align	16
-	.long	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 /* _sRangeVal */
-	.align	16
-	.long	0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000 /* _sRShifter */
-	.align	16
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 /* _sOne */
-	.align	16
-	.long	0x46010000, 0x46010000, 0x46010000, 0x46010000 /* _sRangeVal */
-	.align	16
-	.long	0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000 /* _sPI1 */
-	.align	16
-	.long	0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000 /* _sPI2 */
-	.align	16
-	.long	0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000 /* _sPI3 */
-	.align	16
-	.long	0x2C34611A, 0x2C34611A, 0x2C34611A, 0x2C34611A /* _sPI4 */
-	// PI1, PI2, and PI3 when FMA is available
-	.align	16
-	.long	0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB /* _sPI1_FMA */
-	.align	16
-	.long	0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E /* _sPI2_FMA */
-	.align	16
-	.long	0xA6F72CED, 0xA6F72CED, 0xA6F72CED, 0xA6F72CED /* _sPI3_FMA */
-	.align	16
-	.long	0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC /* _sP0 */
-	.align	16
-	.long	0xBDC433B4, 0xBDC433B4, 0xBDC433B4, 0xBDC433B4 /* _sP1 */
-	.align	16
-	.long	0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC /* _sQ0 */
-	.align	16
-	.long	0xBEDBB7AB, 0xBEDBB7AB, 0xBEDBB7AB, 0xBEDBB7AB /* _sQ1 */
-	.align	16
-	.long	0x3C1F336B, 0x3C1F336B, 0x3C1F336B, 0x3C1F336B /* _sQ2 */
-	.align	16
-	.long	0x40000000, 0x40000000, 0x40000000, 0x40000000 /* _sTwo */
-	// _sCoeffs Breakpoint B = 0 * pi/128, function tan(B + x)
-	.align	16
-	.long	0x3FC90FDB // B' = pi/2 - B (high single)
-	.long	0xB33BBD2E // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x00000000 // c0 (high single)
-	.long	0x00000000 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x00000000 // c1 (low single)
-	.long	0x00000000 // c2
-	.long	0x3EAAACDD // c3
-	.long	0x00000000 // c4
-	.long	0x3FC5EB9B // B' = pi/2 - B (high single)
-	.long	0x32DE638C // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3CC91A31 // c0 (high single)
-	.long	0x2F8E8D1A // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3A1DFA00 // c1 (low single)
-	.long	0x3CC9392D // c2
-	.long	0x3EAB1889 // c3
-	.long	0x3C885D3B // c4
-	.long	0x3FC2C75C // B' = pi/2 - B (high single)
-	.long	0xB2CBBE8A // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3D49393C // c0 (high single)
-	.long	0x30A39F5B // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3B1E2B00 // c1 (low single)
-	.long	0x3D49B5D4 // c2
-	.long	0x3EAC4F10 // c3
-	.long	0x3CFD9425 // c4
-	.long	0x3FBFA31C // B' = pi/2 - B (high single)
-	.long	0x33450FB0 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3D9711CE // c0 (high single)
-	.long	0x314FEB28 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3BB24C00 // c1 (low single)
-	.long	0x3D97E43A // c2
-	.long	0x3EAE6A89 // c3
-	.long	0x3D4D07E0 // c4
-	.long	0x3FBC7EDD // B' = pi/2 - B (high single)
-	.long	0xB1800ADD // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3DC9B5DC // c0 (high single)
-	.long	0x3145AD86 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3C1EEF20 // c1 (low single)
-	.long	0x3DCBAAEA // c2
-	.long	0x3EB14E5E // c3
-	.long	0x3D858BB2 // c4
-	.long	0x3FB95A9E // B' = pi/2 - B (high single)
-	.long	0xB3651267 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3DFC98C2 // c0 (high single)
-	.long	0xB0AE525C // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3C793D20 // c1 (low single)
-	.long	0x3E003845 // c2
-	.long	0x3EB5271F // c3
-	.long	0x3DAC669E // c4
-	.long	0x3FB6365E // B' = pi/2 - B (high single)
-	.long	0x328BB91C // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E17E564 // c0 (high single)
-	.long	0xB1C5A2E4 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3CB440D0 // c1 (low single)
-	.long	0x3E1B3D00 // c2
-	.long	0x3EB9F664 // c3
-	.long	0x3DD647C0 // c4
-	.long	0x3FB3121F // B' = pi/2 - B (high single)
-	.long	0xB30F347D // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E31AE4D // c0 (high single)
-	.long	0xB1F32251 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3CF6A500 // c1 (low single)
-	.long	0x3E3707DA // c2
-	.long	0x3EBFA489 // c3
-	.long	0x3DFBD9C7 // c4
-	.long	0x3FAFEDDF // B' = pi/2 - B (high single)
-	.long	0x331BBA77 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E4BAFAF // c0 (high single)
-	.long	0x2F2A29E0 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D221018 // c1 (low single)
-	.long	0x3E53BED0 // c2
-	.long	0x3EC67E26 // c3
-	.long	0x3E1568E2 // c4
-	.long	0x3FACC9A0 // B' = pi/2 - B (high single)
-	.long	0xB2655A50 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E65F267 // c0 (high single)
-	.long	0x31B4B1DF // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D4E8B90 // c1 (low single)
-	.long	0x3E718ACA // c2
-	.long	0x3ECE7164 // c3
-	.long	0x3E2DC161 // c4
-	.long	0x3FA9A560 // B' = pi/2 - B (high single)
-	.long	0x33719861 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E803FD4 // c0 (high single)
-	.long	0xB2279E66 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D807FC8 // c1 (low single)
-	.long	0x3E884BD4 // c2
-	.long	0x3ED7812D // c3
-	.long	0x3E4636EB // c4
-	.long	0x3FA68121 // B' = pi/2 - B (high single)
-	.long	0x31E43AAC // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E8DB082 // c0 (high single)
-	.long	0xB132A234 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D9CD7D0 // c1 (low single)
-	.long	0x3E988A60 // c2
-	.long	0x3EE203E3 // c3
-	.long	0x3E63582C // c4
-	.long	0x3FA35CE2 // B' = pi/2 - B (high single)
-	.long	0xB33889B6 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E9B5042 // c0 (high single)
-	.long	0xB22A3AEE // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3DBC7490 // c1 (low single)
-	.long	0x3EA99AF5 // c2
-	.long	0x3EEDE107 // c3
-	.long	0x3E80E9AA // c4
-	.long	0x3FA038A2 // B' = pi/2 - B (high single)
-	.long	0x32E4CA7E // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3EA92457 // c0 (high single)
-	.long	0x30B80830 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3DDF8200 // c1 (low single)
-	.long	0x3EBB99E9 // c2
-	.long	0x3EFB4AA8 // c3
-	.long	0x3E9182BE // c4
-	.long	0x3F9D1463 // B' = pi/2 - B (high single)
-	.long	0xB2C55799 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3EB73250 // c0 (high single)
-	.long	0xB2028823 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E0318F8 // c1 (low single)
-	.long	0x3ECEA678 // c2
-	.long	0x3F053C67 // c3
-	.long	0x3EA41E53 // c4
-	.long	0x3F99F023 // B' = pi/2 - B (high single)
-	.long	0x33484328 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3EC5800D // c0 (high single)
-	.long	0xB214C3C1 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E185E54 // c1 (low single)
-	.long	0x3EE2E342 // c2
-	.long	0x3F0DCA73 // c3
-	.long	0x3EB8CC21 // c4
-	.long	0x3F96CBE4 // B' = pi/2 - B (high single)
-	.long	0xB14CDE2E // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3ED413CD // c0 (high single)
-	.long	0xB1C06152 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E2FB0CC // c1 (low single)
-	.long	0x3EF876CB // c2
-	.long	0x3F177807 // c3
-	.long	0x3ED08437 // c4
-	.long	0x3F93A7A5 // B' = pi/2 - B (high single)
-	.long	0xB361DEEE // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3EE2F439 // c0 (high single)
-	.long	0xB1F4399E // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E49341C // c1 (low single)
-	.long	0x3F07C61A // c2
-	.long	0x3F22560F // c3
-	.long	0x3EEAA81E // c4
-	.long	0x3F908365 // B' = pi/2 - B (high single)
-	.long	0x3292200D // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3EF22870 // c0 (high single)
-	.long	0x325271F4 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E65107A // c1 (low single)
-	.long	0x3F1429F0 // c2
-	.long	0x3F2E8AFC // c3
-	.long	0x3F040498 // c4
-	.long	0x3F8D5F26 // B' = pi/2 - B (high single)
-	.long	0xB30C0105 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F00DC0D // c0 (high single)
-	.long	0xB214AF72 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E81B994 // c1 (low single)
-	.long	0x3F218233 // c2
-	.long	0x3F3C4531 // c3
-	.long	0x3F149688 // c4
-	.long	0x3F8A3AE6 // B' = pi/2 - B (high single)
-	.long	0x331EEDF0 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F08D5B9 // c0 (high single)
-	.long	0xB25EF98E // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E92478D // c1 (low single)
-	.long	0x3F2FEDC9 // c2
-	.long	0x3F4BCD58 // c3
-	.long	0x3F27AE9E // c4
-	.long	0x3F8716A7 // B' = pi/2 - B (high single)
-	.long	0xB2588C6D // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F1105AF // c0 (high single)
-	.long	0x32F045B0 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EA44EE2 // c1 (low single)
-	.long	0x3F3F8FDB // c2
-	.long	0x3F5D3FD0 // c3
-	.long	0x3F3D0A23 // c4
-	.long	0x3F83F267 // B' = pi/2 - B (high single)
-	.long	0x3374CBD9 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F1970C4 // c0 (high single)
-	.long	0x32904848 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EB7EFF8 // c1 (low single)
-	.long	0x3F50907C // c2
-	.long	0x3F710FEA // c3
-	.long	0x3F561FED // c4
-	.long	0x3F80CE28 // B' = pi/2 - B (high single)
-	.long	0x31FDD672 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F221C37 // c0 (high single)
-	.long	0xB20C61DC // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3ECD4F71 // c1 (low single)
-	.long	0x3F631DAA // c2
-	.long	0x3F83B471 // c3
-	.long	0x3F7281EA // c4
-	.long	0x3F7B53D1 // B' = pi/2 - B (high single)
-	.long	0x32955386 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F2B0DC1 // c0 (high single)
-	.long	0x32AB7EBA // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EE496C2 // c1 (low single)
-	.long	0x3F776C40 // c2
-	.long	0x3F9065C1 // c3
-	.long	0x3F89AFB6 // c4
-	.long	0x3F750B52 // B' = pi/2 - B (high single)
-	.long	0x32EB316F // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F344BA9 // c0 (high single)
-	.long	0xB2B8B0EA // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EFDF4F7 // c1 (low single)
-	.long	0x3F86DCA8 // c2
-	.long	0x3F9ED53B // c3
-	.long	0x3F9CBEDE // c4
-	.long	0x3F6EC2D4 // B' = pi/2 - B (high single)
-	.long	0xB2BEF0A7 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F3DDCCF // c0 (high single)
-	.long	0x32D29606 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBEE6606F // c1 (low single)
-	.long	0x3F9325D6 // c2
-	.long	0x3FAF4E69 // c3
-	.long	0x3FB3080C // c4
-	.long	0x3F687A55 // B' = pi/2 - B (high single)
-	.long	0xB252257B // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F47C8CC // c0 (high single)
-	.long	0xB200F51A // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBEC82C6C // c1 (low single)
-	.long	0x3FA0BAE9 // c2
-	.long	0x3FC2252F // c3
-	.long	0x3FCD24C7 // c4
-	.long	0x3F6231D6 // B' = pi/2 - B (high single)
-	.long	0xB119A6A2 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F521801 // c0 (high single)
-	.long	0x32AE4178 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBEA72938 // c1 (low single)
-	.long	0x3FAFCC22 // c2
-	.long	0x3FD7BD4A // c3
-	.long	0x3FEBB01B // c4
-	.long	0x3F5BE957 // B' = pi/2 - B (high single)
-	.long	0x3205522A // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F5CD3BE // c0 (high single)
-	.long	0x31460308 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBE8306C5 // c1 (low single)
-	.long	0x3FC09232 // c2
-	.long	0x3FF09632 // c3
-	.long	0x4007DB00 // c4
-	.long	0x3F55A0D8 // B' = pi/2 - B (high single)
-	.long	0x329886FF // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F68065E // c0 (high single)
-	.long	0x32670D1A // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBE36D1D6 // c1 (low single)
-	.long	0x3FD35007 // c2
-	.long	0x4006A861 // c3
-	.long	0x401D4BDA // c4
-	.long	0x3F4F5859 // B' = pi/2 - B (high single)
-	.long	0x32EE64E8 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F73BB75 // c0 (high single)
-	.long	0x32FC908D // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBDBF94B0 // c1 (low single)
-	.long	0x3FE8550F // c2
-	.long	0x40174F67 // c3
-	.long	0x4036C608 // c4
-	.long	0x3F490FDB // B' = pi/2 - B (high single)
-	.long	0xB2BBBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE8BE60E // c0 (high single)
-	.long	0x320D8D84 // c0 (low single)
-	.long	0x3F000000 // c1 (high 1 bit)
-	.long	0xBDF817B1 // c1 (low single)
-	.long	0xBD8345EB // c2
-	.long	0x3D1DFDAC // c3
-	.long	0xBC52CF6F // c4
-	.long	0x3F42C75C // B' = pi/2 - B (high single)
-	.long	0xB24BBE8A // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE87283F // c0 (high single)
-	.long	0xB268B966 // c0 (low single)
-	.long	0x3F000000 // c1 (high 1 bit)
-	.long	0xBDFE6529 // c1 (low single)
-	.long	0xBD7B1953 // c2
-	.long	0x3D18E109 // c3
-	.long	0xBC4570B0 // c4
-	.long	0x3F3C7EDD // B' = pi/2 - B (high single)
-	.long	0xB1000ADD // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE827420 // c0 (high single)
-	.long	0x320B8B4D // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DFB9428 // c1 (low single)
-	.long	0xBD7002B4 // c2
-	.long	0x3D142A6C // c3
-	.long	0xBC3A47FF // c4
-	.long	0x3F36365E // B' = pi/2 - B (high single)
-	.long	0x320BB91C // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE7B9282 // c0 (high single)
-	.long	0xB13383D2 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DF5D211 // c1 (low single)
-	.long	0xBD6542B3 // c2
-	.long	0x3D0FE5E5 // c3
-	.long	0xBC31FB14 // c4
-	.long	0x3F2FEDDF // B' = pi/2 - B (high single)
-	.long	0x329BBA77 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE724E73 // c0 (high single)
-	.long	0x3120C3E2 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DF05283 // c1 (low single)
-	.long	0xBD5AD45E // c2
-	.long	0x3D0BAFBF // c3
-	.long	0xBC27B8BB // c4
-	.long	0x3F29A560 // B' = pi/2 - B (high single)
-	.long	0x32F19861 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE691B44 // c0 (high single)
-	.long	0x31F18936 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DEB138B // c1 (low single)
-	.long	0xBD50B2F7 // c2
-	.long	0x3D07BE3A // c3
-	.long	0xBC1E46A7 // c4
-	.long	0x3F235CE2 // B' = pi/2 - B (high single)
-	.long	0xB2B889B6 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE5FF82C // c0 (high single)
-	.long	0xB170723A // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DE61354 // c1 (low single)
-	.long	0xBD46DA06 // c2
-	.long	0x3D0401F8 // c3
-	.long	0xBC14E013 // c4
-	.long	0x3F1D1463 // B' = pi/2 - B (high single)
-	.long	0xB2455799 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE56E46B // c0 (high single)
-	.long	0x31E3F001 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DE15025 // c1 (low single)
-	.long	0xBD3D4550 // c2
-	.long	0x3D00462D // c3
-	.long	0xBC092C98 // c4
-	.long	0x3F16CBE4 // B' = pi/2 - B (high single)
-	.long	0xB0CCDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE4DDF41 // c0 (high single)
-	.long	0xB1AEA094 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DDCC85C // c1 (low single)
-	.long	0xBD33F0BE // c2
-	.long	0x3CFA23B0 // c3
-	.long	0xBC01FCF7 // c4
-	.long	0x3F108365 // B' = pi/2 - B (high single)
-	.long	0x3212200D // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE44E7F8 // c0 (high single)
-	.long	0xB1CAA3CB // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DD87A74 // c1 (low single)
-	.long	0xBD2AD885 // c2
-	.long	0x3CF3C785 // c3
-	.long	0xBBF1E348 // c4
-	.long	0x3F0A3AE6 // B' = pi/2 - B (high single)
-	.long	0x329EEDF0 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE3BFDDC // c0 (high single)
-	.long	0xB132521A // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DD464FC // c1 (low single)
-	.long	0xBD21F8F1 // c2
-	.long	0x3CEE3076 // c3
-	.long	0xBBE6D263 // c4
-	.long	0x3F03F267 // B' = pi/2 - B (high single)
-	.long	0x32F4CBD9 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE33203E // c0 (high single)
-	.long	0x31FEF5BE // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DD0869C // c1 (low single)
-	.long	0xBD194E8C // c2
-	.long	0x3CE8DCA9 // c3
-	.long	0xBBDADA55 // c4
-	.long	0x3EFB53D1 // B' = pi/2 - B (high single)
-	.long	0x32155386 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE2A4E71 // c0 (high single)
-	.long	0xB19CFCEC // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DCCDE11 // c1 (low single)
-	.long	0xBD10D605 // c2
-	.long	0x3CE382A7 // c3
-	.long	0xBBC8BD97 // c4
-	.long	0x3EEEC2D4 // B' = pi/2 - B (high single)
-	.long	0xB23EF0A7 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE2187D0 // c0 (high single)
-	.long	0xB1B7C7F7 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC96A2B // c1 (low single)
-	.long	0xBD088C22 // c2
-	.long	0x3CDE950E // c3
-	.long	0xBBB89AD1 // c4
-	.long	0x3EE231D6 // B' = pi/2 - B (high single)
-	.long	0xB099A6A2 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE18CBB7 // c0 (high single)
-	.long	0xAFE28430 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC629CE // c1 (low single)
-	.long	0xBD006DCD // c2
-	.long	0x3CDA5A2C // c3
-	.long	0xBBB0B3D2 // c4
-	.long	0x3ED5A0D8 // B' = pi/2 - B (high single)
-	.long	0x321886FF // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE101985 // c0 (high single)
-	.long	0xB02FB2B8 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC31BF3 // c1 (low single)
-	.long	0xBCF0F04D // c2
-	.long	0x3CD60BC7 // c3
-	.long	0xBBA138BA // c4
-	.long	0x3EC90FDB // B' = pi/2 - B (high single)
-	.long	0xB23BBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE07709D // c0 (high single)
-	.long	0xB18A2A83 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC03FA2 // c1 (low single)
-	.long	0xBCE15096 // c2
-	.long	0x3CD26472 // c3
-	.long	0xBB9A1270 // c4
-	.long	0x3EBC7EDD // B' = pi/2 - B (high single)
-	.long	0xB0800ADD // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBDFDA0CB // c0 (high single)
-	.long	0x2F14FCA0 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DBD93F7 // c1 (low single)
-	.long	0xBCD1F71B // c2
-	.long	0x3CCEDD2B // c3
-	.long	0xBB905946 // c4
-	.long	0x3EAFEDDF // B' = pi/2 - B (high single)
-	.long	0x321BBA77 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBDEC708C // c0 (high single)
-	.long	0xB14895C4 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DBB181E // c1 (low single)
-	.long	0xBCC2DEA6 // c2
-	.long	0x3CCB5027 // c3
-	.long	0xBB7F3969 // c4
-	.long	0x3EA35CE2 // B' = pi/2 - B (high single)
-	.long	0xB23889B6 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBDDB4F55 // c0 (high single)
-	.long	0x30F6437E // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB8CB52 // c1 (low single)
-	.long	0xBCB40210 // c2
-	.long	0x3CC82D45 // c3
-	.long	0xBB643075 // c4
-	.long	0x3E96CBE4 // B' = pi/2 - B (high single)
-	.long	0xB04CDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBDCA3BFF // c0 (high single)
-	.long	0x311C95EA // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB6ACDE // c1 (low single)
-	.long	0xBCA55C5B // c2
-	.long	0x3CC5BC04 // c3
-	.long	0xBB63A969 // c4
-	.long	0x3E8A3AE6 // B' = pi/2 - B (high single)
-	.long	0x321EEDF0 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBDB93569 // c0 (high single)
-	.long	0xAFB9ED00 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB4BC1F // c1 (low single)
-	.long	0xBC96E905 // c2
-	.long	0x3CC2E6F5 // c3
-	.long	0xBB3E10A6 // c4
-	.long	0x3E7B53D1 // B' = pi/2 - B (high single)
-	.long	0x31955386 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBDA83A77 // c0 (high single)
-	.long	0x316D967A // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB2F87C // c1 (low single)
-	.long	0xBC88A31F // c2
-	.long	0x3CC0E763 // c3
-	.long	0xBB3F1666 // c4
-	.long	0x3E6231D6 // B' = pi/2 - B (high single)
-	.long	0xB019A6A2 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBD974A0D // c0 (high single)
-	.long	0xB14F365B // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB1616F // c1 (low single)
-	.long	0xBC750CD8 // c2
-	.long	0x3CBEB595 // c3
-	.long	0xBB22B883 // c4
-	.long	0x3E490FDB // B' = pi/2 - B (high single)
-	.long	0xB1BBBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBD866317 // c0 (high single)
-	.long	0xAFF02140 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAFF67D // c1 (low single)
-	.long	0xBC591CD0 // c2
-	.long	0x3CBCBEAD // c3
-	.long	0xBB04BBEC // c4
-	.long	0x3E2FEDDF // B' = pi/2 - B (high single)
-	.long	0x319BBA77 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBD6B08FF // c0 (high single)
-	.long	0xB0EED236 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAEB739 // c1 (low single)
-	.long	0xBC3D6D51 // c2
-	.long	0x3CBB485D // c3
-	.long	0xBAFFF5BA // c4
-	.long	0x3E16CBE4 // B' = pi/2 - B (high single)
-	.long	0xAFCCDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBD495A6C // c0 (high single)
-	.long	0xB0A427BD // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DADA345 // c1 (low single)
-	.long	0xBC21F648 // c2
-	.long	0x3CB9D1B4 // c3
-	.long	0xBACB5567 // c4
-	.long	0x3DFB53D1 // B' = pi/2 - B (high single)
-	.long	0x31155386 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBD27B856 // c0 (high single)
-	.long	0xB0F7EE91 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DACBA4E // c1 (low single)
-	.long	0xBC06AEE3 // c2
-	.long	0x3CB8E5DC // c3
-	.long	0xBAEC00EE // c4
-	.long	0x3DC90FDB // B' = pi/2 - B (high single)
-	.long	0xB13BBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBD0620A3 // c0 (high single)
-	.long	0xB0ECAB40 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DABFC11 // c1 (low single)
-	.long	0xBBD7200F // c2
-	.long	0x3CB79475 // c3
-	.long	0xBA2B0ADC // c4
-	.long	0x3D96CBE4 // B' = pi/2 - B (high single)
-	.long	0xAF4CDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBCC92278 // c0 (high single)
-	.long	0x302F2E68 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAB6854 // c1 (low single)
-	.long	0xBBA1214F // c2
-	.long	0x3CB6C1E9 // c3
-	.long	0x3843C2F3 // c4
-	.long	0x3D490FDB // B' = pi/2 - B (high single)
-	.long	0xB0BBBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBC861015 // c0 (high single)
-	.long	0xAFD68E2E // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAAFEEB // c1 (low single)
-	.long	0xBB569F3F // c2
-	.long	0x3CB6A84E // c3
-	.long	0xBAC64194 // c4
-	.long	0x3CC90FDB // B' = pi/2 - B (high single)
-	.long	0xB03BBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBC060BF3 // c0 (high single)
-	.long	0x2FE251AE // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAABFB9 // c1 (low single)
-	.long	0xBAD67C60 // c2
-	.long	0x3CB64CA5 // c3
-	.long	0xBACDE881 // c4
-	.long	0x00000000 // B' = pi/2 - B (high single)
-	.long	0x00000000 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x00000000 // c0 (high single)
-	.long	0x00000000 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAAAAAB // c1 (low single)
-	.long	0x00000000 // c2
-	.long	0x3CB5E28B // c3
-	.long	0x00000000 // c4
-	.long	0xBCC90FDB // B' = pi/2 - B (high single)
-	.long	0x303BBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3C060BF3 // c0 (high single)
-	.long	0xAFE251AE // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAABFB9 // c1 (low single)
-	.long	0x3AD67C60 // c2
-	.long	0x3CB64CA5 // c3
-	.long	0x3ACDE881 // c4
-	.long	0xBD490FDB // B' = pi/2 - B (high single)
-	.long	0x30BBBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3C861015 // c0 (high single)
-	.long	0x2FD68E2E // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAAFEEB // c1 (low single)
-	.long	0x3B569F3F // c2
-	.long	0x3CB6A84E // c3
-	.long	0x3AC64194 // c4
-	.long	0xBD96CBE4 // B' = pi/2 - B (high single)
-	.long	0x2F4CDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3CC92278 // c0 (high single)
-	.long	0xB02F2E68 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAB6854 // c1 (low single)
-	.long	0x3BA1214F // c2
-	.long	0x3CB6C1E9 // c3
-	.long	0xB843C2F2 // c4
-	.long	0xBDC90FDB // B' = pi/2 - B (high single)
-	.long	0x313BBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3D0620A3 // c0 (high single)
-	.long	0x30ECAB40 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DABFC11 // c1 (low single)
-	.long	0x3BD7200F // c2
-	.long	0x3CB79475 // c3
-	.long	0x3A2B0ADC // c4
-	.long	0xBDFB53D1 // B' = pi/2 - B (high single)
-	.long	0xB1155386 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3D27B856 // c0 (high single)
-	.long	0x30F7EE91 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DACBA4E // c1 (low single)
-	.long	0x3C06AEE3 // c2
-	.long	0x3CB8E5DC // c3
-	.long	0x3AEC00EE // c4
-	.long	0xBE16CBE4 // B' = pi/2 - B (high single)
-	.long	0x2FCCDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3D495A6C // c0 (high single)
-	.long	0x30A427BD // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DADA345 // c1 (low single)
-	.long	0x3C21F648 // c2
-	.long	0x3CB9D1B4 // c3
-	.long	0x3ACB5567 // c4
-	.long	0xBE2FEDDF // B' = pi/2 - B (high single)
-	.long	0xB19BBA77 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3D6B08FF // c0 (high single)
-	.long	0x30EED236 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAEB739 // c1 (low single)
-	.long	0x3C3D6D51 // c2
-	.long	0x3CBB485D // c3
-	.long	0x3AFFF5BA // c4
-	.long	0xBE490FDB // B' = pi/2 - B (high single)
-	.long	0x31BBBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3D866317 // c0 (high single)
-	.long	0x2FF02140 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAFF67D // c1 (low single)
-	.long	0x3C591CD0 // c2
-	.long	0x3CBCBEAD // c3
-	.long	0x3B04BBEC // c4
-	.long	0xBE6231D6 // B' = pi/2 - B (high single)
-	.long	0x3019A6A2 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3D974A0D // c0 (high single)
-	.long	0x314F365B // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB1616F // c1 (low single)
-	.long	0x3C750CD8 // c2
-	.long	0x3CBEB595 // c3
-	.long	0x3B22B883 // c4
-	.long	0xBE7B53D1 // B' = pi/2 - B (high single)
-	.long	0xB1955386 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3DA83A77 // c0 (high single)
-	.long	0xB16D967A // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB2F87C // c1 (low single)
-	.long	0x3C88A31F // c2
-	.long	0x3CC0E763 // c3
-	.long	0x3B3F1666 // c4
-	.long	0xBE8A3AE6 // B' = pi/2 - B (high single)
-	.long	0xB21EEDF0 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3DB93569 // c0 (high single)
-	.long	0x2FB9ED00 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB4BC1F // c1 (low single)
-	.long	0x3C96E905 // c2
-	.long	0x3CC2E6F5 // c3
-	.long	0x3B3E10A6 // c4
-	.long	0xBE96CBE4 // B' = pi/2 - B (high single)
-	.long	0x304CDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3DCA3BFF // c0 (high single)
-	.long	0xB11C95EA // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB6ACDE // c1 (low single)
-	.long	0x3CA55C5B // c2
-	.long	0x3CC5BC04 // c3
-	.long	0x3B63A969 // c4
-	.long	0xBEA35CE2 // B' = pi/2 - B (high single)
-	.long	0x323889B6 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3DDB4F55 // c0 (high single)
-	.long	0xB0F6437E // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB8CB52 // c1 (low single)
-	.long	0x3CB40210 // c2
-	.long	0x3CC82D45 // c3
-	.long	0x3B643075 // c4
-	.long	0xBEAFEDDF // B' = pi/2 - B (high single)
-	.long	0xB21BBA77 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3DEC708C // c0 (high single)
-	.long	0x314895C4 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DBB181E // c1 (low single)
-	.long	0x3CC2DEA6 // c2
-	.long	0x3CCB5027 // c3
-	.long	0x3B7F3969 // c4
-	.long	0xBEBC7EDD // B' = pi/2 - B (high single)
-	.long	0x30800ADD // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3DFDA0CB // c0 (high single)
-	.long	0xAF14FCA0 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DBD93F7 // c1 (low single)
-	.long	0x3CD1F71B // c2
-	.long	0x3CCEDD2B // c3
-	.long	0x3B905946 // c4
-	.long	0xBEC90FDB // B' = pi/2 - B (high single)
-	.long	0x323BBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E07709D // c0 (high single)
-	.long	0x318A2A83 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC03FA2 // c1 (low single)
-	.long	0x3CE15096 // c2
-	.long	0x3CD26472 // c3
-	.long	0x3B9A1270 // c4
-	.long	0xBED5A0D8 // B' = pi/2 - B (high single)
-	.long	0xB21886FF // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E101985 // c0 (high single)
-	.long	0x302FB2B8 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC31BF3 // c1 (low single)
-	.long	0x3CF0F04D // c2
-	.long	0x3CD60BC7 // c3
-	.long	0x3BA138BA // c4
-	.long	0xBEE231D6 // B' = pi/2 - B (high single)
-	.long	0x3099A6A2 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E18CBB7 // c0 (high single)
-	.long	0x2FE28430 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC629CE // c1 (low single)
-	.long	0x3D006DCD // c2
-	.long	0x3CDA5A2C // c3
-	.long	0x3BB0B3D2 // c4
-	.long	0xBEEEC2D4 // B' = pi/2 - B (high single)
-	.long	0x323EF0A7 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E2187D0 // c0 (high single)
-	.long	0x31B7C7F7 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC96A2B // c1 (low single)
-	.long	0x3D088C22 // c2
-	.long	0x3CDE950E // c3
-	.long	0x3BB89AD1 // c4
-	.long	0xBEFB53D1 // B' = pi/2 - B (high single)
-	.long	0xB2155386 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E2A4E71 // c0 (high single)
-	.long	0x319CFCEC // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DCCDE11 // c1 (low single)
-	.long	0x3D10D605 // c2
-	.long	0x3CE382A7 // c3
-	.long	0x3BC8BD97 // c4
-	.long	0xBF03F267 // B' = pi/2 - B (high single)
-	.long	0xB2F4CBD9 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E33203E // c0 (high single)
-	.long	0xB1FEF5BE // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DD0869C // c1 (low single)
-	.long	0x3D194E8C // c2
-	.long	0x3CE8DCA9 // c3
-	.long	0x3BDADA55 // c4
-	.long	0xBF0A3AE6 // B' = pi/2 - B (high single)
-	.long	0xB29EEDF0 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E3BFDDC // c0 (high single)
-	.long	0x3132521A // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DD464FC // c1 (low single)
-	.long	0x3D21F8F1 // c2
-	.long	0x3CEE3076 // c3
-	.long	0x3BE6D263 // c4
-	.long	0xBF108365 // B' = pi/2 - B (high single)
-	.long	0xB212200D // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E44E7F8 // c0 (high single)
-	.long	0x31CAA3CB // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DD87A74 // c1 (low single)
-	.long	0x3D2AD885 // c2
-	.long	0x3CF3C785 // c3
-	.long	0x3BF1E348 // c4
-	.long	0xBF16CBE4 // B' = pi/2 - B (high single)
-	.long	0x30CCDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E4DDF41 // c0 (high single)
-	.long	0x31AEA094 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DDCC85C // c1 (low single)
-	.long	0x3D33F0BE // c2
-	.long	0x3CFA23B0 // c3
-	.long	0x3C01FCF7 // c4
-	.long	0xBF1D1463 // B' = pi/2 - B (high single)
-	.long	0x32455799 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E56E46B // c0 (high single)
-	.long	0xB1E3F001 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DE15025 // c1 (low single)
-	.long	0x3D3D4550 // c2
-	.long	0x3D00462D // c3
-	.long	0x3C092C98 // c4
-	.long	0xBF235CE2 // B' = pi/2 - B (high single)
-	.long	0x32B889B6 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E5FF82C // c0 (high single)
-	.long	0x3170723A // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DE61354 // c1 (low single)
-	.long	0x3D46DA06 // c2
-	.long	0x3D0401F8 // c3
-	.long	0x3C14E013 // c4
-	.long	0xBF29A560 // B' = pi/2 - B (high single)
-	.long	0xB2F19861 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E691B44 // c0 (high single)
-	.long	0xB1F18936 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DEB138B // c1 (low single)
-	.long	0x3D50B2F7 // c2
-	.long	0x3D07BE3A // c3
-	.long	0x3C1E46A7 // c4
-	.long	0xBF2FEDDF // B' = pi/2 - B (high single)
-	.long	0xB29BBA77 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E724E73 // c0 (high single)
-	.long	0xB120C3E2 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DF05283 // c1 (low single)
-	.long	0x3D5AD45E // c2
-	.long	0x3D0BAFBF // c3
-	.long	0x3C27B8BB // c4
-	.long	0xBF36365E // B' = pi/2 - B (high single)
-	.long	0xB20BB91C // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E7B9282 // c0 (high single)
-	.long	0x313383D2 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DF5D211 // c1 (low single)
-	.long	0x3D6542B3 // c2
-	.long	0x3D0FE5E5 // c3
-	.long	0x3C31FB14 // c4
-	.long	0xBF3C7EDD // B' = pi/2 - B (high single)
-	.long	0x31000ADD // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E827420 // c0 (high single)
-	.long	0xB20B8B4D // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DFB9428 // c1 (low single)
-	.long	0x3D7002B4 // c2
-	.long	0x3D142A6C // c3
-	.long	0x3C3A47FF // c4
-	.long	0xBF42C75C // B' = pi/2 - B (high single)
-	.long	0x324BBE8A // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E87283F // c0 (high single)
-	.long	0x3268B966 // c0 (low single)
-	.long	0x3F000000 // c1 (high 1 bit)
-	.long	0xBDFE6529 // c1 (low single)
-	.long	0x3D7B1953 // c2
-	.long	0x3D18E109 // c3
-	.long	0x3C4570B0 // c4
-	.long	0xBF490FDB // B' = pi/2 - B (high single)
-	.long	0x32BBBD2E // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF800000 // c0 (high single)
-	.long	0x2B410000 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xB3000000 // c1 (low single)
-	.long	0xC0000000 // c2
-	.long	0x402AB7C8 // c3
-	.long	0xC05561DB // c4
-	.long	0xBF4F5859 // B' = pi/2 - B (high single)
-	.long	0xB2EE64E8 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF73BB75 // c0 (high single)
-	.long	0xB2FC908D // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBDBF94B0 // c1 (low single)
-	.long	0xBFE8550F // c2
-	.long	0x40174F67 // c3
-	.long	0xC036C608 // c4
-	.long	0xBF55A0D8 // B' = pi/2 - B (high single)
-	.long	0xB29886FF // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF68065E // c0 (high single)
-	.long	0xB2670D1A // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBE36D1D6 // c1 (low single)
-	.long	0xBFD35007 // c2
-	.long	0x4006A861 // c3
-	.long	0xC01D4BDA // c4
-	.long	0xBF5BE957 // B' = pi/2 - B (high single)
-	.long	0xB205522A // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF5CD3BE // c0 (high single)
-	.long	0xB1460308 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBE8306C5 // c1 (low single)
-	.long	0xBFC09232 // c2
-	.long	0x3FF09632 // c3
-	.long	0xC007DB00 // c4
-	.long	0xBF6231D6 // B' = pi/2 - B (high single)
-	.long	0x3119A6A2 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF521801 // c0 (high single)
-	.long	0xB2AE4178 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBEA72938 // c1 (low single)
-	.long	0xBFAFCC22 // c2
-	.long	0x3FD7BD4A // c3
-	.long	0xBFEBB01B // c4
-	.long	0xBF687A55 // B' = pi/2 - B (high single)
-	.long	0x3252257B // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF47C8CC // c0 (high single)
-	.long	0x3200F51A // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBEC82C6C // c1 (low single)
-	.long	0xBFA0BAE9 // c2
-	.long	0x3FC2252F // c3
-	.long	0xBFCD24C7 // c4
-	.long	0xBF6EC2D4 // B' = pi/2 - B (high single)
-	.long	0x32BEF0A7 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF3DDCCF // c0 (high single)
-	.long	0xB2D29606 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBEE6606F // c1 (low single)
-	.long	0xBF9325D6 // c2
-	.long	0x3FAF4E69 // c3
-	.long	0xBFB3080C // c4
-	.long	0xBF750B52 // B' = pi/2 - B (high single)
-	.long	0xB2EB316F // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF344BA9 // c0 (high single)
-	.long	0x32B8B0EA // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EFDF4F7 // c1 (low single)
-	.long	0xBF86DCA8 // c2
-	.long	0x3F9ED53B // c3
-	.long	0xBF9CBEDE // c4
-	.long	0xBF7B53D1 // B' = pi/2 - B (high single)
-	.long	0xB2955386 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF2B0DC1 // c0 (high single)
-	.long	0xB2AB7EBA // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EE496C2 // c1 (low single)
-	.long	0xBF776C40 // c2
-	.long	0x3F9065C1 // c3
-	.long	0xBF89AFB6 // c4
-	.long	0xBF80CE28 // B' = pi/2 - B (high single)
-	.long	0xB1FDD672 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF221C37 // c0 (high single)
-	.long	0x320C61DC // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3ECD4F71 // c1 (low single)
-	.long	0xBF631DAA // c2
-	.long	0x3F83B471 // c3
-	.long	0xBF7281EA // c4
-	.long	0xBF83F267 // B' = pi/2 - B (high single)
-	.long	0xB374CBD9 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF1970C4 // c0 (high single)
-	.long	0xB2904848 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EB7EFF8 // c1 (low single)
-	.long	0xBF50907C // c2
-	.long	0x3F710FEA // c3
-	.long	0xBF561FED // c4
-	.long	0xBF8716A7 // B' = pi/2 - B (high single)
-	.long	0x32588C6D // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF1105AF // c0 (high single)
-	.long	0xB2F045B0 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EA44EE2 // c1 (low single)
-	.long	0xBF3F8FDB // c2
-	.long	0x3F5D3FD0 // c3
-	.long	0xBF3D0A23 // c4
-	.long	0xBF8A3AE6 // B' = pi/2 - B (high single)
-	.long	0xB31EEDF0 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF08D5B9 // c0 (high single)
-	.long	0x325EF98E // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E92478D // c1 (low single)
-	.long	0xBF2FEDC9 // c2
-	.long	0x3F4BCD58 // c3
-	.long	0xBF27AE9E // c4
-	.long	0xBF8D5F26 // B' = pi/2 - B (high single)
-	.long	0x330C0105 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF00DC0D // c0 (high single)
-	.long	0x3214AF72 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E81B994 // c1 (low single)
-	.long	0xBF218233 // c2
-	.long	0x3F3C4531 // c3
-	.long	0xBF149688 // c4
-	.long	0xBF908365 // B' = pi/2 - B (high single)
-	.long	0xB292200D // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBEF22870 // c0 (high single)
-	.long	0xB25271F4 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E65107A // c1 (low single)
-	.long	0xBF1429F0 // c2
-	.long	0x3F2E8AFC // c3
-	.long	0xBF040498 // c4
-	.long	0xBF93A7A5 // B' = pi/2 - B (high single)
-	.long	0x3361DEEE // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBEE2F439 // c0 (high single)
-	.long	0x31F4399E // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E49341C // c1 (low single)
-	.long	0xBF07C61A // c2
-	.long	0x3F22560F // c3
-	.long	0xBEEAA81E // c4
-	.long	0xBF96CBE4 // B' = pi/2 - B (high single)
-	.long	0x314CDE2E // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBED413CD // c0 (high single)
-	.long	0x31C06152 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E2FB0CC // c1 (low single)
-	.long	0xBEF876CB // c2
-	.long	0x3F177807 // c3
-	.long	0xBED08437 // c4
-	.long	0xBF99F023 // B' = pi/2 - B (high single)
-	.long	0xB3484328 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBEC5800D // c0 (high single)
-	.long	0x3214C3C1 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E185E54 // c1 (low single)
-	.long	0xBEE2E342 // c2
-	.long	0x3F0DCA73 // c3
-	.long	0xBEB8CC21 // c4
-	.long	0xBF9D1463 // B' = pi/2 - B (high single)
-	.long	0x32C55799 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBEB73250 // c0 (high single)
-	.long	0x32028823 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E0318F8 // c1 (low single)
-	.long	0xBECEA678 // c2
-	.long	0x3F053C67 // c3
-	.long	0xBEA41E53 // c4
-	.long	0xBFA038A2 // B' = pi/2 - B (high single)
-	.long	0xB2E4CA7E // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBEA92457 // c0 (high single)
-	.long	0xB0B80830 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3DDF8200 // c1 (low single)
-	.long	0xBEBB99E9 // c2
-	.long	0x3EFB4AA8 // c3
-	.long	0xBE9182BE // c4
-	.long	0xBFA35CE2 // B' = pi/2 - B (high single)
-	.long	0x333889B6 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE9B5042 // c0 (high single)
-	.long	0x322A3AEE // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3DBC7490 // c1 (low single)
-	.long	0xBEA99AF5 // c2
-	.long	0x3EEDE107 // c3
-	.long	0xBE80E9AA // c4
-	.long	0xBFA68121 // B' = pi/2 - B (high single)
-	.long	0xB1E43AAC // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE8DB082 // c0 (high single)
-	.long	0x3132A234 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D9CD7D0 // c1 (low single)
-	.long	0xBE988A60 // c2
-	.long	0x3EE203E3 // c3
-	.long	0xBE63582C // c4
-	.long	0xBFA9A560 // B' = pi/2 - B (high single)
-	.long	0xB3719861 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE803FD4 // c0 (high single)
-	.long	0x32279E66 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D807FC8 // c1 (low single)
-	.long	0xBE884BD4 // c2
-	.long	0x3ED7812D // c3
-	.long	0xBE4636EB // c4
-	.long	0xBFACC9A0 // B' = pi/2 - B (high single)
-	.long	0x32655A50 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE65F267 // c0 (high single)
-	.long	0xB1B4B1DF // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D4E8B90 // c1 (low single)
-	.long	0xBE718ACA // c2
-	.long	0x3ECE7164 // c3
-	.long	0xBE2DC161 // c4
-	.long	0xBFAFEDDF // B' = pi/2 - B (high single)
-	.long	0xB31BBA77 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE4BAFAF // c0 (high single)
-	.long	0xAF2A29E0 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D221018 // c1 (low single)
-	.long	0xBE53BED0 // c2
-	.long	0x3EC67E26 // c3
-	.long	0xBE1568E2 // c4
-	.long	0xBFB3121F // B' = pi/2 - B (high single)
-	.long	0x330F347D // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE31AE4D // c0 (high single)
-	.long	0x31F32251 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3CF6A500 // c1 (low single)
-	.long	0xBE3707DA // c2
-	.long	0x3EBFA489 // c3
-	.long	0xBDFBD9C7 // c4
-	.long	0xBFB6365E // B' = pi/2 - B (high single)
-	.long	0xB28BB91C // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE17E564 // c0 (high single)
-	.long	0x31C5A2E4 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3CB440D0 // c1 (low single)
-	.long	0xBE1B3D00 // c2
-	.long	0x3EB9F664 // c3
-	.long	0xBDD647C0 // c4
-	.long	0xBFB95A9E // B' = pi/2 - B (high single)
-	.long	0x33651267 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBDFC98C2 // c0 (high single)
-	.long	0x30AE525C // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3C793D20 // c1 (low single)
-	.long	0xBE003845 // c2
-	.long	0x3EB5271F // c3
-	.long	0xBDAC669E // c4
-	.long	0xBFBC7EDD // B' = pi/2 - B (high single)
-	.long	0x31800ADD // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBDC9B5DC // c0 (high single)
-	.long	0xB145AD86 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3C1EEF20 // c1 (low single)
-	.long	0xBDCBAAEA // c2
-	.long	0x3EB14E5E // c3
-	.long	0xBD858BB2 // c4
-	.long	0xBFBFA31C // B' = pi/2 - B (high single)
-	.long	0xB3450FB0 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBD9711CE // c0 (high single)
-	.long	0xB14FEB28 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3BB24C00 // c1 (low single)
-	.long	0xBD97E43A // c2
-	.long	0x3EAE6A89 // c3
-	.long	0xBD4D07E0 // c4
-	.long	0xBFC2C75C // B' = pi/2 - B (high single)
-	.long	0x32CBBE8A // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBD49393C // c0 (high single)
-	.long	0xB0A39F5B // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3B1E2B00 // c1 (low single)
-	.long	0xBD49B5D4 // c2
-	.long	0x3EAC4F10 // c3
-	.long	0xBCFD9425 // c4
-	.long	0xBFC5EB9B // B' = pi/2 - B (high single)
-	.long	0xB2DE638C // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBCC91A31 // c0 (high single)
-	.long	0xAF8E8D1A // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3A1DFA00 // c1 (low single)
-	.long	0xBCC9392D // c2
-	.long	0x3EAB1889 // c3
-	.long	0xBC885D3B // c4
-	.align	16
-	.type	__svml_stan_data_internal, @object
-	.size	__svml_stan_data_internal, .-__svml_stan_data_internal
-	.space	16, 0x00
-	.align	16
-
-#ifdef __svml_stan_reduction_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(16)) VUINT32 _sPtable[256][3][1];
-} __svml_stan_reduction_data_internal;
-#endif
-__svml_stan_reduction_data_internal:
-	/*     P_hi                  P_med               P_lo                */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 0 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 1 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 2 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 3 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 4 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 5 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 6 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 7 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 8 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 9 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 10 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 11 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 12 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 13 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 14 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 15 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 16 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 17 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 18 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 19 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 20 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 21 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 22 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 23 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 24 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 25 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 26 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 27 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 28 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 29 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 30 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 31 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 32 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 33 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 34 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 35 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 36 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 37 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 38 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 39 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 40 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 41 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 42 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 43 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 44 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 45 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 46 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 47 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 48 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 49 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 50 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 51 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 52 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 53 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 54 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 55 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 56 */
-	.long	0x00000000, 0x00000000, 0x00000001 /* 57 */
-	.long	0x00000000, 0x00000000, 0x00000002 /* 58 */
-	.long	0x00000000, 0x00000000, 0x00000005 /* 59 */
-	.long	0x00000000, 0x00000000, 0x0000000A /* 60 */
-	.long	0x00000000, 0x00000000, 0x00000014 /* 61 */
-	.long	0x00000000, 0x00000000, 0x00000028 /* 62 */
-	.long	0x00000000, 0x00000000, 0x00000051 /* 63 */
-	.long	0x00000000, 0x00000000, 0x000000A2 /* 64 */
-	.long	0x00000000, 0x00000000, 0x00000145 /* 65 */
-	.long	0x00000000, 0x00000000, 0x0000028B /* 66 */
-	.long	0x00000000, 0x00000000, 0x00000517 /* 67 */
-	.long	0x00000000, 0x00000000, 0x00000A2F /* 68 */
-	.long	0x00000000, 0x00000000, 0x0000145F /* 69 */
-	.long	0x00000000, 0x00000000, 0x000028BE /* 70 */
-	.long	0x00000000, 0x00000000, 0x0000517C /* 71 */
-	.long	0x00000000, 0x00000000, 0x0000A2F9 /* 72 */
-	.long	0x00000000, 0x00000000, 0x000145F3 /* 73 */
-	.long	0x00000000, 0x00000000, 0x00028BE6 /* 74 */
-	.long	0x00000000, 0x00000000, 0x000517CC /* 75 */
-	.long	0x00000000, 0x00000000, 0x000A2F98 /* 76 */
-	.long	0x00000000, 0x00000000, 0x00145F30 /* 77 */
-	.long	0x00000000, 0x00000000, 0x0028BE60 /* 78 */
-	.long	0x00000000, 0x00000000, 0x00517CC1 /* 79 */
-	.long	0x00000000, 0x00000000, 0x00A2F983 /* 80 */
-	.long	0x00000000, 0x00000000, 0x0145F306 /* 81 */
-	.long	0x00000000, 0x00000000, 0x028BE60D /* 82 */
-	.long	0x00000000, 0x00000000, 0x0517CC1B /* 83 */
-	.long	0x00000000, 0x00000000, 0x0A2F9836 /* 84 */
-	.long	0x00000000, 0x00000000, 0x145F306D /* 85 */
-	.long	0x00000000, 0x00000000, 0x28BE60DB /* 86 */
-	.long	0x00000000, 0x00000000, 0x517CC1B7 /* 87 */
-	.long	0x00000000, 0x00000000, 0xA2F9836E /* 88 */
-	.long	0x00000000, 0x00000001, 0x45F306DC /* 89 */
-	.long	0x00000000, 0x00000002, 0x8BE60DB9 /* 90 */
-	.long	0x00000000, 0x00000005, 0x17CC1B72 /* 91 */
-	.long	0x00000000, 0x0000000A, 0x2F9836E4 /* 92 */
-	.long	0x00000000, 0x00000014, 0x5F306DC9 /* 93 */
-	.long	0x00000000, 0x00000028, 0xBE60DB93 /* 94 */
-	.long	0x00000000, 0x00000051, 0x7CC1B727 /* 95 */
-	.long	0x00000000, 0x000000A2, 0xF9836E4E /* 96 */
-	.long	0x00000000, 0x00000145, 0xF306DC9C /* 97 */
-	.long	0x00000000, 0x0000028B, 0xE60DB939 /* 98 */
-	.long	0x00000000, 0x00000517, 0xCC1B7272 /* 99 */
-	.long	0x00000000, 0x00000A2F, 0x9836E4E4 /* 100 */
-	.long	0x00000000, 0x0000145F, 0x306DC9C8 /* 101 */
-	.long	0x00000000, 0x000028BE, 0x60DB9391 /* 102 */
-	.long	0x00000000, 0x0000517C, 0xC1B72722 /* 103 */
-	.long	0x00000000, 0x0000A2F9, 0x836E4E44 /* 104 */
-	.long	0x00000000, 0x000145F3, 0x06DC9C88 /* 105 */
-	.long	0x00000000, 0x00028BE6, 0x0DB93910 /* 106 */
-	.long	0x00000000, 0x000517CC, 0x1B727220 /* 107 */
-	.long	0x00000000, 0x000A2F98, 0x36E4E441 /* 108 */
-	.long	0x00000000, 0x00145F30, 0x6DC9C882 /* 109 */
-	.long	0x00000000, 0x0028BE60, 0xDB939105 /* 110 */
-	.long	0x00000000, 0x00517CC1, 0xB727220A /* 111 */
-	.long	0x00000000, 0x00A2F983, 0x6E4E4415 /* 112 */
-	.long	0x00000000, 0x0145F306, 0xDC9C882A /* 113 */
-	.long	0x00000000, 0x028BE60D, 0xB9391054 /* 114 */
-	.long	0x00000000, 0x0517CC1B, 0x727220A9 /* 115 */
-	.long	0x00000000, 0x0A2F9836, 0xE4E44152 /* 116 */
-	.long	0x00000000, 0x145F306D, 0xC9C882A5 /* 117 */
-	.long	0x00000000, 0x28BE60DB, 0x9391054A /* 118 */
-	.long	0x00000000, 0x517CC1B7, 0x27220A94 /* 119 */
-	.long	0x00000000, 0xA2F9836E, 0x4E441529 /* 120 */
-	.long	0x00000001, 0x45F306DC, 0x9C882A53 /* 121 */
-	.long	0x00000002, 0x8BE60DB9, 0x391054A7 /* 122 */
-	.long	0x00000005, 0x17CC1B72, 0x7220A94F /* 123 */
-	.long	0x0000000A, 0x2F9836E4, 0xE441529F /* 124 */
-	.long	0x00000014, 0x5F306DC9, 0xC882A53F /* 125 */
-	.long	0x00000028, 0xBE60DB93, 0x91054A7F /* 126 */
-	.long	0x00000051, 0x7CC1B727, 0x220A94FE /* 127 */
-	.long	0x000000A2, 0xF9836E4E, 0x441529FC /* 128 */
-	.long	0x00000145, 0xF306DC9C, 0x882A53F8 /* 129 */
-	.long	0x0000028B, 0xE60DB939, 0x1054A7F0 /* 130 */
-	.long	0x00000517, 0xCC1B7272, 0x20A94FE1 /* 131 */
-	.long	0x00000A2F, 0x9836E4E4, 0x41529FC2 /* 132 */
-	.long	0x0000145F, 0x306DC9C8, 0x82A53F84 /* 133 */
-	.long	0x000028BE, 0x60DB9391, 0x054A7F09 /* 134 */
-	.long	0x0000517C, 0xC1B72722, 0x0A94FE13 /* 135 */
-	.long	0x0000A2F9, 0x836E4E44, 0x1529FC27 /* 136 */
-	.long	0x000145F3, 0x06DC9C88, 0x2A53F84E /* 137 */
-	.long	0x00028BE6, 0x0DB93910, 0x54A7F09D /* 138 */
-	.long	0x000517CC, 0x1B727220, 0xA94FE13A /* 139 */
-	.long	0x000A2F98, 0x36E4E441, 0x529FC275 /* 140 */
-	.long	0x00145F30, 0x6DC9C882, 0xA53F84EA /* 141 */
-	.long	0x0028BE60, 0xDB939105, 0x4A7F09D5 /* 142 */
-	.long	0x00517CC1, 0xB727220A, 0x94FE13AB /* 143 */
-	.long	0x00A2F983, 0x6E4E4415, 0x29FC2757 /* 144 */
-	.long	0x0145F306, 0xDC9C882A, 0x53F84EAF /* 145 */
-	.long	0x028BE60D, 0xB9391054, 0xA7F09D5F /* 146 */
-	.long	0x0517CC1B, 0x727220A9, 0x4FE13ABE /* 147 */
-	.long	0x0A2F9836, 0xE4E44152, 0x9FC2757D /* 148 */
-	.long	0x145F306D, 0xC9C882A5, 0x3F84EAFA /* 149 */
-	.long	0x28BE60DB, 0x9391054A, 0x7F09D5F4 /* 150 */
-	.long	0x517CC1B7, 0x27220A94, 0xFE13ABE8 /* 151 */
-	.long	0xA2F9836E, 0x4E441529, 0xFC2757D1 /* 152 */
-	.long	0x45F306DC, 0x9C882A53, 0xF84EAFA3 /* 153 */
-	.long	0x8BE60DB9, 0x391054A7, 0xF09D5F47 /* 154 */
-	.long	0x17CC1B72, 0x7220A94F, 0xE13ABE8F /* 155 */
-	.long	0x2F9836E4, 0xE441529F, 0xC2757D1F /* 156 */
-	.long	0x5F306DC9, 0xC882A53F, 0x84EAFA3E /* 157 */
-	.long	0xBE60DB93, 0x91054A7F, 0x09D5F47D /* 158 */
-	.long	0x7CC1B727, 0x220A94FE, 0x13ABE8FA /* 159 */
-	.long	0xF9836E4E, 0x441529FC, 0x2757D1F5 /* 160 */
-	.long	0xF306DC9C, 0x882A53F8, 0x4EAFA3EA /* 161 */
-	.long	0xE60DB939, 0x1054A7F0, 0x9D5F47D4 /* 162 */
-	.long	0xCC1B7272, 0x20A94FE1, 0x3ABE8FA9 /* 163 */
-	.long	0x9836E4E4, 0x41529FC2, 0x757D1F53 /* 164 */
-	.long	0x306DC9C8, 0x82A53F84, 0xEAFA3EA6 /* 165 */
-	.long	0x60DB9391, 0x054A7F09, 0xD5F47D4D /* 166 */
-	.long	0xC1B72722, 0x0A94FE13, 0xABE8FA9A /* 167 */
-	.long	0x836E4E44, 0x1529FC27, 0x57D1F534 /* 168 */
-	.long	0x06DC9C88, 0x2A53F84E, 0xAFA3EA69 /* 169 */
-	.long	0x0DB93910, 0x54A7F09D, 0x5F47D4D3 /* 170 */
-	.long	0x1B727220, 0xA94FE13A, 0xBE8FA9A6 /* 171 */
-	.long	0x36E4E441, 0x529FC275, 0x7D1F534D /* 172 */
-	.long	0x6DC9C882, 0xA53F84EA, 0xFA3EA69B /* 173 */
-	.long	0xDB939105, 0x4A7F09D5, 0xF47D4D37 /* 174 */
-	.long	0xB727220A, 0x94FE13AB, 0xE8FA9A6E /* 175 */
-	.long	0x6E4E4415, 0x29FC2757, 0xD1F534DD /* 176 */
-	.long	0xDC9C882A, 0x53F84EAF, 0xA3EA69BB /* 177 */
-	.long	0xB9391054, 0xA7F09D5F, 0x47D4D377 /* 178 */
-	.long	0x727220A9, 0x4FE13ABE, 0x8FA9A6EE /* 179 */
-	.long	0xE4E44152, 0x9FC2757D, 0x1F534DDC /* 180 */
-	.long	0xC9C882A5, 0x3F84EAFA, 0x3EA69BB8 /* 181 */
-	.long	0x9391054A, 0x7F09D5F4, 0x7D4D3770 /* 182 */
-	.long	0x27220A94, 0xFE13ABE8, 0xFA9A6EE0 /* 183 */
-	.long	0x4E441529, 0xFC2757D1, 0xF534DDC0 /* 184 */
-	.long	0x9C882A53, 0xF84EAFA3, 0xEA69BB81 /* 185 */
-	.long	0x391054A7, 0xF09D5F47, 0xD4D37703 /* 186 */
-	.long	0x7220A94F, 0xE13ABE8F, 0xA9A6EE06 /* 187 */
-	.long	0xE441529F, 0xC2757D1F, 0x534DDC0D /* 188 */
-	.long	0xC882A53F, 0x84EAFA3E, 0xA69BB81B /* 189 */
-	.long	0x91054A7F, 0x09D5F47D, 0x4D377036 /* 190 */
-	.long	0x220A94FE, 0x13ABE8FA, 0x9A6EE06D /* 191 */
-	.long	0x441529FC, 0x2757D1F5, 0x34DDC0DB /* 192 */
-	.long	0x882A53F8, 0x4EAFA3EA, 0x69BB81B6 /* 193 */
-	.long	0x1054A7F0, 0x9D5F47D4, 0xD377036D /* 194 */
-	.long	0x20A94FE1, 0x3ABE8FA9, 0xA6EE06DB /* 195 */
-	.long	0x41529FC2, 0x757D1F53, 0x4DDC0DB6 /* 196 */
-	.long	0x82A53F84, 0xEAFA3EA6, 0x9BB81B6C /* 197 */
-	.long	0x054A7F09, 0xD5F47D4D, 0x377036D8 /* 198 */
-	.long	0x0A94FE13, 0xABE8FA9A, 0x6EE06DB1 /* 199 */
-	.long	0x1529FC27, 0x57D1F534, 0xDDC0DB62 /* 200 */
-	.long	0x2A53F84E, 0xAFA3EA69, 0xBB81B6C5 /* 201 */
-	.long	0x54A7F09D, 0x5F47D4D3, 0x77036D8A /* 202 */
-	.long	0xA94FE13A, 0xBE8FA9A6, 0xEE06DB14 /* 203 */
-	.long	0x529FC275, 0x7D1F534D, 0xDC0DB629 /* 204 */
-	.long	0xA53F84EA, 0xFA3EA69B, 0xB81B6C52 /* 205 */
-	.long	0x4A7F09D5, 0xF47D4D37, 0x7036D8A5 /* 206 */
-	.long	0x94FE13AB, 0xE8FA9A6E, 0xE06DB14A /* 207 */
-	.long	0x29FC2757, 0xD1F534DD, 0xC0DB6295 /* 208 */
-	.long	0x53F84EAF, 0xA3EA69BB, 0x81B6C52B /* 209 */
-	.long	0xA7F09D5F, 0x47D4D377, 0x036D8A56 /* 210 */
-	.long	0x4FE13ABE, 0x8FA9A6EE, 0x06DB14AC /* 211 */
-	.long	0x9FC2757D, 0x1F534DDC, 0x0DB62959 /* 212 */
-	.long	0x3F84EAFA, 0x3EA69BB8, 0x1B6C52B3 /* 213 */
-	.long	0x7F09D5F4, 0x7D4D3770, 0x36D8A566 /* 214 */
-	.long	0xFE13ABE8, 0xFA9A6EE0, 0x6DB14ACC /* 215 */
-	.long	0xFC2757D1, 0xF534DDC0, 0xDB629599 /* 216 */
-	.long	0xF84EAFA3, 0xEA69BB81, 0xB6C52B32 /* 217 */
-	.long	0xF09D5F47, 0xD4D37703, 0x6D8A5664 /* 218 */
-	.long	0xE13ABE8F, 0xA9A6EE06, 0xDB14ACC9 /* 219 */
-	.long	0xC2757D1F, 0x534DDC0D, 0xB6295993 /* 220 */
-	.long	0x84EAFA3E, 0xA69BB81B, 0x6C52B327 /* 221 */
-	.long	0x09D5F47D, 0x4D377036, 0xD8A5664F /* 222 */
-	.long	0x13ABE8FA, 0x9A6EE06D, 0xB14ACC9E /* 223 */
-	.long	0x2757D1F5, 0x34DDC0DB, 0x6295993C /* 224 */
-	.long	0x4EAFA3EA, 0x69BB81B6, 0xC52B3278 /* 225 */
-	.long	0x9D5F47D4, 0xD377036D, 0x8A5664F1 /* 226 */
-	.long	0x3ABE8FA9, 0xA6EE06DB, 0x14ACC9E2 /* 227 */
-	.long	0x757D1F53, 0x4DDC0DB6, 0x295993C4 /* 228 */
-	.long	0xEAFA3EA6, 0x9BB81B6C, 0x52B32788 /* 229 */
-	.long	0xD5F47D4D, 0x377036D8, 0xA5664F10 /* 230 */
-	.long	0xABE8FA9A, 0x6EE06DB1, 0x4ACC9E21 /* 231 */
-	.long	0x57D1F534, 0xDDC0DB62, 0x95993C43 /* 232 */
-	.long	0xAFA3EA69, 0xBB81B6C5, 0x2B327887 /* 233 */
-	.long	0x5F47D4D3, 0x77036D8A, 0x5664F10E /* 234 */
-	.long	0xBE8FA9A6, 0xEE06DB14, 0xACC9E21C /* 235 */
-	.long	0x7D1F534D, 0xDC0DB629, 0x5993C439 /* 236 */
-	.long	0xFA3EA69B, 0xB81B6C52, 0xB3278872 /* 237 */
-	.long	0xF47D4D37, 0x7036D8A5, 0x664F10E4 /* 238 */
-	.long	0xE8FA9A6E, 0xE06DB14A, 0xCC9E21C8 /* 239 */
-	.long	0xD1F534DD, 0xC0DB6295, 0x993C4390 /* 240 */
-	.long	0xA3EA69BB, 0x81B6C52B, 0x32788720 /* 241 */
-	.long	0x47D4D377, 0x036D8A56, 0x64F10E41 /* 242 */
-	.long	0x8FA9A6EE, 0x06DB14AC, 0xC9E21C82 /* 243 */
-	.long	0x1F534DDC, 0x0DB62959, 0x93C43904 /* 244 */
-	.long	0x3EA69BB8, 0x1B6C52B3, 0x27887208 /* 245 */
-	.long	0x7D4D3770, 0x36D8A566, 0x4F10E410 /* 246 */
-	.long	0xFA9A6EE0, 0x6DB14ACC, 0x9E21C820 /* 247 */
-	.long	0xF534DDC0, 0xDB629599, 0x3C439041 /* 248 */
-	.long	0xEA69BB81, 0xB6C52B32, 0x78872083 /* 249 */
-	.long	0xD4D37703, 0x6D8A5664, 0xF10E4107 /* 250 */
-	.long	0xA9A6EE06, 0xDB14ACC9, 0xE21C820F /* 251 */
-	.long	0x534DDC0D, 0xB6295993, 0xC439041F /* 252 */
-	.long	0xA69BB81B, 0x6C52B327, 0x8872083F /* 253 */
-	.long	0x4D377036, 0xD8A5664F, 0x10E4107F /* 254 */
-	.long	0x9A6EE06D, 0xB14ACC9E, 0x21C820FF /* 255 */
-	.align	16
-	.type	__svml_stan_reduction_data_internal, @object
-	.size	__svml_stan_reduction_data_internal, .-__svml_stan_reduction_data_internal
-	.align	16
+LOCAL_DATA_NAME:
+	DATA_VEC (LOCAL_DATA_NAME, _sPI1, 0x3FC90000)
+	DATA_VEC (LOCAL_DATA_NAME, _sPI2, 0x39FDA000)
+	DATA_VEC (LOCAL_DATA_NAME, _sPI3, 0x33A22000)
+	DATA_VEC (LOCAL_DATA_NAME, _sPI4, 0x2C34611A)
+	DATA_VEC (LOCAL_DATA_NAME, _sRangeVal, 0x00800000)
+	DATA_VEC (LOCAL_DATA_NAME, _FLT_0, 0xb795777a)
+	DATA_VEC (LOCAL_DATA_NAME, _FLT_1, 0x40c91000)
 
-.FLT_16:
-	.long	0xffffffff, 0x00000000, 0xffffffff, 0x00000000
-	.type	.FLT_16, @object
-	.size	.FLT_16, 16
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 17/27] x86/fpu: Optimize svml_s_tanf8_core_avx2.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (14 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 16/27] x86/fpu: Optimize svml_s_tanf4_core_sse4.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 18/27] x86/fpu: Optimize svml_s_log10f16_core_avx512.S Noah Goldstein via Libc-alpha
                   ` (10 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Remove many unnecissary spills.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

Code Size Change: 935 Bytes (1438 - 2373)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.8508
0F          (0x0000ffff, Denorm)   -> 0.9556
.1F         (0x3dcccccd)           -> 0.8491
5F          (0x40a00000)           -> 0.7777
2315255808F (0x4f0a0000)           -> 0.7410
-NaN        (0xffffffff)           -> 0.7444
---
 .../fpu/multiarch/svml_s_tanf8_core_avx2.S    | 2967 +++--------------
 1 file changed, 503 insertions(+), 2464 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
index d34e61ac41..de4c849c45 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
@@ -45,2548 +45,587 @@
  *
  */
 
-/* Offsets for data table __svml_stan_data_internal
- */
-#define _sInvPI_uisa			0
-#define _sPI1_uisa			32
-#define _sPI2_uisa			64
-#define _sPI3_uisa			96
-#define _sPI2_ha_uisa			128
-#define _sPI3_ha_uisa			160
-#define Th_tbl_uisa			192
-#define Tl_tbl_uisa			320
-#define _sPC3_uisa			448
-#define _sPC5_uisa			480
-#define _sRangeReductionVal_uisa	512
-#define _sInvPi				544
-#define _sSignMask			576
-#define _sAbsMask			608
-#define _sRangeVal			640
-#define _sRShifter			672
-#define _sOne				704
-#define _sRangeReductionVal		736
-#define _sPI1				768
-#define _sPI2				800
-#define _sPI3				832
-#define _sPI4				864
-#define _sPI1_FMA			896
-#define _sPI2_FMA			928
-#define _sPI3_FMA			960
-#define _sP0				992
-#define _sP1				1024
-#define _sQ0				1056
-#define _sQ1				1088
-#define _sQ2				1120
-#define _sTwo				1152
-#define _sCoeffs			1184
+#define LOCAL_DATA_NAME	__svml_stan_data_internal
+#include "svml_s_common_avx2_rodata_offsets.h"
+
+#define AVX2_SHARED_TABLE
+#define AVX512_SHARED_OFFSETS
+#include "svml_s_tanf_rodata.h.S"
+
+/* Offsets for data table __svml_stan_data_internal.  */
+#define _sPI2_FMA	0
+#define _sPI3_FMA	32
+#define _FLT_0	64
+#define _FLT_1	96
+#define _FLT_2	128
+#define _FLT_3	160
 
 #include <sysdep.h>
 
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_tanf_avx2)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-32, %rsp
-	pushq	%rbx
-	subq	$184, %rsp
-
-	/*
-	 * Legacy Code
-	 * Here HW FMA can be unavailable
-	 */
-	xorl	%eax, %eax
-	vmovups	_sAbsMask+__svml_stan_data_internal(%rip), %ymm10
-
-	/*
-	 *
-	 * Main path (_LA_ and _EP_)
-	 *
-	 * Octant calculation
-	 */
-	vmovups	_sInvPi+__svml_stan_data_internal(%rip), %ymm5
-	vmovups	_sRShifter+__svml_stan_data_internal(%rip), %ymm2
-
-	/* Range reduction */
-	vmovups	_sPI1_FMA+__svml_stan_data_internal(%rip), %ymm3
-
-	/* Rational approximation */
-	vmovups	_sP1+__svml_stan_data_internal(%rip), %ymm9
-	vmovaps	%ymm0, %ymm12
-	vandps	%ymm10, %ymm12, %ymm1
+	vmovups	COMMON_DATA(_AbsMask)(%rip), %ymm6
+	/* Main path (_LA_ and _EP_)
+	   Octant calculation.  */
+	vmovups	AVX2_SHARED_DATA(_sInvPi)(%rip), %ymm5
+	vmovups	AVX2_SHARED_DATA(_sRShifter)(%rip), %ymm2
+
+
+	vandps	%ymm6, %ymm0, %ymm1
+
 	vfmadd213ps %ymm2, %ymm1, %ymm5
-	vsubps	%ymm2, %ymm5, %ymm8
-	vpslld	$30, %ymm5, %ymm6
-
-	/* Inversion mask and sign calculation */
-	vpslld	$31, %ymm5, %ymm4
-	vfnmadd213ps %ymm1, %ymm8, %ymm3
-	vfnmadd231ps _sPI2_FMA+__svml_stan_data_internal(%rip), %ymm8, %ymm3
-	vfnmadd132ps _sPI3_FMA+__svml_stan_data_internal(%rip), %ymm3, %ymm8
-	vmovups	_sQ2+__svml_stan_data_internal(%rip), %ymm3
-	vmulps	%ymm8, %ymm8, %ymm13
-	vfmadd213ps _sQ1+__svml_stan_data_internal(%rip), %ymm13, %ymm3
-	vfmadd213ps _sP0+__svml_stan_data_internal(%rip), %ymm13, %ymm9
-	vfmadd213ps _sQ0+__svml_stan_data_internal(%rip), %ymm13, %ymm3
-	vmulps	%ymm9, %ymm8, %ymm8
-	vxorps	%ymm7, %ymm7, %ymm7
-	vcmpneqps %ymm7, %ymm6, %ymm2
-	vandnps	%ymm12, %ymm10, %ymm11
-	vxorps	%ymm11, %ymm4, %ymm0
-
-	/* Exchanged numerator and denominator if necessary */
-	vandnps	%ymm8, %ymm2, %ymm14
-	vandps	%ymm3, %ymm2, %ymm15
-	vandps	%ymm8, %ymm2, %ymm4
-	vandnps	%ymm3, %ymm2, %ymm5
-	vorps	%ymm15, %ymm14, %ymm6
-	vorps	%ymm5, %ymm4, %ymm7
-
-	/* Division */
-	vdivps	%ymm7, %ymm6, %ymm9
-
-	/* Large values check */
-	vcmpnle_uqps _sRangeReductionVal+__svml_stan_data_internal(%rip), %ymm1, %ymm10
-	vmovmskps %ymm10, %edx
-
-	/* Sign setting */
-	vxorps	%ymm0, %ymm9, %ymm0
-
-	/*
-	 *
-	 * End of main path (_LA_ and _EP_)
-	 */
+	vsubps	%ymm2, %ymm5, %ymm7
+
+	/* Range reduction.  */
+	vmovups	COMMON_DATA(_TanSPI1_FMA)(%rip), %ymm3
+	vfnmadd213ps %ymm1, %ymm7, %ymm3
+
+	vfnmadd231ps LOCAL_DATA(_sPI2_FMA)(%rip), %ymm7, %ymm3
+	vfnmadd132ps LOCAL_DATA(_sPI3_FMA)(%rip), %ymm3, %ymm7
+	vmovups	AVX2_SHARED_DATA(_sQ2)(%rip), %ymm3
+	/* Rational approximation.  */
+	vmovups	AVX2_SHARED_DATA(_sP1)(%rip), %ymm4
 
+	vmulps	%ymm7, %ymm7, %ymm2
+	vfmadd213ps AVX2_SHARED_DATA(_sQ1)(%rip), %ymm2, %ymm3
+	vmovups	AVX2_SHARED_DATA(_sP0)(%rip), %ymm8
+	vfmadd213ps %ymm8, %ymm2, %ymm4
+	vfmadd213ps %ymm8, %ymm2, %ymm3
+	vmulps	%ymm4, %ymm7, %ymm4
+	/* Inversion mask and sign calculation.  */
+	vpslld	$31, %ymm5, %ymm2
+
+
+	vandnps	%ymm0, %ymm6, %ymm7
+
+	/* Exchanged numerator and denominator if necessary.  */
+	vblendvps %ymm2, %ymm3, %ymm4, %ymm6
+	vblendvps %ymm2, %ymm4, %ymm3, %ymm3
+
+	/* Large values check.  */
+	vpcmpgtd AVX2_SHARED_DATA(_sRangeReductionVal)(%rip), %ymm1, %ymm10
+	vpmovmskb %ymm10, %edx
+
+	/* Division.  */
+	vdivps	%ymm3, %ymm6, %ymm3
+
+	/* End of main path (_LA_ and _EP_).  */
 	testl	%edx, %edx
 
-	/* Go to auxilary branch */
+	/* Go to auxilary branch.  */
 	jne	L(AUX_BRANCH)
-	/*  DW_CFA_expression: r3 (rbx) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
-	# LOE r12 r13 r14 r15 eax ymm0 ymm1 ymm10 ymm11 ymm12
 
-	/* Return from auxilary branch
-	 * for out of main path inputs
-	 */
+	vxorps	%ymm2, %ymm7, %ymm7
+	/* Sign setting.  */
+	vxorps	%ymm7, %ymm3, %ymm0
+	ret
 
-L(AUX_BRANCH_RETURN):
-	testl	%eax, %eax
+L(AUX_BRANCH):
+	/* Sign setting. NB for all special case values this is
+	   equivilent to the input (ymm0).  */
+	vpandn	%ymm3, %ymm10, %ymm3
+	vpxor	%ymm3, %ymm7, %ymm12
+	vmovaps	%ymm0, %ymm11
 
-	/* Go to special inputs processing branch */
-	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE r12 r13 r14 r15 eax ymm0 ymm12
 
-	/* Restore registers
-	 * and exit the function
-	 */
+	/* Get the (2^a / 2pi) mod 1 values from the table.  */
+	lea	AVX512_SHARED_DATA(_Reduction)(%rip), %rdx
 
-L(EXIT):
-	addq	$184, %rsp
-	cfi_restore(3)
-	popq	%rbx
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
-	ret
-	cfi_def_cfa(6, 16)
-	/*  DW_CFA_expression: r3 (rbx) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
-	cfi_offset(6, -16)
+	vpsrld	$23, %ymm1, %ymm6
+	vpaddd	%ymm6, %ymm6, %ymm2
 
-	/* Branch to process
-	 * special inputs
-	 */
+	vpaddd	%ymm6, %ymm2, %ymm3
 
-L(SPECIAL_VALUES_BRANCH):
-	vmovups	%ymm12, 32(%rsp)
-	vmovups	%ymm0, 64(%rsp)
-	# LOE r12 r13 r14 r15 eax ymm0
+	/* Collect indexes.  */
+	vmovq	%xmm3, %rax
+	movl	%eax, %ecx
+	shrq	$32, %rax
 
-	xorl	%ebx, %ebx
-	# LOE r12 r13 r14 r15 eax ebx
+	vmovq	(%rdx, %rcx, 4), %xmm4
+	vmovq	(%rdx, %rax, 4), %xmm5
+	vpunpckldq %xmm5, %xmm4, %xmm4
 
-	vzeroupper
-	movq	%r12, 8(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, (%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE r14 r15 ebx r12d
-
-	/* Range mask
-	 * bits check
-	 */
-
-L(RANGEMASK_CHECK):
-	btl	%ebx, %r12d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE r14 r15 ebx r12d
-
-	/* Special inputs
-	 * processing loop
-	 */
+	vpextrq	$1, %xmm3, %rdi
+	movl	%edi, %esi
+	shrq	$32, %rdi
 
-L(SPECIAL_VALUES_LOOP):
-	incl	%ebx
-	cmpl	$8, %ebx
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE r14 r15 ebx r12d
-
-	movq	8(%rsp), %r12
-	cfi_restore(12)
-	movq	(%rsp), %r13
-	cfi_restore(13)
-	vmovups	64(%rsp), %ymm0
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE r12 r13 r14 r15 ymm0
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%ebx, %r13d
-	vmovss	32(%rsp, %r13, 4), %xmm0
-	call	tanf@PLT
-	# LOE r13 r14 r15 ebx r12d xmm0
+	vmovq	(%rdx, %rsi, 4), %xmm2
+	vmovq	(%rdx, %rdi, 4), %xmm5
+	vpunpckldq %xmm5, %xmm2, %xmm2
 
-	vmovss	%xmm0, 64(%rsp, %r13, 4)
+	vextractf128 $1, %ymm3, %xmm7
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	cfi_restore(12)
-	cfi_restore(13)
-	# LOE r14 r15 ebx r12d
+	vmovq	%xmm7, %r10
+	movl	%r10d, %r8d
+	shrq	$32, %r10
 
-	/* Auxilary branch
-	 * for out of main path inputs
-	 */
+	vmovq	(%rdx, %r8, 4), %xmm3
+	vmovq	(%rdx, %r10, 4), %xmm5
+	vpunpckldq %xmm5, %xmm3, %xmm3
 
-L(AUX_BRANCH):
-	vpand	.FLT_16(%rip), %ymm1, %ymm5
-
-	/*
-	 * Get the (2^a / 2pi) mod 1 values from the table.
-	 * Because doesn't have I-type gather, we need a trivial cast
-	 */
-	lea	__svml_stan_reduction_data_internal(%rip), %rdx
-	vmovups	%ymm11, 64(%rsp)
-	vmovups	.FLT_15(%rip), %ymm7
-	vmovups	%ymm10, 96(%rsp)
-	vmovups	%ymm0, 128(%rsp)
-	vpsrld	$23, %ymm5, %ymm6
-	vpslld	$1, %ymm6, %ymm11
-	vpaddd	%ymm6, %ymm11, %ymm13
-	vpslld	$2, %ymm13, %ymm15
-	vandps	%ymm7, %ymm12, %ymm14
-	vcmpeqps %ymm7, %ymm14, %ymm10
-	vmovmskps %ymm10, %eax
-	vextractf128 $1, %ymm15, %xmm7
-	vmovd	%xmm15, %ecx
-	vmovd	%xmm7, %r8d
-	vmovd	(%rcx, %rdx), %xmm8
-	vpextrd	$1, %xmm15, %ebx
-	vpextrd	$2, %xmm15, %esi
-	vpextrd	$3, %xmm15, %edi
-	vpextrd	$1, %xmm7, %r10d
-	vpextrd	$2, %xmm7, %r9d
-	vpextrd	$3, %xmm7, %r11d
-	vmovd	(%rbx, %rdx), %xmm3
-	vmovd	(%rsi, %rdx), %xmm2
-	vmovd	(%rdi, %rdx), %xmm14
-	vmovd	(%r8, %rdx), %xmm10
-	vmovd	(%r10, %rdx), %xmm5
-	vmovd	(%r9, %rdx), %xmm11
-	vmovd	(%r11, %rdx), %xmm6
-	vpunpckldq %xmm3, %xmm8, %xmm4
-	vpunpckldq %xmm14, %xmm2, %xmm0
-	vpunpckldq %xmm5, %xmm10, %xmm13
-	vpunpckldq %xmm6, %xmm11, %xmm15
-	vpunpcklqdq %xmm0, %xmm4, %xmm9
-	vmovd	4(%rcx, %rdx), %xmm3
-	vmovd	4(%rbx, %rdx), %xmm2
-	vmovd	4(%rsi, %rdx), %xmm14
-	vmovd	4(%rdi, %rdx), %xmm4
-	vpunpcklqdq %xmm15, %xmm13, %xmm8
-	vmovd	4(%r8, %rdx), %xmm5
-	vmovd	4(%r10, %rdx), %xmm6
-	vmovd	4(%r9, %rdx), %xmm13
-	vmovd	4(%r11, %rdx), %xmm15
-	vpunpckldq %xmm2, %xmm3, %xmm0
-	vpunpckldq %xmm4, %xmm14, %xmm7
-	vpunpckldq %xmm15, %xmm13, %xmm3
-	vpunpcklqdq %xmm7, %xmm0, %xmm10
-	vmovd	8(%rsi, %rdx), %xmm0
-	vmovd	8(%rdi, %rdx), %xmm7
-	vmovd	8(%rcx, %rdx), %xmm14
-	vmovd	8(%rbx, %rdx), %xmm4
-	vmovd	8(%r8, %rdx), %xmm15
-	vinsertf128 $1, %xmm8, %ymm9, %ymm11
-	vpunpckldq %xmm6, %xmm5, %xmm8
-	vpunpcklqdq %xmm3, %xmm8, %xmm2
-	vpunpckldq %xmm7, %xmm0, %xmm6
-
-	/*
-	 * Also get the significand as an integer
-	 * NB: adding in the integer bit is wrong for denorms!
-	 * To make this work for denorms we should do something slightly different
-	 */
-	vpand	.FLT_17(%rip), %ymm1, %ymm7
-	vmovd	8(%r10, %rdx), %xmm8
-	vmovd	8(%r9, %rdx), %xmm3
-	vpunpckldq %xmm4, %xmm14, %xmm5
-	vpunpckldq %xmm8, %xmm15, %xmm14
-
-	/*  Load constants (not all needed at once)  */
-	lea	_sCoeffs+36+__svml_stan_data_internal(%rip), %r9
-	vpunpcklqdq %xmm6, %xmm5, %xmm13
-	vpaddd	.FLT_18(%rip), %ymm7, %ymm5
-	vinsertf128 $1, %xmm2, %ymm10, %ymm9
-	vmovd	8(%r11, %rdx), %xmm2
-	vpunpckldq %xmm2, %xmm3, %xmm4
-	vpunpcklqdq %xmm4, %xmm14, %xmm0
-
-	/*
-	 * Break the P_xxx and m into 16-bit chunks ready for
-	 * the long multiplication via 16x16->32 multiplications
-	 */
-	vmovdqu	.FLT_19(%rip), %ymm14
-	vpsrld	$16, %ymm5, %ymm10
-	vpand	%ymm14, %ymm5, %ymm5
-	vpand	%ymm14, %ymm9, %ymm3
-	vpand	%ymm14, %ymm11, %ymm7
-	vpsrld	$16, %ymm11, %ymm11
-	vpmulld	%ymm3, %ymm5, %ymm8
-	vpmulld	%ymm3, %ymm10, %ymm3
+	vpextrq	$1, %xmm7, %r11
+	movl	%r11d, %r9d
+	shrq	$32, %r11
+
+	vmovq	(%rdx, %r9, 4), %xmm7
+	vmovq	(%rdx, %r11, 4), %xmm5
+	vpunpckldq %xmm5, %xmm7, %xmm7
+
+	vinsertf128 $1, %xmm3, %ymm4, %ymm4
+	vinsertf128 $1, %xmm7, %ymm2, %ymm2
+
+	vmovdqa	LOCAL_DATA(_FLT_0)(%rip), %ymm9
+
+
+	vpunpcklqdq %ymm2, %ymm4, %ymm7
+	vpunpckhqdq %ymm2, %ymm4, %ymm6
+
+	/* Break the P_xxx and m into 16-bit chunks ready for
+	   the long multiplication via 16x16->32 multiplications.  */
+	vpandn	%ymm1, %ymm9, %ymm5
+	vpsrld	$16, %ymm5, %ymm3
+
+	vpor	LOCAL_DATA(_FLT_1)(%rip), %ymm3, %ymm4
+	vmovd	8(%rdx, %rcx, 4), %xmm5
+	vmovd	8(%rdx, %rax, 4), %xmm2
+	vpunpckldq %xmm2, %xmm5, %xmm3
+
+	vmovd	8(%rdx, %rsi, 4), %xmm2
+	vmovd	8(%rdx, %rdi, 4), %xmm5
+	vpunpckldq %xmm5, %xmm2, %xmm2
+
+	vpunpcklqdq %xmm2, %xmm3, %xmm13
+
+	vmovd	8(%rdx, %r8, 4), %xmm3
+	vmovd	8(%rdx, %r10, 4), %xmm5
+	vpunpckldq %xmm5, %xmm3, %xmm0
+	/* Also get the significand as an integer
+	   NB: adding in the integer bit is wrong for denorms!
+	   To make this work for denorms we should do something
+	   slightly different.  */
+	vmovd	8(%rdx, %r9, 4), %xmm2
+	vmovd	8(%rdx, %r11, 4), %xmm5
+	vpunpckldq %xmm5, %xmm2, %xmm2
+
+	/* Better to use `vpand` than `vpblendw`.  */
+	vmovdqu	AVX2_SHARED_DATA(_Low16)(%rip), %ymm3
+
+	vpunpcklqdq %xmm2, %xmm0, %xmm0
 	vinsertf128 $1, %xmm0, %ymm13, %ymm13
+
+	vpand	%ymm3, %ymm1, %ymm5
+	vpand	%ymm3, %ymm6, %ymm0
+	vpsrld	$16, %ymm7, %ymm2
+	vpand	%ymm3, %ymm7, %ymm7
+	vpmulld	%ymm0, %ymm5, %ymm8
+	vpmulld	%ymm0, %ymm4, %ymm14
+	vpsrld	$16, %ymm6, %ymm0
+	vpmulld	%ymm2, %ymm5, %ymm2
+	vpand	%ymm3, %ymm2, %ymm15
 	vpsrld	$16, %ymm13, %ymm6
-	vpand	%ymm14, %ymm13, %ymm15
-	vpsrld	$16, %ymm9, %ymm0
-	vpmulld	%ymm6, %ymm10, %ymm13
+	vpand	%ymm3, %ymm13, %ymm2
+	vpmulld	%ymm6, %ymm4, %ymm13
 	vpmulld	%ymm6, %ymm5, %ymm6
 	vpsrld	$16, %ymm6, %ymm6
-	vpmulld	%ymm15, %ymm10, %ymm4
-	vpand	%ymm14, %ymm8, %ymm15
+	vpmulld	%ymm2, %ymm4, %ymm2
 	vpaddd	%ymm6, %ymm13, %ymm13
-	vpsrld	$16, %ymm4, %ymm4
+	vpsrld	$16, %ymm2, %ymm6
+	vpand	%ymm3, %ymm8, %ymm2
 	vpsrld	$16, %ymm8, %ymm8
-	vpaddd	%ymm13, %ymm15, %ymm15
+	vpaddd	%ymm13, %ymm2, %ymm13
 	vpmulld	%ymm0, %ymm5, %ymm2
-	vpaddd	%ymm15, %ymm4, %ymm13
-	vpand	%ymm14, %ymm2, %ymm4
-	vpaddd	%ymm8, %ymm3, %ymm15
+	vpaddd	%ymm13, %ymm6, %ymm13
+	vpaddd	%ymm8, %ymm14, %ymm14
+	vpand	%ymm3, %ymm2, %ymm8
 	vpsrld	$16, %ymm2, %ymm2
 	vpsrld	$16, %ymm13, %ymm6
 
-	/* Assemble reduced argument from the pieces */
-	vpand	%ymm14, %ymm13, %ymm13
-	vpaddd	%ymm15, %ymm4, %ymm8
-	vpmulld	%ymm7, %ymm5, %ymm9
-	vpmulld	%ymm0, %ymm10, %ymm0
-	vpaddd	%ymm8, %ymm6, %ymm4
-	vpand	%ymm14, %ymm9, %ymm6
-	vpaddd	%ymm2, %ymm0, %ymm8
-	vpsrld	$16, %ymm9, %ymm3
-	vpsrld	$16, %ymm4, %ymm15
-	vpslld	$16, %ymm4, %ymm4
-	vpaddd	%ymm8, %ymm6, %ymm6
-	vpaddd	%ymm6, %ymm15, %ymm0
-	vpmulld	%ymm11, %ymm5, %ymm6
-
-	/* Now do the big multiplication and carry propagation */
-	vpmulld	%ymm7, %ymm10, %ymm8
-	vpand	%ymm14, %ymm6, %ymm2
-	vpaddd	%ymm3, %ymm8, %ymm5
-	vpsrld	$16, %ymm0, %ymm15
-	vpand	%ymm14, %ymm0, %ymm0
-
-	/*
-	 * We want to incorporate the original sign now too.
-	 * Do it here for convenience in getting the right N value,
-	 * though we could wait right to the end if we were prepared
-	 * to modify the sign of N later too.
-	 * So get the appropriate sign mask now (or sooner).
-	 */
-	vpand	.FLT_20(%rip), %ymm1, %ymm3
-	vpaddd	%ymm5, %ymm2, %ymm7
-	vpaddd	%ymm13, %ymm4, %ymm8
-
-	/*
-	 * Now round at the 2^-8 bit position for reduction mod pi/2^7
-	 * instead of the original 2pi (but still with the same 2pi scaling).
-	 * Use a shifter of 2^15 + 2^14.
-	 * The N we get is our final version; it has an offset of
-	 * 2^8 because of the implicit integer bit, and anyway for negative
-	 * starting value it's a 2s complement thing. But we need to mask
-	 * off the exponent part anyway so it's fine.
-	 */
-	vmovups	.FLT_22(%rip), %ymm14
-	vpaddd	%ymm7, %ymm15, %ymm15
-
-	/*
-	 * Create floating-point high part, implicitly adding integer bit 1
-	 * Incorporate overall sign at this stage too.
-	 */
-	vpxor	.FLT_21(%rip), %ymm3, %ymm11
-
-	/*
-	 * Create floating-point low and medium parts, respectively
-	 * lo_17, ... lo_0, 0, ..., 0
-	 * hi_8, ... hi_0, lo_31, ..., lo_18
-	 * then subtract off the implicitly added integer bits,
-	 * 2^-46 and 2^-23, respectively.
-	 * Put the original sign into all of them at this stage.
-	 */
-	vpxor	.FLT_23(%rip), %ymm3, %ymm7
-	vpslld	$16, %ymm15, %ymm9
-	vpaddd	%ymm0, %ymm9, %ymm2
-	vpand	.FLT_24(%rip), %ymm8, %ymm0
+	/* Assemble reduced argument from the pieces.  */
+	vpand	%ymm3, %ymm13, %ymm13
+	vpaddd	%ymm14, %ymm8, %ymm8
+	vpmulld	%ymm7, %ymm5, %ymm5
+	vpmulld	%ymm0, %ymm4, %ymm0
+	vpaddd	%ymm8, %ymm6, %ymm8
+	vpand	%ymm3, %ymm5, %ymm6
+	vpaddd	%ymm2, %ymm0, %ymm0
+	vpsrld	$16, %ymm5, %ymm14
+	vpsrld	$16, %ymm8, %ymm5
+	vpslld	$16, %ymm8, %ymm8
+	vpaddd	%ymm0, %ymm6, %ymm6
+	vpaddd	%ymm6, %ymm5, %ymm0
+
+
+	/* Now do the big multiplication and carry propagation.  */
+	vpmulld	%ymm7, %ymm4, %ymm7
+	vpaddd	%ymm14, %ymm7, %ymm5
+	vpsrld	$16, %ymm0, %ymm2
+	vpand	%ymm3, %ymm0, %ymm0
+
+	vpaddd	%ymm5, %ymm15, %ymm7
+	vpaddd	%ymm13, %ymm8, %ymm8
+
+	/* Now round at the 2^-8 bit position for reduction mod pi/2^7
+	   instead of the original 2pi (but still with the same 2pi scaling).
+	   Use a shifter of 2^15 + 2^14.
+	   The N we get is our final version; it has an offset of
+	   2^8 because of the implicit integer bit, and anyway for negative
+	   starting value it's a 2s complement thing. But we need to mask
+	   off the exponent part anyway so it's fine.  */
+	vpaddd	%ymm7, %ymm2, %ymm2
+	vmovups	AVX2_SHARED_DATA(_SH_FLT_1)(%rip), %ymm14
+
+	/* Create floating-point low and medium parts, respectively
+	   lo_17, ... lo_0, 0, ..., 0
+	   hi_8, ... hi_0, lo_31, ..., lo_18
+	   then subtract off the implicitly added integer bits,
+	   2^-46 and 2^-23, respectively.
+	   Put the original sign into all of them at this stage.  */
+	vmovdqa	AVX2_SHARED_DATA(_SH_FLT_2)(%rip), %ymm7
+
+	vpslld	$16, %ymm2, %ymm2
+	vpaddd	%ymm0, %ymm2, %ymm2
+	vpand	AVX2_SHARED_DATA(_Low18)(%rip), %ymm8, %ymm0
 	vpsrld	$18, %ymm8, %ymm8
-	vpsrld	$9, %ymm2, %ymm10
+	vpsrld	$9, %ymm2, %ymm6
 	vpslld	$5, %ymm0, %ymm4
-	vpor	%ymm11, %ymm10, %ymm6
-	vpxor	.FLT_25(%rip), %ymm3, %ymm11
-	vpand	.FLT_26(%rip), %ymm2, %ymm3
+	vmovdqa	COMMON_DATA(_OneF)(%rip), %ymm15
+	vpor	%ymm15, %ymm6, %ymm6
+
+
+	vpand	AVX2_SHARED_DATA(_Low9)(%rip), %ymm2, %ymm3
 	vpor	%ymm7, %ymm4, %ymm5
+	vmovdqa	AVX2_SHARED_DATA(_SH_FLT_3)(%rip), %ymm4
 
-	/*
-	 * If the magnitude of the input is <= 2^-20, then
-	 * just pass through the input, since no reduction will be needed and
-	 * the main path will only work accurately if the reduced argument is
-	 * about >= 2^-40 (which it is for all large pi multiples)
-	 */
-	vmovups	.FLT_30(%rip), %ymm4
+	/* If the magnitude of the input is <= 2^-20, then
+	   just pass through the input, since no reduction will be needed
+	   and the main path will only work accurately if the reduced
+	   argument is about >= 2^-40 (which it is for all large pi
+	   multiples).  */
 	vpslld	$14, %ymm3, %ymm2
 
-	/*
-	 * Now multiply those numbers all by 2 pi, reasonably accurately.
-	 * (RHi + RLo) * (pi_lead + pi_trail) ~=
-	 * RHi * pi_lead + (RHi * pi_trail + RLo * pi_lead)
-	 */
-	vmovups	.FLT_27(%rip), %ymm3
+	/* Now multiply those numbers all by 2 pi, reasonably accurately.
+	   (RHi + RLo) * (pi_lead + pi_trail) ~=
+	   RHi * pi_lead + (RHi * pi_trail + RLo * pi_lead).  */
+	vmovups	AVX2_SHARED_DATA(_SH_FLT_4)(%rip), %ymm3
 	vaddps	%ymm14, %ymm6, %ymm13
-	vpor	%ymm8, %ymm2, %ymm9
-	vsubps	%ymm14, %ymm13, %ymm15
-
-	/* Grab our final N value as an integer, appropriately masked mod 2^8 */
-	vpand	.FLT_31(%rip), %ymm13, %ymm13
-	vpor	%ymm11, %ymm9, %ymm10
-	vsubps	%ymm15, %ymm6, %ymm6
-	vsubps	%ymm7, %ymm5, %ymm15
-	vsubps	%ymm11, %ymm10, %ymm14
-
-	/* Now add them up into 2 reasonably aligned pieces */
+	vpor	%ymm8, %ymm2, %ymm2
+	vsubps	%ymm14, %ymm13, %ymm0
+
+	/* Grab our final N value as an integer, appropriately masked
+	   mod 2^8.  */
+	vpor	%ymm4, %ymm2, %ymm2
+	vsubps	%ymm0, %ymm6, %ymm6
+	vsubps	%ymm7, %ymm5, %ymm0
+	vsubps	%ymm4, %ymm2, %ymm14
+
+	vmovups	LOCAL_DATA(_FLT_2)(%rip), %ymm4
+	/* Now add them up into 2 reasonably aligned pieces.  */
 	vaddps	%ymm14, %ymm6, %ymm2
 	vsubps	%ymm2, %ymm6, %ymm6
 	vmulps	%ymm2, %ymm3, %ymm7
 	vaddps	%ymm6, %ymm14, %ymm8
-	vaddps	%ymm8, %ymm15, %ymm8
-	vmovaps	%ymm3, %ymm15
-	vfmsub213ps %ymm7, %ymm2, %ymm15
-	vandps	.FLT_29(%rip), %ymm1, %ymm0
-	vfmadd132ps .FLT_28(%rip), %ymm15, %ymm2
-	vcmpgt_oqps %ymm4, %ymm0, %ymm9
-	vcmple_oqps %ymm4, %ymm0, %ymm5
-
-	/*
-	 * The output is _VRES_R (high) + _VRES_E (low), and the integer part is _VRES_IND
-	 * Set sRp2 = _VRES_R^2 and then resume the original code.
-	 * Argument reduction is now finished: x = n * pi/128 + r
-	 * where n = iIndex and r = sR (high) + sE (low).
-	 * But we have n modulo 256, needed for sin/cos with period 2pi
-	 * but we want it modulo 128 since tan has period pi.
-	 */
-	vpand	.FLT_32(%rip), %ymm13, %ymm0
+	vaddps	%ymm8, %ymm0, %ymm8
+	vmovaps	%ymm3, %ymm0
+	vfmsub213ps %ymm7, %ymm2, %ymm0
+
+
+	vfmadd132ps LOCAL_DATA(_FLT_3)(%rip), %ymm0, %ymm2
+	vpcmpgtd %ymm4, %ymm1, %ymm5
+
+	/* The output is _VRES_R (high) + _VRES_E (low), and the integer
+	   part is _VRES_IND Set sRp2 = _VRES_R^2 and then resume the
+	   original code. Argument reduction is now finished: x = n *
+	   pi/128 + r where n = iIndex and r = sR (high) + sE (low).
+	   But we have n modulo 256, needed for sin/cos with period 2pi
+	   but we want it modulo 128 since tan has period pi.  */
+	vpand	AVX2_SHARED_DATA(_Low7)(%rip), %ymm13, %ymm0
 	vfmadd213ps %ymm2, %ymm3, %ymm8
+
+
+
 	vpslld	$2, %ymm0, %ymm2
-	vandps	%ymm1, %ymm5, %ymm1
-	vandps	%ymm7, %ymm9, %ymm6
-	vorps	%ymm6, %ymm1, %ymm15
-	vpaddd	%ymm0, %ymm2, %ymm1
-	vpslld	$3, %ymm1, %ymm4
-	vandps	%ymm8, %ymm9, %ymm3
-
-	/*
-	 * Simply combine the two parts of the reduced argument
-	 * since we can afford a few ulps in this case.
-	 */
-	vaddps	%ymm3, %ymm15, %ymm6
-	vextractf128 $1, %ymm4, %xmm8
-	vmovd	%xmm4, %r10d
-	vmovd	%xmm8, %ebx
-	vmovd	-36(%r10, %r9), %xmm5
-	vmovd	-32(%r10, %r9), %xmm9
-	vpextrd	$1, %xmm4, %r8d
-	vpextrd	$2, %xmm4, %edi
-	vpextrd	$3, %xmm4, %esi
-	vpextrd	$1, %xmm8, %ecx
-	vpextrd	$2, %xmm8, %edx
-	vpextrd	$3, %xmm8, %r11d
-	vmovd	-36(%r8, %r9), %xmm7
-	vmovd	-36(%rdi, %r9), %xmm10
-	vmovd	-36(%rsi, %r9), %xmm11
-	vmovd	-36(%rbx, %r9), %xmm3
-	vmovd	-36(%rcx, %r9), %xmm2
-	vmovd	-36(%rdx, %r9), %xmm0
-	vmovd	-36(%r11, %r9), %xmm1
-	vpunpckldq %xmm7, %xmm5, %xmm14
-	vpunpckldq %xmm11, %xmm10, %xmm13
-	vpunpckldq %xmm2, %xmm3, %xmm4
-	vpunpckldq %xmm1, %xmm0, %xmm5
-	vpunpcklqdq %xmm13, %xmm14, %xmm15
-	vpunpcklqdq %xmm5, %xmm4, %xmm7
-	vmovd	-32(%r8, %r9), %xmm10
-	vmovd	-32(%rdi, %r9), %xmm11
-	vmovd	-32(%rsi, %r9), %xmm14
-	vmovd	-32(%rbx, %r9), %xmm2
-	vmovd	-32(%rcx, %r9), %xmm0
-	vmovd	-32(%rdx, %r9), %xmm1
-	vmovd	-32(%r11, %r9), %xmm4
-	vpunpckldq %xmm14, %xmm11, %xmm8
-	vpunpckldq %xmm0, %xmm2, %xmm5
-	vmovd	-28(%r8, %r9), %xmm11
-	vmovd	-28(%rdi, %r9), %xmm14
-	vinsertf128 $1, %xmm7, %ymm15, %ymm13
-	vpunpckldq %xmm10, %xmm9, %xmm15
-	vpunpckldq %xmm4, %xmm1, %xmm7
-	vpunpcklqdq %xmm8, %xmm15, %xmm3
-	vpunpcklqdq %xmm7, %xmm5, %xmm9
-	vmovd	-28(%r10, %r9), %xmm10
-	vmovd	-28(%rsi, %r9), %xmm8
-	vmovd	-28(%rbx, %r9), %xmm1
-	vmovd	-28(%rcx, %r9), %xmm4
-	vmovd	-28(%rdx, %r9), %xmm5
-	vmovd	-28(%r11, %r9), %xmm7
-	vpunpckldq %xmm8, %xmm14, %xmm2
-	vmovd	-24(%r10, %r9), %xmm14
-	vinsertf128 $1, %xmm9, %ymm3, %ymm15
-	vpunpckldq %xmm11, %xmm10, %xmm3
-	vpunpckldq %xmm4, %xmm1, %xmm9
-	vpunpckldq %xmm7, %xmm5, %xmm10
-	vpunpcklqdq %xmm2, %xmm3, %xmm0
-	vpunpcklqdq %xmm10, %xmm9, %xmm11
-	vmovd	-24(%r8, %r9), %xmm3
-	vmovd	-24(%rdi, %r9), %xmm2
-	vmovd	-24(%rbx, %r9), %xmm7
-	vmovd	-24(%rcx, %r9), %xmm9
-	vmovd	-24(%rdx, %r9), %xmm10
-	vpunpckldq %xmm3, %xmm14, %xmm1
-	vpunpckldq %xmm9, %xmm7, %xmm14
-	vmovd	-20(%rsi, %r9), %xmm7
-	vinsertf128 $1, %xmm11, %ymm0, %ymm8
-	vmovd	-24(%rsi, %r9), %xmm0
-	vmovd	-24(%r11, %r9), %xmm11
-	vpunpckldq %xmm0, %xmm2, %xmm4
-	vpunpckldq %xmm11, %xmm10, %xmm3
-	vpunpcklqdq %xmm4, %xmm1, %xmm5
-	vpunpcklqdq %xmm3, %xmm14, %xmm2
-	vmovd	-20(%r10, %r9), %xmm0
-	vmovd	-20(%r8, %r9), %xmm1
-	vmovd	-20(%rbx, %r9), %xmm14
-	vmovd	-20(%rdi, %r9), %xmm4
-	vpunpckldq %xmm1, %xmm0, %xmm9
-	vmovd	-20(%r11, %r9), %xmm0
-	vpunpckldq %xmm7, %xmm4, %xmm10
-	vpunpcklqdq %xmm10, %xmm9, %xmm11
-	vmovd	-16(%r10, %r9), %xmm9
-	vmovd	-16(%r8, %r9), %xmm10
-	vinsertf128 $1, %xmm2, %ymm5, %ymm3
-	vmovd	-20(%rcx, %r9), %xmm2
-	vpunpckldq %xmm2, %xmm14, %xmm1
-	vmovd	-20(%rdx, %r9), %xmm14
-	vpunpckldq %xmm0, %xmm14, %xmm4
-	vpunpcklqdq %xmm4, %xmm1, %xmm5
-	vmovd	-16(%rdi, %r9), %xmm2
-	vmovd	-16(%rsi, %r9), %xmm0
-	vpunpckldq %xmm10, %xmm9, %xmm1
-	vmovd	-16(%rcx, %r9), %xmm9
-	vmovd	-16(%rdx, %r9), %xmm10
-	vpunpckldq %xmm0, %xmm2, %xmm4
-	vinsertf128 $1, %xmm5, %ymm11, %ymm7
-	vmovups	%ymm7, 32(%rsp)
-	vmovd	-16(%rbx, %r9), %xmm7
-	vmovd	-16(%r11, %r9), %xmm11
-	vpunpckldq %xmm9, %xmm7, %xmm14
-	vpunpckldq %xmm11, %xmm10, %xmm2
-	vpunpcklqdq %xmm4, %xmm1, %xmm5
-	vpunpcklqdq %xmm2, %xmm14, %xmm0
-	vmovd	-12(%r10, %r9), %xmm1
-	vmovd	-12(%r8, %r9), %xmm4
-	vmovd	-12(%rdi, %r9), %xmm7
-	vmovd	-12(%rsi, %r9), %xmm9
-	vpunpckldq %xmm4, %xmm1, %xmm10
-	vmovd	-12(%rcx, %r9), %xmm1
-	vmovd	-12(%rdx, %r9), %xmm4
-	vpunpckldq %xmm9, %xmm7, %xmm11
-	vpunpcklqdq %xmm11, %xmm10, %xmm14
-	vinsertf128 $1, %xmm0, %ymm5, %ymm2
-	vmovd	-12(%rbx, %r9), %xmm0
-	vmovd	-12(%r11, %r9), %xmm5
-	vpunpckldq %xmm1, %xmm0, %xmm7
-	vpunpckldq %xmm5, %xmm4, %xmm9
-	vpunpcklqdq %xmm9, %xmm7, %xmm10
-	vmovd	-8(%r10, %r9), %xmm1
-	vmovd	-8(%r8, %r9), %xmm4
-	vmovups	128(%rsp), %ymm0
-	vinsertf128 $1, %xmm10, %ymm14, %ymm11
-	vmovups	%ymm11, (%rsp)
-	vmovups	96(%rsp), %ymm10
-	vmovups	64(%rsp), %ymm11
-	# LOE rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 eax xmm1 xmm4 ymm0 ymm2 ymm3 ymm6 ymm8 ymm10 ymm11 ymm12 ymm13 ymm15
-
-	vmovd	-8(%rdi, %r9), %xmm7
-	vmovd	-8(%rsi, %r9), %xmm5
-	vpunpckldq %xmm4, %xmm1, %xmm4
-	vpunpckldq %xmm5, %xmm7, %xmm9
-	vpunpcklqdq %xmm9, %xmm4, %xmm7
-	vmovd	-8(%rbx, %r9), %xmm1
-	vmovd	-8(%rcx, %r9), %xmm14
-	vmovd	-8(%rdx, %r9), %xmm5
-	vmovd	-8(%r11, %r9), %xmm4
-	vpunpckldq %xmm14, %xmm1, %xmm9
-	vpunpckldq %xmm4, %xmm5, %xmm1
-	vpunpcklqdq %xmm1, %xmm9, %xmm14
-	vmovd	-4(%r10, %r9), %xmm5
-	vmovd	-4(%r8, %r9), %xmm4
-	vmovd	-4(%rdi, %r9), %xmm9
-	vmovd	-4(%rsi, %r9), %xmm1
-	vinsertf128 $1, %xmm14, %ymm7, %ymm7
-	vpunpckldq %xmm4, %xmm5, %xmm14
-	vpunpckldq %xmm1, %xmm9, %xmm5
-	vpunpcklqdq %xmm5, %xmm14, %xmm4
-	vmovd	-4(%rbx, %r9), %xmm9
-	vmovd	-4(%rcx, %r9), %xmm1
-	vmovd	-4(%rdx, %r9), %xmm14
-	vmovd	-4(%r11, %r9), %xmm5
-	vpunpckldq %xmm1, %xmm9, %xmm9
-	vpunpckldq %xmm5, %xmm14, %xmm1
-	vpunpcklqdq %xmm1, %xmm9, %xmm14
-	vmovd	(%r10, %r9), %xmm5
-	vmovd	(%r8, %r9), %xmm9
-	vmovd	(%rdi, %r9), %xmm1
-	vpunpckldq %xmm9, %xmm5, %xmm5
-
-	/*
-	 *  Higher polynomial terms
-	 * Stage 1 (with unlimited parallelism)
-	 * P3 = C1_lo + C2 * Z
-	 */
-	vfmadd213ps (%rsp), %ymm6, %ymm7
-	vinsertf128 $1, %xmm14, %ymm4, %ymm4
-	vmovd	(%rsi, %r9), %xmm14
-	vpunpckldq %xmm14, %xmm1, %xmm9
-	vmovd	(%rbx, %r9), %xmm1
-	vmovd	(%rcx, %r9), %xmm14
-	vpunpcklqdq %xmm9, %xmm5, %xmm9
-	vpunpckldq %xmm14, %xmm1, %xmm5
-	vmovd	(%rdx, %r9), %xmm1
-	vmovd	(%r11, %r9), %xmm14
-	vpunpckldq %xmm14, %xmm1, %xmm1
-	vpunpcklqdq %xmm1, %xmm5, %xmm5
-	vmovups	.FLT_33(%rip), %ymm1
-
-	/*
-	 *  Compute 2-part reciprocal component
-	 * Construct a separate reduced argument modulo pi near pi/2 multiples.
-	 * i.e. (pi/2 - x) mod pi, simply by subtracting the reduced argument
-	 * from an accurate B_hi + B_lo = (128 - n) pi/128. Force the upper part
-	 * of this reduced argument to half-length to simplify accurate
-	 * reciprocation later on.
-	 */
-	vsubps	%ymm6, %ymm13, %ymm14
-	vsubps	%ymm14, %ymm13, %ymm13
+	vpaddd	%ymm0, %ymm2, %ymm4
+
+	vpblendvb %ymm5, %ymm7, %ymm1, %ymm6
+
+	vandps	%ymm8, %ymm5, %ymm3
+	vaddps	%ymm3, %ymm6, %ymm6
+
+
+
+
+	/* Simply combine the two parts of the reduced argument
+	   since we can afford a few ulps in this case.  */
+
+	/* Load constants (not all needed at once).  */
+	lea	AVX2_SHARED_DATA(_Coeffs)(%rip), %rdx
+
+	vmovq	%xmm4, %rcx
+	movl	%ecx, %eax
+	shrq	$32, %rcx
+
+	vmovdqu	(%rdx, %rax, 8), %ymm5
+	vmovdqu	(%rdx, %rcx, 8), %ymm7
+	vpunpckldq %ymm7, %ymm5, %ymm3
+	vpunpckhdq %ymm7, %ymm5, %ymm7
+
+	vpextrq	$1, %xmm4, %rsi
+	movl	%esi, %edi
+	shrq	$32, %rsi
+
+	vmovdqu	(%rdx, %rdi, 8), %ymm5
+	vmovdqu	(%rdx, %rsi, 8), %ymm2
+	vpunpckldq %ymm2, %ymm5, %ymm0
+	vpunpckhdq %ymm2, %ymm5, %ymm2
+
+	vextractf128 $1, %ymm4, %xmm4
+
+	vmovq	%xmm4, %r8
+	movl	%r8d, %r10d
+	shrq	$32, %r8
+
+	vmovdqu	(%rdx, %r10, 8), %ymm8
+	vmovdqu	(%rdx, %r8, 8), %ymm5
+	vpunpckldq %ymm5, %ymm8, %ymm14
+	vpunpckhdq %ymm5, %ymm8, %ymm8
+
+
+	vpextrq	$1, %xmm4, %r11
+	movl	%r11d, %r9d
+	shrq	$32, %r11
+
+	vmovdqu	(%rdx, %r9, 8), %ymm5
+	vmovdqu	(%rdx, %r11, 8), %ymm4
+
+	vpunpckldq %ymm4, %ymm5, %ymm13
+	vpunpckhdq %ymm4, %ymm5, %ymm4
+
+	vpunpcklqdq %ymm0, %ymm3, %ymm5
+	vpunpckhqdq %ymm0, %ymm3, %ymm3
+
+	vpunpcklqdq %ymm13, %ymm14, %ymm0
+	vpunpckhqdq %ymm13, %ymm14, %ymm14
+
+	vinserti128 $0x1, %xmm0, %ymm5, %ymm13
+	vperm2i128 $0x31, %ymm0, %ymm5, %ymm5
+
+	vinserti128 $0x1, %xmm14, %ymm3, %ymm0
+	vperm2i128 $0x31, %ymm14, %ymm3, %ymm14
+
+	vpunpcklqdq %ymm2, %ymm7, %ymm3
+	vpunpckhqdq %ymm2, %ymm7, %ymm2
+
+	vpunpcklqdq %ymm4, %ymm8, %ymm7
+	vpunpckhqdq %ymm4, %ymm8, %ymm4
+
+	vinserti128 $0x1, %xmm7, %ymm3, %ymm8
+	vperm2i128 $0x31, %ymm7, %ymm3, %ymm3
+
+	vperm2i128 $0x31, %ymm4, %ymm2, %ymm7
+	vfmadd213ps %ymm3, %ymm6, %ymm7
+	vinserti128 $0x1, %xmm4, %ymm2, %ymm3
+
+	/* Compute 2-part reciprocal component Construct a separate
+	   reduced argument modulo pi near pi/2 multiples. i.e. (pi/2 -
+	   x) mod pi, simply by subtracting the reduced argument from
+	   an accurate B_hi + B_lo = (128 - n) pi/128. Force the upper
+	   part of this reduced argument to half-length to simplify
+	   accurate reciprocation later on.  */
+	vsubps	%ymm6, %ymm13, %ymm2
+	vsubps	%ymm2, %ymm13, %ymm13
 	vsubps	%ymm6, %ymm13, %ymm13
-	vinsertf128 $1, %xmm5, %ymm9, %ymm5
-	vandps	%ymm1, %ymm14, %ymm9
-	vsubps	%ymm9, %ymm14, %ymm14
-
-	/* P4 = C3 + C4 * Z */
-	vfmadd213ps %ymm4, %ymm6, %ymm5
-	vaddps	%ymm14, %ymm15, %ymm15
-	vaddps	%ymm15, %ymm13, %ymm15
-
-	/*
-	 * Now compute an approximate reciprocal to mix into the computation
-	 * To avoid any danger of nonportability, force it to 12 bits,
-	 * though I suspect it always is anyway on current platforms.
-	 */
-	vrcpps	%ymm9, %ymm13
-	vandps	%ymm1, %ymm13, %ymm13
 
-	/*
-	 * Now compute the error sEr where sRecip_hi = (1/R_hi) * (1 - sEr)
-	 * so that we can compensate for it.
-	 */
-	vmovups	_sOne+__svml_stan_data_internal(%rip), %ymm1
-	vfnmadd213ps %ymm1, %ymm13, %ymm9
-
-	/*
-	 * Get a better approximation to  1/sR_hi (not far short of an ulp)
-	 * using a third-order polynomial approximation
-	 */
-	vmovaps	%ymm13, %ymm14
-	vfmadd213ps %ymm13, %ymm9, %ymm14
-	vfmadd231ps %ymm9, %ymm9, %ymm1
-	vmulps	%ymm1, %ymm14, %ymm1
-
-	/*
-	 * Multiply by sRecip_ok to make sR_lo relative to sR_hi
-	 * Since sR_lo is shifted off by about 12 bits, this is accurate enough.
-	 */
-	vmulps	%ymm1, %ymm15, %ymm14
-
-	/*
-	 * Now create a low reciprocal using
-	 * (Recip_hi + Er * Recip_ok) * (1 + sR_lo^2 - sR_lo)
-	 * =~= Recip_hi + Recip_ok * (Er + sR_lo^2 - sR_lo)
-	 */
-	vsubps	%ymm9, %ymm14, %ymm9
-	vfmsub213ps %ymm9, %ymm14, %ymm14
-	vmulps	%ymm14, %ymm1, %ymm9
-	vmovaps	%ymm2, %ymm1
-	vfmadd213ps %ymm3, %ymm6, %ymm1
-	vsubps	%ymm1, %ymm3, %ymm15
-	vmovaps	%ymm8, %ymm3
-	vfmadd213ps %ymm1, %ymm13, %ymm3
-	vfmadd213ps %ymm15, %ymm6, %ymm2
-	vfmsub213ps %ymm3, %ymm8, %ymm13
-	vfmadd213ps %ymm2, %ymm8, %ymm9
-	vaddps	%ymm13, %ymm1, %ymm2
+	/* Higher polynomial terms
+	   Stage 1 (with unlimited parallelism)
+	   P3 = C1_lo + C2 * Z.  */
+	vmovq	32(%rdx, %rax, 8), %xmm4
+	vpcmpgtd %ymm1, %ymm9, %ymm9
+	vmovmskps %ymm9, %eax
 
-	/* Z2 = Z^2 */
-	vmulps	%ymm6, %ymm6, %ymm1
-	vaddps	%ymm2, %ymm9, %ymm8
+	vmovq	32(%rdx, %rcx, 8), %xmm1
 
-	/*
-	 * Stage 2 (with unlimited parallelism)
-	 * P6 = C1_lo + C2 * Z + C3 * Z^2 + C4 * Z^3
-	 */
-	vfmadd213ps %ymm7, %ymm5, %ymm1
+	vinsertf128 $1, 32(%rdx, %r10, 8), %ymm4, %ymm4
+	vinsertf128 $1, 32(%rdx, %r8, 8), %ymm1, %ymm1
+	vpunpckldq %ymm1, %ymm4, %ymm9
 
-	/* P9 = trail(dominant part) + C0_lo */
-	vaddps	32(%rsp), %ymm8, %ymm5
+	vmovq	32(%rdx, %rdi, 8), %xmm1
+	vmovq	32(%rdx, %rsi, 8), %xmm4
 
-	/* Final accumulation of low part */
-	vfmadd213ps %ymm5, %ymm6, %ymm1
+	vinsertf128 $1, 32(%rdx, %r9, 8), %ymm1, %ymm1
+	vinsertf128 $1, 32(%rdx, %r11, 8), %ymm4, %ymm4
+	vpunpckldq %ymm4, %ymm1, %ymm1
 
-	/* And now the very final summation */
-	vaddps	%ymm1, %ymm3, %ymm6
+	vpunpckhqdq %ymm1, %ymm9, %ymm4
+	vpunpcklqdq %ymm1, %ymm9, %ymm9
 
-	/*
-	 *  The end of implementation (LA with huge args reduction)
-	 * End of large arguments path (_HA_, _LA_ and _EP_)
-	 */
 
-	vxorps	%ymm11, %ymm6, %ymm11
+	vmovups	COMMON_DATA(_Neg4096)(%rip), %ymm1
 
-	/* Merge results from main and large paths: */
-	vblendvps %ymm10, %ymm11, %ymm0, %ymm0
+	vfmadd213ps %ymm9, %ymm6, %ymm4
 
-	/* Return to main vector processing path */
-	jmp	L(AUX_BRANCH_RETURN)
-	# LOE r12 r13 r14 r15 eax ymm0 ymm12
-END(_ZGVdN8v_tanf_avx2)
+	vandps	%ymm1, %ymm2, %ymm9
+	vsubps	%ymm9, %ymm2, %ymm2
 
-	.section .rodata, "a"
-	.align	32
+	/* P4 = C3 + C4 * Z.  */
+	vaddps	%ymm2, %ymm0, %ymm0
+	vaddps	%ymm0, %ymm13, %ymm0
 
-.FLT_15:
-	.long	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000
-	.type	.FLT_15, @object
-	.size	.FLT_15, 32
-	.align	32
+	/* Now compute an approximate reciprocal to mix into the computation
+	   To avoid any danger of nonportability, force it to 12 bits,
+	   though I suspect it always is anyway on current platforms.  */
+	vrcpps	%ymm9, %ymm13
+	vandps	%ymm1, %ymm13, %ymm13
 
-.FLT_16:
-	.long	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000
-	.type	.FLT_16, @object
-	.size	.FLT_16, 32
-	.align	32
+	/* Now compute the error sEr where sRecip_hi = (1/R_hi) * (1 - sEr)
+	   so that we can compensate for it.  */
+	vfnmadd213ps %ymm15, %ymm13, %ymm9
+
+	/* Get a better approximation to  1/sR_hi (not far short of an ulp)
+	   using a third-order polynomial approximation.  */
+	vmovaps	%ymm13, %ymm2
+	vfmadd231ps %ymm9, %ymm9, %ymm15
+	vfmadd213ps %ymm13, %ymm9, %ymm2
+	vmulps	%ymm15, %ymm2, %ymm1
+
+	/* Multiply by sRecip_ok to make sR_lo relative to sR_hi Since
+	   sR_lo is shifted off by about 12 bits, this is accurate
+	   enough.  */
+	vmulps	%ymm1, %ymm0, %ymm2
+
+	/* Now create a low reciprocal using
+	   (Recip_hi + Er * Recip_ok) * (1 + sR_lo^2 - sR_lo)
+	   =~= Recip_hi + Recip_ok * (Er + sR_lo^2 - sR_lo).  */
+	vsubps	%ymm9, %ymm2, %ymm9
+	vfmsub213ps %ymm9, %ymm2, %ymm2
+	vmulps	%ymm2, %ymm1, %ymm9
+	vmovaps	%ymm14, %ymm1
+	vfmadd213ps %ymm3, %ymm6, %ymm1
+	vsubps	%ymm1, %ymm3, %ymm0
+	vmovaps	%ymm8, %ymm3
+	vfmadd213ps %ymm1, %ymm13, %ymm3
+	vfmadd213ps %ymm0, %ymm6, %ymm14
+	vfmsub213ps %ymm3, %ymm8, %ymm13
+	vfmadd213ps %ymm14, %ymm8, %ymm9
+	vaddps	%ymm13, %ymm1, %ymm2
 
-.FLT_17:
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	.type	.FLT_17, @object
-	.size	.FLT_17, 32
-	.align	32
+	/* Z2 = Z^2.  */
+	vmulps	%ymm6, %ymm6, %ymm1
+	vaddps	%ymm2, %ymm9, %ymm2
 
-.FLT_18:
-	.long	0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000
-	.type	.FLT_18, @object
-	.size	.FLT_18, 32
-	.align	32
+	/* Stage 2 (with unlimited parallelism)
+	   P6 = C1_lo + C2 * Z + C3 * Z^2 + C4 * Z^3.  */
+	vfmadd213ps %ymm7, %ymm4, %ymm1
 
-.FLT_19:
-	.long	0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
-	.type	.FLT_19, @object
-	.size	.FLT_19, 32
-	.align	32
+	/* P9 = trail(dominant part) + C0_lo.  */
+	vaddps	%ymm5, %ymm2, %ymm4
 
-.FLT_20:
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
-	.type	.FLT_20, @object
-	.size	.FLT_20, 32
-	.align	32
+	/* Final accumulation of low part.  */
+	vfmadd213ps %ymm4, %ymm6, %ymm1
 
-.FLT_21:
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	.type	.FLT_21, @object
-	.size	.FLT_21, 32
-	.align	32
+	/* And now the very final summation.  */
+	vaddps	%ymm1, %ymm3, %ymm6
 
-.FLT_22:
-	.long	0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000
-	.type	.FLT_22, @object
-	.size	.FLT_22, 32
-	.align	32
+	/* The end of implementation (LA with huge args reduction)
+	   End of large arguments path (_HA_, _LA_ and _EP_).  */
+	vpand	%ymm6, %ymm10, %ymm6
+	/* Merge results from main and large paths:.  */
+	vpxor	%ymm6, %ymm12, %ymm0
 
-.FLT_23:
-	.long	0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000
-	.type	.FLT_23, @object
-	.size	.FLT_23, 32
-	.align	32
 
-.FLT_24:
-	.long	0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff
-	.type	.FLT_24, @object
-	.size	.FLT_24, 32
-	.align	32
+	/* `al` has 0 at special values. If all 1s `incb al` will
+	   overflow and set zero flag.  */
+	incb	%al
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	ret
 
-.FLT_25:
-	.long	0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000
-	.type	.FLT_25, @object
-	.size	.FLT_25, 32
-	.align	32
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanhf call. Optimize for code size
+	   more so than speed here.  */
+L(SPECIAL_VALUES_BRANCH):
 
-.FLT_26:
-	.long	0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
-	.type	.FLT_26, @object
-	.size	.FLT_26, 32
-	.align	32
+	/* Use r13 to save/restore the stack. This allows us to use rbp
+	   as callee save register saving code size.  */
+	pushq	%r13
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (r13, -16)
+	/* Need to callee save registers to preserve state across tanhf
+	   calls.  */
+	pushq	%rbx
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register (r13)
 
-.FLT_27:
-	.long	0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb
-	.type	.FLT_27, @object
-	.size	.FLT_27, 32
-	.align	32
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
 
-.FLT_28:
-	.long	0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e
-	.type	.FLT_28, @object
-	.size	.FLT_28, 32
-	.align	32
 
-.FLT_29:
-	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-	.type	.FLT_29, @object
-	.size	.FLT_29, 32
-	.align	32
+	/* Save original input (ymm0 unchanged up to this point).  */
+	vmovaps	%ymm11, 32(%rsp)
+	vmovaps	%ymm0, (%rsp)
 
-.FLT_30:
-	.long	0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000
-	.type	.FLT_30, @object
-	.size	.FLT_30, 32
-	.align	32
+	vzeroupper
 
-.FLT_31:
-	.long	0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff
-	.type	.FLT_31, @object
-	.size	.FLT_31, 32
-	.align	32
+	/* eax has 1s where there was a special value that needs to be
+	   handled by a tanf call.  */
+	negb	%al
+	movzbl	%al, %ebx
+L(SPECIAL_VALUES_LOOP):
 
-.FLT_32:
-	.long	0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f
-	.type	.FLT_32, @object
-	.size	.FLT_32, 32
-	.align	32
+	/* use rbp as index for special value that is saved across calls
+	   to tanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 28] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop. Realigning
+	   also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math function call to process special input.  */
+	vmovss	32(%rsp, %rbp, 4), %xmm0
+	call	tanf@PLT
 
-.FLT_33:
-	.long	0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
-	.type	.FLT_33, @object
-	.size	.FLT_33, 32
-	.align	32
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
+	vmovss	%xmm0, (%rsp, %rbp, 4)
 
-#ifdef __svml_stan_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(32)) VUINT32 _sInvPI_uisa[8][1];
-	__declspec(align(32)) VUINT32 _sPI1_uisa[8][1];
-	__declspec(align(32)) VUINT32 _sPI2_uisa[8][1];
-	__declspec(align(32)) VUINT32 _sPI3_uisa[8][1];
-	__declspec(align(32)) VUINT32 _sPI2_ha_uisa[8][1];
-	__declspec(align(32)) VUINT32 _sPI3_ha_uisa[8][1];
-	__declspec(align(32)) VUINT32 Th_tbl_uisa[32][1];
-	__declspec(align(32)) VUINT32 Tl_tbl_uisa[32][1];
-	__declspec(align(32)) VUINT32 _sPC3_uisa[8][1];
-	__declspec(align(32)) VUINT32 _sPC5_uisa[8][1];
-	__declspec(align(32)) VUINT32 _sRangeReductionVal_uisa[8][1];
-	__declspec(align(32)) VUINT32 _sInvPi[8][1];
-	__declspec(align(32)) VUINT32 _sSignMask[8][1];
-	__declspec(align(32)) VUINT32 _sAbsMask[8][1];
-	__declspec(align(32)) VUINT32 _sRangeVal[8][1];
-	__declspec(align(32)) VUINT32 _sRShifter[8][1];
-	__declspec(align(32)) VUINT32 _sOne[8][1];
-	__declspec(align(32)) VUINT32 _sRangeReductionVal[8][1];
-	__declspec(align(32)) VUINT32 _sPI1[8][1];
-	__declspec(align(32)) VUINT32 _sPI2[8][1];
-	__declspec(align(32)) VUINT32 _sPI3[8][1];
-	__declspec(align(32)) VUINT32 _sPI4[8][1];
-	__declspec(align(32)) VUINT32 _sPI1_FMA[8][1];
-	__declspec(align(32)) VUINT32 _sPI2_FMA[8][1];
-	__declspec(align(32)) VUINT32 _sPI3_FMA[8][1];
-	__declspec(align(32)) VUINT32 _sP0[8][1];
-	__declspec(align(32)) VUINT32 _sP1[8][1];
-	__declspec(align(32)) VUINT32 _sQ0[8][1];
-	__declspec(align(32)) VUINT32 _sQ1[8][1];
-	__declspec(align(32)) VUINT32 _sQ2[8][1];
-	__declspec(align(32)) VUINT32 _sTwo[8][1];
-	__declspec(align(32)) VUINT32 _sCoeffs[128][10][1];
-} __svml_stan_data_internal;
-#endif
-__svml_stan_data_internal:
-	/* UISA */
-	.long	0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983, 0x4122f983 /* _sInvPI_uisa */
-	.align	32
-	.long	0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda, 0x3dc90fda /* _sPI1_uisa */
-	.align	32
-	.long	0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168, 0x31a22168 /* _sPI2_uisa */
-	.align	32
-	.long	0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5, 0x25c234c5 /* _sPI3_uisa */
-	.align	32
-	.long	0x31a22000, 0x31a22000, 0x31a22000, 0x31a22000, 0x31a22000, 0x31a22000, 0x31a22000, 0x31a22000 /* _sPI2_ha_uisa */
-	.align	32
-	.long	0x2a34611a, 0x2a34611a, 0x2a34611a, 0x2a34611a, 0x2a34611a, 0x2a34611a, 0x2a34611a, 0x2a34611a /* _sPI3_ha_uisa */
-	/* Th_tbl_uisa for i from 0 to 31 do printsingle(tan(i*Pi/32)); */
-	.align	32
-	.long	0x80000000, 0x3dc9b5dc, 0x3e4bafaf, 0x3e9b5042
-	.long	0x3ed413cd, 0x3f08d5b9, 0x3f2b0dc1, 0x3f521801
-	.long	0x3f800000, 0x3f9bf7ec, 0x3fbf90c7, 0x3fef789e
-	.long	0x401a827a, 0x4052facf, 0x40a0dff7, 0x41227363
-	.long	0xff7fffff, 0xc1227363, 0xc0a0dff7, 0xc052facf
-	.long	0xc01a827a, 0xbfef789e, 0xbfbf90c7, 0xbf9bf7ec
-	.long	0xbf800000, 0xbf521801, 0xbf2b0dc1, 0xbf08d5b9
-	.long	0xbed413cd, 0xbe9b5042, 0xbe4bafaf, 0xbdc9b5dc
-	/* Tl_tbl_uisa for i from 0 to 31 do printsingle(tan(i*Pi/32)-round(tan(i*Pi/32), SG, RN)); */
-	.align	32
-	.long	0x80000000, 0x3145b2da, 0x2f2a62b0, 0xb22a39c2
-	.long	0xb1c0621a, 0xb25ef963, 0x32ab7f99, 0x32ae4285
-	.long	0x00000000, 0x33587608, 0x32169d18, 0xb30c3ec0
-	.long	0xb3cc0622, 0x3390600e, 0x331091dc, 0xb454a046
-	.long	0xf3800000, 0x3454a046, 0xb31091dc, 0xb390600e
-	.long	0x33cc0622, 0x330c3ec0, 0xb2169d18, 0xb3587608
-	.long	0x00000000, 0xb2ae4285, 0xb2ab7f99, 0x325ef963
-	.long	0x31c0621a, 0x322a39c2, 0xaf2a62b0, 0xb145b2da
-	.align	32
-	.long	0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6, 0x3eaaaaa6 /* _sPC3_uisa */
-	.align	32
-	.long	0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888, 0x3e08b888 /* _sPC5_uisa */
-	.align	32
-	.long	0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000 /* _sRangeReductionVal_uisa */
-	.align	32
-	.long	0x3F22F983, 0x3F22F983, 0x3F22F983, 0x3F22F983, 0x3F22F983, 0x3F22F983, 0x3F22F983, 0x3F22F983 /* _sInvPi */
-	.align	32
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSignMask */
-	.align	32
-	.long	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF /* _sAbsMask */
-	.align	32
-	.long	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 /* _sRangeVal */
-	.align	32
-	.long	0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000, 0x4B400000 /* _sRShifter */
-	.align	32
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 /* _sOne */
-	.align	32
-	.long	0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000, 0x46010000 /* _sRangeVal */
-	.align	32
-	.long	0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000, 0x3FC90000 /* _sPI1 */
-	.align	32
-	.long	0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000, 0x39FDA000 /* _sPI2 */
-	.align	32
-	.long	0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000, 0x33A22000 /* _sPI3 */
-	.align	32
-	.long	0x2C34611A, 0x2C34611A, 0x2C34611A, 0x2C34611A, 0x2C34611A, 0x2C34611A, 0x2C34611A, 0x2C34611A /* _sPI4 */
-	// PI1, PI2, and PI3 when FMA is available
-	.align	32
-	.long	0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB /* _sPI1_FMA */
-	.align	32
-	.long	0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E /* _sPI2_FMA */
-	.align	32
-	.long	0xA6F72CED, 0xA6F72CED, 0xA6F72CED, 0xA6F72CED, 0xA6F72CED, 0xA6F72CED, 0xA6F72CED, 0xA6F72CED /* _sPI3_FMA */
-	.align	32
-	.long	0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC /* _sP0 */
-	.align	32
-	.long	0xBDC433B4, 0xBDC433B4, 0xBDC433B4, 0xBDC433B4, 0xBDC433B4, 0xBDC433B4, 0xBDC433B4, 0xBDC433B4 /* _sP1 */
-	.align	32
-	.long	0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC, 0x3F7FFFFC /* _sQ0 */
-	.align	32
-	.long	0xBEDBB7AB, 0xBEDBB7AB, 0xBEDBB7AB, 0xBEDBB7AB, 0xBEDBB7AB, 0xBEDBB7AB, 0xBEDBB7AB, 0xBEDBB7AB /* _sQ1 */
-	.align	32
-	.long	0x3C1F336B, 0x3C1F336B, 0x3C1F336B, 0x3C1F336B, 0x3C1F336B, 0x3C1F336B, 0x3C1F336B, 0x3C1F336B /* _sQ2 */
-	.align	32
-	.long	0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000 /* _sTwo */
-	// _sCoeffs Breakpoint B = 0 * pi/128, function tan(B + x)
-	.align	32
-	.long	0x3FC90FDB // B' = pi/2 - B (high single)
-	.long	0xB33BBD2E // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x00000000 // c0 (high single)
-	.long	0x00000000 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x00000000 // c1 (low single)
-	.long	0x00000000 // c2
-	.long	0x3EAAACDD // c3
-	.long	0x00000000 // c4
-	.long	0x3FC5EB9B // B' = pi/2 - B (high single)
-	.long	0x32DE638C // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3CC91A31 // c0 (high single)
-	.long	0x2F8E8D1A // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3A1DFA00 // c1 (low single)
-	.long	0x3CC9392D // c2
-	.long	0x3EAB1889 // c3
-	.long	0x3C885D3B // c4
-	.long	0x3FC2C75C // B' = pi/2 - B (high single)
-	.long	0xB2CBBE8A // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3D49393C // c0 (high single)
-	.long	0x30A39F5B // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3B1E2B00 // c1 (low single)
-	.long	0x3D49B5D4 // c2
-	.long	0x3EAC4F10 // c3
-	.long	0x3CFD9425 // c4
-	.long	0x3FBFA31C // B' = pi/2 - B (high single)
-	.long	0x33450FB0 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3D9711CE // c0 (high single)
-	.long	0x314FEB28 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3BB24C00 // c1 (low single)
-	.long	0x3D97E43A // c2
-	.long	0x3EAE6A89 // c3
-	.long	0x3D4D07E0 // c4
-	.long	0x3FBC7EDD // B' = pi/2 - B (high single)
-	.long	0xB1800ADD // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3DC9B5DC // c0 (high single)
-	.long	0x3145AD86 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3C1EEF20 // c1 (low single)
-	.long	0x3DCBAAEA // c2
-	.long	0x3EB14E5E // c3
-	.long	0x3D858BB2 // c4
-	.long	0x3FB95A9E // B' = pi/2 - B (high single)
-	.long	0xB3651267 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3DFC98C2 // c0 (high single)
-	.long	0xB0AE525C // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3C793D20 // c1 (low single)
-	.long	0x3E003845 // c2
-	.long	0x3EB5271F // c3
-	.long	0x3DAC669E // c4
-	.long	0x3FB6365E // B' = pi/2 - B (high single)
-	.long	0x328BB91C // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E17E564 // c0 (high single)
-	.long	0xB1C5A2E4 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3CB440D0 // c1 (low single)
-	.long	0x3E1B3D00 // c2
-	.long	0x3EB9F664 // c3
-	.long	0x3DD647C0 // c4
-	.long	0x3FB3121F // B' = pi/2 - B (high single)
-	.long	0xB30F347D // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E31AE4D // c0 (high single)
-	.long	0xB1F32251 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3CF6A500 // c1 (low single)
-	.long	0x3E3707DA // c2
-	.long	0x3EBFA489 // c3
-	.long	0x3DFBD9C7 // c4
-	.long	0x3FAFEDDF // B' = pi/2 - B (high single)
-	.long	0x331BBA77 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E4BAFAF // c0 (high single)
-	.long	0x2F2A29E0 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D221018 // c1 (low single)
-	.long	0x3E53BED0 // c2
-	.long	0x3EC67E26 // c3
-	.long	0x3E1568E2 // c4
-	.long	0x3FACC9A0 // B' = pi/2 - B (high single)
-	.long	0xB2655A50 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E65F267 // c0 (high single)
-	.long	0x31B4B1DF // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D4E8B90 // c1 (low single)
-	.long	0x3E718ACA // c2
-	.long	0x3ECE7164 // c3
-	.long	0x3E2DC161 // c4
-	.long	0x3FA9A560 // B' = pi/2 - B (high single)
-	.long	0x33719861 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E803FD4 // c0 (high single)
-	.long	0xB2279E66 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D807FC8 // c1 (low single)
-	.long	0x3E884BD4 // c2
-	.long	0x3ED7812D // c3
-	.long	0x3E4636EB // c4
-	.long	0x3FA68121 // B' = pi/2 - B (high single)
-	.long	0x31E43AAC // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E8DB082 // c0 (high single)
-	.long	0xB132A234 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D9CD7D0 // c1 (low single)
-	.long	0x3E988A60 // c2
-	.long	0x3EE203E3 // c3
-	.long	0x3E63582C // c4
-	.long	0x3FA35CE2 // B' = pi/2 - B (high single)
-	.long	0xB33889B6 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3E9B5042 // c0 (high single)
-	.long	0xB22A3AEE // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3DBC7490 // c1 (low single)
-	.long	0x3EA99AF5 // c2
-	.long	0x3EEDE107 // c3
-	.long	0x3E80E9AA // c4
-	.long	0x3FA038A2 // B' = pi/2 - B (high single)
-	.long	0x32E4CA7E // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3EA92457 // c0 (high single)
-	.long	0x30B80830 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3DDF8200 // c1 (low single)
-	.long	0x3EBB99E9 // c2
-	.long	0x3EFB4AA8 // c3
-	.long	0x3E9182BE // c4
-	.long	0x3F9D1463 // B' = pi/2 - B (high single)
-	.long	0xB2C55799 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3EB73250 // c0 (high single)
-	.long	0xB2028823 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E0318F8 // c1 (low single)
-	.long	0x3ECEA678 // c2
-	.long	0x3F053C67 // c3
-	.long	0x3EA41E53 // c4
-	.long	0x3F99F023 // B' = pi/2 - B (high single)
-	.long	0x33484328 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3EC5800D // c0 (high single)
-	.long	0xB214C3C1 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E185E54 // c1 (low single)
-	.long	0x3EE2E342 // c2
-	.long	0x3F0DCA73 // c3
-	.long	0x3EB8CC21 // c4
-	.long	0x3F96CBE4 // B' = pi/2 - B (high single)
-	.long	0xB14CDE2E // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3ED413CD // c0 (high single)
-	.long	0xB1C06152 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E2FB0CC // c1 (low single)
-	.long	0x3EF876CB // c2
-	.long	0x3F177807 // c3
-	.long	0x3ED08437 // c4
-	.long	0x3F93A7A5 // B' = pi/2 - B (high single)
-	.long	0xB361DEEE // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3EE2F439 // c0 (high single)
-	.long	0xB1F4399E // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E49341C // c1 (low single)
-	.long	0x3F07C61A // c2
-	.long	0x3F22560F // c3
-	.long	0x3EEAA81E // c4
-	.long	0x3F908365 // B' = pi/2 - B (high single)
-	.long	0x3292200D // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3EF22870 // c0 (high single)
-	.long	0x325271F4 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E65107A // c1 (low single)
-	.long	0x3F1429F0 // c2
-	.long	0x3F2E8AFC // c3
-	.long	0x3F040498 // c4
-	.long	0x3F8D5F26 // B' = pi/2 - B (high single)
-	.long	0xB30C0105 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F00DC0D // c0 (high single)
-	.long	0xB214AF72 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E81B994 // c1 (low single)
-	.long	0x3F218233 // c2
-	.long	0x3F3C4531 // c3
-	.long	0x3F149688 // c4
-	.long	0x3F8A3AE6 // B' = pi/2 - B (high single)
-	.long	0x331EEDF0 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F08D5B9 // c0 (high single)
-	.long	0xB25EF98E // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E92478D // c1 (low single)
-	.long	0x3F2FEDC9 // c2
-	.long	0x3F4BCD58 // c3
-	.long	0x3F27AE9E // c4
-	.long	0x3F8716A7 // B' = pi/2 - B (high single)
-	.long	0xB2588C6D // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F1105AF // c0 (high single)
-	.long	0x32F045B0 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EA44EE2 // c1 (low single)
-	.long	0x3F3F8FDB // c2
-	.long	0x3F5D3FD0 // c3
-	.long	0x3F3D0A23 // c4
-	.long	0x3F83F267 // B' = pi/2 - B (high single)
-	.long	0x3374CBD9 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F1970C4 // c0 (high single)
-	.long	0x32904848 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EB7EFF8 // c1 (low single)
-	.long	0x3F50907C // c2
-	.long	0x3F710FEA // c3
-	.long	0x3F561FED // c4
-	.long	0x3F80CE28 // B' = pi/2 - B (high single)
-	.long	0x31FDD672 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F221C37 // c0 (high single)
-	.long	0xB20C61DC // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3ECD4F71 // c1 (low single)
-	.long	0x3F631DAA // c2
-	.long	0x3F83B471 // c3
-	.long	0x3F7281EA // c4
-	.long	0x3F7B53D1 // B' = pi/2 - B (high single)
-	.long	0x32955386 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F2B0DC1 // c0 (high single)
-	.long	0x32AB7EBA // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EE496C2 // c1 (low single)
-	.long	0x3F776C40 // c2
-	.long	0x3F9065C1 // c3
-	.long	0x3F89AFB6 // c4
-	.long	0x3F750B52 // B' = pi/2 - B (high single)
-	.long	0x32EB316F // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F344BA9 // c0 (high single)
-	.long	0xB2B8B0EA // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EFDF4F7 // c1 (low single)
-	.long	0x3F86DCA8 // c2
-	.long	0x3F9ED53B // c3
-	.long	0x3F9CBEDE // c4
-	.long	0x3F6EC2D4 // B' = pi/2 - B (high single)
-	.long	0xB2BEF0A7 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F3DDCCF // c0 (high single)
-	.long	0x32D29606 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBEE6606F // c1 (low single)
-	.long	0x3F9325D6 // c2
-	.long	0x3FAF4E69 // c3
-	.long	0x3FB3080C // c4
-	.long	0x3F687A55 // B' = pi/2 - B (high single)
-	.long	0xB252257B // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F47C8CC // c0 (high single)
-	.long	0xB200F51A // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBEC82C6C // c1 (low single)
-	.long	0x3FA0BAE9 // c2
-	.long	0x3FC2252F // c3
-	.long	0x3FCD24C7 // c4
-	.long	0x3F6231D6 // B' = pi/2 - B (high single)
-	.long	0xB119A6A2 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F521801 // c0 (high single)
-	.long	0x32AE4178 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBEA72938 // c1 (low single)
-	.long	0x3FAFCC22 // c2
-	.long	0x3FD7BD4A // c3
-	.long	0x3FEBB01B // c4
-	.long	0x3F5BE957 // B' = pi/2 - B (high single)
-	.long	0x3205522A // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F5CD3BE // c0 (high single)
-	.long	0x31460308 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBE8306C5 // c1 (low single)
-	.long	0x3FC09232 // c2
-	.long	0x3FF09632 // c3
-	.long	0x4007DB00 // c4
-	.long	0x3F55A0D8 // B' = pi/2 - B (high single)
-	.long	0x329886FF // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F68065E // c0 (high single)
-	.long	0x32670D1A // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBE36D1D6 // c1 (low single)
-	.long	0x3FD35007 // c2
-	.long	0x4006A861 // c3
-	.long	0x401D4BDA // c4
-	.long	0x3F4F5859 // B' = pi/2 - B (high single)
-	.long	0x32EE64E8 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0x3F73BB75 // c0 (high single)
-	.long	0x32FC908D // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBDBF94B0 // c1 (low single)
-	.long	0x3FE8550F // c2
-	.long	0x40174F67 // c3
-	.long	0x4036C608 // c4
-	.long	0x3F490FDB // B' = pi/2 - B (high single)
-	.long	0xB2BBBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE8BE60E // c0 (high single)
-	.long	0x320D8D84 // c0 (low single)
-	.long	0x3F000000 // c1 (high 1 bit)
-	.long	0xBDF817B1 // c1 (low single)
-	.long	0xBD8345EB // c2
-	.long	0x3D1DFDAC // c3
-	.long	0xBC52CF6F // c4
-	.long	0x3F42C75C // B' = pi/2 - B (high single)
-	.long	0xB24BBE8A // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE87283F // c0 (high single)
-	.long	0xB268B966 // c0 (low single)
-	.long	0x3F000000 // c1 (high 1 bit)
-	.long	0xBDFE6529 // c1 (low single)
-	.long	0xBD7B1953 // c2
-	.long	0x3D18E109 // c3
-	.long	0xBC4570B0 // c4
-	.long	0x3F3C7EDD // B' = pi/2 - B (high single)
-	.long	0xB1000ADD // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE827420 // c0 (high single)
-	.long	0x320B8B4D // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DFB9428 // c1 (low single)
-	.long	0xBD7002B4 // c2
-	.long	0x3D142A6C // c3
-	.long	0xBC3A47FF // c4
-	.long	0x3F36365E // B' = pi/2 - B (high single)
-	.long	0x320BB91C // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE7B9282 // c0 (high single)
-	.long	0xB13383D2 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DF5D211 // c1 (low single)
-	.long	0xBD6542B3 // c2
-	.long	0x3D0FE5E5 // c3
-	.long	0xBC31FB14 // c4
-	.long	0x3F2FEDDF // B' = pi/2 - B (high single)
-	.long	0x329BBA77 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE724E73 // c0 (high single)
-	.long	0x3120C3E2 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DF05283 // c1 (low single)
-	.long	0xBD5AD45E // c2
-	.long	0x3D0BAFBF // c3
-	.long	0xBC27B8BB // c4
-	.long	0x3F29A560 // B' = pi/2 - B (high single)
-	.long	0x32F19861 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE691B44 // c0 (high single)
-	.long	0x31F18936 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DEB138B // c1 (low single)
-	.long	0xBD50B2F7 // c2
-	.long	0x3D07BE3A // c3
-	.long	0xBC1E46A7 // c4
-	.long	0x3F235CE2 // B' = pi/2 - B (high single)
-	.long	0xB2B889B6 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE5FF82C // c0 (high single)
-	.long	0xB170723A // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DE61354 // c1 (low single)
-	.long	0xBD46DA06 // c2
-	.long	0x3D0401F8 // c3
-	.long	0xBC14E013 // c4
-	.long	0x3F1D1463 // B' = pi/2 - B (high single)
-	.long	0xB2455799 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE56E46B // c0 (high single)
-	.long	0x31E3F001 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DE15025 // c1 (low single)
-	.long	0xBD3D4550 // c2
-	.long	0x3D00462D // c3
-	.long	0xBC092C98 // c4
-	.long	0x3F16CBE4 // B' = pi/2 - B (high single)
-	.long	0xB0CCDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE4DDF41 // c0 (high single)
-	.long	0xB1AEA094 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DDCC85C // c1 (low single)
-	.long	0xBD33F0BE // c2
-	.long	0x3CFA23B0 // c3
-	.long	0xBC01FCF7 // c4
-	.long	0x3F108365 // B' = pi/2 - B (high single)
-	.long	0x3212200D // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE44E7F8 // c0 (high single)
-	.long	0xB1CAA3CB // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DD87A74 // c1 (low single)
-	.long	0xBD2AD885 // c2
-	.long	0x3CF3C785 // c3
-	.long	0xBBF1E348 // c4
-	.long	0x3F0A3AE6 // B' = pi/2 - B (high single)
-	.long	0x329EEDF0 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE3BFDDC // c0 (high single)
-	.long	0xB132521A // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DD464FC // c1 (low single)
-	.long	0xBD21F8F1 // c2
-	.long	0x3CEE3076 // c3
-	.long	0xBBE6D263 // c4
-	.long	0x3F03F267 // B' = pi/2 - B (high single)
-	.long	0x32F4CBD9 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE33203E // c0 (high single)
-	.long	0x31FEF5BE // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DD0869C // c1 (low single)
-	.long	0xBD194E8C // c2
-	.long	0x3CE8DCA9 // c3
-	.long	0xBBDADA55 // c4
-	.long	0x3EFB53D1 // B' = pi/2 - B (high single)
-	.long	0x32155386 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE2A4E71 // c0 (high single)
-	.long	0xB19CFCEC // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DCCDE11 // c1 (low single)
-	.long	0xBD10D605 // c2
-	.long	0x3CE382A7 // c3
-	.long	0xBBC8BD97 // c4
-	.long	0x3EEEC2D4 // B' = pi/2 - B (high single)
-	.long	0xB23EF0A7 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE2187D0 // c0 (high single)
-	.long	0xB1B7C7F7 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC96A2B // c1 (low single)
-	.long	0xBD088C22 // c2
-	.long	0x3CDE950E // c3
-	.long	0xBBB89AD1 // c4
-	.long	0x3EE231D6 // B' = pi/2 - B (high single)
-	.long	0xB099A6A2 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE18CBB7 // c0 (high single)
-	.long	0xAFE28430 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC629CE // c1 (low single)
-	.long	0xBD006DCD // c2
-	.long	0x3CDA5A2C // c3
-	.long	0xBBB0B3D2 // c4
-	.long	0x3ED5A0D8 // B' = pi/2 - B (high single)
-	.long	0x321886FF // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE101985 // c0 (high single)
-	.long	0xB02FB2B8 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC31BF3 // c1 (low single)
-	.long	0xBCF0F04D // c2
-	.long	0x3CD60BC7 // c3
-	.long	0xBBA138BA // c4
-	.long	0x3EC90FDB // B' = pi/2 - B (high single)
-	.long	0xB23BBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBE07709D // c0 (high single)
-	.long	0xB18A2A83 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC03FA2 // c1 (low single)
-	.long	0xBCE15096 // c2
-	.long	0x3CD26472 // c3
-	.long	0xBB9A1270 // c4
-	.long	0x3EBC7EDD // B' = pi/2 - B (high single)
-	.long	0xB0800ADD // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBDFDA0CB // c0 (high single)
-	.long	0x2F14FCA0 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DBD93F7 // c1 (low single)
-	.long	0xBCD1F71B // c2
-	.long	0x3CCEDD2B // c3
-	.long	0xBB905946 // c4
-	.long	0x3EAFEDDF // B' = pi/2 - B (high single)
-	.long	0x321BBA77 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBDEC708C // c0 (high single)
-	.long	0xB14895C4 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DBB181E // c1 (low single)
-	.long	0xBCC2DEA6 // c2
-	.long	0x3CCB5027 // c3
-	.long	0xBB7F3969 // c4
-	.long	0x3EA35CE2 // B' = pi/2 - B (high single)
-	.long	0xB23889B6 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBDDB4F55 // c0 (high single)
-	.long	0x30F6437E // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB8CB52 // c1 (low single)
-	.long	0xBCB40210 // c2
-	.long	0x3CC82D45 // c3
-	.long	0xBB643075 // c4
-	.long	0x3E96CBE4 // B' = pi/2 - B (high single)
-	.long	0xB04CDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBDCA3BFF // c0 (high single)
-	.long	0x311C95EA // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB6ACDE // c1 (low single)
-	.long	0xBCA55C5B // c2
-	.long	0x3CC5BC04 // c3
-	.long	0xBB63A969 // c4
-	.long	0x3E8A3AE6 // B' = pi/2 - B (high single)
-	.long	0x321EEDF0 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBDB93569 // c0 (high single)
-	.long	0xAFB9ED00 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB4BC1F // c1 (low single)
-	.long	0xBC96E905 // c2
-	.long	0x3CC2E6F5 // c3
-	.long	0xBB3E10A6 // c4
-	.long	0x3E7B53D1 // B' = pi/2 - B (high single)
-	.long	0x31955386 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBDA83A77 // c0 (high single)
-	.long	0x316D967A // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB2F87C // c1 (low single)
-	.long	0xBC88A31F // c2
-	.long	0x3CC0E763 // c3
-	.long	0xBB3F1666 // c4
-	.long	0x3E6231D6 // B' = pi/2 - B (high single)
-	.long	0xB019A6A2 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBD974A0D // c0 (high single)
-	.long	0xB14F365B // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB1616F // c1 (low single)
-	.long	0xBC750CD8 // c2
-	.long	0x3CBEB595 // c3
-	.long	0xBB22B883 // c4
-	.long	0x3E490FDB // B' = pi/2 - B (high single)
-	.long	0xB1BBBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBD866317 // c0 (high single)
-	.long	0xAFF02140 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAFF67D // c1 (low single)
-	.long	0xBC591CD0 // c2
-	.long	0x3CBCBEAD // c3
-	.long	0xBB04BBEC // c4
-	.long	0x3E2FEDDF // B' = pi/2 - B (high single)
-	.long	0x319BBA77 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBD6B08FF // c0 (high single)
-	.long	0xB0EED236 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAEB739 // c1 (low single)
-	.long	0xBC3D6D51 // c2
-	.long	0x3CBB485D // c3
-	.long	0xBAFFF5BA // c4
-	.long	0x3E16CBE4 // B' = pi/2 - B (high single)
-	.long	0xAFCCDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBD495A6C // c0 (high single)
-	.long	0xB0A427BD // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DADA345 // c1 (low single)
-	.long	0xBC21F648 // c2
-	.long	0x3CB9D1B4 // c3
-	.long	0xBACB5567 // c4
-	.long	0x3DFB53D1 // B' = pi/2 - B (high single)
-	.long	0x31155386 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBD27B856 // c0 (high single)
-	.long	0xB0F7EE91 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DACBA4E // c1 (low single)
-	.long	0xBC06AEE3 // c2
-	.long	0x3CB8E5DC // c3
-	.long	0xBAEC00EE // c4
-	.long	0x3DC90FDB // B' = pi/2 - B (high single)
-	.long	0xB13BBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBD0620A3 // c0 (high single)
-	.long	0xB0ECAB40 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DABFC11 // c1 (low single)
-	.long	0xBBD7200F // c2
-	.long	0x3CB79475 // c3
-	.long	0xBA2B0ADC // c4
-	.long	0x3D96CBE4 // B' = pi/2 - B (high single)
-	.long	0xAF4CDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBCC92278 // c0 (high single)
-	.long	0x302F2E68 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAB6854 // c1 (low single)
-	.long	0xBBA1214F // c2
-	.long	0x3CB6C1E9 // c3
-	.long	0x3843C2F3 // c4
-	.long	0x3D490FDB // B' = pi/2 - B (high single)
-	.long	0xB0BBBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBC861015 // c0 (high single)
-	.long	0xAFD68E2E // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAAFEEB // c1 (low single)
-	.long	0xBB569F3F // c2
-	.long	0x3CB6A84E // c3
-	.long	0xBAC64194 // c4
-	.long	0x3CC90FDB // B' = pi/2 - B (high single)
-	.long	0xB03BBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0xBC060BF3 // c0 (high single)
-	.long	0x2FE251AE // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAABFB9 // c1 (low single)
-	.long	0xBAD67C60 // c2
-	.long	0x3CB64CA5 // c3
-	.long	0xBACDE881 // c4
-	.long	0x00000000 // B' = pi/2 - B (high single)
-	.long	0x00000000 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x00000000 // c0 (high single)
-	.long	0x00000000 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAAAAAB // c1 (low single)
-	.long	0x00000000 // c2
-	.long	0x3CB5E28B // c3
-	.long	0x00000000 // c4
-	.long	0xBCC90FDB // B' = pi/2 - B (high single)
-	.long	0x303BBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3C060BF3 // c0 (high single)
-	.long	0xAFE251AE // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAABFB9 // c1 (low single)
-	.long	0x3AD67C60 // c2
-	.long	0x3CB64CA5 // c3
-	.long	0x3ACDE881 // c4
-	.long	0xBD490FDB // B' = pi/2 - B (high single)
-	.long	0x30BBBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3C861015 // c0 (high single)
-	.long	0x2FD68E2E // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAAFEEB // c1 (low single)
-	.long	0x3B569F3F // c2
-	.long	0x3CB6A84E // c3
-	.long	0x3AC64194 // c4
-	.long	0xBD96CBE4 // B' = pi/2 - B (high single)
-	.long	0x2F4CDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3CC92278 // c0 (high single)
-	.long	0xB02F2E68 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAB6854 // c1 (low single)
-	.long	0x3BA1214F // c2
-	.long	0x3CB6C1E9 // c3
-	.long	0xB843C2F2 // c4
-	.long	0xBDC90FDB // B' = pi/2 - B (high single)
-	.long	0x313BBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3D0620A3 // c0 (high single)
-	.long	0x30ECAB40 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DABFC11 // c1 (low single)
-	.long	0x3BD7200F // c2
-	.long	0x3CB79475 // c3
-	.long	0x3A2B0ADC // c4
-	.long	0xBDFB53D1 // B' = pi/2 - B (high single)
-	.long	0xB1155386 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3D27B856 // c0 (high single)
-	.long	0x30F7EE91 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DACBA4E // c1 (low single)
-	.long	0x3C06AEE3 // c2
-	.long	0x3CB8E5DC // c3
-	.long	0x3AEC00EE // c4
-	.long	0xBE16CBE4 // B' = pi/2 - B (high single)
-	.long	0x2FCCDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3D495A6C // c0 (high single)
-	.long	0x30A427BD // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DADA345 // c1 (low single)
-	.long	0x3C21F648 // c2
-	.long	0x3CB9D1B4 // c3
-	.long	0x3ACB5567 // c4
-	.long	0xBE2FEDDF // B' = pi/2 - B (high single)
-	.long	0xB19BBA77 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3D6B08FF // c0 (high single)
-	.long	0x30EED236 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAEB739 // c1 (low single)
-	.long	0x3C3D6D51 // c2
-	.long	0x3CBB485D // c3
-	.long	0x3AFFF5BA // c4
-	.long	0xBE490FDB // B' = pi/2 - B (high single)
-	.long	0x31BBBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3D866317 // c0 (high single)
-	.long	0x2FF02140 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DAFF67D // c1 (low single)
-	.long	0x3C591CD0 // c2
-	.long	0x3CBCBEAD // c3
-	.long	0x3B04BBEC // c4
-	.long	0xBE6231D6 // B' = pi/2 - B (high single)
-	.long	0x3019A6A2 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3D974A0D // c0 (high single)
-	.long	0x314F365B // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB1616F // c1 (low single)
-	.long	0x3C750CD8 // c2
-	.long	0x3CBEB595 // c3
-	.long	0x3B22B883 // c4
-	.long	0xBE7B53D1 // B' = pi/2 - B (high single)
-	.long	0xB1955386 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3DA83A77 // c0 (high single)
-	.long	0xB16D967A // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB2F87C // c1 (low single)
-	.long	0x3C88A31F // c2
-	.long	0x3CC0E763 // c3
-	.long	0x3B3F1666 // c4
-	.long	0xBE8A3AE6 // B' = pi/2 - B (high single)
-	.long	0xB21EEDF0 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3DB93569 // c0 (high single)
-	.long	0x2FB9ED00 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB4BC1F // c1 (low single)
-	.long	0x3C96E905 // c2
-	.long	0x3CC2E6F5 // c3
-	.long	0x3B3E10A6 // c4
-	.long	0xBE96CBE4 // B' = pi/2 - B (high single)
-	.long	0x304CDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3DCA3BFF // c0 (high single)
-	.long	0xB11C95EA // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB6ACDE // c1 (low single)
-	.long	0x3CA55C5B // c2
-	.long	0x3CC5BC04 // c3
-	.long	0x3B63A969 // c4
-	.long	0xBEA35CE2 // B' = pi/2 - B (high single)
-	.long	0x323889B6 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3DDB4F55 // c0 (high single)
-	.long	0xB0F6437E // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DB8CB52 // c1 (low single)
-	.long	0x3CB40210 // c2
-	.long	0x3CC82D45 // c3
-	.long	0x3B643075 // c4
-	.long	0xBEAFEDDF // B' = pi/2 - B (high single)
-	.long	0xB21BBA77 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3DEC708C // c0 (high single)
-	.long	0x314895C4 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DBB181E // c1 (low single)
-	.long	0x3CC2DEA6 // c2
-	.long	0x3CCB5027 // c3
-	.long	0x3B7F3969 // c4
-	.long	0xBEBC7EDD // B' = pi/2 - B (high single)
-	.long	0x30800ADD // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3DFDA0CB // c0 (high single)
-	.long	0xAF14FCA0 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DBD93F7 // c1 (low single)
-	.long	0x3CD1F71B // c2
-	.long	0x3CCEDD2B // c3
-	.long	0x3B905946 // c4
-	.long	0xBEC90FDB // B' = pi/2 - B (high single)
-	.long	0x323BBD2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E07709D // c0 (high single)
-	.long	0x318A2A83 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC03FA2 // c1 (low single)
-	.long	0x3CE15096 // c2
-	.long	0x3CD26472 // c3
-	.long	0x3B9A1270 // c4
-	.long	0xBED5A0D8 // B' = pi/2 - B (high single)
-	.long	0xB21886FF // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E101985 // c0 (high single)
-	.long	0x302FB2B8 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC31BF3 // c1 (low single)
-	.long	0x3CF0F04D // c2
-	.long	0x3CD60BC7 // c3
-	.long	0x3BA138BA // c4
-	.long	0xBEE231D6 // B' = pi/2 - B (high single)
-	.long	0x3099A6A2 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E18CBB7 // c0 (high single)
-	.long	0x2FE28430 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC629CE // c1 (low single)
-	.long	0x3D006DCD // c2
-	.long	0x3CDA5A2C // c3
-	.long	0x3BB0B3D2 // c4
-	.long	0xBEEEC2D4 // B' = pi/2 - B (high single)
-	.long	0x323EF0A7 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E2187D0 // c0 (high single)
-	.long	0x31B7C7F7 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DC96A2B // c1 (low single)
-	.long	0x3D088C22 // c2
-	.long	0x3CDE950E // c3
-	.long	0x3BB89AD1 // c4
-	.long	0xBEFB53D1 // B' = pi/2 - B (high single)
-	.long	0xB2155386 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E2A4E71 // c0 (high single)
-	.long	0x319CFCEC // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DCCDE11 // c1 (low single)
-	.long	0x3D10D605 // c2
-	.long	0x3CE382A7 // c3
-	.long	0x3BC8BD97 // c4
-	.long	0xBF03F267 // B' = pi/2 - B (high single)
-	.long	0xB2F4CBD9 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E33203E // c0 (high single)
-	.long	0xB1FEF5BE // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DD0869C // c1 (low single)
-	.long	0x3D194E8C // c2
-	.long	0x3CE8DCA9 // c3
-	.long	0x3BDADA55 // c4
-	.long	0xBF0A3AE6 // B' = pi/2 - B (high single)
-	.long	0xB29EEDF0 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E3BFDDC // c0 (high single)
-	.long	0x3132521A // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DD464FC // c1 (low single)
-	.long	0x3D21F8F1 // c2
-	.long	0x3CEE3076 // c3
-	.long	0x3BE6D263 // c4
-	.long	0xBF108365 // B' = pi/2 - B (high single)
-	.long	0xB212200D // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E44E7F8 // c0 (high single)
-	.long	0x31CAA3CB // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DD87A74 // c1 (low single)
-	.long	0x3D2AD885 // c2
-	.long	0x3CF3C785 // c3
-	.long	0x3BF1E348 // c4
-	.long	0xBF16CBE4 // B' = pi/2 - B (high single)
-	.long	0x30CCDE2E // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E4DDF41 // c0 (high single)
-	.long	0x31AEA094 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DDCC85C // c1 (low single)
-	.long	0x3D33F0BE // c2
-	.long	0x3CFA23B0 // c3
-	.long	0x3C01FCF7 // c4
-	.long	0xBF1D1463 // B' = pi/2 - B (high single)
-	.long	0x32455799 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E56E46B // c0 (high single)
-	.long	0xB1E3F001 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DE15025 // c1 (low single)
-	.long	0x3D3D4550 // c2
-	.long	0x3D00462D // c3
-	.long	0x3C092C98 // c4
-	.long	0xBF235CE2 // B' = pi/2 - B (high single)
-	.long	0x32B889B6 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E5FF82C // c0 (high single)
-	.long	0x3170723A // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DE61354 // c1 (low single)
-	.long	0x3D46DA06 // c2
-	.long	0x3D0401F8 // c3
-	.long	0x3C14E013 // c4
-	.long	0xBF29A560 // B' = pi/2 - B (high single)
-	.long	0xB2F19861 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E691B44 // c0 (high single)
-	.long	0xB1F18936 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DEB138B // c1 (low single)
-	.long	0x3D50B2F7 // c2
-	.long	0x3D07BE3A // c3
-	.long	0x3C1E46A7 // c4
-	.long	0xBF2FEDDF // B' = pi/2 - B (high single)
-	.long	0xB29BBA77 // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E724E73 // c0 (high single)
-	.long	0xB120C3E2 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DF05283 // c1 (low single)
-	.long	0x3D5AD45E // c2
-	.long	0x3D0BAFBF // c3
-	.long	0x3C27B8BB // c4
-	.long	0xBF36365E // B' = pi/2 - B (high single)
-	.long	0xB20BB91C // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E7B9282 // c0 (high single)
-	.long	0x313383D2 // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DF5D211 // c1 (low single)
-	.long	0x3D6542B3 // c2
-	.long	0x3D0FE5E5 // c3
-	.long	0x3C31FB14 // c4
-	.long	0xBF3C7EDD // B' = pi/2 - B (high single)
-	.long	0x31000ADD // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E827420 // c0 (high single)
-	.long	0xB20B8B4D // c0 (low single)
-	.long	0x3E800000 // c1 (high 1 bit)
-	.long	0x3DFB9428 // c1 (low single)
-	.long	0x3D7002B4 // c2
-	.long	0x3D142A6C // c3
-	.long	0x3C3A47FF // c4
-	.long	0xBF42C75C // B' = pi/2 - B (high single)
-	.long	0x324BBE8A // B' = pi/2 - B (low single)
-	.long	0x3F800000 // tau (1 for cot path)
-	.long	0x3E87283F // c0 (high single)
-	.long	0x3268B966 // c0 (low single)
-	.long	0x3F000000 // c1 (high 1 bit)
-	.long	0xBDFE6529 // c1 (low single)
-	.long	0x3D7B1953 // c2
-	.long	0x3D18E109 // c3
-	.long	0x3C4570B0 // c4
-	.long	0xBF490FDB // B' = pi/2 - B (high single)
-	.long	0x32BBBD2E // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF800000 // c0 (high single)
-	.long	0x2B410000 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xB3000000 // c1 (low single)
-	.long	0xC0000000 // c2
-	.long	0x402AB7C8 // c3
-	.long	0xC05561DB // c4
-	.long	0xBF4F5859 // B' = pi/2 - B (high single)
-	.long	0xB2EE64E8 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF73BB75 // c0 (high single)
-	.long	0xB2FC908D // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBDBF94B0 // c1 (low single)
-	.long	0xBFE8550F // c2
-	.long	0x40174F67 // c3
-	.long	0xC036C608 // c4
-	.long	0xBF55A0D8 // B' = pi/2 - B (high single)
-	.long	0xB29886FF // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF68065E // c0 (high single)
-	.long	0xB2670D1A // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBE36D1D6 // c1 (low single)
-	.long	0xBFD35007 // c2
-	.long	0x4006A861 // c3
-	.long	0xC01D4BDA // c4
-	.long	0xBF5BE957 // B' = pi/2 - B (high single)
-	.long	0xB205522A // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF5CD3BE // c0 (high single)
-	.long	0xB1460308 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBE8306C5 // c1 (low single)
-	.long	0xBFC09232 // c2
-	.long	0x3FF09632 // c3
-	.long	0xC007DB00 // c4
-	.long	0xBF6231D6 // B' = pi/2 - B (high single)
-	.long	0x3119A6A2 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF521801 // c0 (high single)
-	.long	0xB2AE4178 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBEA72938 // c1 (low single)
-	.long	0xBFAFCC22 // c2
-	.long	0x3FD7BD4A // c3
-	.long	0xBFEBB01B // c4
-	.long	0xBF687A55 // B' = pi/2 - B (high single)
-	.long	0x3252257B // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF47C8CC // c0 (high single)
-	.long	0x3200F51A // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBEC82C6C // c1 (low single)
-	.long	0xBFA0BAE9 // c2
-	.long	0x3FC2252F // c3
-	.long	0xBFCD24C7 // c4
-	.long	0xBF6EC2D4 // B' = pi/2 - B (high single)
-	.long	0x32BEF0A7 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF3DDCCF // c0 (high single)
-	.long	0xB2D29606 // c0 (low single)
-	.long	0x40000000 // c1 (high 1 bit)
-	.long	0xBEE6606F // c1 (low single)
-	.long	0xBF9325D6 // c2
-	.long	0x3FAF4E69 // c3
-	.long	0xBFB3080C // c4
-	.long	0xBF750B52 // B' = pi/2 - B (high single)
-	.long	0xB2EB316F // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF344BA9 // c0 (high single)
-	.long	0x32B8B0EA // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EFDF4F7 // c1 (low single)
-	.long	0xBF86DCA8 // c2
-	.long	0x3F9ED53B // c3
-	.long	0xBF9CBEDE // c4
-	.long	0xBF7B53D1 // B' = pi/2 - B (high single)
-	.long	0xB2955386 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF2B0DC1 // c0 (high single)
-	.long	0xB2AB7EBA // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EE496C2 // c1 (low single)
-	.long	0xBF776C40 // c2
-	.long	0x3F9065C1 // c3
-	.long	0xBF89AFB6 // c4
-	.long	0xBF80CE28 // B' = pi/2 - B (high single)
-	.long	0xB1FDD672 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF221C37 // c0 (high single)
-	.long	0x320C61DC // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3ECD4F71 // c1 (low single)
-	.long	0xBF631DAA // c2
-	.long	0x3F83B471 // c3
-	.long	0xBF7281EA // c4
-	.long	0xBF83F267 // B' = pi/2 - B (high single)
-	.long	0xB374CBD9 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF1970C4 // c0 (high single)
-	.long	0xB2904848 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EB7EFF8 // c1 (low single)
-	.long	0xBF50907C // c2
-	.long	0x3F710FEA // c3
-	.long	0xBF561FED // c4
-	.long	0xBF8716A7 // B' = pi/2 - B (high single)
-	.long	0x32588C6D // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF1105AF // c0 (high single)
-	.long	0xB2F045B0 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3EA44EE2 // c1 (low single)
-	.long	0xBF3F8FDB // c2
-	.long	0x3F5D3FD0 // c3
-	.long	0xBF3D0A23 // c4
-	.long	0xBF8A3AE6 // B' = pi/2 - B (high single)
-	.long	0xB31EEDF0 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF08D5B9 // c0 (high single)
-	.long	0x325EF98E // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E92478D // c1 (low single)
-	.long	0xBF2FEDC9 // c2
-	.long	0x3F4BCD58 // c3
-	.long	0xBF27AE9E // c4
-	.long	0xBF8D5F26 // B' = pi/2 - B (high single)
-	.long	0x330C0105 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBF00DC0D // c0 (high single)
-	.long	0x3214AF72 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E81B994 // c1 (low single)
-	.long	0xBF218233 // c2
-	.long	0x3F3C4531 // c3
-	.long	0xBF149688 // c4
-	.long	0xBF908365 // B' = pi/2 - B (high single)
-	.long	0xB292200D // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBEF22870 // c0 (high single)
-	.long	0xB25271F4 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E65107A // c1 (low single)
-	.long	0xBF1429F0 // c2
-	.long	0x3F2E8AFC // c3
-	.long	0xBF040498 // c4
-	.long	0xBF93A7A5 // B' = pi/2 - B (high single)
-	.long	0x3361DEEE // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBEE2F439 // c0 (high single)
-	.long	0x31F4399E // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E49341C // c1 (low single)
-	.long	0xBF07C61A // c2
-	.long	0x3F22560F // c3
-	.long	0xBEEAA81E // c4
-	.long	0xBF96CBE4 // B' = pi/2 - B (high single)
-	.long	0x314CDE2E // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBED413CD // c0 (high single)
-	.long	0x31C06152 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E2FB0CC // c1 (low single)
-	.long	0xBEF876CB // c2
-	.long	0x3F177807 // c3
-	.long	0xBED08437 // c4
-	.long	0xBF99F023 // B' = pi/2 - B (high single)
-	.long	0xB3484328 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBEC5800D // c0 (high single)
-	.long	0x3214C3C1 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E185E54 // c1 (low single)
-	.long	0xBEE2E342 // c2
-	.long	0x3F0DCA73 // c3
-	.long	0xBEB8CC21 // c4
-	.long	0xBF9D1463 // B' = pi/2 - B (high single)
-	.long	0x32C55799 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBEB73250 // c0 (high single)
-	.long	0x32028823 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3E0318F8 // c1 (low single)
-	.long	0xBECEA678 // c2
-	.long	0x3F053C67 // c3
-	.long	0xBEA41E53 // c4
-	.long	0xBFA038A2 // B' = pi/2 - B (high single)
-	.long	0xB2E4CA7E // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBEA92457 // c0 (high single)
-	.long	0xB0B80830 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3DDF8200 // c1 (low single)
-	.long	0xBEBB99E9 // c2
-	.long	0x3EFB4AA8 // c3
-	.long	0xBE9182BE // c4
-	.long	0xBFA35CE2 // B' = pi/2 - B (high single)
-	.long	0x333889B6 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE9B5042 // c0 (high single)
-	.long	0x322A3AEE // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3DBC7490 // c1 (low single)
-	.long	0xBEA99AF5 // c2
-	.long	0x3EEDE107 // c3
-	.long	0xBE80E9AA // c4
-	.long	0xBFA68121 // B' = pi/2 - B (high single)
-	.long	0xB1E43AAC // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE8DB082 // c0 (high single)
-	.long	0x3132A234 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D9CD7D0 // c1 (low single)
-	.long	0xBE988A60 // c2
-	.long	0x3EE203E3 // c3
-	.long	0xBE63582C // c4
-	.long	0xBFA9A560 // B' = pi/2 - B (high single)
-	.long	0xB3719861 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE803FD4 // c0 (high single)
-	.long	0x32279E66 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D807FC8 // c1 (low single)
-	.long	0xBE884BD4 // c2
-	.long	0x3ED7812D // c3
-	.long	0xBE4636EB // c4
-	.long	0xBFACC9A0 // B' = pi/2 - B (high single)
-	.long	0x32655A50 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE65F267 // c0 (high single)
-	.long	0xB1B4B1DF // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D4E8B90 // c1 (low single)
-	.long	0xBE718ACA // c2
-	.long	0x3ECE7164 // c3
-	.long	0xBE2DC161 // c4
-	.long	0xBFAFEDDF // B' = pi/2 - B (high single)
-	.long	0xB31BBA77 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE4BAFAF // c0 (high single)
-	.long	0xAF2A29E0 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3D221018 // c1 (low single)
-	.long	0xBE53BED0 // c2
-	.long	0x3EC67E26 // c3
-	.long	0xBE1568E2 // c4
-	.long	0xBFB3121F // B' = pi/2 - B (high single)
-	.long	0x330F347D // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE31AE4D // c0 (high single)
-	.long	0x31F32251 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3CF6A500 // c1 (low single)
-	.long	0xBE3707DA // c2
-	.long	0x3EBFA489 // c3
-	.long	0xBDFBD9C7 // c4
-	.long	0xBFB6365E // B' = pi/2 - B (high single)
-	.long	0xB28BB91C // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBE17E564 // c0 (high single)
-	.long	0x31C5A2E4 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3CB440D0 // c1 (low single)
-	.long	0xBE1B3D00 // c2
-	.long	0x3EB9F664 // c3
-	.long	0xBDD647C0 // c4
-	.long	0xBFB95A9E // B' = pi/2 - B (high single)
-	.long	0x33651267 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBDFC98C2 // c0 (high single)
-	.long	0x30AE525C // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3C793D20 // c1 (low single)
-	.long	0xBE003845 // c2
-	.long	0x3EB5271F // c3
-	.long	0xBDAC669E // c4
-	.long	0xBFBC7EDD // B' = pi/2 - B (high single)
-	.long	0x31800ADD // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBDC9B5DC // c0 (high single)
-	.long	0xB145AD86 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3C1EEF20 // c1 (low single)
-	.long	0xBDCBAAEA // c2
-	.long	0x3EB14E5E // c3
-	.long	0xBD858BB2 // c4
-	.long	0xBFBFA31C // B' = pi/2 - B (high single)
-	.long	0xB3450FB0 // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBD9711CE // c0 (high single)
-	.long	0xB14FEB28 // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3BB24C00 // c1 (low single)
-	.long	0xBD97E43A // c2
-	.long	0x3EAE6A89 // c3
-	.long	0xBD4D07E0 // c4
-	.long	0xBFC2C75C // B' = pi/2 - B (high single)
-	.long	0x32CBBE8A // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBD49393C // c0 (high single)
-	.long	0xB0A39F5B // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3B1E2B00 // c1 (low single)
-	.long	0xBD49B5D4 // c2
-	.long	0x3EAC4F10 // c3
-	.long	0xBCFD9425 // c4
-	.long	0xBFC5EB9B // B' = pi/2 - B (high single)
-	.long	0xB2DE638C // B' = pi/2 - B (low single)
-	.long	0x00000000 // tau (1 for cot path)
-	.long	0xBCC91A31 // c0 (high single)
-	.long	0xAF8E8D1A // c0 (low single)
-	.long	0x3F800000 // c1 (high 1 bit)
-	.long	0x3A1DFA00 // c1 (low single)
-	.long	0xBCC9392D // c2
-	.long	0x3EAB1889 // c3
-	.long	0xBC885D3B // c4
-	.align	32
-	.type	__svml_stan_data_internal, @object
-	.size	__svml_stan_data_internal, .-__svml_stan_data_internal
-	.align	32
+	blsrl	%ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+	/* All results have been written to (%rsp).  */
+	vmovups	(%rsp), %ymm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register (rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (r13)
+	ret
+END(_ZGVdN8v_tanf_avx2)
 
-#ifdef __svml_stan_reduction_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(32)) VUINT32 _sPtable[256][3][1];
-} __svml_stan_reduction_data_internal;
-#endif
-__svml_stan_reduction_data_internal:
-	/*     P_hi                  P_med               P_lo                */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 0 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 1 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 2 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 3 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 4 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 5 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 6 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 7 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 8 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 9 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 10 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 11 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 12 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 13 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 14 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 15 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 16 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 17 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 18 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 19 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 20 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 21 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 22 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 23 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 24 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 25 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 26 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 27 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 28 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 29 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 30 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 31 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 32 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 33 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 34 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 35 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 36 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 37 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 38 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 39 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 40 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 41 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 42 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 43 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 44 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 45 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 46 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 47 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 48 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 49 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 50 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 51 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 52 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 53 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 54 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 55 */
-	.long	0x00000000, 0x00000000, 0x00000000 /* 56 */
-	.long	0x00000000, 0x00000000, 0x00000001 /* 57 */
-	.long	0x00000000, 0x00000000, 0x00000002 /* 58 */
-	.long	0x00000000, 0x00000000, 0x00000005 /* 59 */
-	.long	0x00000000, 0x00000000, 0x0000000A /* 60 */
-	.long	0x00000000, 0x00000000, 0x00000014 /* 61 */
-	.long	0x00000000, 0x00000000, 0x00000028 /* 62 */
-	.long	0x00000000, 0x00000000, 0x00000051 /* 63 */
-	.long	0x00000000, 0x00000000, 0x000000A2 /* 64 */
-	.long	0x00000000, 0x00000000, 0x00000145 /* 65 */
-	.long	0x00000000, 0x00000000, 0x0000028B /* 66 */
-	.long	0x00000000, 0x00000000, 0x00000517 /* 67 */
-	.long	0x00000000, 0x00000000, 0x00000A2F /* 68 */
-	.long	0x00000000, 0x00000000, 0x0000145F /* 69 */
-	.long	0x00000000, 0x00000000, 0x000028BE /* 70 */
-	.long	0x00000000, 0x00000000, 0x0000517C /* 71 */
-	.long	0x00000000, 0x00000000, 0x0000A2F9 /* 72 */
-	.long	0x00000000, 0x00000000, 0x000145F3 /* 73 */
-	.long	0x00000000, 0x00000000, 0x00028BE6 /* 74 */
-	.long	0x00000000, 0x00000000, 0x000517CC /* 75 */
-	.long	0x00000000, 0x00000000, 0x000A2F98 /* 76 */
-	.long	0x00000000, 0x00000000, 0x00145F30 /* 77 */
-	.long	0x00000000, 0x00000000, 0x0028BE60 /* 78 */
-	.long	0x00000000, 0x00000000, 0x00517CC1 /* 79 */
-	.long	0x00000000, 0x00000000, 0x00A2F983 /* 80 */
-	.long	0x00000000, 0x00000000, 0x0145F306 /* 81 */
-	.long	0x00000000, 0x00000000, 0x028BE60D /* 82 */
-	.long	0x00000000, 0x00000000, 0x0517CC1B /* 83 */
-	.long	0x00000000, 0x00000000, 0x0A2F9836 /* 84 */
-	.long	0x00000000, 0x00000000, 0x145F306D /* 85 */
-	.long	0x00000000, 0x00000000, 0x28BE60DB /* 86 */
-	.long	0x00000000, 0x00000000, 0x517CC1B7 /* 87 */
-	.long	0x00000000, 0x00000000, 0xA2F9836E /* 88 */
-	.long	0x00000000, 0x00000001, 0x45F306DC /* 89 */
-	.long	0x00000000, 0x00000002, 0x8BE60DB9 /* 90 */
-	.long	0x00000000, 0x00000005, 0x17CC1B72 /* 91 */
-	.long	0x00000000, 0x0000000A, 0x2F9836E4 /* 92 */
-	.long	0x00000000, 0x00000014, 0x5F306DC9 /* 93 */
-	.long	0x00000000, 0x00000028, 0xBE60DB93 /* 94 */
-	.long	0x00000000, 0x00000051, 0x7CC1B727 /* 95 */
-	.long	0x00000000, 0x000000A2, 0xF9836E4E /* 96 */
-	.long	0x00000000, 0x00000145, 0xF306DC9C /* 97 */
-	.long	0x00000000, 0x0000028B, 0xE60DB939 /* 98 */
-	.long	0x00000000, 0x00000517, 0xCC1B7272 /* 99 */
-	.long	0x00000000, 0x00000A2F, 0x9836E4E4 /* 100 */
-	.long	0x00000000, 0x0000145F, 0x306DC9C8 /* 101 */
-	.long	0x00000000, 0x000028BE, 0x60DB9391 /* 102 */
-	.long	0x00000000, 0x0000517C, 0xC1B72722 /* 103 */
-	.long	0x00000000, 0x0000A2F9, 0x836E4E44 /* 104 */
-	.long	0x00000000, 0x000145F3, 0x06DC9C88 /* 105 */
-	.long	0x00000000, 0x00028BE6, 0x0DB93910 /* 106 */
-	.long	0x00000000, 0x000517CC, 0x1B727220 /* 107 */
-	.long	0x00000000, 0x000A2F98, 0x36E4E441 /* 108 */
-	.long	0x00000000, 0x00145F30, 0x6DC9C882 /* 109 */
-	.long	0x00000000, 0x0028BE60, 0xDB939105 /* 110 */
-	.long	0x00000000, 0x00517CC1, 0xB727220A /* 111 */
-	.long	0x00000000, 0x00A2F983, 0x6E4E4415 /* 112 */
-	.long	0x00000000, 0x0145F306, 0xDC9C882A /* 113 */
-	.long	0x00000000, 0x028BE60D, 0xB9391054 /* 114 */
-	.long	0x00000000, 0x0517CC1B, 0x727220A9 /* 115 */
-	.long	0x00000000, 0x0A2F9836, 0xE4E44152 /* 116 */
-	.long	0x00000000, 0x145F306D, 0xC9C882A5 /* 117 */
-	.long	0x00000000, 0x28BE60DB, 0x9391054A /* 118 */
-	.long	0x00000000, 0x517CC1B7, 0x27220A94 /* 119 */
-	.long	0x00000000, 0xA2F9836E, 0x4E441529 /* 120 */
-	.long	0x00000001, 0x45F306DC, 0x9C882A53 /* 121 */
-	.long	0x00000002, 0x8BE60DB9, 0x391054A7 /* 122 */
-	.long	0x00000005, 0x17CC1B72, 0x7220A94F /* 123 */
-	.long	0x0000000A, 0x2F9836E4, 0xE441529F /* 124 */
-	.long	0x00000014, 0x5F306DC9, 0xC882A53F /* 125 */
-	.long	0x00000028, 0xBE60DB93, 0x91054A7F /* 126 */
-	.long	0x00000051, 0x7CC1B727, 0x220A94FE /* 127 */
-	.long	0x000000A2, 0xF9836E4E, 0x441529FC /* 128 */
-	.long	0x00000145, 0xF306DC9C, 0x882A53F8 /* 129 */
-	.long	0x0000028B, 0xE60DB939, 0x1054A7F0 /* 130 */
-	.long	0x00000517, 0xCC1B7272, 0x20A94FE1 /* 131 */
-	.long	0x00000A2F, 0x9836E4E4, 0x41529FC2 /* 132 */
-	.long	0x0000145F, 0x306DC9C8, 0x82A53F84 /* 133 */
-	.long	0x000028BE, 0x60DB9391, 0x054A7F09 /* 134 */
-	.long	0x0000517C, 0xC1B72722, 0x0A94FE13 /* 135 */
-	.long	0x0000A2F9, 0x836E4E44, 0x1529FC27 /* 136 */
-	.long	0x000145F3, 0x06DC9C88, 0x2A53F84E /* 137 */
-	.long	0x00028BE6, 0x0DB93910, 0x54A7F09D /* 138 */
-	.long	0x000517CC, 0x1B727220, 0xA94FE13A /* 139 */
-	.long	0x000A2F98, 0x36E4E441, 0x529FC275 /* 140 */
-	.long	0x00145F30, 0x6DC9C882, 0xA53F84EA /* 141 */
-	.long	0x0028BE60, 0xDB939105, 0x4A7F09D5 /* 142 */
-	.long	0x00517CC1, 0xB727220A, 0x94FE13AB /* 143 */
-	.long	0x00A2F983, 0x6E4E4415, 0x29FC2757 /* 144 */
-	.long	0x0145F306, 0xDC9C882A, 0x53F84EAF /* 145 */
-	.long	0x028BE60D, 0xB9391054, 0xA7F09D5F /* 146 */
-	.long	0x0517CC1B, 0x727220A9, 0x4FE13ABE /* 147 */
-	.long	0x0A2F9836, 0xE4E44152, 0x9FC2757D /* 148 */
-	.long	0x145F306D, 0xC9C882A5, 0x3F84EAFA /* 149 */
-	.long	0x28BE60DB, 0x9391054A, 0x7F09D5F4 /* 150 */
-	.long	0x517CC1B7, 0x27220A94, 0xFE13ABE8 /* 151 */
-	.long	0xA2F9836E, 0x4E441529, 0xFC2757D1 /* 152 */
-	.long	0x45F306DC, 0x9C882A53, 0xF84EAFA3 /* 153 */
-	.long	0x8BE60DB9, 0x391054A7, 0xF09D5F47 /* 154 */
-	.long	0x17CC1B72, 0x7220A94F, 0xE13ABE8F /* 155 */
-	.long	0x2F9836E4, 0xE441529F, 0xC2757D1F /* 156 */
-	.long	0x5F306DC9, 0xC882A53F, 0x84EAFA3E /* 157 */
-	.long	0xBE60DB93, 0x91054A7F, 0x09D5F47D /* 158 */
-	.long	0x7CC1B727, 0x220A94FE, 0x13ABE8FA /* 159 */
-	.long	0xF9836E4E, 0x441529FC, 0x2757D1F5 /* 160 */
-	.long	0xF306DC9C, 0x882A53F8, 0x4EAFA3EA /* 161 */
-	.long	0xE60DB939, 0x1054A7F0, 0x9D5F47D4 /* 162 */
-	.long	0xCC1B7272, 0x20A94FE1, 0x3ABE8FA9 /* 163 */
-	.long	0x9836E4E4, 0x41529FC2, 0x757D1F53 /* 164 */
-	.long	0x306DC9C8, 0x82A53F84, 0xEAFA3EA6 /* 165 */
-	.long	0x60DB9391, 0x054A7F09, 0xD5F47D4D /* 166 */
-	.long	0xC1B72722, 0x0A94FE13, 0xABE8FA9A /* 167 */
-	.long	0x836E4E44, 0x1529FC27, 0x57D1F534 /* 168 */
-	.long	0x06DC9C88, 0x2A53F84E, 0xAFA3EA69 /* 169 */
-	.long	0x0DB93910, 0x54A7F09D, 0x5F47D4D3 /* 170 */
-	.long	0x1B727220, 0xA94FE13A, 0xBE8FA9A6 /* 171 */
-	.long	0x36E4E441, 0x529FC275, 0x7D1F534D /* 172 */
-	.long	0x6DC9C882, 0xA53F84EA, 0xFA3EA69B /* 173 */
-	.long	0xDB939105, 0x4A7F09D5, 0xF47D4D37 /* 174 */
-	.long	0xB727220A, 0x94FE13AB, 0xE8FA9A6E /* 175 */
-	.long	0x6E4E4415, 0x29FC2757, 0xD1F534DD /* 176 */
-	.long	0xDC9C882A, 0x53F84EAF, 0xA3EA69BB /* 177 */
-	.long	0xB9391054, 0xA7F09D5F, 0x47D4D377 /* 178 */
-	.long	0x727220A9, 0x4FE13ABE, 0x8FA9A6EE /* 179 */
-	.long	0xE4E44152, 0x9FC2757D, 0x1F534DDC /* 180 */
-	.long	0xC9C882A5, 0x3F84EAFA, 0x3EA69BB8 /* 181 */
-	.long	0x9391054A, 0x7F09D5F4, 0x7D4D3770 /* 182 */
-	.long	0x27220A94, 0xFE13ABE8, 0xFA9A6EE0 /* 183 */
-	.long	0x4E441529, 0xFC2757D1, 0xF534DDC0 /* 184 */
-	.long	0x9C882A53, 0xF84EAFA3, 0xEA69BB81 /* 185 */
-	.long	0x391054A7, 0xF09D5F47, 0xD4D37703 /* 186 */
-	.long	0x7220A94F, 0xE13ABE8F, 0xA9A6EE06 /* 187 */
-	.long	0xE441529F, 0xC2757D1F, 0x534DDC0D /* 188 */
-	.long	0xC882A53F, 0x84EAFA3E, 0xA69BB81B /* 189 */
-	.long	0x91054A7F, 0x09D5F47D, 0x4D377036 /* 190 */
-	.long	0x220A94FE, 0x13ABE8FA, 0x9A6EE06D /* 191 */
-	.long	0x441529FC, 0x2757D1F5, 0x34DDC0DB /* 192 */
-	.long	0x882A53F8, 0x4EAFA3EA, 0x69BB81B6 /* 193 */
-	.long	0x1054A7F0, 0x9D5F47D4, 0xD377036D /* 194 */
-	.long	0x20A94FE1, 0x3ABE8FA9, 0xA6EE06DB /* 195 */
-	.long	0x41529FC2, 0x757D1F53, 0x4DDC0DB6 /* 196 */
-	.long	0x82A53F84, 0xEAFA3EA6, 0x9BB81B6C /* 197 */
-	.long	0x054A7F09, 0xD5F47D4D, 0x377036D8 /* 198 */
-	.long	0x0A94FE13, 0xABE8FA9A, 0x6EE06DB1 /* 199 */
-	.long	0x1529FC27, 0x57D1F534, 0xDDC0DB62 /* 200 */
-	.long	0x2A53F84E, 0xAFA3EA69, 0xBB81B6C5 /* 201 */
-	.long	0x54A7F09D, 0x5F47D4D3, 0x77036D8A /* 202 */
-	.long	0xA94FE13A, 0xBE8FA9A6, 0xEE06DB14 /* 203 */
-	.long	0x529FC275, 0x7D1F534D, 0xDC0DB629 /* 204 */
-	.long	0xA53F84EA, 0xFA3EA69B, 0xB81B6C52 /* 205 */
-	.long	0x4A7F09D5, 0xF47D4D37, 0x7036D8A5 /* 206 */
-	.long	0x94FE13AB, 0xE8FA9A6E, 0xE06DB14A /* 207 */
-	.long	0x29FC2757, 0xD1F534DD, 0xC0DB6295 /* 208 */
-	.long	0x53F84EAF, 0xA3EA69BB, 0x81B6C52B /* 209 */
-	.long	0xA7F09D5F, 0x47D4D377, 0x036D8A56 /* 210 */
-	.long	0x4FE13ABE, 0x8FA9A6EE, 0x06DB14AC /* 211 */
-	.long	0x9FC2757D, 0x1F534DDC, 0x0DB62959 /* 212 */
-	.long	0x3F84EAFA, 0x3EA69BB8, 0x1B6C52B3 /* 213 */
-	.long	0x7F09D5F4, 0x7D4D3770, 0x36D8A566 /* 214 */
-	.long	0xFE13ABE8, 0xFA9A6EE0, 0x6DB14ACC /* 215 */
-	.long	0xFC2757D1, 0xF534DDC0, 0xDB629599 /* 216 */
-	.long	0xF84EAFA3, 0xEA69BB81, 0xB6C52B32 /* 217 */
-	.long	0xF09D5F47, 0xD4D37703, 0x6D8A5664 /* 218 */
-	.long	0xE13ABE8F, 0xA9A6EE06, 0xDB14ACC9 /* 219 */
-	.long	0xC2757D1F, 0x534DDC0D, 0xB6295993 /* 220 */
-	.long	0x84EAFA3E, 0xA69BB81B, 0x6C52B327 /* 221 */
-	.long	0x09D5F47D, 0x4D377036, 0xD8A5664F /* 222 */
-	.long	0x13ABE8FA, 0x9A6EE06D, 0xB14ACC9E /* 223 */
-	.long	0x2757D1F5, 0x34DDC0DB, 0x6295993C /* 224 */
-	.long	0x4EAFA3EA, 0x69BB81B6, 0xC52B3278 /* 225 */
-	.long	0x9D5F47D4, 0xD377036D, 0x8A5664F1 /* 226 */
-	.long	0x3ABE8FA9, 0xA6EE06DB, 0x14ACC9E2 /* 227 */
-	.long	0x757D1F53, 0x4DDC0DB6, 0x295993C4 /* 228 */
-	.long	0xEAFA3EA6, 0x9BB81B6C, 0x52B32788 /* 229 */
-	.long	0xD5F47D4D, 0x377036D8, 0xA5664F10 /* 230 */
-	.long	0xABE8FA9A, 0x6EE06DB1, 0x4ACC9E21 /* 231 */
-	.long	0x57D1F534, 0xDDC0DB62, 0x95993C43 /* 232 */
-	.long	0xAFA3EA69, 0xBB81B6C5, 0x2B327887 /* 233 */
-	.long	0x5F47D4D3, 0x77036D8A, 0x5664F10E /* 234 */
-	.long	0xBE8FA9A6, 0xEE06DB14, 0xACC9E21C /* 235 */
-	.long	0x7D1F534D, 0xDC0DB629, 0x5993C439 /* 236 */
-	.long	0xFA3EA69B, 0xB81B6C52, 0xB3278872 /* 237 */
-	.long	0xF47D4D37, 0x7036D8A5, 0x664F10E4 /* 238 */
-	.long	0xE8FA9A6E, 0xE06DB14A, 0xCC9E21C8 /* 239 */
-	.long	0xD1F534DD, 0xC0DB6295, 0x993C4390 /* 240 */
-	.long	0xA3EA69BB, 0x81B6C52B, 0x32788720 /* 241 */
-	.long	0x47D4D377, 0x036D8A56, 0x64F10E41 /* 242 */
-	.long	0x8FA9A6EE, 0x06DB14AC, 0xC9E21C82 /* 243 */
-	.long	0x1F534DDC, 0x0DB62959, 0x93C43904 /* 244 */
-	.long	0x3EA69BB8, 0x1B6C52B3, 0x27887208 /* 245 */
-	.long	0x7D4D3770, 0x36D8A566, 0x4F10E410 /* 246 */
-	.long	0xFA9A6EE0, 0x6DB14ACC, 0x9E21C820 /* 247 */
-	.long	0xF534DDC0, 0xDB629599, 0x3C439041 /* 248 */
-	.long	0xEA69BB81, 0xB6C52B32, 0x78872083 /* 249 */
-	.long	0xD4D37703, 0x6D8A5664, 0xF10E4107 /* 250 */
-	.long	0xA9A6EE06, 0xDB14ACC9, 0xE21C820F /* 251 */
-	.long	0x534DDC0D, 0xB6295993, 0xC439041F /* 252 */
-	.long	0xA69BB81B, 0x6C52B327, 0x8872083F /* 253 */
-	.long	0x4D377036, 0xD8A5664F, 0x10E4107F /* 254 */
-	.long	0x9A6EE06D, 0xB14ACC9E, 0x21C820FF /* 255 */
+	.section .rodata.avx2, "a"
 	.align	32
-	.type	__svml_stan_reduction_data_internal, @object
-	.size	__svml_stan_reduction_data_internal, .-__svml_stan_reduction_data_internal
+LOCAL_DATA_NAME:
+
+	DATA_VEC (LOCAL_DATA_NAME, _sPI2_FMA, 0xB33BBD2E)	// AVX2
+	DATA_VEC (LOCAL_DATA_NAME, _sPI3_FMA, 0xA6F72CED)	// AVX2
+	DATA_VEC (LOCAL_DATA_NAME, _FLT_0, 0x7f800000)	// AVX512, AVX2
+	DATA_VEC (LOCAL_DATA_NAME, _FLT_1, 0x00000080)	// AVX2
+	DATA_VEC (LOCAL_DATA_NAME, _FLT_2, 0x35800000)	// AVX512, AVX2
+	DATA_VEC (LOCAL_DATA_NAME, _FLT_3, 0xb43bbd2e)	// AVX512, AVX2
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 18/27] x86/fpu: Optimize svml_s_log10f16_core_avx512.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (15 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 17/27] x86/fpu: Optimize svml_s_tanf8_core_avx2.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 19/27] x86/fpu: Optimize svml_s_log10f4_core_sse4.S Noah Goldstein via Libc-alpha
                   ` (9 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Improve special values case which ends up covering ~half of all
   float bit patterns.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

Code Size Change: -52 Bytes (226 - 278)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.9484
0F          (0x0000ffff, Denorm)   -> 0.9668
.1F         (0x3dcccccd)           -> 0.9934
5F          (0x40a00000)           -> 0.9859
2315255808F (0x4f0a0000)           -> 0.9926
-NaN        (0xffffffff)           -> 0.9808
---
 .../multiarch/svml_s_log10f16_core_avx512.S   | 296 ++++++++----------
 1 file changed, 125 insertions(+), 171 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
index bea2124519..5b68fbea61 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
@@ -27,211 +27,165 @@
  *
  */
 
-/* Offsets for data table __svml_slog10_data_internal_avx512
- */
-#define One				0
-#define coeff4				64
-#define coeff3				128
-#define coeff2				192
-#define coeff1				256
-#define L2				320
+#define LOCAL_DATA_NAME	__svml_slog10_data_internal
+#include "svml_s_common_evex512_rodata_offsets.h"
+
+/* Offsets for data table __svml_slog10_data_internal.  */
+#define _Coeff_4	0
+#define _Coeff_3	64
+#define _Coeff_2	128
+#define _Coeff_1	192
+#define _L2	256
 
 #include <sysdep.h>
 
 	.section .text.evex512, "ax", @progbits
 ENTRY(_ZGVeN16v_log10f_skx)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-64, %rsp
-	subq	$192, %rsp
 	vgetmantps $11, {sae}, %zmm0, %zmm3
-	vmovups	__svml_slog10_data_internal_avx512(%rip), %zmm1
+	vmovups	COMMON_DATA(_OneF)(%rip), %zmm1
 	vgetexpps {sae}, %zmm0, %zmm5
-	vmovups	L2+__svml_slog10_data_internal_avx512(%rip), %zmm10
 	vpsrld	$19, %zmm3, %zmm7
 	vgetexpps {sae}, %zmm3, %zmm6
 	vsubps	{rn-sae}, %zmm1, %zmm3, %zmm11
-	vpermps	coeff4+__svml_slog10_data_internal_avx512(%rip), %zmm7, %zmm1
-	vpermps	coeff3+__svml_slog10_data_internal_avx512(%rip), %zmm7, %zmm2
+	vpermps	LOCAL_DATA(_Coeff_4)(%rip), %zmm7, %zmm1
+	vpermps	LOCAL_DATA(_Coeff_3)(%rip), %zmm7, %zmm2
 	vsubps	{rn-sae}, %zmm6, %zmm5, %zmm9
-	vpermps	coeff2+__svml_slog10_data_internal_avx512(%rip), %zmm7, %zmm4
-	vpermps	coeff1+__svml_slog10_data_internal_avx512(%rip), %zmm7, %zmm8
+	vpermps	LOCAL_DATA(_Coeff_2)(%rip), %zmm7, %zmm4
+	vpermps	LOCAL_DATA(_Coeff_1)(%rip), %zmm7, %zmm8
 
-	/* x<=0? */
-	vfpclassps $94, %zmm0, %k0
+	/* x<=0?  */
+	vfpclassps $0x5e, %zmm0, %k0
 	vfmadd213ps {rn-sae}, %zmm2, %zmm11, %zmm1
-	vmulps	{rn-sae}, %zmm10, %zmm9, %zmm12
+	vmulps	LOCAL_DATA(_L2)(%rip), %zmm9, %zmm12
 	vfmadd213ps {rn-sae}, %zmm4, %zmm11, %zmm1
-	kmovw	%k0, %edx
 	vfmadd213ps {rn-sae}, %zmm8, %zmm11, %zmm1
 	vfmadd213ps {rn-sae}, %zmm12, %zmm11, %zmm1
+	kmovd	%k0, %edx
 	testl	%edx, %edx
 
-	/* Go to special inputs processing branch */
+	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
-
-	/* Restore registers
-	 * and exit the function
-	 */
 
-L(EXIT):
+	/* Restore registers * and exit the function.  */
 	vmovaps	%zmm1, %zmm0
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
 	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
 
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanf call. Optimize for code size
+	   moreso than speed here.  */
 L(SPECIAL_VALUES_BRANCH):
-	vmovups	%zmm0, 64(%rsp)
-	vmovups	%zmm1, 128(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx zmm1
 
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
+	/* Use r13 to save/restore the stack. This allows us to use rbp
+	   as callee save register saving code size.  */
+	pushq	%r13
+	cfi_def_cfa (rsp, 16)
+	/* Need to callee save registers to preserve state across tanf
+	   calls.  */
+	pushq	%rbx
+	cfi_def_cfa (rsp, 24)
+	pushq	%rbp
+	cfi_def_cfa (rsp, 32)
+	movq	%rsp, %r13
+	cfi_def_cfa (r13, 32)
+
+	/* Align stack and make room for 2x zmm vectors.  */
+	andq	$-64, %rsp
+	addq	$-128, %rsp
+
+	/* Save original input.  */
+	vmovaps	%zmm0, 64(%rsp)
+	/* Save all already computed inputs.  */
+	vmovaps	%zmm1, (%rsp)
 
 	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
-
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
 
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a tanf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$16, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	128(%rsp), %zmm1
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 zmm1
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	vmovss	64(%rsp, %r14, 4), %xmm0
+
+	/* use rbp as index for special value that is saved across calls
+	   to tanf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 56] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop. Realigning
+	   also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	64(%rsp, %rbp, 4), %xmm0
 	call	log10f@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	vmovss	%xmm0, 128(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	blsrl	%ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+
+	/* All results have been written to 64(%rsp).  */
+	vmovaps	(%rsp), %zmm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa (rsp, 32)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_def_cfa (rsp, 24)
+	popq	%rbx
+	cfi_def_cfa (rsp, 16)
+	popq	%r13
+	ret
 END(_ZGVeN16v_log10f_skx)
 
-	.section .rodata, "a"
+	.section .rodata.evex512, "a"
 	.align	64
 
-#ifdef __svml_slog10_data_internal_avx512_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(64)) VUINT32 One[16][1];
-	__declspec(align(64)) VUINT32 coeff4[16][1];
-	__declspec(align(64)) VUINT32 coeff3[16][1];
-	__declspec(align(64)) VUINT32 coeff2[16][1];
-	__declspec(align(64)) VUINT32 coeff1[16][1];
-	__declspec(align(64)) VUINT32 L2[16][1];
-} __svml_slog10_data_internal_avx512;
-#endif
-__svml_slog10_data_internal_avx512:
-	/* One */
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	// c4
-	.align	64
-	.long	0xbdc9ae9b, 0xbda6fcf4
-	.long	0xbd8bac76, 0xbd6bca30
-	.long	0xbd48a99b, 0xbd2c0a9f
-	.long	0xbd1480db, 0xbd00faf2
-	.long	0xbe823aa9, 0xbe656348
-	.long	0xbe4afbb9, 0xbe346895
-	.long	0xbe20ffff, 0xbe103a0b
-	.long	0xbe01a91c, 0xbde9e84e
-	// c3
-	.align	64
-	.long	0x3e13d888, 0x3e10a87c
-	.long	0x3e0b95c3, 0x3e057f0b
-	.long	0x3dfde038, 0x3df080d9
-	.long	0x3de34c1e, 0x3dd68333
-	.long	0x3dac6e8e, 0x3dd54a51
-	.long	0x3df30f40, 0x3e04235d
-	.long	0x3e0b7033, 0x3e102c90
-	.long	0x3e12ebad, 0x3e141ff8
-	// c2
-	.align	64
-	.long	0xbe5e5a9b, 0xbe5e2677
-	.long	0xbe5d83f5, 0xbe5c6016
-	.long	0xbe5abd0b, 0xbe58a6fd
-	.long	0xbe562e02, 0xbe5362f8
-	.long	0xbe68e27c, 0xbe646747
-	.long	0xbe619a73, 0xbe5ff05a
-	.long	0xbe5f0570, 0xbe5e92d0
-	.long	0xbe5e662b, 0xbe5e5c08
-	// c1
-	.align	64
-	.long	0x3ede5bd8, 0x3ede5b45
-	.long	0x3ede57d8, 0x3ede4eb1
-	.long	0x3ede3d37, 0x3ede2166
-	.long	0x3eddf9d9, 0x3eddc5bb
-	.long	0x3ede08ed, 0x3ede32e7
-	.long	0x3ede4967, 0x3ede5490
-	.long	0x3ede597f, 0x3ede5b50
-	.long	0x3ede5bca, 0x3ede5bd9
-	/* L2 */
-	.align	64
-	.long	0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b
-	.align	64
-	.type	__svml_slog10_data_internal_avx512, @object
-	.size	__svml_slog10_data_internal_avx512, .-__svml_slog10_data_internal_avx512
+LOCAL_DATA_NAME:
+	float_block (LOCAL_DATA_NAME, _Coeff_4,
+		0xbdc9ae9b, 0xbda6fcf4,
+		0xbd8bac76, 0xbd6bca30,
+		0xbd48a99b, 0xbd2c0a9f,
+		0xbd1480db, 0xbd00faf2,
+		0xbe823aa9, 0xbe656348,
+		0xbe4afbb9, 0xbe346895,
+		0xbe20ffff, 0xbe103a0b,
+		0xbe01a91c, 0xbde9e84e)
+
+	float_block (LOCAL_DATA_NAME, _Coeff_3,
+		0x3e13d888, 0x3e10a87c,
+		0x3e0b95c3, 0x3e057f0b,
+		0x3dfde038, 0x3df080d9,
+		0x3de34c1e, 0x3dd68333,
+		0x3dac6e8e, 0x3dd54a51,
+		0x3df30f40, 0x3e04235d,
+		0x3e0b7033, 0x3e102c90,
+		0x3e12ebad, 0x3e141ff8)
+
+	float_block (LOCAL_DATA_NAME, _Coeff_2,
+		0xbe5e5a9b, 0xbe5e2677,
+		0xbe5d83f5, 0xbe5c6016,
+		0xbe5abd0b, 0xbe58a6fd,
+		0xbe562e02, 0xbe5362f8,
+		0xbe68e27c, 0xbe646747,
+		0xbe619a73, 0xbe5ff05a,
+		0xbe5f0570, 0xbe5e92d0,
+		0xbe5e662b, 0xbe5e5c08)
+
+	float_block (LOCAL_DATA_NAME, _Coeff_1,
+		0x3ede5bd8, 0x3ede5b45,
+		0x3ede57d8, 0x3ede4eb1,
+		0x3ede3d37, 0x3ede2166,
+		0x3eddf9d9, 0x3eddc5bb,
+		0x3ede08ed, 0x3ede32e7,
+		0x3ede4967, 0x3ede5490,
+		0x3ede597f, 0x3ede5b50,
+		0x3ede5bca, 0x3ede5bd9)
+
+	DATA_VEC (LOCAL_DATA_NAME, _L2, 0x3e9a209b)
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 19/27] x86/fpu: Optimize svml_s_log10f4_core_sse4.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (16 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 18/27] x86/fpu: Optimize svml_s_log10f16_core_avx512.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 20/27] x86/fpu: Optimize svml_s_log10f8_core_avx2.S Noah Goldstein via Libc-alpha
                   ` (8 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Improve special values case which ends up covering ~half of all
   float bit patterns.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

Code Size Change: -61 Bytes (279 - 340)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.9395
0F          (0x0000ffff, Denorm)   -> 0.9729
.1F         (0x3dcccccd)           -> 0.9458
5F          (0x40a00000)           -> 0.9499
2315255808F (0x4f0a0000)           -> 0.9437
-NaN        (0xffffffff)           -> 0.8284
---
 .../fpu/multiarch/svml_s_log10f4_core_sse4.S  | 319 +++++++-----------
 1 file changed, 123 insertions(+), 196 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f4_core_sse4.S
index 58f54d62a3..faa0e79a24 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f4_core_sse4.S
@@ -27,216 +27,143 @@
  *
  */
 
-/* Offsets for data table __svml_slog10_data_internal
- */
-#define MinNorm				0
-#define MaxNorm				16
-#define L2H				32
-#define L2L				48
-#define iBrkValue			64
-#define iOffExpoMask			80
-#define One				96
-#define sPoly				112
-#define L2				256
+#define LOCAL_DATA_NAME	__svml_slog10_data_internal
+#include "svml_s_common_sse4_rodata_offsets.h"
+
+/* Offsets for data table __svml_slog10_data_internal.  */
+#define _L2L	0
+#define _Coeff_9	16
+#define _Coeff_8	32
+#define _Coeff_7	48
+#define _Coeff_6	64
+#define _Coeff_5	80
+#define _Coeff_4	96
+#define _Coeff_3	112
+#define _Coeff_2	128
+#define _Coeff_1	144
+#define _L2H	160
 
 #include <sysdep.h>
 
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_log10f_sse4)
-	subq	$72, %rsp
-	cfi_def_cfa_offset(80)
-	movaps	%xmm0, %xmm1
-
-	/* reduction: compute r, n */
-	movdqu	iBrkValue+__svml_slog10_data_internal(%rip), %xmm2
-	movaps	%xmm0, %xmm4
-	movdqu	iOffExpoMask+__svml_slog10_data_internal(%rip), %xmm10
-	psubd	%xmm2, %xmm1
-	pand	%xmm1, %xmm10
-	psrad	$23, %xmm1
-	paddd	%xmm2, %xmm10
+	movdqu	COMMON_DATA(_NotiOffExpoMask)(%rip), %xmm2
 	movaps	%xmm0, %xmm3
-	movups	sPoly+__svml_slog10_data_internal(%rip), %xmm5
-	movups	sPoly+32+__svml_slog10_data_internal(%rip), %xmm6
-	movups	sPoly+64+__svml_slog10_data_internal(%rip), %xmm7
-	movups	sPoly+96+__svml_slog10_data_internal(%rip), %xmm9
-	cvtdq2ps %xmm1, %xmm12
-	cmpltps	MinNorm+__svml_slog10_data_internal(%rip), %xmm4
-	cmpnleps MaxNorm+__svml_slog10_data_internal(%rip), %xmm3
-	subps	One+__svml_slog10_data_internal(%rip), %xmm10
-	mulps	%xmm10, %xmm5
-	movaps	%xmm10, %xmm8
-	mulps	%xmm10, %xmm6
-	mulps	%xmm10, %xmm8
-	addps	sPoly+16+__svml_slog10_data_internal(%rip), %xmm5
-	mulps	%xmm10, %xmm7
-	addps	sPoly+48+__svml_slog10_data_internal(%rip), %xmm6
-	mulps	%xmm10, %xmm9
-	mulps	%xmm8, %xmm5
-	addps	sPoly+80+__svml_slog10_data_internal(%rip), %xmm7
-	addps	sPoly+112+__svml_slog10_data_internal(%rip), %xmm9
-	addps	%xmm5, %xmm6
-	mulps	%xmm8, %xmm6
-	orps	%xmm3, %xmm4
-
-	/* combine and get argument value range mask */
-	movmskps %xmm4, %edx
-	movups	L2L+__svml_slog10_data_internal(%rip), %xmm1
-	addps	%xmm6, %xmm7
-	mulps	%xmm12, %xmm1
-	mulps	%xmm7, %xmm8
-	movups	L2H+__svml_slog10_data_internal(%rip), %xmm11
-	addps	%xmm8, %xmm9
-	mulps	%xmm11, %xmm12
-	mulps	%xmm10, %xmm9
-	addps	sPoly+128+__svml_slog10_data_internal(%rip), %xmm9
-	mulps	%xmm9, %xmm10
-	addps	%xmm10, %xmm1
-	addps	%xmm12, %xmm1
-	testl	%edx, %edx
-
-	/* Go to special inputs processing branch */
+	psubd	%xmm2, %xmm0
+	movaps	COMMON_DATA(_ILoRange)(%rip), %xmm4
+	pcmpgtd	%xmm0, %xmm4
+	/* combine and get argument value range mask.  */
+	movmskps %xmm4, %eax
+	movups	LOCAL_DATA(_L2L)(%rip), %xmm0
+	/* reduction: compute r, n.  */
+	movdqu	COMMON_DATA(_IBrkValue)(%rip), %xmm4
+	movaps	%xmm3, %xmm6
+	psubd	%xmm4, %xmm3
+	pandn	%xmm3, %xmm2
+	paddd	%xmm4, %xmm2
+	subps	COMMON_DATA(_OneF)(%rip), %xmm2
+	psrad	$0x17, %xmm3
+	cvtdq2ps %xmm3, %xmm4
+	mulps	%xmm4, %xmm0
+	movaps	%xmm2, %xmm3
+	mulps	%xmm2, %xmm2
+	movups	LOCAL_DATA(_Coeff_9)(%rip), %xmm1
+	mulps	%xmm3, %xmm1
+	addps	LOCAL_DATA(_Coeff_8)(%rip), %xmm1
+	mulps	%xmm2, %xmm1
+	movups	LOCAL_DATA(_Coeff_7)(%rip), %xmm5
+	mulps	%xmm3, %xmm5
+	addps	LOCAL_DATA(_Coeff_6)(%rip), %xmm5
+	addps	%xmm1, %xmm5
+	mulps	%xmm2, %xmm5
+	movups	LOCAL_DATA(_Coeff_5)(%rip), %xmm1
+	mulps	%xmm3, %xmm1
+	addps	LOCAL_DATA(_Coeff_4)(%rip), %xmm1
+	addps	%xmm5, %xmm1
+	mulps	%xmm1, %xmm2
+	movups	LOCAL_DATA(_Coeff_3)(%rip), %xmm1
+	mulps	%xmm3, %xmm1
+	addps	LOCAL_DATA(_Coeff_2)(%rip), %xmm1
+	addps	%xmm2, %xmm1
+	mulps	%xmm3, %xmm1
+	addps	LOCAL_DATA(_Coeff_1)(%rip), %xmm1
+	mulps	%xmm1, %xmm3
+	addps	%xmm3, %xmm0
+	movups	LOCAL_DATA(_L2H)(%rip), %xmm2
+	mulps	%xmm4, %xmm2
+	addps	%xmm2, %xmm0
+	testl	%eax, %eax
+	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1
-
-	/* Restore registers
-	 * and exit the function
-	 */
-
-L(EXIT):
-	movaps	%xmm1, %xmm0
-	addq	$72, %rsp
-	cfi_def_cfa_offset(8)
 	ret
-	cfi_def_cfa_offset(80)
-
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   more so than speed here.  */
 L(SPECIAL_VALUES_BRANCH):
-	movups	%xmm0, 32(%rsp)
-	movups	%xmm1, 48(%rsp)
-	# LOE rbx rbp r12 r13 r14 r15 edx
-
-	xorl	%eax, %eax
-	movq	%r12, 16(%rsp)
-	cfi_offset(12, -64)
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	cfi_offset(13, -72)
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
-
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
-
+	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
+	   call entry will be 16-byte aligned.  */
+
+	subq	$0x38, %rsp
+	movups	%xmm0, 24(%rsp)
+	movups	%xmm6, 40(%rsp)
+
+	/* Use rbx/rbp for callee save registers as they get short
+	   encoding for many instructions (as compared with r12/r13).  */
+	movq	%rbx, (%rsp)
+	cfi_offset (rbx, -64)
+	movq	%rbp, 8(%rsp)
+	cfi_offset (rbp, -56)
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a tanhf call.  */
+	movl	%eax, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$4, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx rbp r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	movups	48(%rsp), %xmm1
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	cfi_offset(12, -64)
-	cfi_offset(13, -72)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r12 r13 r14 r15 xmm1
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
-	call	log10f@PLT
-	# LOE rbx rbp r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 48(%rsp, %r14, 4)
+	/* use rbp as index for special value that is saved across calls
+	   to tanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 12] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop.  */
+	xorl	%ebp, %ebp
+	bsfl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	40(%rsp, %rbp, 4), %xmm0
+	call	log10f@PLT
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx rbp r15 r12d r13d
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
+	movss	%xmm0, 24(%rsp, %rbp, 4)
+
+	leal	-1(%rbx), %eax
+	andl	%eax, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+	/* All results have been written to 24(%rsp).  */
+	movups	24(%rsp), %xmm0
+	movq	(%rsp), %rbx
+	cfi_restore (rbx)
+	movq	8(%rsp), %rbp
+	cfi_restore (rbp)
+	addq	$56, %rsp
+	cfi_def_cfa_offset (8)
+	ret
 END(_ZGVbN4v_log10f_sse4)
 
-	.section .rodata, "a"
+	.section .rodata.sse4, "a"
 	.align	16
 
-#ifdef __svml_slog10_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(16)) VUINT32 MinNorm[4][1];
-	__declspec(align(16)) VUINT32 MaxNorm[4][1];
-	__declspec(align(16)) VUINT32 L2H[4][1];
-	__declspec(align(16)) VUINT32 L2L[4][1];
-	__declspec(align(16)) VUINT32 iBrkValue[4][1];
-	__declspec(align(16)) VUINT32 iOffExpoMask[4][1];
-	__declspec(align(16)) VUINT32 One[4][1];
-	__declspec(align(16)) VUINT32 sPoly[9][4][1];
-	__declspec(align(16)) VUINT32 L2[4][1];
-} __svml_slog10_data_internal;
-#endif
-__svml_slog10_data_internal:
-	/* MinNorm */
-	.long	0x00800000, 0x00800000, 0x00800000, 0x00800000
-	/* MaxNorm */
-	.align	16
-	.long	0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff
-	/* L2H */
-	.align	16
-	.long	0x3e9a2100, 0x3e9a2100, 0x3e9a2100, 0x3e9a2100
-	/* L2L */
-	.align	16
-	.long	0xb64AF600, 0xb64AF600, 0xb64AF600, 0xb64AF600
-	/* iBrkValue = SP 2/3 */
-	.align	16
-	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
-	/* iOffExpoMask = SP significand mask */
-	.align	16
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	/* sOne = SP 1.0 */
-	.align	16
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* spoly[9] */
-	.align	16
-	.long	0x3d8063B4, 0x3d8063B4, 0x3d8063B4, 0x3d8063B4 /* coeff9 */
-	.long	0xbd890073, 0xbd890073, 0xbd890073, 0xbd890073 /* coeff8 */
-	.long	0x3d775317, 0x3d775317, 0x3d775317, 0x3d775317 /* coeff7 */
-	.long	0xbd91FB27, 0xbd91FB27, 0xbd91FB27, 0xbd91FB27 /* coeff6 */
-	.long	0x3dB20B96, 0x3dB20B96, 0x3dB20B96, 0x3dB20B96 /* coeff5 */
-	.long	0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20 /* coeff4 */
-	.long	0x3e143CE5, 0x3e143CE5, 0x3e143CE5, 0x3e143CE5 /* coeff3 */
-	.long	0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5 /* coeff2 */
-	.long	0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9 /* coeff1 */
-	/* L2 */
-	.align	16
-	.long	0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b
-	.align	16
-	.type	__svml_slog10_data_internal, @object
-	.size	__svml_slog10_data_internal, .-__svml_slog10_data_internal
+LOCAL_DATA_NAME:
+	DATA_VEC (LOCAL_DATA_NAME, _L2L, 0xb64af600)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_9, 0x3d8063b4)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_8, 0xbd890073)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_7, 0x3d775317)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_6, 0xbd91fb27)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_5, 0x3db20b96)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_4, 0xbdde6e20)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_3, 0x3e143ce5)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_2, 0xbe5e5bc5)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_1, 0x3ede5bd9)
+	DATA_VEC (LOCAL_DATA_NAME, _L2H, 0x3e9a2100)
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 20/27] x86/fpu: Optimize svml_s_log10f8_core_avx2.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (17 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 19/27] x86/fpu: Optimize svml_s_log10f4_core_sse4.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 21/27] x86/fpu: Optimize svml_s_log2f16_core_avx512.S Noah Goldstein via Libc-alpha
                   ` (7 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Improve special values case which ends up covering ~half of all
   float bit patterns.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

Code Size Change: -43 Bytes (268 - 311)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.9483
0F          (0x0000ffff, Denorm)   -> 0.9718
.1F         (0x3dcccccd)           -> 0.9253
5F          (0x40a00000)           -> 0.9218
2315255808F (0x4f0a0000)           -> 0.9199
-NaN        (0xffffffff)           -> 0.8258
---
 .../fpu/multiarch/svml_s_log10f8_core_avx2.S  | 316 ++++++++----------
 1 file changed, 132 insertions(+), 184 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
index 4bdc62e90e..2e9db34f23 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
@@ -27,216 +27,164 @@
  *
  */
 
-/* Offsets for data table __svml_slog10_data_internal
- */
-#define MinNorm				0
-#define MaxNorm				32
-#define L2H				64
-#define L2L				96
-#define iBrkValue			128
-#define iOffExpoMask			160
-#define One				192
-#define sPoly				224
-#define L2				512
+#define LOCAL_DATA_NAME	__svml_slog10_data_internal
+#include "svml_s_common_avx2_rodata_offsets.h"
+
+/* Offsets for data table __svml_slog10_data_internal.  */
+#define _Coeff_9	0
+#define _Coeff_8	32
+#define _Coeff_7	64
+#define _Coeff_6	96
+#define _Coeff_5	128
+#define _Coeff_4	160
+#define _Coeff_3	192
+#define _Coeff_2	224
+#define _Coeff_1	256
+#define _L2L	288
+#define _L2H	320
 
 #include <sysdep.h>
 
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_log10f_avx2)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-32, %rsp
-	subq	$96, %rsp
-
-	/* reduction: compute r, n */
-	vmovups	iBrkValue+__svml_slog10_data_internal(%rip), %ymm4
-	vmovups	sPoly+__svml_slog10_data_internal(%rip), %ymm15
-	vmovups	sPoly+64+__svml_slog10_data_internal(%rip), %ymm9
-	vmovups	sPoly+128+__svml_slog10_data_internal(%rip), %ymm10
-	vmovups	sPoly+192+__svml_slog10_data_internal(%rip), %ymm12
+	/* reduction: compute r, n.  */
+	vmovups	COMMON_DATA(_IBrkValue)(%rip), %ymm4
 	vpsubd	%ymm4, %ymm0, %ymm1
-	vcmplt_oqps MinNorm+__svml_slog10_data_internal(%rip), %ymm0, %ymm5
-	vcmpnle_uqps MaxNorm+__svml_slog10_data_internal(%rip), %ymm0, %ymm6
-	vpand	iOffExpoMask+__svml_slog10_data_internal(%rip), %ymm1, %ymm3
+	vmovups	COMMON_DATA(_NotiOffExpoMask)(%rip), %ymm7
+	vpandn	%ymm1, %ymm7, %ymm3
 	vpsrad	$23, %ymm1, %ymm2
-	vpaddd	%ymm4, %ymm3, %ymm8
+
+	vpsubd	%ymm7, %ymm0, %ymm5
+	vmovups	COMMON_DATA(_ILoRange)(%rip), %ymm7
+	vpcmpgtd %ymm5, %ymm7, %ymm7
+
+
+	vpaddd	%ymm4, %ymm3, %ymm5
 	vcvtdq2ps %ymm2, %ymm1
-	vsubps	One+__svml_slog10_data_internal(%rip), %ymm8, %ymm13
-	vmulps	L2L+__svml_slog10_data_internal(%rip), %ymm1, %ymm14
-	vfmadd213ps sPoly+32+__svml_slog10_data_internal(%rip), %ymm13, %ymm15
-	vfmadd213ps sPoly+96+__svml_slog10_data_internal(%rip), %ymm13, %ymm9
-	vmulps	%ymm13, %ymm13, %ymm11
-	vfmadd213ps sPoly+160+__svml_slog10_data_internal(%rip), %ymm13, %ymm10
-	vfmadd213ps sPoly+224+__svml_slog10_data_internal(%rip), %ymm13, %ymm12
-	vfmadd213ps %ymm9, %ymm11, %ymm15
-	vfmadd213ps %ymm10, %ymm11, %ymm15
-	vfmadd213ps %ymm12, %ymm11, %ymm15
-	vfmadd213ps sPoly+256+__svml_slog10_data_internal(%rip), %ymm13, %ymm15
-	vfmadd213ps %ymm14, %ymm13, %ymm15
-	vorps	%ymm6, %ymm5, %ymm7
-
-	/* combine and get argument value range mask */
+	vsubps	COMMON_DATA(_OneF)(%rip), %ymm5, %ymm5
+
+	vmovups	LOCAL_DATA(_Coeff_9)(%rip), %ymm2
+	vfmadd213ps LOCAL_DATA(_Coeff_8)(%rip), %ymm5, %ymm2
+	vmovups	LOCAL_DATA(_Coeff_7)(%rip), %ymm3
+	vfmadd213ps LOCAL_DATA(_Coeff_6)(%rip), %ymm5, %ymm3
+	vmulps	%ymm5, %ymm5, %ymm4
 	vmovmskps %ymm7, %edx
-	vfmadd132ps L2H+__svml_slog10_data_internal(%rip), %ymm15, %ymm1
-	testl	%edx, %edx
+	vmovups	LOCAL_DATA(_Coeff_5)(%rip), %ymm7
+	vfmadd213ps LOCAL_DATA(_Coeff_4)(%rip), %ymm5, %ymm7
+	vmovups	LOCAL_DATA(_Coeff_3)(%rip), %ymm6
+	vfmadd213ps LOCAL_DATA(_Coeff_2)(%rip), %ymm5, %ymm6
+	vfmadd213ps %ymm3, %ymm4, %ymm2
+	vfmadd213ps %ymm7, %ymm4, %ymm2
+	vfmadd213ps %ymm6, %ymm4, %ymm2
+	vfmadd213ps LOCAL_DATA(_Coeff_1)(%rip), %ymm5, %ymm2
+	vmulps	LOCAL_DATA(_L2L)(%rip), %ymm1, %ymm7
+	vfmadd213ps %ymm7, %ymm5, %ymm2
+
 
-	/* Go to special inputs processing branch */
+
+	vfmadd132ps LOCAL_DATA(_L2H)(%rip), %ymm2, %ymm1
+	testl	%edx, %edx
+	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx ymm0 ymm1
 
-	/* Restore registers
-	 * and exit the function
-	 */
 
-L(EXIT):
+	/* Restore registers * and exit the function.  */
 	vmovaps	%ymm1, %ymm0
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
 	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   more so than speed here.  */
 L(SPECIAL_VALUES_BRANCH):
-	vmovups	%ymm0, 32(%rsp)
-	vmovups	%ymm1, 64(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx ymm1
+	/* Use r13 to save/restore the stack. This allows us to use rbp
+	   as callee save register saving code size.  */
+	pushq	%r13
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (r13, -16)
+	/* Need to callee save registers to preserve state across tanhf
+	   calls.  */
+	pushq	%rbx
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register (r13)
 
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
+
+	/* Save all already computed inputs.  */
+	vmovups	%ymm1, (%rsp)
+	/* Save original input (ymm0 unchanged up to this point).  */
+	vmovups	%ymm0, 32(%rsp)
 
 	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
-
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
 
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a atanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$8, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	64(%rsp), %ymm1
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 ymm1
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	vmovss	32(%rsp, %r14, 4), %xmm0
+
+	/* use rbp as index for special value that is saved across calls
+	   to atanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 28] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop. Realigning
+	   also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	vmovss	32(%rsp, %rbp, 4), %xmm0
 	call	log10f@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	vmovss	%xmm0, 64(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
+	vmovss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl	%ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+
+	/* All results have been written to (%rsp).  */
+	vmovups	(%rsp), %ymm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register (rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (r13)
+	ret
 END(_ZGVdN8v_log10f_avx2)
 
-	.section .rodata, "a"
+	.section .rodata.avx2, "a"
 	.align	32
 
-#ifdef __svml_slog10_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(32)) VUINT32 MinNorm[8][1];
-	__declspec(align(32)) VUINT32 MaxNorm[8][1];
-	__declspec(align(32)) VUINT32 L2H[8][1];
-	__declspec(align(32)) VUINT32 L2L[8][1];
-	__declspec(align(32)) VUINT32 iBrkValue[8][1];
-	__declspec(align(32)) VUINT32 iOffExpoMask[8][1];
-	__declspec(align(32)) VUINT32 One[8][1];
-	__declspec(align(32)) VUINT32 sPoly[9][8][1];
-	__declspec(align(32)) VUINT32 L2[8][1];
-} __svml_slog10_data_internal;
-#endif
-__svml_slog10_data_internal:
-	/* MinNorm */
-	.long	0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000
-	/* MaxNorm */
-	.align	32
-	.long	0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff
-	/* L2H */
-	.align	32
-	.long	0x3e9a2100, 0x3e9a2100, 0x3e9a2100, 0x3e9a2100, 0x3e9a2100, 0x3e9a2100, 0x3e9a2100, 0x3e9a2100
-	/* L2L */
-	.align	32
-	.long	0xb64AF600, 0xb64AF600, 0xb64AF600, 0xb64AF600, 0xb64AF600, 0xb64AF600, 0xb64AF600, 0xb64AF600
-	/* iBrkValue = SP 2/3 */
-	.align	32
-	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
-	/* iOffExpoMask = SP significand mask */
-	.align	32
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	/* sOne = SP 1.0 */
-	.align	32
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* spoly[9] */
-	.align	32
-	.long	0x3d8063B4, 0x3d8063B4, 0x3d8063B4, 0x3d8063B4, 0x3d8063B4, 0x3d8063B4, 0x3d8063B4, 0x3d8063B4 /* coeff9 */
-	.long	0xbd890073, 0xbd890073, 0xbd890073, 0xbd890073, 0xbd890073, 0xbd890073, 0xbd890073, 0xbd890073 /* coeff8 */
-	.long	0x3d775317, 0x3d775317, 0x3d775317, 0x3d775317, 0x3d775317, 0x3d775317, 0x3d775317, 0x3d775317 /* coeff7 */
-	.long	0xbd91FB27, 0xbd91FB27, 0xbd91FB27, 0xbd91FB27, 0xbd91FB27, 0xbd91FB27, 0xbd91FB27, 0xbd91FB27 /* coeff6 */
-	.long	0x3dB20B96, 0x3dB20B96, 0x3dB20B96, 0x3dB20B96, 0x3dB20B96, 0x3dB20B96, 0x3dB20B96, 0x3dB20B96 /* coeff5 */
-	.long	0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20, 0xbdDE6E20 /* coeff4 */
-	.long	0x3e143CE5, 0x3e143CE5, 0x3e143CE5, 0x3e143CE5, 0x3e143CE5, 0x3e143CE5, 0x3e143CE5, 0x3e143CE5 /* coeff3 */
-	.long	0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5, 0xbe5E5BC5 /* coeff2 */
-	.long	0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9, 0x3eDE5BD9 /* coeff1 */
-	/* L2 */
-	.align	32
-	.long	0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b
-	.align	32
-	.type	__svml_slog10_data_internal, @object
-	.size	__svml_slog10_data_internal, .-__svml_slog10_data_internal
+LOCAL_DATA_NAME:
+
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_9, 0x3d8063b4)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_8, 0xbd890073)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_7, 0x3d775317)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_6, 0xbd91fb27)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_5, 0x3db20b96)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_4, 0xbdde6e20)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_3, 0x3e143ce5)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_2, 0xbe5e5bc5)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_1, 0x3ede5bd9)
+	DATA_VEC (LOCAL_DATA_NAME, _L2L, 0xb64af600)
+	DATA_VEC (LOCAL_DATA_NAME, _L2H, 0x3e9a2100)
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 21/27] x86/fpu: Optimize svml_s_log2f16_core_avx512.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (18 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 20/27] x86/fpu: Optimize svml_s_log10f8_core_avx2.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 22/27] x86/fpu: Optimize svml_s_log2f4_core_sse4.S Noah Goldstein via Libc-alpha
                   ` (6 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Improve special values case which ends up covering ~half of all
   float bit patterns.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

Code Size Change: -46 Bytes (216 - 262)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.8641
0F          (0x0000ffff, Denorm)   -> 0.9888
.1F         (0x3dcccccd)           -> 0.9938
5F          (0x40a00000)           -> 0.9892
2315255808F (0x4f0a0000)           -> 0.9953
-NaN        (0xffffffff)           -> 0.9769
---
 .../multiarch/svml_s_log2f16_core_avx512.S    | 288 ++++++++----------
 1 file changed, 124 insertions(+), 164 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
index 3ada6ed349..61bfde1526 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
@@ -27,39 +27,35 @@
  *
  */
 
-/* Offsets for data table __svml_slog2_data_internal_avx512
- */
-#define One				0
-#define coeff4				64
-#define coeff3				128
-#define coeff2				192
-#define coeff1				256
+#define LOCAL_DATA_NAME	__svml_slog2_data_internal
+#include "svml_s_common_evex512_rodata_offsets.h"
+
+/* Offsets for data table __svml_slog2_data_internal.  */
+#define _Coeff_4	0
+#define _Coeff_3	64
+#define _Coeff_2	128
+#define _Coeff_1	192
+
 
 #include <sysdep.h>
 
 	.section .text.evex512, "ax", @progbits
 ENTRY(_ZGVeN16v_log2f_skx)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-64, %rsp
-	subq	$192, %rsp
 	vgetmantps $11, {sae}, %zmm0, %zmm3
-	vmovups	__svml_slog2_data_internal_avx512(%rip), %zmm1
+	vmovups	COMMON_DATA(_OneF)(%rip), %zmm1
 	vgetexpps {sae}, %zmm0, %zmm5
 
-	/* x<=0? */
-	vfpclassps $94, %zmm0, %k0
+
 	vsubps	{rn-sae}, %zmm1, %zmm3, %zmm9
 	vpsrld	$19, %zmm3, %zmm7
 	vgetexpps {sae}, %zmm3, %zmm6
-	vpermps	coeff4+__svml_slog2_data_internal_avx512(%rip), %zmm7, %zmm1
-	vpermps	coeff3+__svml_slog2_data_internal_avx512(%rip), %zmm7, %zmm2
-	vpermps	coeff2+__svml_slog2_data_internal_avx512(%rip), %zmm7, %zmm4
-	vpermps	coeff1+__svml_slog2_data_internal_avx512(%rip), %zmm7, %zmm8
+	vpermps	LOCAL_DATA(_Coeff_4)(%rip), %zmm7, %zmm1
+	vpermps	LOCAL_DATA(_Coeff_3)(%rip), %zmm7, %zmm2
+	vpermps	LOCAL_DATA(_Coeff_2)(%rip), %zmm7, %zmm4
+	vpermps	LOCAL_DATA(_Coeff_1)(%rip), %zmm7, %zmm8
 	vsubps	{rn-sae}, %zmm6, %zmm5, %zmm10
+	/* x<=0?  */
+	vfpclassps $0x5e, %zmm0, %k0
 	vfmadd213ps {rn-sae}, %zmm2, %zmm9, %zmm1
 	kmovw	%k0, %edx
 	vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm1
@@ -67,164 +63,128 @@ ENTRY(_ZGVeN16v_log2f_skx)
 	vfmadd213ps {rn-sae}, %zmm10, %zmm9, %zmm1
 	testl	%edx, %edx
 
-	/* Go to special inputs processing branch */
+	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
-
-	/* Restore registers
-	 * and exit the function
-	 */
-
-L(EXIT):
 	vmovaps	%zmm1, %zmm0
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
 	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
 
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Restore registers * and exit the function.  */
+
+	/* Branch to process * special inputs.  */
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanf call. Optimize for code size
+	   moreso than speed here.  */
 L(SPECIAL_VALUES_BRANCH):
-	vmovups	%zmm0, 64(%rsp)
-	vmovups	%zmm1, 128(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx zmm1
 
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
+	/* Use r13 to save/restore the stack. This allows us to use rbp
+	   as callee save register saving code size.  */
+	pushq	%r13
+	cfi_def_cfa (rsp, 16)
+	/* Need to callee save registers to preserve state across tanf
+	   calls.  */
+	pushq	%rbx
+	cfi_def_cfa (rsp, 24)
+	pushq	%rbp
+	cfi_def_cfa (rsp, 32)
+	movq	%rsp, %r13
+	cfi_def_cfa (r13, 32)
+
+	/* Align stack and make room for 2x zmm vectors.  */
+	andq	$-64, %rsp
+	addq	$-128, %rsp
+
+	/* Save origional input.  */
+	vmovaps	%zmm0, 64(%rsp)
+	/* Save all already computed inputs.  */
+	vmovaps	%zmm1, (%rsp)
 
 	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
-
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
 
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a tanf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$16, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	128(%rsp), %zmm1
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 zmm1
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	vmovss	64(%rsp, %r14, 4), %xmm0
+
+	/* use rbp as index for special value that is saved across calls
+	   to tanf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 56] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop. Realigning
+	   also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	64(%rsp, %rbp, 4), %xmm0
 	call	log2f@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
 
-	vmovss	%xmm0, 128(%rsp, %r14, 4)
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	blsrl	%ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+
+	/* All results have been written to 64(%rsp).  */
+	vmovaps	(%rsp), %zmm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa (rsp, 32)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_def_cfa (rsp, 24)
+	popq	%rbx
+	cfi_def_cfa (rsp, 16)
+	popq	%r13
+	ret
 END(_ZGVeN16v_log2f_skx)
 
-	.section .rodata, "a"
+	.section .rodata.evex512, "a"
 	.align	64
 
-#ifdef __svml_slog2_data_internal_avx512_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(64)) VUINT32 One[16][1];
-	__declspec(align(64)) VUINT32 coeff4[16][1];
-	__declspec(align(64)) VUINT32 coeff3[16][1];
-	__declspec(align(64)) VUINT32 coeff2[16][1];
-	__declspec(align(64)) VUINT32 coeff1[16][1];
-} __svml_slog2_data_internal_avx512;
-#endif
-__svml_slog2_data_internal_avx512:
-	/* One */
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	// c4
-	.align	64
-	.long	0xbea77e4a, 0xbe8aae3d
-	.long	0xbe67fe32, 0xbe43d1b6
-	.long	0xbe26a589, 0xbe0ee09b
-	.long	0xbdf6a8a1, 0xbdd63b49
-	.long	0xbf584e51, 0xbf3e80a1
-	.long	0xbf2892f0, 0xbf15d377
-	.long	0xbf05b525, 0xbeef8e30
-	.long	0xbed75c8f, 0xbec24184
-	// c3
-	.align	64
-	.long	0x3ef5910c, 0x3ef045a1
-	.long	0x3ee7d87e, 0x3eddbb84
-	.long	0x3ed2d6df, 0x3ec7bbd2
-	.long	0x3ebcc42f, 0x3eb22616
-	.long	0x3e8f3399, 0x3eb1223e
-	.long	0x3ec9db4a, 0x3edb7a09
-	.long	0x3ee79a1a, 0x3eef77cb
-	.long	0x3ef407a4, 0x3ef607b4
-	// c2
-	.align	64
-	.long	0xbf38a934, 0xbf387de6
-	.long	0xbf37f6f0, 0xbf37048b
-	.long	0xbf35a88a, 0xbf33ed04
-	.long	0xbf31df56, 0xbf2f8d82
-	.long	0xbf416814, 0xbf3daf58
-	.long	0xbf3b5c08, 0xbf39fa2a
-	.long	0xbf393713, 0xbf38d7e1
-	.long	0xbf38b2cd, 0xbf38aa62
-	// c1
-	.align	64
-	.long	0x3fb8aa3b, 0x3fb8a9c0
-	.long	0x3fb8a6e8, 0x3fb89f4e
-	.long	0x3fb890cb, 0x3fb879b1
-	.long	0x3fb858d8, 0x3fb82d90
-	.long	0x3fb8655e, 0x3fb8883a
-	.long	0x3fb89aea, 0x3fb8a42f
-	.long	0x3fb8a848, 0x3fb8a9c9
-	.long	0x3fb8aa2f, 0x3fb8aa3b
-	.align	64
-	.type	__svml_slog2_data_internal_avx512, @object
-	.size	__svml_slog2_data_internal_avx512, .-__svml_slog2_data_internal_avx512
+LOCAL_DATA_NAME:
+	float_block (LOCAL_DATA_NAME, _Coeff_4,
+		0xbea77e4a, 0xbe8aae3d,
+		0xbe67fe32, 0xbe43d1b6,
+		0xbe26a589, 0xbe0ee09b,
+		0xbdf6a8a1, 0xbdd63b49,
+		0xbf584e51, 0xbf3e80a1,
+		0xbf2892f0, 0xbf15d377,
+		0xbf05b525, 0xbeef8e30,
+		0xbed75c8f, 0xbec24184)
+
+	float_block (LOCAL_DATA_NAME, _Coeff_3,
+		0x3ef5910c, 0x3ef045a1,
+		0x3ee7d87e, 0x3eddbb84,
+		0x3ed2d6df, 0x3ec7bbd2,
+		0x3ebcc42f, 0x3eb22616,
+		0x3e8f3399, 0x3eb1223e,
+		0x3ec9db4a, 0x3edb7a09,
+		0x3ee79a1a, 0x3eef77cb,
+		0x3ef407a4, 0x3ef607b4)
+
+	float_block (LOCAL_DATA_NAME, _Coeff_2,
+		0xbf38a934, 0xbf387de6,
+		0xbf37f6f0, 0xbf37048b,
+		0xbf35a88a, 0xbf33ed04,
+		0xbf31df56, 0xbf2f8d82,
+		0xbf416814, 0xbf3daf58,
+		0xbf3b5c08, 0xbf39fa2a,
+		0xbf393713, 0xbf38d7e1,
+		0xbf38b2cd, 0xbf38aa62)
+
+	float_block (LOCAL_DATA_NAME, _Coeff_1,
+		0x3fb8aa3b, 0x3fb8a9c0,
+		0x3fb8a6e8, 0x3fb89f4e,
+		0x3fb890cb, 0x3fb879b1,
+		0x3fb858d8, 0x3fb82d90,
+		0x3fb8655e, 0x3fb8883a,
+		0x3fb89aea, 0x3fb8a42f,
+		0x3fb8a848, 0x3fb8a9c9,
+		0x3fb8aa2f, 0x3fb8aa3b)
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 22/27] x86/fpu: Optimize svml_s_log2f4_core_sse4.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (19 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 21/27] x86/fpu: Optimize svml_s_log2f16_core_avx512.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 23/27] x86/fpu: Optimize svml_s_log2f8_core_avx2.S Noah Goldstein via Libc-alpha
                   ` (5 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Improve special values case which ends up covering ~half of all
   float bit patterns.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

Code Size Change: -56 Bytes (256 - 312)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.9048
0F          (0x0000ffff, Denorm)   -> 0.9910
.1F         (0x3dcccccd)           -> 0.9399
5F          (0x40a00000)           -> 0.9425
2315255808F (0x4f0a0000)           -> 0.9312
-NaN        (0xffffffff)           -> 0.9230
---
 .../fpu/multiarch/svml_s_log2f4_core_sse4.S   | 292 +++++++-----------
 1 file changed, 115 insertions(+), 177 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f4_core_sse4.S
index bc6a778b75..b877bbb034 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f4_core_sse4.S
@@ -27,196 +27,134 @@
  *
  */
 
-/* Offsets for data table __svml_slog2_data_internal
- */
-#define MinNorm				0
-#define MaxNorm				16
-#define iBrkValue			32
-#define iOffExpoMask			48
-#define One				64
-#define sPoly				80
+#define LOCAL_DATA_NAME	__svml_slog2_data_internal
+#include "svml_s_common_sse4_rodata_offsets.h"
+/* Offsets for data table __svml_slog2_data_internal.  */
+#define _Coeff_3	0
+#define _Coeff_2	16
+#define _Coeff_7	32
+#define _Coeff_6	48
+#define _Coeff_9	64
+#define _Coeff_8	80
+#define _Coeff_5	96
+#define _Coeff_4	112
+#define _Coeff_1	128
+
 
 #include <sysdep.h>
 
 	.section .text.sse4, "ax", @progbits
 ENTRY(_ZGVbN4v_log2f_sse4)
-	subq	$72, %rsp
-	cfi_def_cfa_offset(80)
-	movaps	%xmm0, %xmm1
-
-	/* reduction: compute r, n */
-	movdqu	iBrkValue+__svml_slog2_data_internal(%rip), %xmm2
-	movaps	%xmm0, %xmm4
-	movdqu	iOffExpoMask+__svml_slog2_data_internal(%rip), %xmm10
-	psubd	%xmm2, %xmm1
-	pand	%xmm1, %xmm10
-	movaps	%xmm0, %xmm3
-	paddd	%xmm2, %xmm10
-	psrad	$23, %xmm1
-	movups	sPoly+__svml_slog2_data_internal(%rip), %xmm5
-	movups	sPoly+32+__svml_slog2_data_internal(%rip), %xmm6
-	movups	sPoly+64+__svml_slog2_data_internal(%rip), %xmm7
-	movups	sPoly+96+__svml_slog2_data_internal(%rip), %xmm9
-	cmpltps	MinNorm+__svml_slog2_data_internal(%rip), %xmm4
-	cmpnleps MaxNorm+__svml_slog2_data_internal(%rip), %xmm3
-	cvtdq2ps %xmm1, %xmm1
-	subps	One+__svml_slog2_data_internal(%rip), %xmm10
-	mulps	%xmm10, %xmm5
-	movaps	%xmm10, %xmm8
-	mulps	%xmm10, %xmm6
-	mulps	%xmm10, %xmm8
-	addps	sPoly+16+__svml_slog2_data_internal(%rip), %xmm5
-	mulps	%xmm10, %xmm7
-	addps	sPoly+48+__svml_slog2_data_internal(%rip), %xmm6
-	mulps	%xmm10, %xmm9
-	mulps	%xmm8, %xmm5
-	addps	sPoly+80+__svml_slog2_data_internal(%rip), %xmm7
-	addps	sPoly+112+__svml_slog2_data_internal(%rip), %xmm9
-	addps	%xmm5, %xmm6
-	mulps	%xmm8, %xmm6
-	orps	%xmm3, %xmm4
-
-	/* combine and get argument value range mask */
-	movmskps %xmm4, %edx
-	addps	%xmm6, %xmm7
-	mulps	%xmm7, %xmm8
-	addps	%xmm8, %xmm9
-	mulps	%xmm10, %xmm9
-	addps	sPoly+128+__svml_slog2_data_internal(%rip), %xmm9
-	mulps	%xmm9, %xmm10
-	addps	%xmm10, %xmm1
-	testl	%edx, %edx
-
-	/* Go to special inputs processing branch */
+	movdqu	COMMON_DATA(_NotiOffExpoMask)(%rip), %xmm1
+	movaps	%xmm0, %xmm2
+	psubd	%xmm1, %xmm0
+	movaps	COMMON_DATA(_ILoRange)(%rip), %xmm5
+
+	pcmpgtd	%xmm0, %xmm5
+	/* combine and get argument value range mask.  */
+	movmskps %xmm5, %eax
+	/* reduction: compute r, n.  */
+	movdqu	COMMON_DATA(_IBrkValue)(%rip), %xmm0
+	movaps	%xmm2, %xmm5
+	psubd	%xmm0, %xmm2
+	pandn	%xmm2, %xmm1
+	paddd	%xmm0, %xmm1
+	psrad	$0x17, %xmm2
+	cvtdq2ps %xmm2, %xmm0
+	subps	COMMON_DATA(_OneF)(%rip), %xmm1
+	movups	LOCAL_DATA(_Coeff_3)(%rip), %xmm2
+	mulps	%xmm1, %xmm2
+	addps	LOCAL_DATA(_Coeff_2)(%rip), %xmm2
+	movups	LOCAL_DATA(_Coeff_7)(%rip), %xmm4
+	mulps	%xmm1, %xmm4
+	addps	LOCAL_DATA(_Coeff_6)(%rip), %xmm4
+	movups	LOCAL_DATA(_Coeff_9)(%rip), %xmm3
+	mulps	%xmm1, %xmm3
+	addps	LOCAL_DATA(_Coeff_8)(%rip), %xmm3
+	movaps	%xmm1, %xmm6
+	mulps	%xmm1, %xmm1
+	mulps	%xmm1, %xmm3
+	addps	%xmm3, %xmm4
+	mulps	%xmm1, %xmm4
+	movups	LOCAL_DATA(_Coeff_5)(%rip), %xmm3
+	mulps	%xmm6, %xmm3
+	addps	LOCAL_DATA(_Coeff_4)(%rip), %xmm3
+	addps	%xmm4, %xmm3
+	mulps	%xmm3, %xmm1
+	addps	%xmm1, %xmm2
+	mulps	%xmm6, %xmm2
+	addps	LOCAL_DATA(_Coeff_1)(%rip), %xmm2
+	mulps	%xmm2, %xmm6
+	addps	%xmm6, %xmm0
+	testl	%eax, %eax
+	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1
-
-	/* Restore registers
-	 * and exit the function
-	 */
-
-L(EXIT):
-	movaps	%xmm1, %xmm0
-	addq	$72, %rsp
-	cfi_def_cfa_offset(8)
 	ret
-	cfi_def_cfa_offset(80)
-
-	/* Branch to process
-	 * special inputs
-	 */
 
+	/* Cold case. edx has 1s where there was a special value that
+	   more so than speed here.  */
 L(SPECIAL_VALUES_BRANCH):
-	movups	%xmm0, 32(%rsp)
-	movups	%xmm1, 48(%rsp)
-	# LOE rbx rbp r12 r13 r14 r15 edx
-
-	xorl	%eax, %eax
-	movq	%r12, 16(%rsp)
-	cfi_offset(12, -64)
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	cfi_offset(13, -72)
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
-
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx rbp r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
-
+	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
+	   call entry will be 16-byte aligned.  */
+	subq	$0x38, %rsp
+	movups	%xmm0, 24(%rsp)
+	movups	%xmm5, 40(%rsp)
+
+	/* Use rbx/rbp for callee save registers as they get short
+	   encoding for many instructions (as compared with r12/r13).  */
+	movq	%rbx, (%rsp)
+	cfi_offset (rbx, -64)
+	movq	%rbp, 8(%rsp)
+	cfi_offset (rbp, -56)
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a tanhf call.  */
+	movl	%eax, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$4, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx rbp r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	movups	48(%rsp), %xmm1
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	cfi_offset(12, -64)
-	cfi_offset(13, -72)
-	cfi_offset(14, -80)
-	# LOE rbx rbp r12 r13 r14 r15 xmm1
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
-	call	log2f@PLT
-	# LOE rbx rbp r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 48(%rsp, %r14, 4)
+	/* use rbp as index for special value that is saved across calls
+	   to tanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 12] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop.  */
+	xorl	%ebp, %ebp
+	bsfl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	40(%rsp, %rbp, 4), %xmm0
+	call	log2f@PLT
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx rbp r15 r12d r13d
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
+	movss	%xmm0, 24(%rsp, %rbp, 4)
+
+	leal	-1(%rbx), %eax
+	andl	%eax, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+	/* All results have been written to 24(%rsp).  */
+	movups	24(%rsp), %xmm0
+	movq	(%rsp), %rbx
+	cfi_restore (rbx)
+	movq	8(%rsp), %rbp
+	cfi_restore (rbp)
+	addq	$56, %rsp
+	cfi_def_cfa_offset (8)
+	ret
 END(_ZGVbN4v_log2f_sse4)
 
-	.section .rodata, "a"
+	.section .rodata.sse4, "a"
 	.align	16
 
-#ifdef __svml_slog2_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(16)) VUINT32 MinNorm[4][1];
-	__declspec(align(16)) VUINT32 MaxNorm[4][1];
-	__declspec(align(16)) VUINT32 iBrkValue[4][1];
-	__declspec(align(16)) VUINT32 iOffExpoMask[4][1];
-	__declspec(align(16)) VUINT32 One[4][1];
-	__declspec(align(16)) VUINT32 sPoly[9][4][1];
-} __svml_slog2_data_internal;
-#endif
-__svml_slog2_data_internal:
-	/* MinNorm */
-	.long	0x00800000, 0x00800000, 0x00800000, 0x00800000
-	/* MaxNorm */
-	.align	16
-	.long	0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff
-	/* iBrkValue = SP 2/3 */
-	.align	16
-	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
-	/* iOffExpoMask = SP significand mask */
-	.align	16
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	/* sOne = SP 1.0 */
-	.align	16
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* spoly[9] */
-	.align	16
-	.long	0x3e554012, 0x3e554012, 0x3e554012, 0x3e554012 /* coeff9 */
-	.long	0xbe638E14, 0xbe638E14, 0xbe638E14, 0xbe638E14 /* coeff8 */
-	.long	0x3e4D660B, 0x3e4D660B, 0x3e4D660B, 0x3e4D660B /* coeff7 */
-	.long	0xbe727824, 0xbe727824, 0xbe727824, 0xbe727824 /* coeff6 */
-	.long	0x3e93DD07, 0x3e93DD07, 0x3e93DD07, 0x3e93DD07 /* coeff5 */
-	.long	0xbeB8B969, 0xbeB8B969, 0xbeB8B969, 0xbeB8B969 /* coeff4 */
-	.long	0x3eF637C0, 0x3eF637C0, 0x3eF637C0, 0x3eF637C0 /* coeff3 */
-	.long	0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B /* coeff2 */
-	.long	0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B /* coeff1 */
-	.align	16
-	.type	__svml_slog2_data_internal, @object
-	.size	__svml_slog2_data_internal, .-__svml_slog2_data_internal
+LOCAL_DATA_NAME:
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_3, 0x3ef637c0)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_2, 0xbf38aa2b)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_7, 0x3e4d660b)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_6, 0xbe727824)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_9, 0x3e554012)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_8, 0xbe638e14)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_5, 0x3e93dd07)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_4, 0xbeb8b969)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_1, 0x3fb8aa3b)
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 23/27] x86/fpu: Optimize svml_s_log2f8_core_avx2.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (20 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 22/27] x86/fpu: Optimize svml_s_log2f4_core_sse4.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 24/27] x86/fpu: Optimize svml_s_logf16_core_avx512.S Noah Goldstein via Libc-alpha
                   ` (4 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Improve special values case which ends up covering ~half of all
   float bit patterns.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

Code Size Change: -43 Bytes (251 - 294)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.8543
0F          (0x0000ffff, Denorm)   -> 0.9722
.1F         (0x3dcccccd)           -> 0.9009
5F          (0x40a00000)           -> 0.9432
2315255808F (0x4f0a0000)           -> 0.8680
-NaN        (0xffffffff)           -> 0.8154
---
 .../fpu/multiarch/svml_s_log2f8_core_avx2.S   | 290 ++++++++----------
 1 file changed, 123 insertions(+), 167 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
index 2245d40f84..2c56159323 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
@@ -27,199 +27,155 @@
  *
  */
 
-/* Offsets for data table __svml_slog2_data_internal
- */
-#define MinNorm				0
-#define MaxNorm				32
-#define iBrkValue			64
-#define iOffExpoMask			96
-#define One				128
-#define sPoly				160
+#define LOCAL_DATA_NAME	__svml_slog2_data_internal
+#include "svml_s_common_avx2_rodata_offsets.h"
+/* Offsets for data table __svml_slog2_data_internal.  */
+#define _Coeff_9	0
+#define _Coeff_8	32
+#define _Coeff_7	64
+#define _Coeff_6	96
+#define _Coeff_5	128
+#define _Coeff_4	160
+#define _Coeff_3	192
+#define _Coeff_2	224
+#define _Coeff_1	256
 
 #include <sysdep.h>
 
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_log2f_avx2)
-	pushq	%rbp
-	cfi_def_cfa_offset(16)
-	movq	%rsp, %rbp
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
-	andq	$-32, %rsp
-	subq	$96, %rsp
+	/* reduction: compute r, n.  */
+	vmovups	COMMON_DATA(_IBrkValue)(%rip), %ymm4
+
 
-	/* reduction: compute r, n */
-	vmovups	iBrkValue+__svml_slog2_data_internal(%rip), %ymm4
-	vmovups	sPoly+64+__svml_slog2_data_internal(%rip), %ymm9
-	vmovups	sPoly+128+__svml_slog2_data_internal(%rip), %ymm10
-	vmovups	sPoly+192+__svml_slog2_data_internal(%rip), %ymm12
 	vpsubd	%ymm4, %ymm0, %ymm1
-	vcmplt_oqps MinNorm+__svml_slog2_data_internal(%rip), %ymm0, %ymm5
-	vcmpnle_uqps MaxNorm+__svml_slog2_data_internal(%rip), %ymm0, %ymm6
-	vpand	iOffExpoMask+__svml_slog2_data_internal(%rip), %ymm1, %ymm3
+	vmovups	COMMON_DATA(_NotiOffExpoMask)(%rip), %ymm2
+	vpsubd	%ymm2, %ymm0, %ymm5
+	vmovups	COMMON_DATA(_ILoRange)(%rip), %ymm6
+	vpcmpgtd %ymm5, %ymm6, %ymm6
+	vpandn	%ymm1, %ymm2, %ymm3
 	vpsrad	$23, %ymm1, %ymm2
-	vmovups	sPoly+__svml_slog2_data_internal(%rip), %ymm1
-	vpaddd	%ymm4, %ymm3, %ymm8
-	vcvtdq2ps %ymm2, %ymm14
-	vsubps	One+__svml_slog2_data_internal(%rip), %ymm8, %ymm13
-	vfmadd213ps sPoly+32+__svml_slog2_data_internal(%rip), %ymm13, %ymm1
-	vfmadd213ps sPoly+96+__svml_slog2_data_internal(%rip), %ymm13, %ymm9
-	vmulps	%ymm13, %ymm13, %ymm11
-	vfmadd213ps sPoly+160+__svml_slog2_data_internal(%rip), %ymm13, %ymm10
-	vfmadd213ps sPoly+224+__svml_slog2_data_internal(%rip), %ymm13, %ymm12
-	vfmadd213ps %ymm9, %ymm11, %ymm1
-	vfmadd213ps %ymm10, %ymm11, %ymm1
-	vfmadd213ps %ymm12, %ymm11, %ymm1
-	vfmadd213ps sPoly+256+__svml_slog2_data_internal(%rip), %ymm13, %ymm1
-	vorps	%ymm6, %ymm5, %ymm7
-
-	/* combine and get argument value range mask */
-	vmovmskps %ymm7, %edx
-	vfmadd213ps %ymm14, %ymm13, %ymm1
+	vmovups	LOCAL_DATA(_Coeff_9)(%rip), %ymm1
+	vpaddd	%ymm4, %ymm3, %ymm3
+
+	vsubps	COMMON_DATA(_OneF)(%rip), %ymm3, %ymm3
+	vfmadd213ps LOCAL_DATA(_Coeff_8)(%rip), %ymm3, %ymm1
+	vmovups	LOCAL_DATA(_Coeff_7)(%rip), %ymm4
+	vfmadd213ps LOCAL_DATA(_Coeff_6)(%rip), %ymm3, %ymm4
+	vmulps	%ymm3, %ymm3, %ymm5
+	vmovups	LOCAL_DATA(_Coeff_5)(%rip), %ymm7
+	vfmadd213ps LOCAL_DATA(_Coeff_4)(%rip), %ymm3, %ymm7
+	vmovmskps %ymm6, %edx
+
+	vmovups	LOCAL_DATA(_Coeff_3)(%rip), %ymm6
+	vfmadd213ps LOCAL_DATA(_Coeff_2)(%rip), %ymm3, %ymm6
+	vfmadd213ps %ymm4, %ymm5, %ymm1
+	vfmadd213ps %ymm7, %ymm5, %ymm1
+	vfmadd213ps %ymm6, %ymm5, %ymm1
+	vfmadd213ps LOCAL_DATA(_Coeff_1)(%rip), %ymm3, %ymm1
+	vcvtdq2ps %ymm2, %ymm2
+
+	/* combine and get argument value range mask.  */
+
+	vfmadd213ps %ymm2, %ymm3, %ymm1
 	testl	%edx, %edx
 
-	/* Go to special inputs processing branch */
+	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 edx ymm0 ymm1
 
-	/* Restore registers
-	 * and exit the function
-	 */
 
-L(EXIT):
 	vmovaps	%ymm1, %ymm0
-	movq	%rbp, %rsp
-	popq	%rbp
-	cfi_def_cfa(7, 8)
-	cfi_restore(6)
 	ret
-	cfi_def_cfa(6, 16)
-	cfi_offset(6, -16)
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   more so than speed here.  */
+L(SPECIAL_VALUES_BRANCH):
 
-	/* Branch to process
-	 * special inputs
-	 */
+	/* Use r13 to save/restore the stack. This allows us to use rbp
+	   as callee save register saving code size.  */
+	pushq	%r13
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (r13, -16)
+	/* Need to callee save registers to preserve state across tanhf
+	   calls.  */
+	pushq	%rbx
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register (r13)
 
-L(SPECIAL_VALUES_BRANCH):
-	vmovups	%ymm0, 32(%rsp)
-	vmovups	%ymm1, 64(%rsp)
-	# LOE rbx r12 r13 r14 r15 edx ymm1
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
 
-	xorl	%eax, %eax
-	# LOE rbx r12 r13 r14 r15 eax edx
+	/* Save all already computed inputs.  */
+	vmovups	%ymm1, (%rsp)
+	/* Save original input (ymm0 unchanged up to this point).  */
+	vmovups	%ymm0, 32(%rsp)
 
 	vzeroupper
-	movq	%r12, 16(%rsp)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-	movl	%eax, %r12d
-	movq	%r13, 8(%rsp)
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-	movl	%edx, %r13d
-	movq	%r14, (%rsp)
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r15 r12d r13d
-
-	/* Range mask
-	 * bits check
-	 */
-
-L(RANGEMASK_CHECK):
-	btl	%r12d, %r13d
-
-	/* Call scalar math function */
-	jc	L(SCALAR_MATH_CALL)
-	# LOE rbx r15 r12d r13d
-
-	/* Special inputs
-	 * processing loop
-	 */
 
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a atanhf call.  */
+	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	incl	%r12d
-	cmpl	$8, %r12d
-
-	/* Check bits in range mask */
-	jl	L(RANGEMASK_CHECK)
-	# LOE rbx r15 r12d r13d
-
-	movq	16(%rsp), %r12
-	cfi_restore(12)
-	movq	8(%rsp), %r13
-	cfi_restore(13)
-	movq	(%rsp), %r14
-	cfi_restore(14)
-	vmovups	64(%rsp), %ymm1
-
-	/* Go to exit */
-	jmp	L(EXIT)
-	/*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-	/*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-	# LOE rbx r12 r13 r14 r15 ymm1
-
-	/* Scalar math fucntion call
-	 * to process special input
-	 */
-
-L(SCALAR_MATH_CALL):
-	movl	%r12d, %r14d
-	vmovss	32(%rsp, %r14, 4), %xmm0
+
+	/* use rbp as index for special value that is saved across calls
+	   to atanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 28] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop. Realigning
+	   also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	vmovss	32(%rsp, %rbp, 4), %xmm0
 	call	log2f@PLT
-	# LOE rbx r14 r15 r12d r13d xmm0
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
+	vmovss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl	%ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+
 
-	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
-	/* Process special inputs in loop */
-	jmp	L(SPECIAL_VALUES_LOOP)
-	# LOE rbx r15 r12d r13d
+	/* All results have been written to (%rsp).  */
+	vmovups	(%rsp), %ymm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register (rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (r13)
+	ret
 END(_ZGVdN8v_log2f_avx2)
 
-	.section .rodata, "a"
+	.section .rodata.avx2, "a"
 	.align	32
 
-#ifdef __svml_slog2_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-	__declspec(align(32)) VUINT32 MinNorm[8][1];
-	__declspec(align(32)) VUINT32 MaxNorm[8][1];
-	__declspec(align(32)) VUINT32 iBrkValue[8][1];
-	__declspec(align(32)) VUINT32 iOffExpoMask[8][1];
-	__declspec(align(32)) VUINT32 One[8][1];
-	__declspec(align(32)) VUINT32 sPoly[9][8][1];
-} __svml_slog2_data_internal;
-#endif
-__svml_slog2_data_internal:
-	/* MinNorm */
-	.long	0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000
-	/* MaxNorm */
-	.align	32
-	.long	0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff
-	/* iBrkValue = SP 2/3 */
-	.align	32
-	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
-	/* iOffExpoMask = SP significand mask */
-	.align	32
-	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-	/* sOne = SP 1.0 */
-	.align	32
-	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-	/* spoly[9] */
-	.align	32
-	.long	0x3e554012, 0x3e554012, 0x3e554012, 0x3e554012, 0x3e554012, 0x3e554012, 0x3e554012, 0x3e554012 /* coeff9 */
-	.long	0xbe638E14, 0xbe638E14, 0xbe638E14, 0xbe638E14, 0xbe638E14, 0xbe638E14, 0xbe638E14, 0xbe638E14 /* coeff8 */
-	.long	0x3e4D660B, 0x3e4D660B, 0x3e4D660B, 0x3e4D660B, 0x3e4D660B, 0x3e4D660B, 0x3e4D660B, 0x3e4D660B /* coeff7 */
-	.long	0xbe727824, 0xbe727824, 0xbe727824, 0xbe727824, 0xbe727824, 0xbe727824, 0xbe727824, 0xbe727824 /* coeff6 */
-	.long	0x3e93DD07, 0x3e93DD07, 0x3e93DD07, 0x3e93DD07, 0x3e93DD07, 0x3e93DD07, 0x3e93DD07, 0x3e93DD07 /* coeff5 */
-	.long	0xbeB8B969, 0xbeB8B969, 0xbeB8B969, 0xbeB8B969, 0xbeB8B969, 0xbeB8B969, 0xbeB8B969, 0xbeB8B969 /* coeff4 */
-	.long	0x3eF637C0, 0x3eF637C0, 0x3eF637C0, 0x3eF637C0, 0x3eF637C0, 0x3eF637C0, 0x3eF637C0, 0x3eF637C0 /* coeff3 */
-	.long	0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B /* coeff2 */
-	.long	0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B /* coeff1 */
-	.align	32
-	.type	__svml_slog2_data_internal, @object
-	.size	__svml_slog2_data_internal, .-__svml_slog2_data_internal
+LOCAL_DATA_NAME:
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_9, 0x3e554012)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_8, 0xbe638e14)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_7, 0x3e4d660b)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_6, 0xbe727824)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_5, 0x3e93dd07)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_4, 0xbeb8b969)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_3, 0x3ef637c0)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_2, 0xbf38aa2b)
+	DATA_VEC (LOCAL_DATA_NAME, _Coeff_1, 0x3fb8aa3b)
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 24/27] x86/fpu: Optimize svml_s_logf16_core_avx512.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (21 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 23/27] x86/fpu: Optimize svml_s_log2f8_core_avx2.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 25/27] x86/fpu: Optimize svml_s_logf4_core_sse4.S Noah Goldstein via Libc-alpha
                   ` (3 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Improve special values case which ends up covering ~half of all
   float bit patterns.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

As well instead of using the shared `__svml_slogf_data` just define
the data locally.  This is because; 1) Its not really ideal for
the sse4/avx2 to reuse the avx512 tables as it pollute the cache
which unnecessarily large blocks.  2) Really only one of the
versions is ever expected to be used by a given process so there
isn't any constructive caching between them. And 3) there is not
enough data shared to make up for the first two reasons.

Lastly, the avx512-skl implementation no longer uses `vpternlogd` (the
only AVX512-DQ instruction) so make the avx512-knl implementation
just a alias to the SKL one.

Code Size Change: -1285 Bytes (267 - 1552) (With removing KNL impl)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.7324
0F          (0x0000ffff, Denorm)   -> 0.9708
.1F         (0x3dcccccd)           -> 0.9535
5F          (0x40a00000)           -> 0.9699
2315255808F (0x4f0a0000)           -> 0.9909
-NaN        (0xffffffff)           -> 0.6798
---
 .../fpu/multiarch/svml_s_logf16_core_avx512.S | 553 ++++++------------
 1 file changed, 173 insertions(+), 380 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
index eb830a6579..ebc959aa74 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
@@ -16,388 +16,181 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
+#define LOCAL_DATA_NAME	__svml_slog_data_internal
+#define LOCAL_DATA_NAME_UNALIGNED	__svml_slog_data_internal_unaligned
+#include "svml_s_common_evex512_rodata_offsets.h"
+
+/* Offsets for data table __svml_slog_data_internal_unaligned.
+   4 bytes each.  */
+#define _iLoRange	0
+
+/* Offsets for data table __svml_slog_data_internal
+   64 bytes each.  */
+#define _iBrkValue	0
+#define _sPoly_7	64
+#define _sPoly_6	128
+#define _sPoly_5	192
+#define _sPoly_4	256
+#define _sPoly_3	320
+#define _sPoly_2	384
+#define _sLn2	448
+
 #include <sysdep.h>
-#include "svml_s_logf_data.h"
-#include "svml_s_wrapper_impl.h"
 
 	.section .text.evex512, "ax", @progbits
-ENTRY (_ZGVeN16v_logf_knl)
-/*
-   ALGORITHM DESCRIPTION:
-
-     log(x) = exponent_x*log(2) + log(mantissa_x),         if mantissa_x<4/3
-     log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3
-
-     R = mantissa_x - 1,     if mantissa_x<4/3
-     R = 0.5*mantissa_x - 1, if mantissa_x>4/3
-     |R|< 1/3
-
-     log(1+R) is approximated as a polynomial: degree 9 for 1-ulp,
-     degree 7 for 4-ulp, degree 3 for half-precision.  */
-
-        pushq     %rbp
-        cfi_adjust_cfa_offset (8)
-        cfi_rel_offset (%rbp, 0)
-        movq      %rsp, %rbp
-        cfi_def_cfa_register (%rbp)
-        andq      $-64, %rsp
-        subq      $1280, %rsp
-        movq      __svml_slog_data@GOTPCREL(%rip), %rax
-        movl      $-1, %ecx
-
-/* reduction: compute r,n */
-        vpsubd    _iBrkValue(%rax), %zmm0, %zmm2
-        vmovups   _sPoly_7(%rax), %zmm7
-        vpandd    _iOffExpoMask(%rax), %zmm2, %zmm3
-
-/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */
-        vpsrad    $23, %zmm2, %zmm4
-
-/* check for working range,
-   set special argument mask (denormals/zero/Inf/NaN)
- */
-        vpaddd    _iHiDelta(%rax), %zmm0, %zmm1
-
-/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */
-        vpaddd    _iBrkValue(%rax), %zmm3, %zmm6
-        vpcmpd    $1, _iLoRange(%rax), %zmm1, %k1
-        vcvtdq2ps {rn-sae}, %zmm4, %zmm1
-
-/* reduced argument R */
-        vsubps       _sOne(%rax), %zmm6, %zmm8
-        vpbroadcastd %ecx, %zmm5{%k1}{z}
-
-/* polynomial evaluation starts here */
-        vfmadd213ps _sPoly_6(%rax), %zmm8, %zmm7
-        vptestmd    %zmm5, %zmm5, %k0
-        kmovw       %k0, %ecx
-        vfmadd213ps _sPoly_5(%rax), %zmm8, %zmm7
-        vfmadd213ps _sPoly_4(%rax), %zmm8, %zmm7
-        vfmadd213ps _sPoly_3(%rax), %zmm8, %zmm7
-        vfmadd213ps _sPoly_2(%rax), %zmm8, %zmm7
-        vfmadd213ps _sPoly_1(%rax), %zmm8, %zmm7
-        vmulps      %zmm8, %zmm7, %zmm9
-
-/* polynomial evaluation end */
-        vfmadd213ps %zmm8, %zmm8, %zmm9
-
-/*
-   final reconstruction:
-   add exponent_value*log2 to polynomial result
- */
-        vfmadd132ps _sLn2(%rax), %zmm9, %zmm1
-        testl       %ecx, %ecx
-        jne         .LBL_1_3
-
-.LBL_1_2:
-        cfi_remember_state
-        vmovaps   %zmm1, %zmm0
-        movq      %rbp, %rsp
-        cfi_def_cfa_register (%rsp)
-        popq      %rbp
-        cfi_adjust_cfa_offset (-8)
-        cfi_restore (%rbp)
-        ret
-
-.LBL_1_3:
-        cfi_restore_state
-        vmovups   %zmm0, 1152(%rsp)
-        vmovups   %zmm1, 1216(%rsp)
-        je        .LBL_1_2
-
-        xorb      %dl, %dl
-        kmovw     %k4, 1048(%rsp)
-        xorl      %eax, %eax
-        kmovw     %k5, 1040(%rsp)
-        kmovw     %k6, 1032(%rsp)
-        kmovw     %k7, 1024(%rsp)
-        vmovups   %zmm16, 960(%rsp)
-        vmovups   %zmm17, 896(%rsp)
-        vmovups   %zmm18, 832(%rsp)
-        vmovups   %zmm19, 768(%rsp)
-        vmovups   %zmm20, 704(%rsp)
-        vmovups   %zmm21, 640(%rsp)
-        vmovups   %zmm22, 576(%rsp)
-        vmovups   %zmm23, 512(%rsp)
-        vmovups   %zmm24, 448(%rsp)
-        vmovups   %zmm25, 384(%rsp)
-        vmovups   %zmm26, 320(%rsp)
-        vmovups   %zmm27, 256(%rsp)
-        vmovups   %zmm28, 192(%rsp)
-        vmovups   %zmm29, 128(%rsp)
-        vmovups   %zmm30, 64(%rsp)
-        vmovups   %zmm31, (%rsp)
-        movq      %rsi, 1064(%rsp)
-        movq      %rdi, 1056(%rsp)
-        movq      %r12, 1096(%rsp)
-        cfi_offset_rel_rsp (12, 1096)
-        movb      %dl, %r12b
-        movq      %r13, 1088(%rsp)
-        cfi_offset_rel_rsp (13, 1088)
-        movl      %ecx, %r13d
-        movq      %r14, 1080(%rsp)
-        cfi_offset_rel_rsp (14, 1080)
-        movl      %eax, %r14d
-        movq      %r15, 1072(%rsp)
-        cfi_offset_rel_rsp (15, 1072)
-        cfi_remember_state
-
-.LBL_1_6:
-        btl       %r14d, %r13d
-        jc        .LBL_1_12
-
-.LBL_1_7:
-        lea       1(%r14), %esi
-        btl       %esi, %r13d
-        jc        .LBL_1_10
-
-.LBL_1_8:
-        addb      $1, %r12b
-        addl      $2, %r14d
-        cmpb      $16, %r12b
-        jb        .LBL_1_6
-
-        kmovw     1048(%rsp), %k4
-        movq      1064(%rsp), %rsi
-        kmovw     1040(%rsp), %k5
-        movq      1056(%rsp), %rdi
-        kmovw     1032(%rsp), %k6
-        movq      1096(%rsp), %r12
-        cfi_restore (%r12)
-        movq      1088(%rsp), %r13
-        cfi_restore (%r13)
-        kmovw     1024(%rsp), %k7
-        vmovups   960(%rsp), %zmm16
-        vmovups   896(%rsp), %zmm17
-        vmovups   832(%rsp), %zmm18
-        vmovups   768(%rsp), %zmm19
-        vmovups   704(%rsp), %zmm20
-        vmovups   640(%rsp), %zmm21
-        vmovups   576(%rsp), %zmm22
-        vmovups   512(%rsp), %zmm23
-        vmovups   448(%rsp), %zmm24
-        vmovups   384(%rsp), %zmm25
-        vmovups   320(%rsp), %zmm26
-        vmovups   256(%rsp), %zmm27
-        vmovups   192(%rsp), %zmm28
-        vmovups   128(%rsp), %zmm29
-        vmovups   64(%rsp), %zmm30
-        vmovups   (%rsp), %zmm31
-        movq      1080(%rsp), %r14
-        cfi_restore (%r14)
-        movq      1072(%rsp), %r15
-        cfi_restore (%r15)
-        vmovups   1216(%rsp), %zmm1
-        jmp       .LBL_1_2
-
-.LBL_1_10:
-        cfi_restore_state
-        movzbl    %r12b, %r15d
-        vmovss    1156(%rsp,%r15,8), %xmm0
-        call      JUMPTARGET(logf)
-        vmovss    %xmm0, 1220(%rsp,%r15,8)
-        jmp       .LBL_1_8
-
-.LBL_1_12:
-        movzbl    %r12b, %r15d
-        vmovss    1152(%rsp,%r15,8), %xmm0
-        call      JUMPTARGET(logf)
-        vmovss    %xmm0, 1216(%rsp,%r15,8)
-        jmp       .LBL_1_7
-END (_ZGVeN16v_logf_knl)
-
 ENTRY (_ZGVeN16v_logf_skx)
-/*
-   ALGORITHM DESCRIPTION:
-
-     log(x) = exponent_x*log(2) + log(mantissa_x),         if mantissa_x<4/3
-     log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3
-
-     R = mantissa_x - 1,     if mantissa_x<4/3
-     R = 0.5*mantissa_x - 1, if mantissa_x>4/3
-     |R|< 1/3
-
-     log(1+R) is approximated as a polynomial: degree 9 for 1-ulp,
-     degree 7 for 4-ulp, degree 3 for half-precision.  */
-
-        pushq     %rbp
-        cfi_adjust_cfa_offset (8)
-        cfi_rel_offset (%rbp, 0)
-        movq      %rsp, %rbp
-        cfi_def_cfa_register (%rbp)
-        andq      $-64, %rsp
-        subq      $1280, %rsp
-        movq      __svml_slog_data@GOTPCREL(%rip), %rax
-        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
-        vmovups _iBrkValue(%rax), %zmm4
-        vmovups _sPoly_7(%rax), %zmm8
-
-/*
-   check for working range,
-   set special argument mask (denormals/zero/Inf/NaN)
- */
-        vpaddd _iHiDelta(%rax), %zmm0, %zmm1
-
-/* reduction: compute r,n */
-        vpsubd    %zmm4, %zmm0, %zmm2
-        vpcmpd    $5, _iLoRange(%rax), %zmm1, %k1
-
-/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */
-        vpsrad    $23, %zmm2, %zmm5
-        vpandd _iOffExpoMask(%rax), %zmm2, %zmm3
-
-/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */
-        vpaddd    %zmm4, %zmm3, %zmm7
-
-/* reduced argument R */
-        vsubps _sOne(%rax), %zmm7, %zmm9
-
-/* polynomial evaluation starts here */
-        vfmadd213ps _sPoly_6(%rax), %zmm9, %zmm8
-        vfmadd213ps _sPoly_5(%rax), %zmm9, %zmm8
-        vfmadd213ps _sPoly_4(%rax), %zmm9, %zmm8
-        vfmadd213ps _sPoly_3(%rax), %zmm9, %zmm8
-        vfmadd213ps _sPoly_2(%rax), %zmm9, %zmm8
-        vfmadd213ps _sPoly_1(%rax), %zmm9, %zmm8
-        vmulps    %zmm9, %zmm8, %zmm10
-
-/* polynomial evaluation end */
-        vfmadd213ps %zmm9, %zmm9, %zmm10
-        vpandnd   %zmm1, %zmm1, %zmm6{%k1}
-        vptestmd  %zmm6, %zmm6, %k0
-        vcvtdq2ps {rn-sae}, %zmm5, %zmm1
-        kmovw     %k0, %ecx
-
-/*
-   final reconstruction:
-   add exponent_value*log2 to polynomial result
- */
-        vfmadd132ps _sLn2(%rax), %zmm10, %zmm1
-        testl     %ecx, %ecx
-        jne       .LBL_2_3
-
-.LBL_2_2:
-        cfi_remember_state
-        vmovaps   %zmm1, %zmm0
-        movq      %rbp, %rsp
-        cfi_def_cfa_register (%rsp)
-        popq      %rbp
-        cfi_adjust_cfa_offset (-8)
-        cfi_restore (%rbp)
-        ret
-
-.LBL_2_3:
-        cfi_restore_state
-        vmovups   %zmm0, 1152(%rsp)
-        vmovups   %zmm1, 1216(%rsp)
-        je        .LBL_2_2
-
-        xorb      %dl, %dl
-        xorl      %eax, %eax
-        kmovw     %k4, 1048(%rsp)
-        kmovw     %k5, 1040(%rsp)
-        kmovw     %k6, 1032(%rsp)
-        kmovw     %k7, 1024(%rsp)
-        vmovups   %zmm16, 960(%rsp)
-        vmovups   %zmm17, 896(%rsp)
-        vmovups   %zmm18, 832(%rsp)
-        vmovups   %zmm19, 768(%rsp)
-        vmovups   %zmm20, 704(%rsp)
-        vmovups   %zmm21, 640(%rsp)
-        vmovups   %zmm22, 576(%rsp)
-        vmovups   %zmm23, 512(%rsp)
-        vmovups   %zmm24, 448(%rsp)
-        vmovups   %zmm25, 384(%rsp)
-        vmovups   %zmm26, 320(%rsp)
-        vmovups   %zmm27, 256(%rsp)
-        vmovups   %zmm28, 192(%rsp)
-        vmovups   %zmm29, 128(%rsp)
-        vmovups   %zmm30, 64(%rsp)
-        vmovups   %zmm31, (%rsp)
-        movq      %rsi, 1064(%rsp)
-        movq      %rdi, 1056(%rsp)
-        movq      %r12, 1096(%rsp)
-        cfi_offset_rel_rsp (12, 1096)
-        movb      %dl, %r12b
-        movq      %r13, 1088(%rsp)
-        cfi_offset_rel_rsp (13, 1088)
-        movl      %ecx, %r13d
-        movq      %r14, 1080(%rsp)
-        cfi_offset_rel_rsp (14, 1080)
-        movl      %eax, %r14d
-        movq      %r15, 1072(%rsp)
-        cfi_offset_rel_rsp (15, 1072)
-        cfi_remember_state
-
-.LBL_2_6:
-        btl       %r14d, %r13d
-        jc        .LBL_2_12
-
-.LBL_2_7:
-        lea       1(%r14), %esi
-        btl       %esi, %r13d
-        jc        .LBL_2_10
-
-.LBL_2_8:
-        incb      %r12b
-        addl      $2, %r14d
-        cmpb      $16, %r12b
-        jb        .LBL_2_6
-
-        kmovw     1048(%rsp), %k4
-        kmovw     1040(%rsp), %k5
-        kmovw     1032(%rsp), %k6
-        kmovw     1024(%rsp), %k7
-        vmovups   960(%rsp), %zmm16
-        vmovups   896(%rsp), %zmm17
-        vmovups   832(%rsp), %zmm18
-        vmovups   768(%rsp), %zmm19
-        vmovups   704(%rsp), %zmm20
-        vmovups   640(%rsp), %zmm21
-        vmovups   576(%rsp), %zmm22
-        vmovups   512(%rsp), %zmm23
-        vmovups   448(%rsp), %zmm24
-        vmovups   384(%rsp), %zmm25
-        vmovups   320(%rsp), %zmm26
-        vmovups   256(%rsp), %zmm27
-        vmovups   192(%rsp), %zmm28
-        vmovups   128(%rsp), %zmm29
-        vmovups   64(%rsp), %zmm30
-        vmovups   (%rsp), %zmm31
-        vmovups   1216(%rsp), %zmm1
-        movq      1064(%rsp), %rsi
-        movq      1056(%rsp), %rdi
-        movq      1096(%rsp), %r12
-        cfi_restore (%r12)
-        movq      1088(%rsp), %r13
-        cfi_restore (%r13)
-        movq      1080(%rsp), %r14
-        cfi_restore (%r14)
-        movq      1072(%rsp), %r15
-        cfi_restore (%r15)
-        jmp       .LBL_2_2
-
-.LBL_2_10:
-        cfi_restore_state
-        movzbl    %r12b, %r15d
-        vmovss    1156(%rsp,%r15,8), %xmm0
-        vzeroupper
-        vmovss    1156(%rsp,%r15,8), %xmm0
-
-        call      JUMPTARGET(logf)
-
-        vmovss    %xmm0, 1220(%rsp,%r15,8)
-        jmp       .LBL_2_8
-
-.LBL_2_12:
-        movzbl    %r12b, %r15d
-        vmovss    1152(%rsp,%r15,8), %xmm0
-        vzeroupper
-        vmovss    1152(%rsp,%r15,8), %xmm0
-
-        call      JUMPTARGET(logf)
-
-        vmovss    %xmm0, 1216(%rsp,%r15,8)
-        jmp       .LBL_2_7
-
+	/* ALGORITHM DESCRIPTION:
+	   if mantissa_x<4/3
+        log(x) = exponent_x*log(2) + log(mantissa_x)
+	   if mantissa_x>4/3
+        log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x)
+
+	   R = mantissa_x - 1,     if mantissa_x<4/3
+	   R = 0.5*mantissa_x - 1, if mantissa_x>4/3
+	   |R|< 1/3
+
+	   log(1+R) is approximated as a polynomial: degree 9 for
+	   1-ulp, degree 7 for 4-ulp, degree 3 for half-precision.  */
+	vmovups	LOCAL_DATA(_iBrkValue)(%rip), %zmm4
+	vmovups	LOCAL_DATA(_sPoly_7)(%rip), %zmm8
+
+	/* check for working range, set special argument mask
+	   (denormals/zero/Inf/NaN).  */
+	vmovups	COMMON_DATA(_NotiOffExpoMask)(%rip), %zmm3
+	vpsubd	%zmm3, %zmm0, %zmm1
+
+	/* reduction: compute r,n.  */
+	vpsubd	%zmm4, %zmm0, %zmm2
+	vpcmpd	$2, LOCAL_DATA_UNALIGNED(_iLoRange)(%rip){1to16}, %zmm1, %k0
+
+	/* exponent_x (mantissa_x<4/3),
+	   or exponent_x+1 (mantissa_x>4/3).  */
+	vpsrad	$23, %zmm2, %zmm5
+	vpandnd	%zmm2, %zmm3, %zmm3
+
+	/* mantissa_x (mantissa_x<4/3),
+	   or 0.5 mantissa_x (mantissa_x>4/3).  */
+	vpaddd	%zmm4, %zmm3, %zmm7
+
+	/* reduced argument R.  */
+	vsubps	COMMON_DATA(_OneF)(%rip), %zmm7, %zmm9
+
+	/* polynomial evaluation starts here.  */
+	vfmadd213ps LOCAL_DATA(_sPoly_6)(%rip), %zmm9, %zmm8
+	vfmadd213ps LOCAL_DATA(_sPoly_5)(%rip), %zmm9, %zmm8
+	vfmadd213ps LOCAL_DATA(_sPoly_4)(%rip), %zmm9, %zmm8
+	vfmadd213ps LOCAL_DATA(_sPoly_3)(%rip), %zmm9, %zmm8
+	vfmadd213ps LOCAL_DATA(_sPoly_2)(%rip), %zmm9, %zmm8
+	vfmadd213ps COMMON_DATA(_Neg5F)(%rip), %zmm9, %zmm8
+	vmulps	%zmm9, %zmm8, %zmm10
+
+	/* polynomial evaluation end.  */
+	vfmadd213ps %zmm9, %zmm9, %zmm10
+	vcvtdq2ps {rn-sae}, %zmm5, %zmm1
+	kmovw	%k0, %ecx
+	/* final reconstruction: add exponent_value
+	   log2 to polynomial result.  */
+	vfmadd132ps LOCAL_DATA(_sLn2)(%rip), %zmm10, %zmm1
+	/* Branch to process special inputs.  */
+	testl	%ecx, %ecx
+	jne	L(SPECIAL_VALUES_BRANCH)
+
+	vmovaps	%zmm1, %zmm0
+	ret
+
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a tanf call. Optimize for code size
+	   moreso than speed here.  */
+L(SPECIAL_VALUES_BRANCH):
+	/* Use r13 to save/restore the stack. This allows us to use rbp
+	   as callee save register saving code size.  */
+	pushq	%r13
+	cfi_def_cfa (rsp, 16)
+	/* Need to callee save registers to preserve state across tanf
+	   calls.  */
+	pushq	%rbx
+	cfi_def_cfa (rsp, 24)
+	pushq	%rbp
+	cfi_def_cfa (rsp, 32)
+	movq	%rsp, %r13
+	cfi_def_cfa (r13, 32)
+
+	/* Align stack and make room for 2x zmm vectors.  */
+	andq	$-64, %rsp
+	addq	$-128, %rsp
+
+	/* Save origional input.  */
+	vmovaps	%zmm0, 64(%rsp)
+	/* Save all already computed inputs.  */
+	vmovaps	%zmm1, (%rsp)
+
+	vzeroupper
+
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a tanf call.  */
+	movl	%ecx, %ebx
+L(SPECIAL_VALUES_LOOP):
+
+	/* use rbp as index for special value that is saved across calls
+	   to tanf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 56] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop. Realigning
+	   also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	64(%rsp, %rbp, 4), %xmm0
+	call	logf@PLT
+
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl	%ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+
+	/* All results have been written to 64(%rsp).  */
+	vmovaps	(%rsp), %zmm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa (rsp, 32)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_def_cfa (rsp, 24)
+	popq	%rbx
+	cfi_def_cfa (rsp, 16)
+	popq	%r13
+	ret
 END (_ZGVeN16v_logf_skx)
+
+strong_alias (_ZGVeN16v_logf_skx, _ZGVeN16v_logf_knl)
+
+	.section .rodata.evex512, "a"
+	.align	64
+
+	/* Data table for vector implementations of function logf. The
+	   table may contain polynomial, reduction, lookup coefficients
+	   and other coefficients obtained through different methods of
+	   research and experimental work.  */
+LOCAL_DATA_NAME_UNALIGNED:
+	float_block (LOCAL_DATA_NAME_UNALIGNED, _iLoRange, 0x00ffffff)
+
+	.type	LOCAL_DATA_NAME_UNALIGNED, @object
+	.size	LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
+
+LOCAL_DATA_NAME:
+	DATA_VEC (LOCAL_DATA_NAME, _iBrkValue, 0x3f2aaaab)
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_7, 0xbe1b6a22)
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_6, 0x3e2db86b)
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_5, 0xbe289358)
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_4, 0x3e4afb81)
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_3, 0xbe80061d)
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_2, 0x3eaaaee7)
+	DATA_VEC (LOCAL_DATA_NAME, _sLn2, 0x3f317218)
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 25/27] x86/fpu: Optimize svml_s_logf4_core_sse4.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (22 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 24/27] x86/fpu: Optimize svml_s_logf16_core_avx512.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 26/27] x86/fpu: Optimize svml_s_logf8_core_avx2.S Noah Goldstein via Libc-alpha
                   ` (2 subsequent siblings)
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Improve special values case which ends up covering ~half of all
   float bit patterns.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

As well instead of using the shared `__svml_slogf_data` just define
the data locally.  This is because; 1) Its not really ideal for
the sse4/avx2 to reuse the avx512 tables as it pollute the cache
which unnecessarily large blocks.  2) Really only one of the
versions is ever expected to be used by a given process so there
isn't any constructive caching between them. And 3) there is not
enough data shared to make up for the first two reasons.

Code Size Change: -286 Bytes (235 - 521)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.5750
0F          (0x0000ffff, Denorm)   -> 0.9206
.1F         (0x3dcccccd)           -> 0.9389
5F          (0x40a00000)           -> 0.9361
2315255808F (0x4f0a0000)           -> 0.9306
-NaN        (0xffffffff)           -> 0.5226
---
 .../fpu/multiarch/svml_s_logf4_core_sse4.S    | 327 +++++++++---------
 1 file changed, 156 insertions(+), 171 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S
index 20ad054eac..42d09db8df 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core_sse4.S
@@ -16,179 +16,164 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
+
+
+#define LOCAL_DATA_NAME	__svml_slog_data_internal
+#include "svml_s_common_sse4_rodata_offsets.h"
+
+#define _sPoly_7	0
+#define _sPoly_6	16
+#define _sPoly_5	32
+#define _sPoly_4	48
+#define _sPoly_3	64
+#define _sPoly_2	80
+
 #include <sysdep.h>
-#include "svml_s_logf_data.h"
 
 	.section .text.sse4, "ax", @progbits
 ENTRY (_ZGVbN4v_logf_sse4)
-/*
-   ALGORITHM DESCRIPTION:
-
-     log(x) = exponent_x*log(2) + log(mantissa_x),         if mantissa_x<4/3
-     log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3
-
-     R = mantissa_x - 1,     if mantissa_x<4/3
-     R = 0.5*mantissa_x - 1, if mantissa_x>4/3
-     |R|< 1/3
-
-     log(1+R) is approximated as a polynomial: degree 9 for 1-ulp,
-     degree 7 for 4-ulp, degree 3 for half-precision.  */
-
-        pushq     %rbp
-        cfi_adjust_cfa_offset (8)
-        cfi_rel_offset (%rbp, 0)
-        movq      %rsp, %rbp
-        cfi_def_cfa_register (%rbp)
-        andq      $-64, %rsp
-        subq      $320, %rsp
-
-/* reduction: compute r,n */
-        movaps    %xmm0, %xmm2
-
-/* check for working range,
-   set special argument mask (denormals/zero/Inf/NaN) */
-        movq      __svml_slog_data@GOTPCREL(%rip), %rax
-        movdqu _iHiDelta(%rax), %xmm1
-        movdqu _iLoRange(%rax), %xmm4
-        paddd     %xmm0, %xmm1
-        movdqu _iBrkValue(%rax), %xmm3
-        pcmpgtd   %xmm1, %xmm4
-        movdqu _iOffExpoMask(%rax), %xmm1
-        psubd     %xmm3, %xmm2
-        pand      %xmm2, %xmm1
-
-/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */
-        psrad     $23, %xmm2
-        paddd     %xmm3, %xmm1
-        movups _sPoly_7(%rax), %xmm5
-
-/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */
-        cvtdq2ps  %xmm2, %xmm6
-
-/* reduced argument R */
-        subps _sOne(%rax), %xmm1
-        movmskps  %xmm4, %ecx
-
-/* final reconstruction:
-   add exponent_value*log2 to polynomial result */
-        mulps _sLn2(%rax), %xmm6
-
-/* polynomial evaluation starts here */
-        mulps     %xmm1, %xmm5
-        addps _sPoly_6(%rax), %xmm5
-        mulps     %xmm1, %xmm5
-        addps _sPoly_5(%rax), %xmm5
-        mulps     %xmm1, %xmm5
-        addps _sPoly_4(%rax), %xmm5
-        mulps     %xmm1, %xmm5
-        addps _sPoly_3(%rax), %xmm5
-        mulps     %xmm1, %xmm5
-        addps _sPoly_2(%rax), %xmm5
-        mulps     %xmm1, %xmm5
-        addps _sPoly_1(%rax), %xmm5
-        mulps     %xmm1, %xmm5
-
-/* polynomial evaluation end */
-        mulps     %xmm1, %xmm5
-        addps     %xmm5, %xmm1
-        addps     %xmm6, %xmm1
-        testl     %ecx, %ecx
-        jne       .LBL_1_3
-
-.LBL_1_2:
-        cfi_remember_state
-        movdqa    %xmm1, %xmm0
-        movq      %rbp, %rsp
-        cfi_def_cfa_register (%rsp)
-        popq      %rbp
-        cfi_adjust_cfa_offset (-8)
-        cfi_restore (%rbp)
-        ret
-
-.LBL_1_3:
-        cfi_restore_state
-        movups    %xmm0, 192(%rsp)
-        movups    %xmm1, 256(%rsp)
-        je        .LBL_1_2
-
-        xorb      %dl, %dl
-        xorl      %eax, %eax
-        movups    %xmm8, 112(%rsp)
-        movups    %xmm9, 96(%rsp)
-        movups    %xmm10, 80(%rsp)
-        movups    %xmm11, 64(%rsp)
-        movups    %xmm12, 48(%rsp)
-        movups    %xmm13, 32(%rsp)
-        movups    %xmm14, 16(%rsp)
-        movups    %xmm15, (%rsp)
-        movq      %rsi, 136(%rsp)
-        movq      %rdi, 128(%rsp)
-        movq      %r12, 168(%rsp)
-        cfi_offset_rel_rsp (12, 168)
-        movb      %dl, %r12b
-        movq      %r13, 160(%rsp)
-        cfi_offset_rel_rsp (13, 160)
-        movl      %ecx, %r13d
-        movq      %r14, 152(%rsp)
-        cfi_offset_rel_rsp (14, 152)
-        movl      %eax, %r14d
-        movq      %r15, 144(%rsp)
-        cfi_offset_rel_rsp (15, 144)
-        cfi_remember_state
-
-.LBL_1_6:
-        btl       %r14d, %r13d
-        jc        .LBL_1_12
-
-.LBL_1_7:
-        lea       1(%r14), %esi
-        btl       %esi, %r13d
-        jc        .LBL_1_10
-
-.LBL_1_8:
-        incb      %r12b
-        addl      $2, %r14d
-        cmpb      $16, %r12b
-        jb        .LBL_1_6
-
-        movups    112(%rsp), %xmm8
-        movups    96(%rsp), %xmm9
-        movups    80(%rsp), %xmm10
-        movups    64(%rsp), %xmm11
-        movups    48(%rsp), %xmm12
-        movups    32(%rsp), %xmm13
-        movups    16(%rsp), %xmm14
-        movups    (%rsp), %xmm15
-        movq      136(%rsp), %rsi
-        movq      128(%rsp), %rdi
-        movq      168(%rsp), %r12
-        cfi_restore (%r12)
-        movq      160(%rsp), %r13
-        cfi_restore (%r13)
-        movq      152(%rsp), %r14
-        cfi_restore (%r14)
-        movq      144(%rsp), %r15
-        cfi_restore (%r15)
-        movups    256(%rsp), %xmm1
-        jmp       .LBL_1_2
-
-.LBL_1_10:
-        cfi_restore_state
-        movzbl    %r12b, %r15d
-        movss     196(%rsp,%r15,8), %xmm0
-
-        call      JUMPTARGET(logf)
-
-        movss     %xmm0, 260(%rsp,%r15,8)
-        jmp       .LBL_1_8
-
-.LBL_1_12:
-        movzbl    %r12b, %r15d
-        movss     192(%rsp,%r15,8), %xmm0
-
-        call      JUMPTARGET(logf)
-
-        movss     %xmm0, 256(%rsp,%r15,8)
-        jmp       .LBL_1_7
-
+	movdqu	COMMON_DATA(_ILoRange)(%rip), %xmm1
+	/* ALGORITHM DESCRIPTION:
+	   if mantissa_x<4/3
+        log(x) = exponent_x*log(2) + log(mantissa_x)
+	   if mantissa_x>4/3
+        log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x)
+
+	   R = mantissa_x - 1,     if mantissa_x<4/3
+	   R = 0.5*mantissa_x - 1, if mantissa_x>4/3
+	   |R|< 1/3
+
+	   log(1+R) is approximated as a polynomial: degree 9 for
+	   1-ulp, degree 7 for 4-ulp, degree 3 for half-precision.  */
+
+	/* check for working range, set special argument mask
+	   (denormals/zero/Inf/NaN).  */
+	movdqu	COMMON_DATA(_NotiOffExpoMask)(%rip), %xmm2
+	movaps	%xmm0, %xmm3
+	psubd	%xmm2, %xmm3
+	pcmpgtd	%xmm3, %xmm1
+	movmskps %xmm1, %eax
+
+	movdqu	COMMON_DATA(_IBrkValue)(%rip), %xmm1
+	movaps	%xmm0, %xmm3
+	psubd	%xmm1, %xmm0
+	pandn	%xmm0, %xmm2
+	paddd	%xmm1, %xmm2
+	/* reduced argument R.  */
+	subps	COMMON_DATA(_OneF)(%rip), %xmm2
+	/* exponent_x (mantissa_x<4/3),
+	   or exponent_x+1 (mantissa_x>4/3).  */
+	psrad	$0x17, %xmm0
+	/* mantissa_x (mantissa_x<4/3),
+	   or 0.5 mantissa_x (mantissa_x>4/3).  */
+	cvtdq2ps %xmm0, %xmm0
+	/* final reconstruction: add exponent_value * log2 to polynomial
+	   result.  */
+	mulps	COMMON_DATA(_Ln2)(%rip), %xmm0
+	movups	LOCAL_DATA(_sPoly_7)(%rip), %xmm1
+	/* polynomial evaluation starts here.  */
+	mulps	%xmm2, %xmm1
+	addps	LOCAL_DATA(_sPoly_6)(%rip), %xmm1
+	mulps	%xmm2, %xmm1
+	addps	LOCAL_DATA(_sPoly_5)(%rip), %xmm1
+	mulps	%xmm2, %xmm1
+	addps	LOCAL_DATA(_sPoly_4)(%rip), %xmm1
+	mulps	%xmm2, %xmm1
+	addps	LOCAL_DATA(_sPoly_3)(%rip), %xmm1
+	mulps	%xmm2, %xmm1
+	addps	LOCAL_DATA(_sPoly_2)(%rip), %xmm1
+	mulps	%xmm2, %xmm1
+	addps	COMMON_DATA(_Neg5F)(%rip), %xmm1
+	mulps	%xmm2, %xmm1
+	/* polynomial evaluation end.  */
+	mulps	%xmm2, %xmm1
+	addps	%xmm1, %xmm2
+	addps	%xmm2, %xmm0
+	testl	%eax, %eax
+	jne	L(SPECIAL_VALUES_BRANCH)
+	ret
+
+	/* Cold case. edx has 1s where there was a special value that
+	   more so than speed here.  */
+L(SPECIAL_VALUES_BRANCH):
+	/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
+	   call entry will be 16-byte aligned.  */
+	subq	$0x38, %rsp
+	movups	%xmm0, 24(%rsp)
+	movups	%xmm3, 40(%rsp)
+
+	/* Use rbx/rbp for callee save registers as they get short
+	   encoding for many instructions (as compared with r12/r13).  */
+	movq	%rbx, (%rsp)
+	cfi_offset (rbx, -64)
+	movq	%rbp, 8(%rsp)
+	cfi_offset (rbp, -56)
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a tanhf call.  */
+	movl	%eax, %ebx
+L(SPECIAL_VALUES_LOOP):
+
+	/* use rbp as index for special value that is saved across calls
+	   to tanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 12] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop.  */
+	xorl	%ebp, %ebp
+	bsfl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	40(%rsp, %rbp, 4), %xmm0
+	call	logf@PLT
+
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
+	movss	%xmm0, 24(%rsp, %rbp, 4)
+
+	leal	-1(%rbx), %eax
+	andl	%eax, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+	/* All results have been written to 24(%rsp).  */
+	movups	24(%rsp), %xmm0
+	movq	(%rsp), %rbx
+	cfi_restore (rbx)
+	movq	8(%rsp), %rbp
+	cfi_restore (rbp)
+	addq	$56, %rsp
+	cfi_def_cfa_offset (8)
+	ret
 END (_ZGVbN4v_logf_sse4)
+
+	.section .rodata.sse4, "a"
+	.align	16
+
+	/* Data table for vector implementations of function logf. The
+	   table may contain polynomial, reduction, lookup coefficients
+	   and other coefficients obtained through different methods of
+	   research and experimental work.  */
+
+	.globl	LOCAL_DATA_NAME
+LOCAL_DATA_NAME:
+	/* Polynomial sPoly[] coefficients:.  */
+	/* -1.5177205204963684082031250e-01.  */
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_7, 0xbe1b6a22)
+
+	/* 1.6964881122112274169921875e-01.  */
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_6, 0x3e2db86b)
+
+	/* -1.6462457180023193359375000e-01.  */
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_5, 0xbe289358)
+
+	/* 1.9822503626346588134765625e-01.  */
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_4, 0x3e4afb81)
+
+	/* -2.5004664063453674316406250e-01.  */
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_3, 0xbe80061d)
+
+	/* 3.3336564898490905761718750e-01.  */
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_2, 0x3eaaaee7)
+
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 26/27] x86/fpu: Optimize svml_s_logf8_core_avx2.S
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (23 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 25/27] x86/fpu: Optimize svml_s_logf4_core_sse4.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07  8:52 ` [PATCH v1 27/27] x86/fpu: Remove unused svml_s_logf_data.S file Noah Goldstein via Libc-alpha
  2022-12-07 23:53 ` [PATCH v1 01/27] x86/fpu: Create helper file for common data macros H.J. Lu via Libc-alpha
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

1. Improve special values case which ends up covering ~half of all
   float bit patterns.
2. Cleanup some missed optimizations in instruction selection /
   unnecissary repeated rodata references.
3. Remove unused rodata.
4. Use common data definitions where possible.

As well instead of using the shared `__svml_slogf_data` just define
the data locally.  This is because; 1) Its not really ideal for
the sse4/avx2 to reuse the avx512 tables as it pollute the cache
which unnecessarily large blocks.  2) Really only one of the
versions is ever expected to be used by a given process so there
isn't any constructive caching between them. And 3) there is not
enough data shared to make up for the first two reasons.

Code Size Change: -314 Bytes (230 - 544)

Input                                 New Time / Old Time
0F          (0x00000000)           -> 0.6246
0F          (0x0000ffff, Denorm)   -> 0.9525
.1F         (0x3dcccccd)           -> 0.9021
5F          (0x40a00000)           -> 0.9113
2315255808F (0x4f0a0000)           -> 0.8293
-NaN        (0xffffffff)           -> 0.5745
---
 .../fpu/multiarch/svml_s_logf8_core_avx2.S    | 342 +++++++++---------
 1 file changed, 181 insertions(+), 161 deletions(-)

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S
index 616312c695..35f63b7879 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core_avx2.S
@@ -16,169 +16,189 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
+#define LOCAL_DATA_NAME	__svml_slog_data_internal
+#include "svml_s_common_avx2_rodata_offsets.h"
+
+/* Offsets for data table __svml_slog_data_internal.  */
+#define _sPoly_7	0
+#define _sPoly_6	32
+#define _sPoly_5	64
+#define _sPoly_4	96
+#define _sPoly_3	128
+#define _sPoly_2	160
+
 #include <sysdep.h>
-#include "svml_s_logf_data.h"
 
 	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_logf_avx2)
-/*
-   ALGORITHM DESCRIPTION:
-
-    log(x) = exponent_x*log(2) + log(mantissa_x),         if mantissa_x<4/3
-    log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x), if mantissa_x>4/3
-
-    R = mantissa_x - 1,     if mantissa_x<4/3
-    R = 0.5*mantissa_x - 1, if mantissa_x>4/3
-    |R|< 1/3
-
-    log(1+R) is approximated as a polynomial: degree 9 for 1-ulp,
-    degree 7 for 4-ulp, degree 3 for half-precision.  */
-
-        pushq     %rbp
-        cfi_adjust_cfa_offset (8)
-        cfi_rel_offset (%rbp, 0)
-        movq      %rsp, %rbp
-        cfi_def_cfa_register (%rbp)
-        andq      $-64, %rsp
-        subq      $448, %rsp
-        movq      __svml_slog_data@GOTPCREL(%rip), %rax
-        vmovaps   %ymm0, %ymm2
-        vmovups _iBrkValue(%rax), %ymm6
-        vmovups _iLoRange(%rax), %ymm1
-/* check for working range,
-   set special argument mask (denormals/zero/Inf/NaN) */
-        vpaddd _iHiDelta(%rax), %ymm2, %ymm7
-
-/* reduction: compute r,n */
-        vpsubd    %ymm6, %ymm2, %ymm4
-
-/* exponent_x (mantissa_x<4/3) or exponent_x+1 (mantissa_x>4/3) */
-        vpsrad    $23, %ymm4, %ymm3
-        vpand _iOffExpoMask(%rax), %ymm4, %ymm5
-        vmovups _sPoly_7(%rax), %ymm4
-        vcvtdq2ps %ymm3, %ymm0
-
-/* mantissa_x (mantissa_x<4/3), or 0.5*mantissa_x (mantissa_x>4/3) */
-        vpaddd    %ymm6, %ymm5, %ymm3
-
-/* reduced argument R */
-        vsubps _sOne(%rax), %ymm3, %ymm5
-
-/* polynomial evaluation starts here */
-        vfmadd213ps _sPoly_6(%rax), %ymm5, %ymm4
-        vfmadd213ps _sPoly_5(%rax), %ymm5, %ymm4
-        vfmadd213ps _sPoly_4(%rax), %ymm5, %ymm4
-        vfmadd213ps _sPoly_3(%rax), %ymm5, %ymm4
-        vfmadd213ps _sPoly_2(%rax), %ymm5, %ymm4
-        vfmadd213ps _sPoly_1(%rax), %ymm5, %ymm4
-        vmulps    %ymm5, %ymm4, %ymm6
-
-/* polynomial evaluation end */
-        vfmadd213ps %ymm5, %ymm5, %ymm6
-        vpcmpgtd  %ymm7, %ymm1, %ymm1
-        vmovmskps %ymm1, %ecx
-
-/* final reconstruction:
-   add exponent_value*log2 to polynomial result */
-        vfmadd132ps _sLn2(%rax), %ymm6, %ymm0
-        testl     %ecx, %ecx
-        jne       .LBL_1_3
-
-.LBL_1_2:
-        cfi_remember_state
-        movq      %rbp, %rsp
-        cfi_def_cfa_register (%rsp)
-        popq      %rbp
-        cfi_adjust_cfa_offset (-8)
-        cfi_restore (%rbp)
-        ret
-
-.LBL_1_3:
-        cfi_restore_state
-        vmovups   %ymm2, 320(%rsp)
-        vmovups   %ymm0, 384(%rsp)
-        je        .LBL_1_2
-
-        xorb      %dl, %dl
-        xorl      %eax, %eax
-        vmovups   %ymm8, 224(%rsp)
-        vmovups   %ymm9, 192(%rsp)
-        vmovups   %ymm10, 160(%rsp)
-        vmovups   %ymm11, 128(%rsp)
-        vmovups   %ymm12, 96(%rsp)
-        vmovups   %ymm13, 64(%rsp)
-        vmovups   %ymm14, 32(%rsp)
-        vmovups   %ymm15, (%rsp)
-        movq      %rsi, 264(%rsp)
-        movq      %rdi, 256(%rsp)
-        movq      %r12, 296(%rsp)
-        cfi_offset_rel_rsp (12, 296)
-        movb      %dl, %r12b
-        movq      %r13, 288(%rsp)
-        cfi_offset_rel_rsp (13, 288)
-        movl      %ecx, %r13d
-        movq      %r14, 280(%rsp)
-        cfi_offset_rel_rsp (14, 280)
-        movl      %eax, %r14d
-        movq      %r15, 272(%rsp)
-        cfi_offset_rel_rsp (15, 272)
-        cfi_remember_state
-
-.LBL_1_6:
-        btl       %r14d, %r13d
-        jc        .LBL_1_12
-
-.LBL_1_7:
-        lea       1(%r14), %esi
-        btl       %esi, %r13d
-        jc        .LBL_1_10
-
-.LBL_1_8:
-        incb      %r12b
-        addl      $2, %r14d
-        cmpb      $16, %r12b
-        jb        .LBL_1_6
-
-        vmovups   224(%rsp), %ymm8
-        vmovups   192(%rsp), %ymm9
-        vmovups   160(%rsp), %ymm10
-        vmovups   128(%rsp), %ymm11
-        vmovups   96(%rsp), %ymm12
-        vmovups   64(%rsp), %ymm13
-        vmovups   32(%rsp), %ymm14
-        vmovups   (%rsp), %ymm15
-        vmovups   384(%rsp), %ymm0
-        movq      264(%rsp), %rsi
-        movq      256(%rsp), %rdi
-        movq      296(%rsp), %r12
-        cfi_restore (%r12)
-        movq      288(%rsp), %r13
-        cfi_restore (%r13)
-        movq      280(%rsp), %r14
-        cfi_restore (%r14)
-        movq      272(%rsp), %r15
-        cfi_restore (%r15)
-        jmp       .LBL_1_2
-
-.LBL_1_10:
-        cfi_restore_state
-        movzbl    %r12b, %r15d
-        vmovss    324(%rsp,%r15,8), %xmm0
-        vzeroupper
-
-        call      JUMPTARGET(logf)
-
-        vmovss    %xmm0, 388(%rsp,%r15,8)
-        jmp       .LBL_1_8
-
-.LBL_1_12:
-        movzbl    %r12b, %r15d
-        vmovss    320(%rsp,%r15,8), %xmm0
-        vzeroupper
-
-        call      JUMPTARGET(logf)
-
-        vmovss    %xmm0, 384(%rsp,%r15,8)
-        jmp       .LBL_1_7
-
+	/* ALGORITHM DESCRIPTION:
+	   if mantissa_x<4/3
+        log(x) = exponent_x*log(2) + log(mantissa_x)
+	   if mantissa_x>4/3
+        log(x) = (exponent_x+1)*log(2) + log(0.5*mantissa_x)
+
+	   R = mantissa_x - 1,     if mantissa_x<4/3
+	   R = 0.5*mantissa_x - 1, if mantissa_x>4/3
+	   |R|< 1/3
+
+	   log(1+R) is approximated as a polynomial: degree 9 for
+	   1-ulp, degree 7 for 4-ulp, degree 3 for half-precision.  */
+	vmovups	COMMON_DATA(_IBrkValue)(%rip), %ymm6
+
+
+	vmovups	COMMON_DATA(_NotiOffExpoMask)(%rip), %ymm7
+
+	/* reduction: compute r,n.  */
+	vpsubd	%ymm6, %ymm0, %ymm4
+
+	/* exponent_x (mantissa_x<4/3) or
+	   exponent_x+1 (mantissa_x>4/3).  */
+	vpsrad	$23, %ymm4, %ymm3
+	vpandn	%ymm4, %ymm7, %ymm5
+	vmovups	LOCAL_DATA(_sPoly_7)(%rip), %ymm4
+	vcvtdq2ps %ymm3, %ymm2
+
+	/* mantissa_x (mantissa_x<4/3),
+	   or 0.5*mantissa_x (mantissa_x>4/3).  */
+	vpaddd	%ymm6, %ymm5, %ymm3
+
+	/* reduced argument R.  */
+	vsubps	COMMON_DATA(_OneF)(%rip), %ymm3, %ymm5
+
+	/* polynomial evaluation starts here.  */
+	vfmadd213ps LOCAL_DATA(_sPoly_6)(%rip), %ymm5, %ymm4
+	vfmadd213ps LOCAL_DATA(_sPoly_5)(%rip), %ymm5, %ymm4
+	vfmadd213ps LOCAL_DATA(_sPoly_4)(%rip), %ymm5, %ymm4
+	vfmadd213ps LOCAL_DATA(_sPoly_3)(%rip), %ymm5, %ymm4
+	vfmadd213ps LOCAL_DATA(_sPoly_2)(%rip), %ymm5, %ymm4
+	vfmadd213ps COMMON_DATA(_Neg5F)(%rip), %ymm5, %ymm4
+	vmulps	%ymm5, %ymm4, %ymm6
+
+	/* polynomial evaluation end.  */
+	vfmadd213ps %ymm5, %ymm5, %ymm6
+
+	vmovups	COMMON_DATA(_ILoRange)(%rip), %ymm1
+	/* check for working range, set special argument mask
+	   (denormals/zero/Inf/NaN).  */
+	vpsubd	%ymm7, %ymm0, %ymm7
+
+
+	vpcmpgtd %ymm7, %ymm1, %ymm1
+	vmovmskps %ymm1, %ecx
+
+	/* final reconstruction: add exponent_value*log2 to polynomial
+	   result.  */
+	vfmadd132ps COMMON_DATA(_Ln2)(%rip), %ymm6, %ymm2
+	testl	%ecx, %ecx
+	/* Branch to process special inputs.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	vmovaps	%ymm2, %ymm0
+	ret
+
+
+	/* Cold case. edx has 1s where there was a special value that
+	   needs to be handled by a atanhf call. Optimize for code size
+	   more so than speed here.  */
+L(SPECIAL_VALUES_BRANCH):
+	/* Use r13 to save/restore the stack. This allows us to use rbp
+	   as callee save register saving code size.  */
+	pushq	%r13
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (r13, -16)
+	/* Need to callee save registers to preserve state across tanhf
+	   calls.  */
+	pushq	%rbx
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbx, -24)
+	pushq	%rbp
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbp, -32)
+	movq	%rsp, %r13
+	cfi_def_cfa_register (r13)
+
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
+
+	/* Save all already computed inputs.  */
+	vmovups	%ymm2, (%rsp)
+	/* Save original input (ymm2 unchanged up to this point).  */
+	vmovups	%ymm0, 32(%rsp)
+
+	vzeroupper
+
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a atanhf call.  */
+	movl	%ecx, %ebx
+L(SPECIAL_VALUES_LOOP):
+
+	/* use rbp as index for special value that is saved across calls
+	   to atanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 28] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop. Realigning
+	   also costs more code size.  */
+	xorl	%ebp, %ebp
+	tzcntl	%ebx, %ebp
+
+	/* Scalar math fucntion call to process special input.  */
+	vmovss	32(%rsp, %rbp, 4), %xmm0
+	call	logf@PLT
+
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
+	vmovss	%xmm0, (%rsp, %rbp, 4)
+
+	blsrl	%ebx, %ebx
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+
+
+	/* All results have been written to (%rsp).  */
+	vmovups	(%rsp), %ymm0
+	/* Restore rsp.  */
+	movq	%r13, %rsp
+	cfi_def_cfa_register (rsp)
+	/* Restore callee save registers.  */
+	popq	%rbp
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
+	popq	%rbx
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
+	popq	%r13
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (r13)
+	ret
 END(_ZGVdN8v_logf_avx2)
+
+	.section .rodata.avx2, "a"
+	.align	32
+
+	/* Data table for vector implementations of function logf. The
+	   table may contain polynomial, reduction, lookup coefficients
+	   and other coefficients obtained through different methods of
+	   research and experimental work.  */
+LOCAL_DATA_NAME:
+	/* Polynomial sPoly[] coefficients:.  */
+	/* -1.5177205204963684082031250e-01.  */
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_7, 0xbe1b6a22)
+
+	/* 1.6964881122112274169921875e-01.  */
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_6, 0x3e2db86b)
+
+	/* -1.6462457180023193359375000e-01.  */
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_5, 0xbe289358)
+
+	/* 1.9822503626346588134765625e-01.  */
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_4, 0x3e4afb81)
+
+	/* -2.5004664063453674316406250e-01.  */
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_3, 0xbe80061d)
+
+	/* 3.3336564898490905761718750e-01.  */
+	DATA_VEC (LOCAL_DATA_NAME, _sPoly_2, 0x3eaaaee7)
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v1 27/27] x86/fpu: Remove unused svml_s_logf_data.S file
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (24 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 26/27] x86/fpu: Optimize svml_s_logf8_core_avx2.S Noah Goldstein via Libc-alpha
@ 2022-12-07  8:52 ` Noah Goldstein via Libc-alpha
  2022-12-07 23:53 ` [PATCH v1 01/27] x86/fpu: Create helper file for common data macros H.J. Lu via Libc-alpha
  26 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-07  8:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, andrey.kolesov, carlos

The common data definitions inside the file are no longer used so
delete it.
---
 sysdeps/x86_64/fpu/Makefile           |   1 -
 sysdeps/x86_64/fpu/svml_s_logf_data.S | 102 --------------------------
 sysdeps/x86_64/fpu/svml_s_logf_data.h |  48 ------------
 3 files changed, 151 deletions(-)
 delete mode 100644 sysdeps/x86_64/fpu/svml_s_logf_data.S
 delete mode 100644 sysdeps/x86_64/fpu/svml_s_logf_data.h

diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
index e7e747e920..72375170a5 100644
--- a/sysdeps/x86_64/fpu/Makefile
+++ b/sysdeps/x86_64/fpu/Makefile
@@ -18,7 +18,6 @@ libmvec-support += \
   svml_s_common_evex512_rodata \
   svml_s_common_sse4_rodata \
   svml_s_expf_data \
-  svml_s_logf_data \
   svml_s_powf_data \
   svml_s_trig_data \
   $(foreach l,$(libmvec-double-func-list), \
diff --git a/sysdeps/x86_64/fpu/svml_s_logf_data.S b/sysdeps/x86_64/fpu/svml_s_logf_data.S
deleted file mode 100644
index 73c05e1734..0000000000
--- a/sysdeps/x86_64/fpu/svml_s_logf_data.S
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Data for vector function logf.
-   Copyright (C) 2014-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include "svml_s_logf_data.h"
-
-	.section .rodata, "a"
-	.align 64
-
-/* Data table for vector implementations of function logf.
-   The table may contain polynomial, reduction, lookup coefficients and
-   other coefficients obtained through different methods of research and
-   experimental work.  */
-
-	.globl __svml_slog_data
-__svml_slog_data:
-
-/* Polynomial sPoly[] coefficients:
- * -5.0000000000000000000000000e-01 */
-float_vector _sPoly_1 0xbf000000
-
-/* 3.3336564898490905761718750e-01 */
-float_vector _sPoly_2 0x3eaaaee7
-
-/* -2.5004664063453674316406250e-01 */
-float_vector _sPoly_3 0xbe80061d
-
-/* 1.9822503626346588134765625e-01 */
-float_vector _sPoly_4 0x3e4afb81
-
-/* -1.6462457180023193359375000e-01 */
-float_vector _sPoly_5 0xbe289358
-
-/* 1.6964881122112274169921875e-01 */
-float_vector _sPoly_6 0x3e2db86b
-
-/* -1.5177205204963684082031250e-01 */
-float_vector _sPoly_7 0xbe1b6a22
-
-/* Constant for work range check: Delta 80000000-7f800000 */
-float_vector _iHiDelta 0x00800000
-
-/* Constant for work range check: 00800000 + Delta */
-float_vector _iLoRange 0x01000000
-
-/* Mantissa break point  SP 2/3 */
-float_vector _iBrkValue 0x3f2aaaab
-
-/* SP significand mask */
-float_vector _iOffExpoMask 0x007fffff
-
-/* 1.0f */
-float_vector _sOne 0x3f800000
-
-/* SP log(2) */
-float_vector _sLn2 0x3f317218
-
-/* SP infinity, +/- */
-.if .-__svml_slog_data != _sInfs
-.err
-.endif
-	.long	0x7f800000
-	.long	0xff800000
-	.rept	56
-	.byte	0
-	.endr
-
-/* SP one, +/- */
-.if .-__svml_slog_data != _sOnes
-.err
-.endif
-	.long	0x3f800000
-	.long	0xbf800000
-	.rept	56
-	.byte	0
-	.endr
-
-/* SP zero +/- */
-.if .-__svml_slog_data != _sZeros
-.err
-.endif
-	.long	0x00000000
-	.long	0x80000000
-	.rept	56
-	.byte	0
-	.endr
-	.type	__svml_slog_data,@object
-	.size __svml_slog_data,.-__svml_slog_data
diff --git a/sysdeps/x86_64/fpu/svml_s_logf_data.h b/sysdeps/x86_64/fpu/svml_s_logf_data.h
deleted file mode 100644
index 72e66081c5..0000000000
--- a/sysdeps/x86_64/fpu/svml_s_logf_data.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Offsets for data table for vectorized function logf.
-   Copyright (C) 2014-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef S_LOGF_DATA_H
-#define S_LOGF_DATA_H
-
-#define _sPoly_1                      	0
-#define _sPoly_2                      	64
-#define _sPoly_3                      	128
-#define _sPoly_4                      	192
-#define _sPoly_5                      	256
-#define _sPoly_6                      	320
-#define _sPoly_7                      	384
-#define _iHiDelta                     	448
-#define _iLoRange                     	512
-#define _iBrkValue                    	576
-#define _iOffExpoMask                 	640
-#define _sOne                         	704
-#define _sLn2                         	768
-#define _sInfs                        	832
-#define _sOnes                        	896
-#define _sZeros                       	960
-
-.macro float_vector offset value
-.if .-__svml_slog_data != \offset
-.err
-.endif
-.rept 16
-.long \value
-.endr
-.endm
-
-#endif
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 01/27] x86/fpu: Create helper file for common data macros
  2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
                   ` (25 preceding siblings ...)
  2022-12-07  8:52 ` [PATCH v1 27/27] x86/fpu: Remove unused svml_s_logf_data.S file Noah Goldstein via Libc-alpha
@ 2022-12-07 23:53 ` H.J. Lu via Libc-alpha
  2022-12-08  0:13   ` Noah Goldstein via Libc-alpha
  26 siblings, 1 reply; 38+ messages in thread
From: H.J. Lu via Libc-alpha @ 2022-12-07 23:53 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, andrey.kolesov, carlos

On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The macros are useful for creating .rodata definitions and checking
> that the offset is correct.
> ---
>  .../x86_64/fpu/svml_common_data_macros.h.S    | 50 +++++++++++++++++++
>  1 file changed, 50 insertions(+)
>  create mode 100644 sysdeps/x86_64/fpu/svml_common_data_macros.h.S
>
> diff --git a/sysdeps/x86_64/fpu/svml_common_data_macros.h.S b/sysdeps/x86_64/fpu/svml_common_data_macros.h.S
> new file mode 100644
> index 0000000000..31bd66835d
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_common_data_macros.h.S
> @@ -0,0 +1,50 @@
> +/* Helper macros for creating rodata
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +#ifndef _SVML_COMMON_DATA_MACROS_H_S
> +#define _SVML_COMMON_DATA_MACROS_H_S   1
> +
> +
> +.macro check_offset data_section offset
> +       .if     .-\data_section != \offset
> +       .err
> +       .endif
> +.endm
> +
> +
> +/* Only used in floating point functions at the moment.  */
> +.macro float_vectorN data_section N offset value
> +       check_offset \data_section \offset
> +       .rept   \N
> +       .long   \value
> +       .endr
> +.endm
> +
> +#define float_block(data_section, offset, ...) \
> +       check_offset data_section offset;       \
> +       .long   __VA_ARGS__
> +
> +
> +#define float_vector16(data_section, offset, value)    \
> +       float_vectorN data_section 4 offset value
> +#define float_vector32(data_section, offset, value)    \
> +       float_vectorN data_section 8 offset value
> +#define float_vector64(data_section, offset, value)    \
> +       float_vectorN data_section 16 offset value
> +
> +#endif
> --
> 2.34.1
>

Please use .h files in fpu directory.

-- 
H.J.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 01/27] x86/fpu: Create helper file for common data macros
  2022-12-07 23:53 ` [PATCH v1 01/27] x86/fpu: Create helper file for common data macros H.J. Lu via Libc-alpha
@ 2022-12-08  0:13   ` Noah Goldstein via Libc-alpha
  2022-12-08  0:22     ` H.J. Lu via Libc-alpha
  0 siblings, 1 reply; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-08  0:13 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, andrey.kolesov, carlos

On Wed, Dec 7, 2022 at 3:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The macros are useful for creating .rodata definitions and checking
> > that the offset is correct.
> > ---
> >  .../x86_64/fpu/svml_common_data_macros.h.S    | 50 +++++++++++++++++++
> >  1 file changed, 50 insertions(+)
> >  create mode 100644 sysdeps/x86_64/fpu/svml_common_data_macros.h.S
> >
> > diff --git a/sysdeps/x86_64/fpu/svml_common_data_macros.h.S b/sysdeps/x86_64/fpu/svml_common_data_macros.h.S
> > new file mode 100644
> > index 0000000000..31bd66835d
> > --- /dev/null
> > +++ b/sysdeps/x86_64/fpu/svml_common_data_macros.h.S
> > @@ -0,0 +1,50 @@
> > +/* Helper macros for creating rodata
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   https://www.gnu.org/licenses/.  */
> > +
> > +#ifndef _SVML_COMMON_DATA_MACROS_H_S
> > +#define _SVML_COMMON_DATA_MACROS_H_S   1
> > +
> > +
> > +.macro check_offset data_section offset
> > +       .if     .-\data_section != \offset
> > +       .err
> > +       .endif
> > +.endm
> > +
> > +
> > +/* Only used in floating point functions at the moment.  */
> > +.macro float_vectorN data_section N offset value
> > +       check_offset \data_section \offset
> > +       .rept   \N
> > +       .long   \value
> > +       .endr
> > +.endm
> > +
> > +#define float_block(data_section, offset, ...) \
> > +       check_offset data_section offset;       \
> > +       .long   __VA_ARGS__
> > +
> > +
> > +#define float_vector16(data_section, offset, value)    \
> > +       float_vectorN data_section 4 offset value
> > +#define float_vector32(data_section, offset, value)    \
> > +       float_vectorN data_section 8 offset value
> > +#define float_vector64(data_section, offset, value)    \
> > +       float_vectorN data_section 16 offset value
> > +
> > +#endif
> > --
> > 2.34.1
> >
>
> Please use .h files in fpu directory.

Are .S files globbed somewhere or something?

It uses assembler macros so its convenient for the
extension to match.
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 01/27] x86/fpu: Create helper file for common data macros
  2022-12-08  0:13   ` Noah Goldstein via Libc-alpha
@ 2022-12-08  0:22     ` H.J. Lu via Libc-alpha
  2022-12-08  0:46       ` Noah Goldstein via Libc-alpha
  0 siblings, 1 reply; 38+ messages in thread
From: H.J. Lu via Libc-alpha @ 2022-12-08  0:22 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, andrey.kolesov, carlos

On Wed, Dec 7, 2022 at 4:13 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Dec 7, 2022 at 3:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > The macros are useful for creating .rodata definitions and checking
> > > that the offset is correct.
> > > ---
> > >  .../x86_64/fpu/svml_common_data_macros.h.S    | 50 +++++++++++++++++++
> > >  1 file changed, 50 insertions(+)
> > >  create mode 100644 sysdeps/x86_64/fpu/svml_common_data_macros.h.S
> > >
> > > diff --git a/sysdeps/x86_64/fpu/svml_common_data_macros.h.S b/sysdeps/x86_64/fpu/svml_common_data_macros.h.S
> > > new file mode 100644
> > > index 0000000000..31bd66835d
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/fpu/svml_common_data_macros.h.S
> > > @@ -0,0 +1,50 @@
> > > +/* Helper macros for creating rodata
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   https://www.gnu.org/licenses/.  */
> > > +
> > > +#ifndef _SVML_COMMON_DATA_MACROS_H_S
> > > +#define _SVML_COMMON_DATA_MACROS_H_S   1
> > > +
> > > +
> > > +.macro check_offset data_section offset
> > > +       .if     .-\data_section != \offset
> > > +       .err
> > > +       .endif
> > > +.endm
> > > +
> > > +
> > > +/* Only used in floating point functions at the moment.  */
> > > +.macro float_vectorN data_section N offset value
> > > +       check_offset \data_section \offset
> > > +       .rept   \N
> > > +       .long   \value
> > > +       .endr
> > > +.endm
> > > +
> > > +#define float_block(data_section, offset, ...) \
> > > +       check_offset data_section offset;       \
> > > +       .long   __VA_ARGS__
> > > +
> > > +
> > > +#define float_vector16(data_section, offset, value)    \
> > > +       float_vectorN data_section 4 offset value
> > > +#define float_vector32(data_section, offset, value)    \
> > > +       float_vectorN data_section 8 offset value
> > > +#define float_vector64(data_section, offset, value)    \
> > > +       float_vectorN data_section 16 offset value
> > > +
> > > +#endif
> > > --
> > > 2.34.1
> > >
> >
> > Please use .h files in fpu directory.
>
> Are .S files globbed somewhere or something?
>
> It uses assembler macros so its convenient for the
> extension to match.

Use new macros or rename files before commit.

-- 
H.J.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 01/27] x86/fpu: Create helper file for common data macros
  2022-12-08  0:22     ` H.J. Lu via Libc-alpha
@ 2022-12-08  0:46       ` Noah Goldstein via Libc-alpha
  0 siblings, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-08  0:46 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, andrey.kolesov, carlos

On Wed, Dec 7, 2022 at 4:22 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Dec 7, 2022 at 4:13 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Wed, Dec 7, 2022 at 3:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > The macros are useful for creating .rodata definitions and checking
> > > > that the offset is correct.
> > > > ---
> > > >  .../x86_64/fpu/svml_common_data_macros.h.S    | 50 +++++++++++++++++++
> > > >  1 file changed, 50 insertions(+)
> > > >  create mode 100644 sysdeps/x86_64/fpu/svml_common_data_macros.h.S
> > > >
> > > > diff --git a/sysdeps/x86_64/fpu/svml_common_data_macros.h.S b/sysdeps/x86_64/fpu/svml_common_data_macros.h.S
> > > > new file mode 100644
> > > > index 0000000000..31bd66835d
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/fpu/svml_common_data_macros.h.S
> > > > @@ -0,0 +1,50 @@
> > > > +/* Helper macros for creating rodata
> > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   https://www.gnu.org/licenses/.  */
> > > > +
> > > > +#ifndef _SVML_COMMON_DATA_MACROS_H_S
> > > > +#define _SVML_COMMON_DATA_MACROS_H_S   1
> > > > +
> > > > +
> > > > +.macro check_offset data_section offset
> > > > +       .if     .-\data_section != \offset
> > > > +       .err
> > > > +       .endif
> > > > +.endm
> > > > +
> > > > +
> > > > +/* Only used in floating point functions at the moment.  */
> > > > +.macro float_vectorN data_section N offset value
> > > > +       check_offset \data_section \offset
> > > > +       .rept   \N
> > > > +       .long   \value
> > > > +       .endr
> > > > +.endm
> > > > +
> > > > +#define float_block(data_section, offset, ...) \
> > > > +       check_offset data_section offset;       \
> > > > +       .long   __VA_ARGS__
> > > > +
> > > > +
> > > > +#define float_vector16(data_section, offset, value)    \
> > > > +       float_vectorN data_section 4 offset value
> > > > +#define float_vector32(data_section, offset, value)    \
> > > > +       float_vectorN data_section 8 offset value
> > > > +#define float_vector64(data_section, offset, value)    \
> > > > +       float_vectorN data_section 16 offset value
> > > > +
> > > > +#endif
> > > > --
> > > > 2.34.1
> > > >
> > >
> > > Please use .h files in fpu directory.
> >
> > Are .S files globbed somewhere or something?
> >
> > It uses assembler macros so its convenient for the
> > extension to match.
>
> Use new macros or rename files before commit.

Okay.
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S
  2022-12-07  8:52 ` [PATCH v1 07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S Noah Goldstein via Libc-alpha
@ 2022-12-16 17:05   ` H.J. Lu via Libc-alpha
  2022-12-16 18:17     ` Noah Goldstein via Libc-alpha
  0 siblings, 1 reply; 38+ messages in thread
From: H.J. Lu via Libc-alpha @ 2022-12-16 17:05 UTC (permalink / raw)
  To: Noah Goldstein, Sunil K Pandey; +Cc: libc-alpha, andrey.kolesov, carlos

On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No changes to the logic, just change how rodata is handled.
>
> 1. Define the rodatas using the new macros so they check that the
>    offset is correct.
>
> 2. Use common data where applicable.
> ---
>  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
>  1 file changed, 197 insertions(+), 253 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> index d74fc7731d..765e9ed7f7 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> @@ -70,94 +70,99 @@
>   *
>   */
>
> -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> -   by use in the function. On cold-starts this might help the
> -   prefetcher. Possibly a better idea is to interleave start/end so
> -   that the prefetcher is less likely to detect a stream and pull
> -   irrelivant lines into cache.  */
>
> -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> - */
> +
> +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> +#include "svml_s_common_evex512_rodata_offsets.h"
> +
> +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> +   4 bytes each.  */
>  #define _iExpMantMask_UISA             0
>  #define _iMinIdxOfsMask_UISA           4
>  #define _iMaxIdxMask_UISA              8
>  #define _iExpMask                      12
>
> -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> -   each.  */
> -#define _sC_lo                         0
> -#define _sC_hi                         64
> -#define _sP7_lo                                128
> -#define _sP7_hi                                192
> -#define _sSignMask                     256
> -#define _sP6_lo                                320
> -#define _sP6_hi                                384
> -#define _sP5_lo                                448
> -#define _sP5_hi                                512
> -#define _sP4_lo                                576
> -#define _sP4_hi                                640
> -#define _sP3_lo                                704
> -#define _sP3_hi                                768
> -#define _sP2_lo                                832
> -#define _sP2_hi                                896
> -#define _sP0_lo                                960
> -#define _sP0_hi                                1024
> +/* Offsets for data table __svml_stanh_data_internal. Ordered
> +   by use in the function. On cold-starts this might help the
> +   prefetcher. Possibly a better idea is to interleave start/end so
> +   that the prefetcher is less likely to detect a stream and pull
> +   irrelivant lines into cache.  */
> +
> +/* Offsets for data table __svml_stanh_data_internal.
> +   64 bytes each.  */
> +#define _sC_lo 0
> +#define _sC_hi 64
> +#define _sP7_lo        128
> +#define _sP7_hi        192
> +#define _sP6_lo        256
> +#define _sP6_hi        320
> +#define _sP5_lo        384
> +#define _sP5_hi        448
> +#define _sP4_lo        512
> +#define _sP4_hi        576
> +#define _sP3_lo        640
> +#define _sP3_hi        704
> +#define _sP2_lo        768
> +#define _sP2_hi        832
> +#define _sP0_lo        896
> +#define _sP0_hi        960
> +
>
>  #include <sysdep.h>
> -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
>
>         .section .text.evex512, "ax", @progbits
>  ENTRY(_ZGVeN16v_tanhf_skx)
> -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> +       /* Here huge arguments, INF and NaNs are filtered out to
> +          callout.  */
> +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
>
>         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
>         vpxord  %zmm3, %zmm3, %zmm3
>         vpmaxsd %zmm3, %zmm2, %zmm3
> -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
>
>         /* Setup permute indices in zmm3.  */
>         vpsrld  $21, %zmm3, %zmm3
>
>         /* Store if there are any special cases in k1.  */
> -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
>
> -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
>
> -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
>
>         /* Store absolute values of inputs in zmm1.  */
> -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
>         vandnps %zmm0, %zmm4, %zmm1
>         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
>
> -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
>
> -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
>
>         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
>         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
>
> -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
>
> -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
>
>         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
>         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
>
> -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
>
> -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
>
>         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
>         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
>
>         /* Go to special inputs processing branch.  */
>         jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> +
>         /* Wait until after branch of write over zmm0.  */
>         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
>
> @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
>
>         /* Cold case. edx has 1s where there was a special value that
>            needs to be handled by a tanhf call. Optimize for code size
> -          more so than speed here. */
> +          more so than speed here.  */
>  L(SPECIAL_VALUES_BRANCH):
> -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> -       callee save register saving code size. */
> +
> +       /* Use r13 to save/restore the stack. This allows us to use rbp
> +          as callee save register saving code size.  */
>         pushq   %r13
> -       cfi_adjust_cfa_offset(8)
> -       cfi_offset(r13, -16)
> -       /* Need to callee save registers to preserve state across tanhf calls.
> -        */
> +       cfi_adjust_cfa_offset (8)
> +       cfi_offset (r13, -16)
> +       /* Need to callee save registers to preserve state across tanhf
> +          calls.  */
>         pushq   %rbx
> -       cfi_adjust_cfa_offset(8)
> -       cfi_offset(rbx, -24)
> +       cfi_adjust_cfa_offset (8)
> +       cfi_offset (rbx, -24)
>         pushq   %rbp
> -       cfi_adjust_cfa_offset(8)
> -       cfi_offset(rbp, -32)
> +       cfi_adjust_cfa_offset (8)
> +       cfi_offset (rbp, -32)
>         movq    %rsp, %r13
> -       cfi_def_cfa_register(r13)
> +       cfi_def_cfa_register (r13)
>
>         /* Align stack and make room for 2x zmm vectors.  */
>         andq    $-64, %rsp
> @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
>
>         vzeroupper
>
> -       /* edx has 1s where there was a special value that needs to be handled
> -          by a tanhf call.  */
> +       /* edx has 1s where there was a special value that needs to be
> +          handled by a tanhf call.  */
>         movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       # LOE rbx rbp r12 r13 r14 r15
> -       /* use rbp as index for special value that is saved across calls to
> -          tanhf. We technically don't need a callee save register here as offset
> -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> -          in the loop. Realigning also costs more code size.  */
> +
> +       /* use rbp as index for special value that is saved across calls
> +          to tanhf. We technically don't need a callee save register
> +          here as offset to rsp is always [0, 56] so we can restore
> +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> +          save/restore vs 2 extra instructions in the loop. Realigning
> +          also costs more code size.  */
>         xorl    %ebp, %ebp
>         tzcntl  %ebx, %ebp
>
> @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
>         vmovss  64(%rsp, %rbp, 4), %xmm0
>         call    tanhf@PLT
>
> -       /* No good way to avoid the store-forwarding fault this will cause on
> -          return. `lfence` avoids the SF fault but at greater cost as it
> -          serialized stack/callee save restoration.  */
> +       /* No good way to avoid the store-forwarding fault this will
> +          cause on return. `lfence` avoids the SF fault but at greater
> +          cost as it serialized stack/callee save restoration.  */
>         vmovss  %xmm0, (%rsp, %rbp, 4)
>
> -       blsrl   %ebx, %ebx
> +       blsrl   %ebx, %ebx
>         jnz     L(SPECIAL_VALUES_LOOP)
> -       # LOE r12 r13 r14 r15
> +
>
>         /* All results have been written to (%rsp).  */
>         vmovaps (%rsp), %zmm0
>         /* Restore rsp.  */
>         movq    %r13, %rsp
> -       cfi_def_cfa_register(rsp)
> +       cfi_def_cfa_register (rsp)
>         /* Restore callee save registers.  */
>         popq    %rbp
> -       cfi_adjust_cfa_offset(-8)
> -       cfi_restore(rbp)
> +       cfi_adjust_cfa_offset (-8)
> +       cfi_restore (rbp)
>         popq    %rbx
> -       cfi_adjust_cfa_offset(-8)
> -       cfi_restore(rbp)
> +       cfi_adjust_cfa_offset (-8)
> +       cfi_restore (rbp)
>         popq    %r13
> -       cfi_adjust_cfa_offset(-8)
> -       cfi_restore(r13)
> +       cfi_adjust_cfa_offset (-8)
> +       cfi_restore (r13)
>         ret
>  END(_ZGVeN16v_tanhf_skx)
>
> -       .section .rodata, "a"
> +       .section .rodata.evex512, "a"
>         .align  16
> -#ifdef __svml_stanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct
> -       {
> -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> -} __svml_stanh_data_internal;
> -#endif
> -
> -__svml_stanh_data_internal:
> -       .align  4
> -       /* _iExpMantMask_UISA */
> -       .long   0x7fe00000
> -
> -       .align  4
> -       /* _iMinIdxOfsMask_UISA */
> -       .long   0x3d400000
> -
> -       .align  4
> -       /* _iMaxIdxMask_UISA */
> -       .long   0x03e00000
> -
> -       .align  4
> -       /* _iExpMask */
> -       .long   0x7f000000
> -
> -       .align  64
> -__svml_stanh_data_internal_al64:
> -       .align  64
> -       /* _sC_lo */
> -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> -
> -       .align  64
> -       /* _sC_hi */
> -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> -
> -       .align  64
> -       /* _sP7_lo */
> -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> -
> -       .align  64
> -       /* _sP7_hi */
> -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
>
> -       .align  64
> -       /* _sSignMask */
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> -
> -       .align  64
> -       /* _sP6_lo */
> -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> -
> -       .align  64
> -       /* _sP6_hi */
> -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> -
> -       .align  64
> -       /* _sP5_lo */
> -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> -
> -       .align  64
> -       /* _sP5_hi */
> -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> -
> -       .align  64
> -       /* _sP4_lo */
> -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> -
> -       .align  64
> -       /* _sP4_hi */
> -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> -
> -       .align  64
> -       /* _sP3_lo */
> -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> -
> -       .align  64
> -       /* _sP3_hi */
> -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> -
> -       .align  64
> -       /* _sP2_lo */
> -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> -
> -       .align  64
> -       /* _sP2_hi */
> -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> -
> -       .align  64
> -       /* _sP0_lo */
> -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> -
> -       .align  64
> -       /* _sP0_hi */
> -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> +LOCAL_DATA_NAME_UNALIGNED:
> +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
>
>         .align  64
> -       .type   __svml_stanh_data_internal_al64, @object
> -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> -       .type   __svml_stanh_data_internal, @object
> -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> +LOCAL_DATA_NAME:
> +       float_block (LOCAL_DATA_NAME, _sC_lo,
> +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> +
> +       float_block (LOCAL_DATA_NAME, _sC_hi,
> +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> +
> +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> +
> +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> +
> +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> +
> +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> +
> +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> +
> +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> +
> +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> +
> +       .type   LOCAL_DATA_NAME, @object
> +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> --
> 2.34.1
>

The data movement makes the assembler codes much harder to follow.
Sunil, what do you think of this patch series?


-- 
H.J.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S
  2022-12-16 17:05   ` H.J. Lu via Libc-alpha
@ 2022-12-16 18:17     ` Noah Goldstein via Libc-alpha
  2022-12-16 21:37       ` H.J. Lu via Libc-alpha
  0 siblings, 1 reply; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-16 18:17 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Sunil K Pandey, libc-alpha, andrey.kolesov, carlos

On Fri, Dec 16, 2022 at 9:06 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > No changes to the logic, just change how rodata is handled.
> >
> > 1. Define the rodatas using the new macros so they check that the
> >    offset is correct.
> >
> > 2. Use common data where applicable.
> > ---
> >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
> >  1 file changed, 197 insertions(+), 253 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > index d74fc7731d..765e9ed7f7 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > @@ -70,94 +70,99 @@
> >   *
> >   */
> >
> > -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> > -   by use in the function. On cold-starts this might help the
> > -   prefetcher. Possibly a better idea is to interleave start/end so
> > -   that the prefetcher is less likely to detect a stream and pull
> > -   irrelivant lines into cache.  */
> >
> > -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> > - */
> > +
> > +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> > +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> > +#include "svml_s_common_evex512_rodata_offsets.h"
> > +
> > +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> > +   4 bytes each.  */
> >  #define _iExpMantMask_UISA             0
> >  #define _iMinIdxOfsMask_UISA           4
> >  #define _iMaxIdxMask_UISA              8
> >  #define _iExpMask                      12
> >
> > -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> > -   each.  */
> > -#define _sC_lo                         0
> > -#define _sC_hi                         64
> > -#define _sP7_lo                                128
> > -#define _sP7_hi                                192
> > -#define _sSignMask                     256
> > -#define _sP6_lo                                320
> > -#define _sP6_hi                                384
> > -#define _sP5_lo                                448
> > -#define _sP5_hi                                512
> > -#define _sP4_lo                                576
> > -#define _sP4_hi                                640
> > -#define _sP3_lo                                704
> > -#define _sP3_hi                                768
> > -#define _sP2_lo                                832
> > -#define _sP2_hi                                896
> > -#define _sP0_lo                                960
> > -#define _sP0_hi                                1024
> > +/* Offsets for data table __svml_stanh_data_internal. Ordered
> > +   by use in the function. On cold-starts this might help the
> > +   prefetcher. Possibly a better idea is to interleave start/end so
> > +   that the prefetcher is less likely to detect a stream and pull
> > +   irrelivant lines into cache.  */
> > +
> > +/* Offsets for data table __svml_stanh_data_internal.
> > +   64 bytes each.  */
> > +#define _sC_lo 0
> > +#define _sC_hi 64
> > +#define _sP7_lo        128
> > +#define _sP7_hi        192
> > +#define _sP6_lo        256
> > +#define _sP6_hi        320
> > +#define _sP5_lo        384
> > +#define _sP5_hi        448
> > +#define _sP4_lo        512
> > +#define _sP4_hi        576
> > +#define _sP3_lo        640
> > +#define _sP3_hi        704
> > +#define _sP2_lo        768
> > +#define _sP2_hi        832
> > +#define _sP0_lo        896
> > +#define _sP0_hi        960
> > +
> >
> >  #include <sysdep.h>
> > -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> > -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
> >
> >         .section .text.evex512, "ax", @progbits
> >  ENTRY(_ZGVeN16v_tanhf_skx)
> > -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > +       /* Here huge arguments, INF and NaNs are filtered out to
> > +          callout.  */
> > +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> >
> >         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> >         vpxord  %zmm3, %zmm3, %zmm3
> >         vpmaxsd %zmm3, %zmm2, %zmm3
> > -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> >
> >         /* Setup permute indices in zmm3.  */
> >         vpsrld  $21, %zmm3, %zmm3
> >
> >         /* Store if there are any special cases in k1.  */
> > -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> >
> > -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> > +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> >
> > -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> > +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> >
> >         /* Store absolute values of inputs in zmm1.  */
> > -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
> >         vandnps %zmm0, %zmm4, %zmm1
> >         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> >
> > -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> > +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> >
> > -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> > +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> >
> >         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> >         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> >
> > -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> > +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> >
> > -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> > +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> >
> >         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> >         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> >
> > -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> > +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> >
> > -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> > +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> >
> >         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> >         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> >
> >         /* Go to special inputs processing branch.  */
> >         jne     L(SPECIAL_VALUES_BRANCH)
> > -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > +
> >         /* Wait until after branch of write over zmm0.  */
> >         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> >
> > @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> >
> >         /* Cold case. edx has 1s where there was a special value that
> >            needs to be handled by a tanhf call. Optimize for code size
> > -          more so than speed here. */
> > +          more so than speed here.  */
> >  L(SPECIAL_VALUES_BRANCH):
> > -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > -       callee save register saving code size. */
> > +
> > +       /* Use r13 to save/restore the stack. This allows us to use rbp
> > +          as callee save register saving code size.  */
> >         pushq   %r13
> > -       cfi_adjust_cfa_offset(8)
> > -       cfi_offset(r13, -16)
> > -       /* Need to callee save registers to preserve state across tanhf calls.
> > -        */
> > +       cfi_adjust_cfa_offset (8)
> > +       cfi_offset (r13, -16)
> > +       /* Need to callee save registers to preserve state across tanhf
> > +          calls.  */
> >         pushq   %rbx
> > -       cfi_adjust_cfa_offset(8)
> > -       cfi_offset(rbx, -24)
> > +       cfi_adjust_cfa_offset (8)
> > +       cfi_offset (rbx, -24)
> >         pushq   %rbp
> > -       cfi_adjust_cfa_offset(8)
> > -       cfi_offset(rbp, -32)
> > +       cfi_adjust_cfa_offset (8)
> > +       cfi_offset (rbp, -32)
> >         movq    %rsp, %r13
> > -       cfi_def_cfa_register(r13)
> > +       cfi_def_cfa_register (r13)
> >
> >         /* Align stack and make room for 2x zmm vectors.  */
> >         andq    $-64, %rsp
> > @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
> >
> >         vzeroupper
> >
> > -       /* edx has 1s where there was a special value that needs to be handled
> > -          by a tanhf call.  */
> > +       /* edx has 1s where there was a special value that needs to be
> > +          handled by a tanhf call.  */
> >         movl    %edx, %ebx
> >  L(SPECIAL_VALUES_LOOP):
> > -       # LOE rbx rbp r12 r13 r14 r15
> > -       /* use rbp as index for special value that is saved across calls to
> > -          tanhf. We technically don't need a callee save register here as offset
> > -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > -          in the loop. Realigning also costs more code size.  */
> > +
> > +       /* use rbp as index for special value that is saved across calls
> > +          to tanhf. We technically don't need a callee save register
> > +          here as offset to rsp is always [0, 56] so we can restore
> > +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> > +          save/restore vs 2 extra instructions in the loop. Realigning
> > +          also costs more code size.  */
> >         xorl    %ebp, %ebp
> >         tzcntl  %ebx, %ebp
> >
> > @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
> >         vmovss  64(%rsp, %rbp, 4), %xmm0
> >         call    tanhf@PLT
> >
> > -       /* No good way to avoid the store-forwarding fault this will cause on
> > -          return. `lfence` avoids the SF fault but at greater cost as it
> > -          serialized stack/callee save restoration.  */
> > +       /* No good way to avoid the store-forwarding fault this will
> > +          cause on return. `lfence` avoids the SF fault but at greater
> > +          cost as it serialized stack/callee save restoration.  */
> >         vmovss  %xmm0, (%rsp, %rbp, 4)
> >
> > -       blsrl   %ebx, %ebx
> > +       blsrl   %ebx, %ebx
> >         jnz     L(SPECIAL_VALUES_LOOP)
> > -       # LOE r12 r13 r14 r15
> > +
> >
> >         /* All results have been written to (%rsp).  */
> >         vmovaps (%rsp), %zmm0
> >         /* Restore rsp.  */
> >         movq    %r13, %rsp
> > -       cfi_def_cfa_register(rsp)
> > +       cfi_def_cfa_register (rsp)
> >         /* Restore callee save registers.  */
> >         popq    %rbp
> > -       cfi_adjust_cfa_offset(-8)
> > -       cfi_restore(rbp)
> > +       cfi_adjust_cfa_offset (-8)
> > +       cfi_restore (rbp)
> >         popq    %rbx
> > -       cfi_adjust_cfa_offset(-8)
> > -       cfi_restore(rbp)
> > +       cfi_adjust_cfa_offset (-8)
> > +       cfi_restore (rbp)
> >         popq    %r13
> > -       cfi_adjust_cfa_offset(-8)
> > -       cfi_restore(r13)
> > +       cfi_adjust_cfa_offset (-8)
> > +       cfi_restore (r13)
> >         ret
> >  END(_ZGVeN16v_tanhf_skx)
> >
> > -       .section .rodata, "a"
> > +       .section .rodata.evex512, "a"
> >         .align  16
> > -#ifdef __svml_stanh_data_internal_typedef
> > -typedef unsigned int VUINT32;
> > -typedef struct
> > -       {
> > -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> > -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> > -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> > -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> > -} __svml_stanh_data_internal;
> > -#endif
> > -
> > -__svml_stanh_data_internal:
> > -       .align  4
> > -       /* _iExpMantMask_UISA */
> > -       .long   0x7fe00000
> > -
> > -       .align  4
> > -       /* _iMinIdxOfsMask_UISA */
> > -       .long   0x3d400000
> > -
> > -       .align  4
> > -       /* _iMaxIdxMask_UISA */
> > -       .long   0x03e00000
> > -
> > -       .align  4
> > -       /* _iExpMask */
> > -       .long   0x7f000000
> > -
> > -       .align  64
> > -__svml_stanh_data_internal_al64:
> > -       .align  64
> > -       /* _sC_lo */
> > -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > -
> > -       .align  64
> > -       /* _sC_hi */
> > -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > -
> > -       .align  64
> > -       /* _sP7_lo */
> > -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > -
> > -       .align  64
> > -       /* _sP7_hi */
> > -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> >
> > -       .align  64
> > -       /* _sSignMask */
> > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > -
> > -       .align  64
> > -       /* _sP6_lo */
> > -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > -
> > -       .align  64
> > -       /* _sP6_hi */
> > -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > -
> > -       .align  64
> > -       /* _sP5_lo */
> > -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > -
> > -       .align  64
> > -       /* _sP5_hi */
> > -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > -
> > -       .align  64
> > -       /* _sP4_lo */
> > -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > -
> > -       .align  64
> > -       /* _sP4_hi */
> > -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > -
> > -       .align  64
> > -       /* _sP3_lo */
> > -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > -
> > -       .align  64
> > -       /* _sP3_hi */
> > -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > -
> > -       .align  64
> > -       /* _sP2_lo */
> > -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > -
> > -       .align  64
> > -       /* _sP2_hi */
> > -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > -
> > -       .align  64
> > -       /* _sP0_lo */
> > -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > -
> > -       .align  64
> > -       /* _sP0_hi */
> > -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > +LOCAL_DATA_NAME_UNALIGNED:
> > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> > +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> > +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
> >
> >         .align  64
> > -       .type   __svml_stanh_data_internal_al64, @object
> > -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> > -       .type   __svml_stanh_data_internal, @object
> > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > +LOCAL_DATA_NAME:
> > +       float_block (LOCAL_DATA_NAME, _sC_lo,
> > +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> > +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> > +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> > +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sC_hi,
> > +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> > +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> > +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> > +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> > +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> > +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> > +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> > +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> > +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> > +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> > +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> > +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> > +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> > +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> > +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> > +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> > +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> > +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> > +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> > +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> > +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> > +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> > +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> > +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> > +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> > +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> > +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> > +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> > +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> > +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> > +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> > +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> > +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> > +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> > +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> > +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> > +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> > +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> > +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> > +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> > +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> > +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> > +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> > +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> > +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> > +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> > +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> > +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> > +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> > +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> > +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> > +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> > +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> > +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> > +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> > +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> > +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> > +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> > +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> > +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> > +
> > +       .type   LOCAL_DATA_NAME, @object
> > +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> > --
> > 2.34.1
> >
>
> The data movement makes the assembler codes much harder to follow.
> Sunil, what do you think of this patch series?

What do you mean? The change on in how we define rodata or the movement
to multiple files or something else?
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S
  2022-12-16 18:17     ` Noah Goldstein via Libc-alpha
@ 2022-12-16 21:37       ` H.J. Lu via Libc-alpha
  2022-12-16 21:51         ` Noah Goldstein via Libc-alpha
  0 siblings, 1 reply; 38+ messages in thread
From: H.J. Lu via Libc-alpha @ 2022-12-16 21:37 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Sunil K Pandey, libc-alpha, andrey.kolesov, carlos

On Fri, Dec 16, 2022 at 10:18 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Dec 16, 2022 at 9:06 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > No changes to the logic, just change how rodata is handled.
> > >
> > > 1. Define the rodatas using the new macros so they check that the
> > >    offset is correct.
> > >
> > > 2. Use common data where applicable.
> > > ---
> > >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
> > >  1 file changed, 197 insertions(+), 253 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > index d74fc7731d..765e9ed7f7 100644
> > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > @@ -70,94 +70,99 @@
> > >   *
> > >   */
> > >
> > > -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> > > -   by use in the function. On cold-starts this might help the
> > > -   prefetcher. Possibly a better idea is to interleave start/end so
> > > -   that the prefetcher is less likely to detect a stream and pull
> > > -   irrelivant lines into cache.  */
> > >
> > > -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> > > - */
> > > +
> > > +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> > > +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> > > +#include "svml_s_common_evex512_rodata_offsets.h"
> > > +
> > > +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> > > +   4 bytes each.  */
> > >  #define _iExpMantMask_UISA             0
> > >  #define _iMinIdxOfsMask_UISA           4
> > >  #define _iMaxIdxMask_UISA              8
> > >  #define _iExpMask                      12
> > >
> > > -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> > > -   each.  */
> > > -#define _sC_lo                         0
> > > -#define _sC_hi                         64
> > > -#define _sP7_lo                                128
> > > -#define _sP7_hi                                192
> > > -#define _sSignMask                     256
> > > -#define _sP6_lo                                320
> > > -#define _sP6_hi                                384
> > > -#define _sP5_lo                                448
> > > -#define _sP5_hi                                512
> > > -#define _sP4_lo                                576
> > > -#define _sP4_hi                                640
> > > -#define _sP3_lo                                704
> > > -#define _sP3_hi                                768
> > > -#define _sP2_lo                                832
> > > -#define _sP2_hi                                896
> > > -#define _sP0_lo                                960
> > > -#define _sP0_hi                                1024
> > > +/* Offsets for data table __svml_stanh_data_internal. Ordered
> > > +   by use in the function. On cold-starts this might help the
> > > +   prefetcher. Possibly a better idea is to interleave start/end so
> > > +   that the prefetcher is less likely to detect a stream and pull
> > > +   irrelivant lines into cache.  */
> > > +
> > > +/* Offsets for data table __svml_stanh_data_internal.
> > > +   64 bytes each.  */
> > > +#define _sC_lo 0
> > > +#define _sC_hi 64
> > > +#define _sP7_lo        128
> > > +#define _sP7_hi        192
> > > +#define _sP6_lo        256
> > > +#define _sP6_hi        320
> > > +#define _sP5_lo        384
> > > +#define _sP5_hi        448
> > > +#define _sP4_lo        512
> > > +#define _sP4_hi        576
> > > +#define _sP3_lo        640
> > > +#define _sP3_hi        704
> > > +#define _sP2_lo        768
> > > +#define _sP2_hi        832
> > > +#define _sP0_lo        896
> > > +#define _sP0_hi        960
> > > +
> > >
> > >  #include <sysdep.h>
> > > -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> > > -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
> > >
> > >         .section .text.evex512, "ax", @progbits
> > >  ENTRY(_ZGVeN16v_tanhf_skx)
> > > -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > +       /* Here huge arguments, INF and NaNs are filtered out to
> > > +          callout.  */
> > > +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > >
> > >         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > >         vpxord  %zmm3, %zmm3, %zmm3
> > >         vpmaxsd %zmm3, %zmm2, %zmm3
> > > -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > >
> > >         /* Setup permute indices in zmm3.  */
> > >         vpsrld  $21, %zmm3, %zmm3
> > >
> > >         /* Store if there are any special cases in k1.  */
> > > -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > >
> > > -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > > -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> > > +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > >
> > > -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > > -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> > > +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > >
> > >         /* Store absolute values of inputs in zmm1.  */
> > > -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > > +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
> > >         vandnps %zmm0, %zmm4, %zmm1
> > >         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > >
> > > -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > > -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> > > +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > >
> > > -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > > -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> > > +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > >
> > >         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > >         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > >
> > > -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > > -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> > > +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > >
> > > -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > > -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> > > +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > >
> > >         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > >         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > >
> > > -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > > -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> > > +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > >
> > > -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > > -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> > > +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > >
> > >         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > >         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > > @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > >
> > >         /* Go to special inputs processing branch.  */
> > >         jne     L(SPECIAL_VALUES_BRANCH)
> > > -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > +
> > >         /* Wait until after branch of write over zmm0.  */
> > >         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > >
> > > @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > >
> > >         /* Cold case. edx has 1s where there was a special value that
> > >            needs to be handled by a tanhf call. Optimize for code size
> > > -          more so than speed here. */
> > > +          more so than speed here.  */
> > >  L(SPECIAL_VALUES_BRANCH):
> > > -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > > -       callee save register saving code size. */
> > > +
> > > +       /* Use r13 to save/restore the stack. This allows us to use rbp
> > > +          as callee save register saving code size.  */
> > >         pushq   %r13
> > > -       cfi_adjust_cfa_offset(8)
> > > -       cfi_offset(r13, -16)
> > > -       /* Need to callee save registers to preserve state across tanhf calls.
> > > -        */
> > > +       cfi_adjust_cfa_offset (8)
> > > +       cfi_offset (r13, -16)
> > > +       /* Need to callee save registers to preserve state across tanhf
> > > +          calls.  */
> > >         pushq   %rbx
> > > -       cfi_adjust_cfa_offset(8)
> > > -       cfi_offset(rbx, -24)
> > > +       cfi_adjust_cfa_offset (8)
> > > +       cfi_offset (rbx, -24)
> > >         pushq   %rbp
> > > -       cfi_adjust_cfa_offset(8)
> > > -       cfi_offset(rbp, -32)
> > > +       cfi_adjust_cfa_offset (8)
> > > +       cfi_offset (rbp, -32)
> > >         movq    %rsp, %r13
> > > -       cfi_def_cfa_register(r13)
> > > +       cfi_def_cfa_register (r13)
> > >
> > >         /* Align stack and make room for 2x zmm vectors.  */
> > >         andq    $-64, %rsp
> > > @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
> > >
> > >         vzeroupper
> > >
> > > -       /* edx has 1s where there was a special value that needs to be handled
> > > -          by a tanhf call.  */
> > > +       /* edx has 1s where there was a special value that needs to be
> > > +          handled by a tanhf call.  */
> > >         movl    %edx, %ebx
> > >  L(SPECIAL_VALUES_LOOP):
> > > -       # LOE rbx rbp r12 r13 r14 r15
> > > -       /* use rbp as index for special value that is saved across calls to
> > > -          tanhf. We technically don't need a callee save register here as offset
> > > -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > -          in the loop. Realigning also costs more code size.  */
> > > +
> > > +       /* use rbp as index for special value that is saved across calls
> > > +          to tanhf. We technically don't need a callee save register
> > > +          here as offset to rsp is always [0, 56] so we can restore
> > > +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> > > +          save/restore vs 2 extra instructions in the loop. Realigning
> > > +          also costs more code size.  */
> > >         xorl    %ebp, %ebp
> > >         tzcntl  %ebx, %ebp
> > >
> > > @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
> > >         vmovss  64(%rsp, %rbp, 4), %xmm0
> > >         call    tanhf@PLT
> > >
> > > -       /* No good way to avoid the store-forwarding fault this will cause on
> > > -          return. `lfence` avoids the SF fault but at greater cost as it
> > > -          serialized stack/callee save restoration.  */
> > > +       /* No good way to avoid the store-forwarding fault this will
> > > +          cause on return. `lfence` avoids the SF fault but at greater
> > > +          cost as it serialized stack/callee save restoration.  */
> > >         vmovss  %xmm0, (%rsp, %rbp, 4)
> > >
> > > -       blsrl   %ebx, %ebx
> > > +       blsrl   %ebx, %ebx
> > >         jnz     L(SPECIAL_VALUES_LOOP)
> > > -       # LOE r12 r13 r14 r15
> > > +
> > >
> > >         /* All results have been written to (%rsp).  */
> > >         vmovaps (%rsp), %zmm0
> > >         /* Restore rsp.  */
> > >         movq    %r13, %rsp
> > > -       cfi_def_cfa_register(rsp)
> > > +       cfi_def_cfa_register (rsp)
> > >         /* Restore callee save registers.  */
> > >         popq    %rbp
> > > -       cfi_adjust_cfa_offset(-8)
> > > -       cfi_restore(rbp)
> > > +       cfi_adjust_cfa_offset (-8)
> > > +       cfi_restore (rbp)
> > >         popq    %rbx
> > > -       cfi_adjust_cfa_offset(-8)
> > > -       cfi_restore(rbp)
> > > +       cfi_adjust_cfa_offset (-8)
> > > +       cfi_restore (rbp)
> > >         popq    %r13
> > > -       cfi_adjust_cfa_offset(-8)
> > > -       cfi_restore(r13)
> > > +       cfi_adjust_cfa_offset (-8)
> > > +       cfi_restore (r13)
> > >         ret
> > >  END(_ZGVeN16v_tanhf_skx)
> > >
> > > -       .section .rodata, "a"
> > > +       .section .rodata.evex512, "a"
> > >         .align  16
> > > -#ifdef __svml_stanh_data_internal_typedef
> > > -typedef unsigned int VUINT32;
> > > -typedef struct
> > > -       {
> > > -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> > > -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > > -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> > > -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> > > -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> > > -} __svml_stanh_data_internal;
> > > -#endif
> > > -
> > > -__svml_stanh_data_internal:
> > > -       .align  4
> > > -       /* _iExpMantMask_UISA */
> > > -       .long   0x7fe00000
> > > -
> > > -       .align  4
> > > -       /* _iMinIdxOfsMask_UISA */
> > > -       .long   0x3d400000
> > > -
> > > -       .align  4
> > > -       /* _iMaxIdxMask_UISA */
> > > -       .long   0x03e00000
> > > -
> > > -       .align  4
> > > -       /* _iExpMask */
> > > -       .long   0x7f000000
> > > -
> > > -       .align  64
> > > -__svml_stanh_data_internal_al64:
> > > -       .align  64
> > > -       /* _sC_lo */
> > > -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > -
> > > -       .align  64
> > > -       /* _sC_hi */
> > > -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > -
> > > -       .align  64
> > > -       /* _sP7_lo */
> > > -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > -
> > > -       .align  64
> > > -       /* _sP7_hi */
> > > -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > >
> > > -       .align  64
> > > -       /* _sSignMask */
> > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > -
> > > -       .align  64
> > > -       /* _sP6_lo */
> > > -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > -
> > > -       .align  64
> > > -       /* _sP6_hi */
> > > -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > -
> > > -       .align  64
> > > -       /* _sP5_lo */
> > > -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > -
> > > -       .align  64
> > > -       /* _sP5_hi */
> > > -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > -
> > > -       .align  64
> > > -       /* _sP4_lo */
> > > -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > -
> > > -       .align  64
> > > -       /* _sP4_hi */
> > > -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > -
> > > -       .align  64
> > > -       /* _sP3_lo */
> > > -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > -
> > > -       .align  64
> > > -       /* _sP3_hi */
> > > -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > -
> > > -       .align  64
> > > -       /* _sP2_lo */
> > > -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > -
> > > -       .align  64
> > > -       /* _sP2_hi */
> > > -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > -
> > > -       .align  64
> > > -       /* _sP0_lo */
> > > -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > -
> > > -       .align  64
> > > -       /* _sP0_hi */
> > > -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > +LOCAL_DATA_NAME_UNALIGNED:
> > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> > > +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> > > +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
> > >
> > >         .align  64
> > > -       .type   __svml_stanh_data_internal_al64, @object
> > > -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> > > -       .type   __svml_stanh_data_internal, @object
> > > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > > +LOCAL_DATA_NAME:
> > > +       float_block (LOCAL_DATA_NAME, _sC_lo,
> > > +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> > > +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> > > +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> > > +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sC_hi,
> > > +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> > > +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> > > +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> > > +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> > > +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> > > +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> > > +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> > > +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> > > +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> > > +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> > > +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> > > +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> > > +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> > > +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> > > +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> > > +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> > > +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> > > +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> > > +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> > > +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> > > +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> > > +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> > > +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> > > +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> > > +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> > > +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> > > +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> > > +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> > > +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> > > +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> > > +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> > > +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> > > +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> > > +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> > > +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> > > +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> > > +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> > > +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> > > +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> > > +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> > > +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> > > +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> > > +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> > > +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> > > +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> > > +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> > > +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> > > +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> > > +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> > > +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> > > +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> > > +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> > > +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> > > +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> > > +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> > > +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> > > +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> > > +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> > > +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> > > +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> > > +
> > > +       .type   LOCAL_DATA_NAME, @object
> > > +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> > > --
> > > 2.34.1
> > >
> >
> > The data movement makes the assembler codes much harder to follow.
> > Sunil, what do you think of this patch series?
>
> What do you mean? The change on in how we define rodata or the movement
> to multiple files or something else?

The glibc way to support data files for assembly codes is to define
data in C and use *.sym to generate offsets for assembly files, like

sysdeps/x86/cpu-features-offsets.sym:XSAVE_STATE_SIZE_OFFSET
offsetof (struct cpu_features, xsave_state_size)
sysdeps/x86_64/dl-trampoline.h:  sub
_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip),
%RSP_LP
sysdeps/x86_64/dl-trampoline.h:  sub
_dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP

-- 
H.J.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S
  2022-12-16 21:37       ` H.J. Lu via Libc-alpha
@ 2022-12-16 21:51         ` Noah Goldstein via Libc-alpha
  2022-12-16 22:01           ` H.J. Lu via Libc-alpha
  0 siblings, 1 reply; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2022-12-16 21:51 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Sunil K Pandey, libc-alpha, andrey.kolesov, carlos

On Fri, Dec 16, 2022 at 1:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Dec 16, 2022 at 10:18 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Dec 16, 2022 at 9:06 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > No changes to the logic, just change how rodata is handled.
> > > >
> > > > 1. Define the rodatas using the new macros so they check that the
> > > >    offset is correct.
> > > >
> > > > 2. Use common data where applicable.
> > > > ---
> > > >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
> > > >  1 file changed, 197 insertions(+), 253 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > index d74fc7731d..765e9ed7f7 100644
> > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > @@ -70,94 +70,99 @@
> > > >   *
> > > >   */
> > > >
> > > > -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> > > > -   by use in the function. On cold-starts this might help the
> > > > -   prefetcher. Possibly a better idea is to interleave start/end so
> > > > -   that the prefetcher is less likely to detect a stream and pull
> > > > -   irrelivant lines into cache.  */
> > > >
> > > > -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> > > > - */
> > > > +
> > > > +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> > > > +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> > > > +#include "svml_s_common_evex512_rodata_offsets.h"
> > > > +
> > > > +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> > > > +   4 bytes each.  */
> > > >  #define _iExpMantMask_UISA             0
> > > >  #define _iMinIdxOfsMask_UISA           4
> > > >  #define _iMaxIdxMask_UISA              8
> > > >  #define _iExpMask                      12
> > > >
> > > > -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> > > > -   each.  */
> > > > -#define _sC_lo                         0
> > > > -#define _sC_hi                         64
> > > > -#define _sP7_lo                                128
> > > > -#define _sP7_hi                                192
> > > > -#define _sSignMask                     256
> > > > -#define _sP6_lo                                320
> > > > -#define _sP6_hi                                384
> > > > -#define _sP5_lo                                448
> > > > -#define _sP5_hi                                512
> > > > -#define _sP4_lo                                576
> > > > -#define _sP4_hi                                640
> > > > -#define _sP3_lo                                704
> > > > -#define _sP3_hi                                768
> > > > -#define _sP2_lo                                832
> > > > -#define _sP2_hi                                896
> > > > -#define _sP0_lo                                960
> > > > -#define _sP0_hi                                1024
> > > > +/* Offsets for data table __svml_stanh_data_internal. Ordered
> > > > +   by use in the function. On cold-starts this might help the
> > > > +   prefetcher. Possibly a better idea is to interleave start/end so
> > > > +   that the prefetcher is less likely to detect a stream and pull
> > > > +   irrelivant lines into cache.  */
> > > > +
> > > > +/* Offsets for data table __svml_stanh_data_internal.
> > > > +   64 bytes each.  */
> > > > +#define _sC_lo 0
> > > > +#define _sC_hi 64
> > > > +#define _sP7_lo        128
> > > > +#define _sP7_hi        192
> > > > +#define _sP6_lo        256
> > > > +#define _sP6_hi        320
> > > > +#define _sP5_lo        384
> > > > +#define _sP5_hi        448
> > > > +#define _sP4_lo        512
> > > > +#define _sP4_hi        576
> > > > +#define _sP3_lo        640
> > > > +#define _sP3_hi        704
> > > > +#define _sP2_lo        768
> > > > +#define _sP2_hi        832
> > > > +#define _sP0_lo        896
> > > > +#define _sP0_hi        960
> > > > +
> > > >
> > > >  #include <sysdep.h>
> > > > -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> > > > -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
> > > >
> > > >         .section .text.evex512, "ax", @progbits
> > > >  ENTRY(_ZGVeN16v_tanhf_skx)
> > > > -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > +       /* Here huge arguments, INF and NaNs are filtered out to
> > > > +          callout.  */
> > > > +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > >
> > > >         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > > >         vpxord  %zmm3, %zmm3, %zmm3
> > > >         vpmaxsd %zmm3, %zmm2, %zmm3
> > > > -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > >
> > > >         /* Setup permute indices in zmm3.  */
> > > >         vpsrld  $21, %zmm3, %zmm3
> > > >
> > > >         /* Store if there are any special cases in k1.  */
> > > > -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > >
> > > > -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > > > -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> > > > +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > >
> > > > -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > > > -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> > > > +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > >
> > > >         /* Store absolute values of inputs in zmm1.  */
> > > > -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > > > +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
> > > >         vandnps %zmm0, %zmm4, %zmm1
> > > >         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > > >
> > > > -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > > > -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> > > > +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > >
> > > > -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > > > -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> > > > +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > >
> > > >         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > > >         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > > >
> > > > -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > > > -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> > > > +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > >
> > > > -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > > > -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> > > > +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > >
> > > >         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > > >         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > > >
> > > > -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > > > -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> > > > +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > >
> > > > -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > > > -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> > > > +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > >
> > > >         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > > >         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > > > @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > >
> > > >         /* Go to special inputs processing branch.  */
> > > >         jne     L(SPECIAL_VALUES_BRANCH)
> > > > -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > +
> > > >         /* Wait until after branch of write over zmm0.  */
> > > >         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > > >
> > > > @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > >
> > > >         /* Cold case. edx has 1s where there was a special value that
> > > >            needs to be handled by a tanhf call. Optimize for code size
> > > > -          more so than speed here. */
> > > > +          more so than speed here.  */
> > > >  L(SPECIAL_VALUES_BRANCH):
> > > > -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > > > -       callee save register saving code size. */
> > > > +
> > > > +       /* Use r13 to save/restore the stack. This allows us to use rbp
> > > > +          as callee save register saving code size.  */
> > > >         pushq   %r13
> > > > -       cfi_adjust_cfa_offset(8)
> > > > -       cfi_offset(r13, -16)
> > > > -       /* Need to callee save registers to preserve state across tanhf calls.
> > > > -        */
> > > > +       cfi_adjust_cfa_offset (8)
> > > > +       cfi_offset (r13, -16)
> > > > +       /* Need to callee save registers to preserve state across tanhf
> > > > +          calls.  */
> > > >         pushq   %rbx
> > > > -       cfi_adjust_cfa_offset(8)
> > > > -       cfi_offset(rbx, -24)
> > > > +       cfi_adjust_cfa_offset (8)
> > > > +       cfi_offset (rbx, -24)
> > > >         pushq   %rbp
> > > > -       cfi_adjust_cfa_offset(8)
> > > > -       cfi_offset(rbp, -32)
> > > > +       cfi_adjust_cfa_offset (8)
> > > > +       cfi_offset (rbp, -32)
> > > >         movq    %rsp, %r13
> > > > -       cfi_def_cfa_register(r13)
> > > > +       cfi_def_cfa_register (r13)
> > > >
> > > >         /* Align stack and make room for 2x zmm vectors.  */
> > > >         andq    $-64, %rsp
> > > > @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
> > > >
> > > >         vzeroupper
> > > >
> > > > -       /* edx has 1s where there was a special value that needs to be handled
> > > > -          by a tanhf call.  */
> > > > +       /* edx has 1s where there was a special value that needs to be
> > > > +          handled by a tanhf call.  */
> > > >         movl    %edx, %ebx
> > > >  L(SPECIAL_VALUES_LOOP):
> > > > -       # LOE rbx rbp r12 r13 r14 r15
> > > > -       /* use rbp as index for special value that is saved across calls to
> > > > -          tanhf. We technically don't need a callee save register here as offset
> > > > -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > > -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > -          in the loop. Realigning also costs more code size.  */
> > > > +
> > > > +       /* use rbp as index for special value that is saved across calls
> > > > +          to tanhf. We technically don't need a callee save register
> > > > +          here as offset to rsp is always [0, 56] so we can restore
> > > > +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> > > > +          save/restore vs 2 extra instructions in the loop. Realigning
> > > > +          also costs more code size.  */
> > > >         xorl    %ebp, %ebp
> > > >         tzcntl  %ebx, %ebp
> > > >
> > > > @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
> > > >         vmovss  64(%rsp, %rbp, 4), %xmm0
> > > >         call    tanhf@PLT
> > > >
> > > > -       /* No good way to avoid the store-forwarding fault this will cause on
> > > > -          return. `lfence` avoids the SF fault but at greater cost as it
> > > > -          serialized stack/callee save restoration.  */
> > > > +       /* No good way to avoid the store-forwarding fault this will
> > > > +          cause on return. `lfence` avoids the SF fault but at greater
> > > > +          cost as it serialized stack/callee save restoration.  */
> > > >         vmovss  %xmm0, (%rsp, %rbp, 4)
> > > >
> > > > -       blsrl   %ebx, %ebx
> > > > +       blsrl   %ebx, %ebx
> > > >         jnz     L(SPECIAL_VALUES_LOOP)
> > > > -       # LOE r12 r13 r14 r15
> > > > +
> > > >
> > > >         /* All results have been written to (%rsp).  */
> > > >         vmovaps (%rsp), %zmm0
> > > >         /* Restore rsp.  */
> > > >         movq    %r13, %rsp
> > > > -       cfi_def_cfa_register(rsp)
> > > > +       cfi_def_cfa_register (rsp)
> > > >         /* Restore callee save registers.  */
> > > >         popq    %rbp
> > > > -       cfi_adjust_cfa_offset(-8)
> > > > -       cfi_restore(rbp)
> > > > +       cfi_adjust_cfa_offset (-8)
> > > > +       cfi_restore (rbp)
> > > >         popq    %rbx
> > > > -       cfi_adjust_cfa_offset(-8)
> > > > -       cfi_restore(rbp)
> > > > +       cfi_adjust_cfa_offset (-8)
> > > > +       cfi_restore (rbp)
> > > >         popq    %r13
> > > > -       cfi_adjust_cfa_offset(-8)
> > > > -       cfi_restore(r13)
> > > > +       cfi_adjust_cfa_offset (-8)
> > > > +       cfi_restore (r13)
> > > >         ret
> > > >  END(_ZGVeN16v_tanhf_skx)
> > > >
> > > > -       .section .rodata, "a"
> > > > +       .section .rodata.evex512, "a"
> > > >         .align  16
> > > > -#ifdef __svml_stanh_data_internal_typedef
> > > > -typedef unsigned int VUINT32;
> > > > -typedef struct
> > > > -       {
> > > > -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> > > > -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > > > -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> > > > -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> > > > -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> > > > -} __svml_stanh_data_internal;
> > > > -#endif
> > > > -
> > > > -__svml_stanh_data_internal:
> > > > -       .align  4
> > > > -       /* _iExpMantMask_UISA */
> > > > -       .long   0x7fe00000
> > > > -
> > > > -       .align  4
> > > > -       /* _iMinIdxOfsMask_UISA */
> > > > -       .long   0x3d400000
> > > > -
> > > > -       .align  4
> > > > -       /* _iMaxIdxMask_UISA */
> > > > -       .long   0x03e00000
> > > > -
> > > > -       .align  4
> > > > -       /* _iExpMask */
> > > > -       .long   0x7f000000
> > > > -
> > > > -       .align  64
> > > > -__svml_stanh_data_internal_al64:
> > > > -       .align  64
> > > > -       /* _sC_lo */
> > > > -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > > -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > > -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > > -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > > -
> > > > -       .align  64
> > > > -       /* _sC_hi */
> > > > -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > > -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > > -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > > -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP7_lo */
> > > > -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > > -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > > -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > > -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > > -
> > > > -       .align  64
> > > > -       /* _sP7_hi */
> > > > -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > > -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > > -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > > -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > >
> > > > -       .align  64
> > > > -       /* _sSignMask */
> > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP6_lo */
> > > > -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > > -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > > -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > > -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > > -
> > > > -       .align  64
> > > > -       /* _sP6_hi */
> > > > -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > > -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > > -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > > -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP5_lo */
> > > > -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > > -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > > -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > > -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > > -
> > > > -       .align  64
> > > > -       /* _sP5_hi */
> > > > -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > > -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > > -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > > -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP4_lo */
> > > > -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > > -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > > -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > > -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > > -
> > > > -       .align  64
> > > > -       /* _sP4_hi */
> > > > -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > > -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > > -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > > -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP3_lo */
> > > > -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > > -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > > -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > > -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > > -
> > > > -       .align  64
> > > > -       /* _sP3_hi */
> > > > -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > > -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > > -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > > -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP2_lo */
> > > > -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > > -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > > -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > > -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > > -
> > > > -       .align  64
> > > > -       /* _sP2_hi */
> > > > -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > > -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > > -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > > -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP0_lo */
> > > > -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > > -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > > -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > > -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > > -
> > > > -       .align  64
> > > > -       /* _sP0_hi */
> > > > -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > > -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > > -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > > -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > > +LOCAL_DATA_NAME_UNALIGNED:
> > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> > > > +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> > > > +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
> > > >
> > > >         .align  64
> > > > -       .type   __svml_stanh_data_internal_al64, @object
> > > > -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> > > > -       .type   __svml_stanh_data_internal, @object
> > > > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > > > +LOCAL_DATA_NAME:
> > > > +       float_block (LOCAL_DATA_NAME, _sC_lo,
> > > > +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> > > > +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> > > > +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> > > > +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sC_hi,
> > > > +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> > > > +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> > > > +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> > > > +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> > > > +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> > > > +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> > > > +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> > > > +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> > > > +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> > > > +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> > > > +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> > > > +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> > > > +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> > > > +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> > > > +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> > > > +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> > > > +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> > > > +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> > > > +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> > > > +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> > > > +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> > > > +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> > > > +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> > > > +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> > > > +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> > > > +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> > > > +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> > > > +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> > > > +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> > > > +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> > > > +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> > > > +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> > > > +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> > > > +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> > > > +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> > > > +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> > > > +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> > > > +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> > > > +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> > > > +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> > > > +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> > > > +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> > > > +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> > > > +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> > > > +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> > > > +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> > > > +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> > > > +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> > > > +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> > > > +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> > > > +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> > > > +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> > > > +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> > > > +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> > > > +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> > > > +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> > > > +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> > > > +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> > > > +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> > > > +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> > > > +
> > > > +       .type   LOCAL_DATA_NAME, @object
> > > > +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> > > > --
> > > > 2.34.1
> > > >
> > >
> > > The data movement makes the assembler codes much harder to follow.
> > > Sunil, what do you think of this patch series?
> >
> > What do you mean? The change on in how we define rodata or the movement
> > to multiple files or something else?
>
> The glibc way to support data files for assembly codes is to define
> data in C and use *.sym to generate offsets for assembly files, like

I see. Although to be fair the entire SVML codebase bucks that trend.

Seems like a more dramatic trend to move all the offsets to C.
>
> sysdeps/x86/cpu-features-offsets.sym:XSAVE_STATE_SIZE_OFFSET
> offsetof (struct cpu_features, xsave_state_size)
> sysdeps/x86_64/dl-trampoline.h:  sub
> _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip),
> %RSP_LP
> sysdeps/x86_64/dl-trampoline.h:  sub
> _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S
  2022-12-16 21:51         ` Noah Goldstein via Libc-alpha
@ 2022-12-16 22:01           ` H.J. Lu via Libc-alpha
  2022-12-16 22:54             ` Sunil Pandey via Libc-alpha
  2023-06-27 18:23             ` Noah Goldstein via Libc-alpha
  0 siblings, 2 replies; 38+ messages in thread
From: H.J. Lu via Libc-alpha @ 2022-12-16 22:01 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Sunil K Pandey, libc-alpha, andrey.kolesov, carlos

On Fri, Dec 16, 2022 at 1:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Dec 16, 2022 at 1:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Dec 16, 2022 at 10:18 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Fri, Dec 16, 2022 at 9:06 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > No changes to the logic, just change how rodata is handled.
> > > > >
> > > > > 1. Define the rodatas using the new macros so they check that the
> > > > >    offset is correct.
> > > > >
> > > > > 2. Use common data where applicable.
> > > > > ---
> > > > >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
> > > > >  1 file changed, 197 insertions(+), 253 deletions(-)
> > > > >
> > > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > index d74fc7731d..765e9ed7f7 100644
> > > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > @@ -70,94 +70,99 @@
> > > > >   *
> > > > >   */
> > > > >
> > > > > -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> > > > > -   by use in the function. On cold-starts this might help the
> > > > > -   prefetcher. Possibly a better idea is to interleave start/end so
> > > > > -   that the prefetcher is less likely to detect a stream and pull
> > > > > -   irrelivant lines into cache.  */
> > > > >
> > > > > -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> > > > > - */
> > > > > +
> > > > > +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> > > > > +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> > > > > +#include "svml_s_common_evex512_rodata_offsets.h"
> > > > > +
> > > > > +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> > > > > +   4 bytes each.  */
> > > > >  #define _iExpMantMask_UISA             0
> > > > >  #define _iMinIdxOfsMask_UISA           4
> > > > >  #define _iMaxIdxMask_UISA              8
> > > > >  #define _iExpMask                      12
> > > > >
> > > > > -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> > > > > -   each.  */
> > > > > -#define _sC_lo                         0
> > > > > -#define _sC_hi                         64
> > > > > -#define _sP7_lo                                128
> > > > > -#define _sP7_hi                                192
> > > > > -#define _sSignMask                     256
> > > > > -#define _sP6_lo                                320
> > > > > -#define _sP6_hi                                384
> > > > > -#define _sP5_lo                                448
> > > > > -#define _sP5_hi                                512
> > > > > -#define _sP4_lo                                576
> > > > > -#define _sP4_hi                                640
> > > > > -#define _sP3_lo                                704
> > > > > -#define _sP3_hi                                768
> > > > > -#define _sP2_lo                                832
> > > > > -#define _sP2_hi                                896
> > > > > -#define _sP0_lo                                960
> > > > > -#define _sP0_hi                                1024
> > > > > +/* Offsets for data table __svml_stanh_data_internal. Ordered
> > > > > +   by use in the function. On cold-starts this might help the
> > > > > +   prefetcher. Possibly a better idea is to interleave start/end so
> > > > > +   that the prefetcher is less likely to detect a stream and pull
> > > > > +   irrelivant lines into cache.  */
> > > > > +
> > > > > +/* Offsets for data table __svml_stanh_data_internal.
> > > > > +   64 bytes each.  */
> > > > > +#define _sC_lo 0
> > > > > +#define _sC_hi 64
> > > > > +#define _sP7_lo        128
> > > > > +#define _sP7_hi        192
> > > > > +#define _sP6_lo        256
> > > > > +#define _sP6_hi        320
> > > > > +#define _sP5_lo        384
> > > > > +#define _sP5_hi        448
> > > > > +#define _sP4_lo        512
> > > > > +#define _sP4_hi        576
> > > > > +#define _sP3_lo        640
> > > > > +#define _sP3_hi        704
> > > > > +#define _sP2_lo        768
> > > > > +#define _sP2_hi        832
> > > > > +#define _sP0_lo        896
> > > > > +#define _sP0_hi        960
> > > > > +
> > > > >
> > > > >  #include <sysdep.h>
> > > > > -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> > > > > -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
> > > > >
> > > > >         .section .text.evex512, "ax", @progbits
> > > > >  ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > > -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > > -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > > +       /* Here huge arguments, INF and NaNs are filtered out to
> > > > > +          callout.  */
> > > > > +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > > +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > >
> > > > >         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > > > >         vpxord  %zmm3, %zmm3, %zmm3
> > > > >         vpmaxsd %zmm3, %zmm2, %zmm3
> > > > > -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > > +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > >
> > > > >         /* Setup permute indices in zmm3.  */
> > > > >         vpsrld  $21, %zmm3, %zmm3
> > > > >
> > > > >         /* Store if there are any special cases in k1.  */
> > > > > -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > > +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > > > > -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > > +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> > > > > +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > > > > -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > > +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> > > > > +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > >
> > > > >         /* Store absolute values of inputs in zmm1.  */
> > > > > -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > > > > +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
> > > > >         vandnps %zmm0, %zmm4, %zmm1
> > > > >         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > > > > -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > > +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> > > > > +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > > > > -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > > +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> > > > > +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > >
> > > > >         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > > > >         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > > > > -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > > +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> > > > > +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > > > > -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > > +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> > > > > +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > >
> > > > >         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > > > >         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > > > > -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > > +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> > > > > +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > > > > -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > > +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> > > > > +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > >
> > > > >         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > > > >         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > > > > @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > > >
> > > > >         /* Go to special inputs processing branch.  */
> > > > >         jne     L(SPECIAL_VALUES_BRANCH)
> > > > > -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > > +
> > > > >         /* Wait until after branch of write over zmm0.  */
> > > > >         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > > > >
> > > > > @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > > >
> > > > >         /* Cold case. edx has 1s where there was a special value that
> > > > >            needs to be handled by a tanhf call. Optimize for code size
> > > > > -          more so than speed here. */
> > > > > +          more so than speed here.  */
> > > > >  L(SPECIAL_VALUES_BRANCH):
> > > > > -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > > -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > > > > -       callee save register saving code size. */
> > > > > +
> > > > > +       /* Use r13 to save/restore the stack. This allows us to use rbp
> > > > > +          as callee save register saving code size.  */
> > > > >         pushq   %r13
> > > > > -       cfi_adjust_cfa_offset(8)
> > > > > -       cfi_offset(r13, -16)
> > > > > -       /* Need to callee save registers to preserve state across tanhf calls.
> > > > > -        */
> > > > > +       cfi_adjust_cfa_offset (8)
> > > > > +       cfi_offset (r13, -16)
> > > > > +       /* Need to callee save registers to preserve state across tanhf
> > > > > +          calls.  */
> > > > >         pushq   %rbx
> > > > > -       cfi_adjust_cfa_offset(8)
> > > > > -       cfi_offset(rbx, -24)
> > > > > +       cfi_adjust_cfa_offset (8)
> > > > > +       cfi_offset (rbx, -24)
> > > > >         pushq   %rbp
> > > > > -       cfi_adjust_cfa_offset(8)
> > > > > -       cfi_offset(rbp, -32)
> > > > > +       cfi_adjust_cfa_offset (8)
> > > > > +       cfi_offset (rbp, -32)
> > > > >         movq    %rsp, %r13
> > > > > -       cfi_def_cfa_register(r13)
> > > > > +       cfi_def_cfa_register (r13)
> > > > >
> > > > >         /* Align stack and make room for 2x zmm vectors.  */
> > > > >         andq    $-64, %rsp
> > > > > @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
> > > > >
> > > > >         vzeroupper
> > > > >
> > > > > -       /* edx has 1s where there was a special value that needs to be handled
> > > > > -          by a tanhf call.  */
> > > > > +       /* edx has 1s where there was a special value that needs to be
> > > > > +          handled by a tanhf call.  */
> > > > >         movl    %edx, %ebx
> > > > >  L(SPECIAL_VALUES_LOOP):
> > > > > -       # LOE rbx rbp r12 r13 r14 r15
> > > > > -       /* use rbp as index for special value that is saved across calls to
> > > > > -          tanhf. We technically don't need a callee save register here as offset
> > > > > -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > > > -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > > -          in the loop. Realigning also costs more code size.  */
> > > > > +
> > > > > +       /* use rbp as index for special value that is saved across calls
> > > > > +          to tanhf. We technically don't need a callee save register
> > > > > +          here as offset to rsp is always [0, 56] so we can restore
> > > > > +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> > > > > +          save/restore vs 2 extra instructions in the loop. Realigning
> > > > > +          also costs more code size.  */
> > > > >         xorl    %ebp, %ebp
> > > > >         tzcntl  %ebx, %ebp
> > > > >
> > > > > @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
> > > > >         vmovss  64(%rsp, %rbp, 4), %xmm0
> > > > >         call    tanhf@PLT
> > > > >
> > > > > -       /* No good way to avoid the store-forwarding fault this will cause on
> > > > > -          return. `lfence` avoids the SF fault but at greater cost as it
> > > > > -          serialized stack/callee save restoration.  */
> > > > > +       /* No good way to avoid the store-forwarding fault this will
> > > > > +          cause on return. `lfence` avoids the SF fault but at greater
> > > > > +          cost as it serialized stack/callee save restoration.  */
> > > > >         vmovss  %xmm0, (%rsp, %rbp, 4)
> > > > >
> > > > > -       blsrl   %ebx, %ebx
> > > > > +       blsrl   %ebx, %ebx
> > > > >         jnz     L(SPECIAL_VALUES_LOOP)
> > > > > -       # LOE r12 r13 r14 r15
> > > > > +
> > > > >
> > > > >         /* All results have been written to (%rsp).  */
> > > > >         vmovaps (%rsp), %zmm0
> > > > >         /* Restore rsp.  */
> > > > >         movq    %r13, %rsp
> > > > > -       cfi_def_cfa_register(rsp)
> > > > > +       cfi_def_cfa_register (rsp)
> > > > >         /* Restore callee save registers.  */
> > > > >         popq    %rbp
> > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > -       cfi_restore(rbp)
> > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > +       cfi_restore (rbp)
> > > > >         popq    %rbx
> > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > -       cfi_restore(rbp)
> > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > +       cfi_restore (rbp)
> > > > >         popq    %r13
> > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > -       cfi_restore(r13)
> > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > +       cfi_restore (r13)
> > > > >         ret
> > > > >  END(_ZGVeN16v_tanhf_skx)
> > > > >
> > > > > -       .section .rodata, "a"
> > > > > +       .section .rodata.evex512, "a"
> > > > >         .align  16
> > > > > -#ifdef __svml_stanh_data_internal_typedef
> > > > > -typedef unsigned int VUINT32;
> > > > > -typedef struct
> > > > > -       {
> > > > > -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> > > > > -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > > > > -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> > > > > -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> > > > > -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> > > > > -} __svml_stanh_data_internal;
> > > > > -#endif
> > > > > -
> > > > > -__svml_stanh_data_internal:
> > > > > -       .align  4
> > > > > -       /* _iExpMantMask_UISA */
> > > > > -       .long   0x7fe00000
> > > > > -
> > > > > -       .align  4
> > > > > -       /* _iMinIdxOfsMask_UISA */
> > > > > -       .long   0x3d400000
> > > > > -
> > > > > -       .align  4
> > > > > -       /* _iMaxIdxMask_UISA */
> > > > > -       .long   0x03e00000
> > > > > -
> > > > > -       .align  4
> > > > > -       /* _iExpMask */
> > > > > -       .long   0x7f000000
> > > > > -
> > > > > -       .align  64
> > > > > -__svml_stanh_data_internal_al64:
> > > > > -       .align  64
> > > > > -       /* _sC_lo */
> > > > > -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > > > -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > > > -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > > > -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sC_hi */
> > > > > -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > > > -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > > > -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > > > -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP7_lo */
> > > > > -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > > > -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > > > -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > > > -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP7_hi */
> > > > > -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > > > -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > > > -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > > > -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > > >
> > > > > -       .align  64
> > > > > -       /* _sSignMask */
> > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP6_lo */
> > > > > -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > > > -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > > > -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > > > -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP6_hi */
> > > > > -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > > > -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > > > -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > > > -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP5_lo */
> > > > > -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > > > -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > > > -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > > > -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP5_hi */
> > > > > -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > > > -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > > > -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > > > -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP4_lo */
> > > > > -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > > > -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > > > -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > > > -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP4_hi */
> > > > > -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > > > -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > > > -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > > > -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP3_lo */
> > > > > -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > > > -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > > > -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > > > -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP3_hi */
> > > > > -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > > > -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > > > -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > > > -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP2_lo */
> > > > > -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > > > -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > > > -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > > > -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP2_hi */
> > > > > -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > > > -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > > > -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > > > -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP0_lo */
> > > > > -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > > > -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > > > -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > > > -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP0_hi */
> > > > > -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > > > -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > > > -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > > > -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > > > +LOCAL_DATA_NAME_UNALIGNED:
> > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> > > > > +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> > > > > +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
> > > > >
> > > > >         .align  64
> > > > > -       .type   __svml_stanh_data_internal_al64, @object
> > > > > -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> > > > > -       .type   __svml_stanh_data_internal, @object
> > > > > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > > > > +LOCAL_DATA_NAME:
> > > > > +       float_block (LOCAL_DATA_NAME, _sC_lo,
> > > > > +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> > > > > +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> > > > > +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> > > > > +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sC_hi,
> > > > > +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> > > > > +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> > > > > +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> > > > > +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> > > > > +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> > > > > +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> > > > > +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> > > > > +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> > > > > +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> > > > > +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> > > > > +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> > > > > +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> > > > > +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> > > > > +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> > > > > +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> > > > > +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> > > > > +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> > > > > +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> > > > > +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> > > > > +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> > > > > +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> > > > > +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> > > > > +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> > > > > +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> > > > > +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> > > > > +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> > > > > +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> > > > > +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> > > > > +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> > > > > +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> > > > > +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> > > > > +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> > > > > +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> > > > > +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> > > > > +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> > > > > +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> > > > > +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> > > > > +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> > > > > +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> > > > > +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> > > > > +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> > > > > +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> > > > > +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> > > > > +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> > > > > +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> > > > > +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> > > > > +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> > > > > +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> > > > > +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> > > > > +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> > > > > +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> > > > > +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> > > > > +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> > > > > +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> > > > > +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> > > > > +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> > > > > +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> > > > > +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> > > > > +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> > > > > +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> > > > > +
> > > > > +       .type   LOCAL_DATA_NAME, @object
> > > > > +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> > > > > --
> > > > > 2.34.1
> > > > >
> > > >
> > > > The data movement makes the assembler codes much harder to follow.
> > > > Sunil, what do you think of this patch series?
> > >
> > > What do you mean? The change on in how we define rodata or the movement
> > > to multiple files or something else?
> >
> > The glibc way to support data files for assembly codes is to define
> > data in C and use *.sym to generate offsets for assembly files, like
>
> I see. Although to be fair the entire SVML codebase bucks that trend.

It is because libmvec codes were generated by ICC and processed
by scripts.

> Seems like a more dramatic trend to move all the offsets to C.

Since you are adding data by hand, you should do it in C.

> >
> > sysdeps/x86/cpu-features-offsets.sym:XSAVE_STATE_SIZE_OFFSET
> > offsetof (struct cpu_features, xsave_state_size)
> > sysdeps/x86_64/dl-trampoline.h:  sub
> > _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip),
> > %RSP_LP
> > sysdeps/x86_64/dl-trampoline.h:  sub
> > _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> >
> > --
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S
  2022-12-16 22:01           ` H.J. Lu via Libc-alpha
@ 2022-12-16 22:54             ` Sunil Pandey via Libc-alpha
  2023-06-27 18:23             ` Noah Goldstein via Libc-alpha
  1 sibling, 0 replies; 38+ messages in thread
From: Sunil Pandey via Libc-alpha @ 2022-12-16 22:54 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Noah Goldstein, libc-alpha, andrey.kolesov, carlos

On Fri, Dec 16, 2022 at 2:01 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Dec 16, 2022 at 1:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Dec 16, 2022 at 1:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Dec 16, 2022 at 10:18 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Fri, Dec 16, 2022 at 9:06 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > No changes to the logic, just change how rodata is handled.
> > > > > >
> > > > > > 1. Define the rodatas using the new macros so they check that the
> > > > > >    offset is correct.
> > > > > >
> > > > > > 2. Use common data where applicable.
> > > > > > ---
> > > > > >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
> > > > > >  1 file changed, 197 insertions(+), 253 deletions(-)
> > > > > >
> > > > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > > index d74fc7731d..765e9ed7f7 100644
> > > > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > > @@ -70,94 +70,99 @@
> > > > > >   *
> > > > > >   */
> > > > > >
> > > > > > -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> > > > > > -   by use in the function. On cold-starts this might help the
> > > > > > -   prefetcher. Possibly a better idea is to interleave start/end so
> > > > > > -   that the prefetcher is less likely to detect a stream and pull
> > > > > > -   irrelivant lines into cache.  */
> > > > > >
> > > > > > -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> > > > > > - */
> > > > > > +
> > > > > > +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> > > > > > +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> > > > > > +#include "svml_s_common_evex512_rodata_offsets.h"
> > > > > > +
> > > > > > +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> > > > > > +   4 bytes each.  */
> > > > > >  #define _iExpMantMask_UISA             0
> > > > > >  #define _iMinIdxOfsMask_UISA           4
> > > > > >  #define _iMaxIdxMask_UISA              8
> > > > > >  #define _iExpMask                      12
> > > > > >
> > > > > > -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> > > > > > -   each.  */
> > > > > > -#define _sC_lo                         0
> > > > > > -#define _sC_hi                         64
> > > > > > -#define _sP7_lo                                128
> > > > > > -#define _sP7_hi                                192
> > > > > > -#define _sSignMask                     256
> > > > > > -#define _sP6_lo                                320
> > > > > > -#define _sP6_hi                                384
> > > > > > -#define _sP5_lo                                448
> > > > > > -#define _sP5_hi                                512
> > > > > > -#define _sP4_lo                                576
> > > > > > -#define _sP4_hi                                640
> > > > > > -#define _sP3_lo                                704
> > > > > > -#define _sP3_hi                                768
> > > > > > -#define _sP2_lo                                832
> > > > > > -#define _sP2_hi                                896
> > > > > > -#define _sP0_lo                                960
> > > > > > -#define _sP0_hi                                1024
> > > > > > +/* Offsets for data table __svml_stanh_data_internal. Ordered
> > > > > > +   by use in the function. On cold-starts this might help the
> > > > > > +   prefetcher. Possibly a better idea is to interleave start/end so
> > > > > > +   that the prefetcher is less likely to detect a stream and pull
> > > > > > +   irrelivant lines into cache.  */
> > > > > > +
> > > > > > +/* Offsets for data table __svml_stanh_data_internal.
> > > > > > +   64 bytes each.  */
> > > > > > +#define _sC_lo 0
> > > > > > +#define _sC_hi 64
> > > > > > +#define _sP7_lo        128
> > > > > > +#define _sP7_hi        192
> > > > > > +#define _sP6_lo        256
> > > > > > +#define _sP6_hi        320
> > > > > > +#define _sP5_lo        384
> > > > > > +#define _sP5_hi        448
> > > > > > +#define _sP4_lo        512
> > > > > > +#define _sP4_hi        576
> > > > > > +#define _sP3_lo        640
> > > > > > +#define _sP3_hi        704
> > > > > > +#define _sP2_lo        768
> > > > > > +#define _sP2_hi        832
> > > > > > +#define _sP0_lo        896
> > > > > > +#define _sP0_hi        960
> > > > > > +
> > > > > >
> > > > > >  #include <sysdep.h>
> > > > > > -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> > > > > > -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
> > > > > >
> > > > > >         .section .text.evex512, "ax", @progbits
> > > > > >  ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > > -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > > > -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > > > -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > > > +       /* Here huge arguments, INF and NaNs are filtered out to
> > > > > > +          callout.  */
> > > > > > +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > > > +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > > >
> > > > > >         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > > > > >         vpxord  %zmm3, %zmm3, %zmm3
> > > > > >         vpmaxsd %zmm3, %zmm2, %zmm3
> > > > > > -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > > > +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > > >
> > > > > >         /* Setup permute indices in zmm3.  */
> > > > > >         vpsrld  $21, %zmm3, %zmm3
> > > > > >
> > > > > >         /* Store if there are any special cases in k1.  */
> > > > > > -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > > > +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > > > > > -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > > > +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> > > > > > +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > > > > > -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > > > +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> > > > > > +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > > >
> > > > > >         /* Store absolute values of inputs in zmm1.  */
> > > > > > -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > > > > > +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
> > > > > >         vandnps %zmm0, %zmm4, %zmm1
> > > > > >         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > > > > > -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > > > +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> > > > > > +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > > > > > -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > > > +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> > > > > > +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > > >
> > > > > >         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > > > > >         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > > > > > -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > > > +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> > > > > > +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > > > > > -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > > > +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> > > > > > +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > > >
> > > > > >         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > > > > >         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > > > > > -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > > > +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> > > > > > +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > > > > > -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > > > +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> > > > > > +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > > >
> > > > > >         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > > > > >         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > > > > > @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > >
> > > > > >         /* Go to special inputs processing branch.  */
> > > > > >         jne     L(SPECIAL_VALUES_BRANCH)
> > > > > > -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > > > +
> > > > > >         /* Wait until after branch of write over zmm0.  */
> > > > > >         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > > > > >
> > > > > > @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > >
> > > > > >         /* Cold case. edx has 1s where there was a special value that
> > > > > >            needs to be handled by a tanhf call. Optimize for code size
> > > > > > -          more so than speed here. */
> > > > > > +          more so than speed here.  */
> > > > > >  L(SPECIAL_VALUES_BRANCH):
> > > > > > -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > > > -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > > > > > -       callee save register saving code size. */
> > > > > > +
> > > > > > +       /* Use r13 to save/restore the stack. This allows us to use rbp
> > > > > > +          as callee save register saving code size.  */
> > > > > >         pushq   %r13
> > > > > > -       cfi_adjust_cfa_offset(8)
> > > > > > -       cfi_offset(r13, -16)
> > > > > > -       /* Need to callee save registers to preserve state across tanhf calls.
> > > > > > -        */
> > > > > > +       cfi_adjust_cfa_offset (8)
> > > > > > +       cfi_offset (r13, -16)
> > > > > > +       /* Need to callee save registers to preserve state across tanhf
> > > > > > +          calls.  */
> > > > > >         pushq   %rbx
> > > > > > -       cfi_adjust_cfa_offset(8)
> > > > > > -       cfi_offset(rbx, -24)
> > > > > > +       cfi_adjust_cfa_offset (8)
> > > > > > +       cfi_offset (rbx, -24)
> > > > > >         pushq   %rbp
> > > > > > -       cfi_adjust_cfa_offset(8)
> > > > > > -       cfi_offset(rbp, -32)
> > > > > > +       cfi_adjust_cfa_offset (8)
> > > > > > +       cfi_offset (rbp, -32)
> > > > > >         movq    %rsp, %r13
> > > > > > -       cfi_def_cfa_register(r13)
> > > > > > +       cfi_def_cfa_register (r13)
> > > > > >
> > > > > >         /* Align stack and make room for 2x zmm vectors.  */
> > > > > >         andq    $-64, %rsp
> > > > > > @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
> > > > > >
> > > > > >         vzeroupper
> > > > > >
> > > > > > -       /* edx has 1s where there was a special value that needs to be handled
> > > > > > -          by a tanhf call.  */
> > > > > > +       /* edx has 1s where there was a special value that needs to be
> > > > > > +          handled by a tanhf call.  */
> > > > > >         movl    %edx, %ebx
> > > > > >  L(SPECIAL_VALUES_LOOP):
> > > > > > -       # LOE rbx rbp r12 r13 r14 r15
> > > > > > -       /* use rbp as index for special value that is saved across calls to
> > > > > > -          tanhf. We technically don't need a callee save register here as offset
> > > > > > -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > > > > -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > > > -          in the loop. Realigning also costs more code size.  */
> > > > > > +
> > > > > > +       /* use rbp as index for special value that is saved across calls
> > > > > > +          to tanhf. We technically don't need a callee save register
> > > > > > +          here as offset to rsp is always [0, 56] so we can restore
> > > > > > +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> > > > > > +          save/restore vs 2 extra instructions in the loop. Realigning
> > > > > > +          also costs more code size.  */
> > > > > >         xorl    %ebp, %ebp
> > > > > >         tzcntl  %ebx, %ebp
> > > > > >
> > > > > > @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
> > > > > >         vmovss  64(%rsp, %rbp, 4), %xmm0
> > > > > >         call    tanhf@PLT
> > > > > >
> > > > > > -       /* No good way to avoid the store-forwarding fault this will cause on
> > > > > > -          return. `lfence` avoids the SF fault but at greater cost as it
> > > > > > -          serialized stack/callee save restoration.  */
> > > > > > +       /* No good way to avoid the store-forwarding fault this will
> > > > > > +          cause on return. `lfence` avoids the SF fault but at greater
> > > > > > +          cost as it serialized stack/callee save restoration.  */
> > > > > >         vmovss  %xmm0, (%rsp, %rbp, 4)
> > > > > >
> > > > > > -       blsrl   %ebx, %ebx
> > > > > > +       blsrl   %ebx, %ebx
> > > > > >         jnz     L(SPECIAL_VALUES_LOOP)
> > > > > > -       # LOE r12 r13 r14 r15
> > > > > > +
> > > > > >
> > > > > >         /* All results have been written to (%rsp).  */
> > > > > >         vmovaps (%rsp), %zmm0
> > > > > >         /* Restore rsp.  */
> > > > > >         movq    %r13, %rsp
> > > > > > -       cfi_def_cfa_register(rsp)
> > > > > > +       cfi_def_cfa_register (rsp)
> > > > > >         /* Restore callee save registers.  */
> > > > > >         popq    %rbp
> > > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > > -       cfi_restore(rbp)
> > > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > > +       cfi_restore (rbp)
> > > > > >         popq    %rbx
> > > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > > -       cfi_restore(rbp)
> > > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > > +       cfi_restore (rbp)
> > > > > >         popq    %r13
> > > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > > -       cfi_restore(r13)
> > > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > > +       cfi_restore (r13)
> > > > > >         ret
> > > > > >  END(_ZGVeN16v_tanhf_skx)
> > > > > >
> > > > > > -       .section .rodata, "a"
> > > > > > +       .section .rodata.evex512, "a"
> > > > > >         .align  16
> > > > > > -#ifdef __svml_stanh_data_internal_typedef
> > > > > > -typedef unsigned int VUINT32;
> > > > > > -typedef struct
> > > > > > -       {
> > > > > > -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> > > > > > -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > > > > > -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> > > > > > -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> > > > > > -} __svml_stanh_data_internal;
> > > > > > -#endif
> > > > > > -
> > > > > > -__svml_stanh_data_internal:
> > > > > > -       .align  4
> > > > > > -       /* _iExpMantMask_UISA */
> > > > > > -       .long   0x7fe00000
> > > > > > -
> > > > > > -       .align  4
> > > > > > -       /* _iMinIdxOfsMask_UISA */
> > > > > > -       .long   0x3d400000
> > > > > > -
> > > > > > -       .align  4
> > > > > > -       /* _iMaxIdxMask_UISA */
> > > > > > -       .long   0x03e00000
> > > > > > -
> > > > > > -       .align  4
> > > > > > -       /* _iExpMask */
> > > > > > -       .long   0x7f000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -__svml_stanh_data_internal_al64:
> > > > > > -       .align  64
> > > > > > -       /* _sC_lo */
> > > > > > -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > > > > -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > > > > -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > > > > -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sC_hi */
> > > > > > -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > > > > -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > > > > -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > > > > -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP7_lo */
> > > > > > -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > > > > -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > > > > -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > > > > -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP7_hi */
> > > > > > -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > > > > -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > > > > -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > > > > -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > > > >
> > > > > > -       .align  64
> > > > > > -       /* _sSignMask */
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP6_lo */
> > > > > > -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > > > > -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > > > > -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > > > > -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP6_hi */
> > > > > > -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > > > > -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > > > > -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > > > > -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP5_lo */
> > > > > > -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > > > > -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > > > > -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > > > > -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP5_hi */
> > > > > > -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > > > > -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > > > > -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > > > > -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP4_lo */
> > > > > > -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > > > > -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > > > > -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > > > > -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP4_hi */
> > > > > > -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > > > > -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > > > > -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > > > > -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP3_lo */
> > > > > > -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > > > > -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > > > > -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > > > > -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP3_hi */
> > > > > > -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > > > > -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > > > > -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > > > > -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP2_lo */
> > > > > > -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > > > > -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > > > > -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > > > > -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP2_hi */
> > > > > > -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > > > > -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > > > > -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > > > > -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP0_lo */
> > > > > > -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > > > > -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > > > > -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > > > > -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP0_hi */
> > > > > > -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > > > > -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > > > > -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > > > > -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > > > > +LOCAL_DATA_NAME_UNALIGNED:
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> > > > > > +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> > > > > > +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
> > > > > >
> > > > > >         .align  64
> > > > > > -       .type   __svml_stanh_data_internal_al64, @object
> > > > > > -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> > > > > > -       .type   __svml_stanh_data_internal, @object
> > > > > > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > > > > > +LOCAL_DATA_NAME:
> > > > > > +       float_block (LOCAL_DATA_NAME, _sC_lo,
> > > > > > +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> > > > > > +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> > > > > > +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> > > > > > +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sC_hi,
> > > > > > +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> > > > > > +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> > > > > > +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> > > > > > +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> > > > > > +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> > > > > > +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> > > > > > +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> > > > > > +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> > > > > > +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> > > > > > +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> > > > > > +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> > > > > > +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> > > > > > +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> > > > > > +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> > > > > > +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> > > > > > +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> > > > > > +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> > > > > > +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> > > > > > +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> > > > > > +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> > > > > > +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> > > > > > +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> > > > > > +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> > > > > > +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> > > > > > +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> > > > > > +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> > > > > > +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> > > > > > +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> > > > > > +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> > > > > > +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> > > > > > +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> > > > > > +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> > > > > > +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> > > > > > +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> > > > > > +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> > > > > > +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> > > > > > +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> > > > > > +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> > > > > > +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> > > > > > +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> > > > > > +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> > > > > > +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> > > > > > +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> > > > > > +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> > > > > > +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> > > > > > +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> > > > > > +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> > > > > > +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> > > > > > +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> > > > > > +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> > > > > > +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> > > > > > +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> > > > > > +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> > > > > > +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> > > > > > +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> > > > > > +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> > > > > > +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> > > > > > +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> > > > > > +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> > > > > > +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> > > > > > +
> > > > > > +       .type   LOCAL_DATA_NAME, @object
> > > > > > +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> > > > > > --
> > > > > > 2.34.1
> > > > > >
> > > > >
> > > > > The data movement makes the assembler codes much harder to follow.
> > > > > Sunil, what do you think of this patch series?
> > > >
> > > > What do you mean? The change on in how we define rodata or the movement
> > > > to multiple files or something else?
> > >
> > > The glibc way to support data files for assembly codes is to define
> > > data in C and use *.sym to generate offsets for assembly files, like
> >
> > I see. Although to be fair the entire SVML codebase bucks that trend.
>
> It is because libmvec codes were generated by ICC and processed
> by scripts.
>
> > Seems like a more dramatic trend to move all the offsets to C.
>
> Since you are adding data by hand, you should do it in C.
>
> > >
> > > sysdeps/x86/cpu-features-offsets.sym:XSAVE_STATE_SIZE_OFFSET
> > > offsetof (struct cpu_features, xsave_state_size)
> > > sysdeps/x86_64/dl-trampoline.h:  sub
> > > _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip),
> > > %RSP_LP
> > > sysdeps/x86_64/dl-trampoline.h:  sub
> > > _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> > >
> > > --
> > > H.J.
>
>

Does this restructuring provide any performance benefit as measured by
libmvec microbenchmark?




>
> --
> H.J.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v1 07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S
  2022-12-16 22:01           ` H.J. Lu via Libc-alpha
  2022-12-16 22:54             ` Sunil Pandey via Libc-alpha
@ 2023-06-27 18:23             ` Noah Goldstein via Libc-alpha
  1 sibling, 0 replies; 38+ messages in thread
From: Noah Goldstein via Libc-alpha @ 2023-06-27 18:23 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Sunil K Pandey, libc-alpha, andrey.kolesov, carlos

On Fri, Dec 16, 2022 at 4:01 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Dec 16, 2022 at 1:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Dec 16, 2022 at 1:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Dec 16, 2022 at 10:18 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Fri, Dec 16, 2022 at 9:06 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > No changes to the logic, just change how rodata is handled.
> > > > > >
> > > > > > 1. Define the rodatas using the new macros so they check that the
> > > > > >    offset is correct.
> > > > > >
> > > > > > 2. Use common data where applicable.
> > > > > > ---
> > > > > >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
> > > > > >  1 file changed, 197 insertions(+), 253 deletions(-)
> > > > > >
> > > > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > > index d74fc7731d..765e9ed7f7 100644
> > > > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > > @@ -70,94 +70,99 @@
> > > > > >   *
> > > > > >   */
> > > > > >
> > > > > > -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> > > > > > -   by use in the function. On cold-starts this might help the
> > > > > > -   prefetcher. Possibly a better idea is to interleave start/end so
> > > > > > -   that the prefetcher is less likely to detect a stream and pull
> > > > > > -   irrelivant lines into cache.  */
> > > > > >
> > > > > > -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> > > > > > - */
> > > > > > +
> > > > > > +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> > > > > > +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> > > > > > +#include "svml_s_common_evex512_rodata_offsets.h"
> > > > > > +
> > > > > > +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> > > > > > +   4 bytes each.  */
> > > > > >  #define _iExpMantMask_UISA             0
> > > > > >  #define _iMinIdxOfsMask_UISA           4
> > > > > >  #define _iMaxIdxMask_UISA              8
> > > > > >  #define _iExpMask                      12
> > > > > >
> > > > > > -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> > > > > > -   each.  */
> > > > > > -#define _sC_lo                         0
> > > > > > -#define _sC_hi                         64
> > > > > > -#define _sP7_lo                                128
> > > > > > -#define _sP7_hi                                192
> > > > > > -#define _sSignMask                     256
> > > > > > -#define _sP6_lo                                320
> > > > > > -#define _sP6_hi                                384
> > > > > > -#define _sP5_lo                                448
> > > > > > -#define _sP5_hi                                512
> > > > > > -#define _sP4_lo                                576
> > > > > > -#define _sP4_hi                                640
> > > > > > -#define _sP3_lo                                704
> > > > > > -#define _sP3_hi                                768
> > > > > > -#define _sP2_lo                                832
> > > > > > -#define _sP2_hi                                896
> > > > > > -#define _sP0_lo                                960
> > > > > > -#define _sP0_hi                                1024
> > > > > > +/* Offsets for data table __svml_stanh_data_internal. Ordered
> > > > > > +   by use in the function. On cold-starts this might help the
> > > > > > +   prefetcher. Possibly a better idea is to interleave start/end so
> > > > > > +   that the prefetcher is less likely to detect a stream and pull
> > > > > > +   irrelivant lines into cache.  */
> > > > > > +
> > > > > > +/* Offsets for data table __svml_stanh_data_internal.
> > > > > > +   64 bytes each.  */
> > > > > > +#define _sC_lo 0
> > > > > > +#define _sC_hi 64
> > > > > > +#define _sP7_lo        128
> > > > > > +#define _sP7_hi        192
> > > > > > +#define _sP6_lo        256
> > > > > > +#define _sP6_hi        320
> > > > > > +#define _sP5_lo        384
> > > > > > +#define _sP5_hi        448
> > > > > > +#define _sP4_lo        512
> > > > > > +#define _sP4_hi        576
> > > > > > +#define _sP3_lo        640
> > > > > > +#define _sP3_hi        704
> > > > > > +#define _sP2_lo        768
> > > > > > +#define _sP2_hi        832
> > > > > > +#define _sP0_lo        896
> > > > > > +#define _sP0_hi        960
> > > > > > +
> > > > > >
> > > > > >  #include <sysdep.h>
> > > > > > -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> > > > > > -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
> > > > > >
> > > > > >         .section .text.evex512, "ax", @progbits
> > > > > >  ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > > -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > > > -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > > > -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > > > +       /* Here huge arguments, INF and NaNs are filtered out to
> > > > > > +          callout.  */
> > > > > > +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > > > +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > > >
> > > > > >         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > > > > >         vpxord  %zmm3, %zmm3, %zmm3
> > > > > >         vpmaxsd %zmm3, %zmm2, %zmm3
> > > > > > -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > > > +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > > >
> > > > > >         /* Setup permute indices in zmm3.  */
> > > > > >         vpsrld  $21, %zmm3, %zmm3
> > > > > >
> > > > > >         /* Store if there are any special cases in k1.  */
> > > > > > -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > > > +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > > > > > -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > > > +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> > > > > > +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > > > > > -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > > > +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> > > > > > +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > > >
> > > > > >         /* Store absolute values of inputs in zmm1.  */
> > > > > > -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > > > > > +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
> > > > > >         vandnps %zmm0, %zmm4, %zmm1
> > > > > >         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > > > > > -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > > > +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> > > > > > +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > > > > > -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > > > +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> > > > > > +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > > >
> > > > > >         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > > > > >         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > > > > > -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > > > +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> > > > > > +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > > > > > -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > > > +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> > > > > > +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > > >
> > > > > >         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > > > > >         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > > > > > -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > > > +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> > > > > > +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > > > > > -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > > > +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> > > > > > +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > > >
> > > > > >         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > > > > >         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > > > > > @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > >
> > > > > >         /* Go to special inputs processing branch.  */
> > > > > >         jne     L(SPECIAL_VALUES_BRANCH)
> > > > > > -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > > > +
> > > > > >         /* Wait until after branch of write over zmm0.  */
> > > > > >         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > > > > >
> > > > > > @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > >
> > > > > >         /* Cold case. edx has 1s where there was a special value that
> > > > > >            needs to be handled by a tanhf call. Optimize for code size
> > > > > > -          more so than speed here. */
> > > > > > +          more so than speed here.  */
> > > > > >  L(SPECIAL_VALUES_BRANCH):
> > > > > > -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > > > -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > > > > > -       callee save register saving code size. */
> > > > > > +
> > > > > > +       /* Use r13 to save/restore the stack. This allows us to use rbp
> > > > > > +          as callee save register saving code size.  */
> > > > > >         pushq   %r13
> > > > > > -       cfi_adjust_cfa_offset(8)
> > > > > > -       cfi_offset(r13, -16)
> > > > > > -       /* Need to callee save registers to preserve state across tanhf calls.
> > > > > > -        */
> > > > > > +       cfi_adjust_cfa_offset (8)
> > > > > > +       cfi_offset (r13, -16)
> > > > > > +       /* Need to callee save registers to preserve state across tanhf
> > > > > > +          calls.  */
> > > > > >         pushq   %rbx
> > > > > > -       cfi_adjust_cfa_offset(8)
> > > > > > -       cfi_offset(rbx, -24)
> > > > > > +       cfi_adjust_cfa_offset (8)
> > > > > > +       cfi_offset (rbx, -24)
> > > > > >         pushq   %rbp
> > > > > > -       cfi_adjust_cfa_offset(8)
> > > > > > -       cfi_offset(rbp, -32)
> > > > > > +       cfi_adjust_cfa_offset (8)
> > > > > > +       cfi_offset (rbp, -32)
> > > > > >         movq    %rsp, %r13
> > > > > > -       cfi_def_cfa_register(r13)
> > > > > > +       cfi_def_cfa_register (r13)
> > > > > >
> > > > > >         /* Align stack and make room for 2x zmm vectors.  */
> > > > > >         andq    $-64, %rsp
> > > > > > @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
> > > > > >
> > > > > >         vzeroupper
> > > > > >
> > > > > > -       /* edx has 1s where there was a special value that needs to be handled
> > > > > > -          by a tanhf call.  */
> > > > > > +       /* edx has 1s where there was a special value that needs to be
> > > > > > +          handled by a tanhf call.  */
> > > > > >         movl    %edx, %ebx
> > > > > >  L(SPECIAL_VALUES_LOOP):
> > > > > > -       # LOE rbx rbp r12 r13 r14 r15
> > > > > > -       /* use rbp as index for special value that is saved across calls to
> > > > > > -          tanhf. We technically don't need a callee save register here as offset
> > > > > > -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > > > > -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > > > -          in the loop. Realigning also costs more code size.  */
> > > > > > +
> > > > > > +       /* use rbp as index for special value that is saved across calls
> > > > > > +          to tanhf. We technically don't need a callee save register
> > > > > > +          here as offset to rsp is always [0, 56] so we can restore
> > > > > > +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> > > > > > +          save/restore vs 2 extra instructions in the loop. Realigning
> > > > > > +          also costs more code size.  */
> > > > > >         xorl    %ebp, %ebp
> > > > > >         tzcntl  %ebx, %ebp
> > > > > >
> > > > > > @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
> > > > > >         vmovss  64(%rsp, %rbp, 4), %xmm0
> > > > > >         call    tanhf@PLT
> > > > > >
> > > > > > -       /* No good way to avoid the store-forwarding fault this will cause on
> > > > > > -          return. `lfence` avoids the SF fault but at greater cost as it
> > > > > > -          serialized stack/callee save restoration.  */
> > > > > > +       /* No good way to avoid the store-forwarding fault this will
> > > > > > +          cause on return. `lfence` avoids the SF fault but at greater
> > > > > > +          cost as it serialized stack/callee save restoration.  */
> > > > > >         vmovss  %xmm0, (%rsp, %rbp, 4)
> > > > > >
> > > > > > -       blsrl   %ebx, %ebx
> > > > > > +       blsrl   %ebx, %ebx
> > > > > >         jnz     L(SPECIAL_VALUES_LOOP)
> > > > > > -       # LOE r12 r13 r14 r15
> > > > > > +
> > > > > >
> > > > > >         /* All results have been written to (%rsp).  */
> > > > > >         vmovaps (%rsp), %zmm0
> > > > > >         /* Restore rsp.  */
> > > > > >         movq    %r13, %rsp
> > > > > > -       cfi_def_cfa_register(rsp)
> > > > > > +       cfi_def_cfa_register (rsp)
> > > > > >         /* Restore callee save registers.  */
> > > > > >         popq    %rbp
> > > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > > -       cfi_restore(rbp)
> > > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > > +       cfi_restore (rbp)
> > > > > >         popq    %rbx
> > > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > > -       cfi_restore(rbp)
> > > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > > +       cfi_restore (rbp)
> > > > > >         popq    %r13
> > > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > > -       cfi_restore(r13)
> > > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > > +       cfi_restore (r13)
> > > > > >         ret
> > > > > >  END(_ZGVeN16v_tanhf_skx)
> > > > > >
> > > > > > -       .section .rodata, "a"
> > > > > > +       .section .rodata.evex512, "a"
> > > > > >         .align  16
> > > > > > -#ifdef __svml_stanh_data_internal_typedef
> > > > > > -typedef unsigned int VUINT32;
> > > > > > -typedef struct
> > > > > > -       {
> > > > > > -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> > > > > > -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > > > > > -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> > > > > > -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> > > > > > -} __svml_stanh_data_internal;
> > > > > > -#endif
> > > > > > -
> > > > > > -__svml_stanh_data_internal:
> > > > > > -       .align  4
> > > > > > -       /* _iExpMantMask_UISA */
> > > > > > -       .long   0x7fe00000
> > > > > > -
> > > > > > -       .align  4
> > > > > > -       /* _iMinIdxOfsMask_UISA */
> > > > > > -       .long   0x3d400000
> > > > > > -
> > > > > > -       .align  4
> > > > > > -       /* _iMaxIdxMask_UISA */
> > > > > > -       .long   0x03e00000
> > > > > > -
> > > > > > -       .align  4
> > > > > > -       /* _iExpMask */
> > > > > > -       .long   0x7f000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -__svml_stanh_data_internal_al64:
> > > > > > -       .align  64
> > > > > > -       /* _sC_lo */
> > > > > > -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > > > > -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > > > > -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > > > > -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sC_hi */
> > > > > > -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > > > > -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > > > > -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > > > > -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP7_lo */
> > > > > > -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > > > > -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > > > > -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > > > > -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP7_hi */
> > > > > > -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > > > > -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > > > > -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > > > > -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > > > >
> > > > > > -       .align  64
> > > > > > -       /* _sSignMask */
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP6_lo */
> > > > > > -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > > > > -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > > > > -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > > > > -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP6_hi */
> > > > > > -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > > > > -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > > > > -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > > > > -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP5_lo */
> > > > > > -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > > > > -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > > > > -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > > > > -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP5_hi */
> > > > > > -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > > > > -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > > > > -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > > > > -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP4_lo */
> > > > > > -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > > > > -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > > > > -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > > > > -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP4_hi */
> > > > > > -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > > > > -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > > > > -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > > > > -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP3_lo */
> > > > > > -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > > > > -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > > > > -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > > > > -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP3_hi */
> > > > > > -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > > > > -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > > > > -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > > > > -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP2_lo */
> > > > > > -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > > > > -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > > > > -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > > > > -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP2_hi */
> > > > > > -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > > > > -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > > > > -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > > > > -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP0_lo */
> > > > > > -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > > > > -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > > > > -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > > > > -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP0_hi */
> > > > > > -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > > > > -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > > > > -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > > > > -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > > > > +LOCAL_DATA_NAME_UNALIGNED:
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> > > > > > +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> > > > > > +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
> > > > > >
> > > > > >         .align  64
> > > > > > -       .type   __svml_stanh_data_internal_al64, @object
> > > > > > -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> > > > > > -       .type   __svml_stanh_data_internal, @object
> > > > > > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > > > > > +LOCAL_DATA_NAME:
> > > > > > +       float_block (LOCAL_DATA_NAME, _sC_lo,
> > > > > > +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> > > > > > +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> > > > > > +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> > > > > > +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sC_hi,
> > > > > > +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> > > > > > +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> > > > > > +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> > > > > > +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> > > > > > +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> > > > > > +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> > > > > > +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> > > > > > +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> > > > > > +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> > > > > > +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> > > > > > +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> > > > > > +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> > > > > > +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> > > > > > +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> > > > > > +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> > > > > > +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> > > > > > +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> > > > > > +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> > > > > > +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> > > > > > +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> > > > > > +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> > > > > > +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> > > > > > +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> > > > > > +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> > > > > > +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> > > > > > +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> > > > > > +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> > > > > > +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> > > > > > +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> > > > > > +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> > > > > > +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> > > > > > +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> > > > > > +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> > > > > > +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> > > > > > +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> > > > > > +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> > > > > > +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> > > > > > +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> > > > > > +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> > > > > > +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> > > > > > +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> > > > > > +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> > > > > > +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> > > > > > +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> > > > > > +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> > > > > > +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> > > > > > +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> > > > > > +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> > > > > > +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> > > > > > +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> > > > > > +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> > > > > > +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> > > > > > +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> > > > > > +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> > > > > > +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> > > > > > +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> > > > > > +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> > > > > > +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> > > > > > +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> > > > > > +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> > > > > > +
> > > > > > +       .type   LOCAL_DATA_NAME, @object
> > > > > > +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> > > > > > --
> > > > > > 2.34.1
> > > > > >
> > > > >
> > > > > The data movement makes the assembler codes much harder to follow.
> > > > > Sunil, what do you think of this patch series?
> > > >
> > > > What do you mean? The change on in how we define rodata or the movement
> > > > to multiple files or something else?
> > >
> > > The glibc way to support data files for assembly codes is to define
> > > data in C and use *.sym to generate offsets for assembly files, like
> >
> > I see. Although to be fair the entire SVML codebase bucks that trend.
>
> It is because libmvec codes were generated by ICC and processed
> by scripts.
>
> > Seems like a more dramatic trend to move all the offsets to C.
>
> Since you are adding data by hand, you should do it in C.

Since the plan is to integrate this piece-meal (function by function), think
it's easier to integrate into a system that matches the rest of the
unyet changed
files.

Once all of the SVML functions have been updated it will be simple
enough to script
the change from ASM -> C.
Thoughts?
>
> > >
> > > sysdeps/x86/cpu-features-offsets.sym:XSAVE_STATE_SIZE_OFFSET
> > > offsetof (struct cpu_features, xsave_state_size)
> > > sysdeps/x86_64/dl-trampoline.h:  sub
> > > _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip),
> > > %RSP_LP
> > > sysdeps/x86_64/dl-trampoline.h:  sub
> > > _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2023-06-27 18:24 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-07  8:52 [PATCH v1 01/27] x86/fpu: Create helper file for common data macros Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 02/27] x86/fpu: Add file for common data used across svml_s_*_avx2.S files Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 03/27] x86/fpu: Add file for common data used across svml_s_*_avx512.S files Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 04/27] x86/fpu: Add file for common data used across svml_s_*_sse4.S files Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 05/27] x86/fpu: Build common data files for svml_s_*_{avx512, avx2, sse4}.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 06/27] x86/fpu: Update rodata usage in svml_s_tanhf_*_{avx2, sse4} Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S Noah Goldstein via Libc-alpha
2022-12-16 17:05   ` H.J. Lu via Libc-alpha
2022-12-16 18:17     ` Noah Goldstein via Libc-alpha
2022-12-16 21:37       ` H.J. Lu via Libc-alpha
2022-12-16 21:51         ` Noah Goldstein via Libc-alpha
2022-12-16 22:01           ` H.J. Lu via Libc-alpha
2022-12-16 22:54             ` Sunil Pandey via Libc-alpha
2023-06-27 18:23             ` Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 08/27] x86/fpu: Update rodata usage in svml_s_atanhf16_core_avx512.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 09/27] x86/fpu: Update rodata usage in svml_s_atanhf4_core_sse4.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 10/27] x86/fpu: Update rodata usage in svml_s_atanhf8_core_avx2.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 11/27] x86/fpu: Optimize svml_s_atanf16_core_avx512.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 12/27] x86/fpu: Optimize svml_s_atanf4_core_sse4.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 13/27] x86/fpu: Optimize svml_s_atanf8_core_avx2.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 14/27] x86/fpu: Add common rodata file for svml_s_tanf_*_{avx512, avx2, sse4}.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 15/27] x86/fpu: Optimize svml_s_tanf16_core_avx512.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 16/27] x86/fpu: Optimize svml_s_tanf4_core_sse4.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 17/27] x86/fpu: Optimize svml_s_tanf8_core_avx2.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 18/27] x86/fpu: Optimize svml_s_log10f16_core_avx512.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 19/27] x86/fpu: Optimize svml_s_log10f4_core_sse4.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 20/27] x86/fpu: Optimize svml_s_log10f8_core_avx2.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 21/27] x86/fpu: Optimize svml_s_log2f16_core_avx512.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 22/27] x86/fpu: Optimize svml_s_log2f4_core_sse4.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 23/27] x86/fpu: Optimize svml_s_log2f8_core_avx2.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 24/27] x86/fpu: Optimize svml_s_logf16_core_avx512.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 25/27] x86/fpu: Optimize svml_s_logf4_core_sse4.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 26/27] x86/fpu: Optimize svml_s_logf8_core_avx2.S Noah Goldstein via Libc-alpha
2022-12-07  8:52 ` [PATCH v1 27/27] x86/fpu: Remove unused svml_s_logf_data.S file Noah Goldstein via Libc-alpha
2022-12-07 23:53 ` [PATCH v1 01/27] x86/fpu: Create helper file for common data macros H.J. Lu via Libc-alpha
2022-12-08  0:13   ` Noah Goldstein via Libc-alpha
2022-12-08  0:22     ` H.J. Lu via Libc-alpha
2022-12-08  0:46       ` Noah Goldstein via Libc-alpha

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).