From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: AS3215 2.6.0.0/16 X-Spam-Status: No, score=-4.0 required=3.0 tests=AWL,BAYES_00,DKIM_SIGNED, DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,MAILING_LIST_MULTI, RCVD_IN_DNSWL_HI,SPF_HELO_PASS,SPF_PASS shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from sourceware.org (server2.sourceware.org [IPv6:2620:52:3:1:0:246e:9693:128c]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (No client certificate requested) by dcvr.yhbt.net (Postfix) with ESMTPS id AE5631F601 for ; Wed, 7 Dec 2022 08:55:06 +0000 (UTC) Authentication-Results: dcvr.yhbt.net; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.b="jBmTpkeN"; dkim-atps=neutral Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id E80D83959CAC for ; Wed, 7 Dec 2022 08:55:05 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org E80D83959CAC DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1670403305; bh=IMoZxeQ/oukkOMJfCvwJY99hhQQ8F/a9ZrXRrg/QnzI=; h=To:Cc:Subject:Date:In-Reply-To:References:List-Id: List-Unsubscribe:List-Archive:List-Post:List-Help:List-Subscribe: From:Reply-To:From; b=jBmTpkeNdfqbK8809TWp6D8yRjQ/VHC41n+ZWN4gPWnRRa8Cme2dSj63mTJB5dBFB WdD2lRf81bEmxE6FDk20PEL5KuZxjOwKVA1SmFzbssKlpelcFPpNyGSg08Eav+rCls TQtRzWjiC2Ita/HVwSZr65ZgKJaq48lhKxSJco1w= Received: from mail-ed1-x531.google.com (mail-ed1-x531.google.com [IPv6:2a00:1450:4864:20::531]) by sourceware.org (Postfix) with ESMTPS id 2A79F392B11B for ; Wed, 7 Dec 2022 08:52:58 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 2A79F392B11B Received: by mail-ed1-x531.google.com with SMTP id r26so23884077edc.10 for ; Wed, 07 Dec 2022 00:52:58 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=IMoZxeQ/oukkOMJfCvwJY99hhQQ8F/a9ZrXRrg/QnzI=; b=rsOUyG6r3gXKWTyf6RTe4/VPu59FF2uXAyEBQ2ctpgsVs+5bi/hLJM9pOM54yirBE9 qGsp9kCGxzBthpWsjceOLXQlYEs/L+jmsC+YG6hL7iZ5E+7ufcsFn1q5TpN21s1t4XZt WUkMxHGf561zwIkhqfP2NV3FIcsNO2A+AD//SdPcobBxTil1AcsX11dTKjT66bJJW+IZ oVQJMsKShePjJavspxEWzZ6RJvmo9t9D5p09aT+O1yoXzutUE/FAx99+pvSSyKnRmA+P YFhOVRIIa5AlDrM1hwi/grF2K+nRuEHBf44zUaj+r3q6Vk+k0aj+DqQ/WEUIFmFDcGug 3yzw== X-Gm-Message-State: ANoB5pnZolBXfh6EuB/tL4qnqopUQE2GTXwn6jMqSOx9JXLLwmULmeU9 jVWJuYayKak2fnySAWX3D1frTyqdVtA= X-Google-Smtp-Source: AA0mqf79G39WgFlFMr0tv82oC2VwPJEumo24nAcRgWR3HADdfHlc8uimQNh7FqvWO04LMgMlCklk0w== X-Received: by 2002:a05:6402:3c1:b0:46b:2d74:d970 with SMTP id t1-20020a05640203c100b0046b2d74d970mr38222941edw.138.1670403176624; Wed, 07 Dec 2022 00:52:56 -0800 (PST) Received: from noahgold-desk.lan (2603-8080-1301-76c6-feb7-1b9b-f2dd-08f7.res6.spectrum.com. [2603:8080:1301:76c6:feb7:1b9b:f2dd:8f7]) by smtp.gmail.com with ESMTPSA id k17-20020aa7c051000000b0046bd3b366f9sm1931767edo.32.2022.12.07.00.52.55 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 07 Dec 2022 00:52:56 -0800 (PST) To: libc-alpha@sourceware.org Cc: goldstein.w.n@gmail.com, hjl.tools@gmail.com, andrey.kolesov@intel.com, carlos@systemhalted.org Subject: [PATCH v1 09/27] x86/fpu: Update rodata usage in svml_s_atanhf4_core_sse4.S Date: Wed, 7 Dec 2022 00:52:18 -0800 Message-Id: <20221207085236.1424424-9-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20221207085236.1424424-1-goldstein.w.n@gmail.com> References: <20221207085236.1424424-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , From: Noah Goldstein via Libc-alpha Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+e=80x24.org@sourceware.org Sender: "Libc-alpha" No changes to the logic, just change how rodata is handled. 1. Define the rodatas using the new macros so they check that the offset is correct. 2. Use common data where applicable. --- .../fpu/multiarch/svml_s_atanhf4_core_sse4.S | 240 ++++++++---------- 1 file changed, 102 insertions(+), 138 deletions(-) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S index 37200b3601..da5744506f 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S @@ -30,46 +30,45 @@ * */ -/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered - by use in the function. On cold-starts this might help the - prefetcher. Possibly a better idea is to interleave start/end so - that the prefetcher is less likely to detect a stream and pull - irrelivant lines into cache. */ -#define sOne 0 -#define SgnMask 16 -#define sTopMask12 32 -#define iBrkValue 48 -#define iOffExpoMask 64 -#define sPoly 80 -#define sLn2 208 -#define TinyRange 224 +#define LOCAL_DATA_NAME __svml_satanh_data_internal +#include "svml_s_common_sse4_rodata_offsets.h" + +/* Offsets for data table __svml_stan_data_internal. */ +#define _Poly_1 0 +#define _Poly_2 16 +#define _Poly_3 32 +#define _Poly_4 48 +#define _Poly_5 64 +#define _Poly_6 80 +#define _Poly_7 96 +#define _TinyRange 112 #include -#define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal) .section .text.sse4, "ax", @progbits ENTRY(_ZGVbN4v_atanhf_sse4) movaps %xmm0, %xmm5 - /* Load constants including One = 1 */ - movups ATANHF_DATA(sOne)(%rip), %xmm4 + /* Load constants including One = 1. */ + movups COMMON_DATA(_OneF)(%rip), %xmm4 movaps %xmm5, %xmm3 - /* Strip off the sign, so treat X as positive until right at the end */ - movups ATANHF_DATA(SgnMask)(%rip), %xmm1 + /* Strip off the sign, so treat X as positive until right at the + end. */ + movups COMMON_DATA(_AbsMask)(%rip), %xmm1 movaps %xmm4, %xmm2 andps %xmm1, %xmm0 movaps %xmm4, %xmm10 - movups ATANHF_DATA(sTopMask12)(%rip), %xmm11 + movups COMMON_DATA(_Neg4096)(%rip), %xmm11 movaps %xmm4, %xmm14 movaps %xmm11, %xmm9 - /* - * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces, - * the upper part UHi being <= 12 bits long. Then we have - * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)). - */ + /* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two + pieces, the upper part UHi being <= 12 bits long. Then we + have: + atanh(X) = 1/2 * log((1 + X) / (1 - X)) + = 1/2 * log1p(V / (UHi + ULo)). */ movaps %xmm0, %xmm6 mulps %xmm5, %xmm3 subps %xmm0, %xmm2 @@ -80,65 +79,61 @@ ENTRY(_ZGVbN4v_atanhf_sse4) andps %xmm2, %xmm9 - /* - * Check whether |X| < 1, in which case we use the main function. - * Otherwise set the rangemask so that the callout will get used. - * Note that this will also use the callout for NaNs since not(NaN < 1). - */ + /* Check whether |X| < 1, in which case we use the main + function. Otherwise set the rangemask so that the callout + will get used. Note that this will also use the callout for + NaNs since not(NaN < 1). */ rcpps %xmm9, %xmm7 subps %xmm9, %xmm2 andps %xmm11, %xmm7 - /* - * Split V as well into upper 12 bits and lower part, so that we can get - * a preliminary quotient estimate without rounding error. - */ + /* Split V as well into upper 12 bits and lower part, so that we + can get a preliminary quotient estimate without rounding + error. */ andps %xmm6, %xmm11 mulps %xmm7, %xmm9 addps %xmm2, %xmm10 subps %xmm11, %xmm6 - /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */ + /* Hence get initial quotient estimate: + QHi + QLo = R * VHi + R * VLo. */ mulps %xmm7, %xmm11 mulps %xmm7, %xmm10 subps %xmm9, %xmm14 mulps %xmm6, %xmm7 subps %xmm10, %xmm14 - /* Compute D = E + E^2 */ + /* Compute D = E + E^2. */ movaps %xmm14, %xmm13 movaps %xmm4, %xmm8 mulps %xmm14, %xmm13 - /* reduction: compute r,n */ - movdqu ATANHF_DATA(iBrkValue)(%rip), %xmm9 + /* reduction: compute r,n. */ + movdqu COMMON_DATA(_IBrkValue)(%rip), %xmm9 addps %xmm13, %xmm14 - /* - * Compute R * (VHi + VLo) * (1 + E + E^2) - * = R * (VHi + VLo) * (1 + D) - * = QHi + (QHi * D + QLo + QLo * D) - */ + /* Compute R * (VHi + VLo) * (1 + E + E^2) + = R * (VHi + VLo) * (1 + D) + = QHi + (QHi * D + QLo + QLo * D). */ movaps %xmm14, %xmm2 mulps %xmm7, %xmm14 mulps %xmm11, %xmm2 addps %xmm14, %xmm7 - movdqu ATANHF_DATA(iOffExpoMask)(%rip), %xmm12 + movdqu COMMON_DATA(_NotiOffExpoMask)(%rip), %xmm12 movaps %xmm4, %xmm14 - /* Record the sign for eventual reincorporation. */ + /* Record the sign for eventual reincorporation. */ addps %xmm7, %xmm2 - /* - * Now finally accumulate the high and low parts of the - * argument to log1p, H + L, with a final compensated summation. - */ + /* Now finally accumulate the high and low parts of the + argument to log1p, H + L, with a final compensated summation. */ movaps %xmm2, %xmm6 andnps %xmm5, %xmm1 movaps %xmm4, %xmm7 - /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */ + /* Or the sign bit in with the tiny result to handle atanh(-0) + correctly. */ addps %xmm11, %xmm6 maxps %xmm6, %xmm7 minps %xmm6, %xmm8 @@ -149,43 +144,43 @@ ENTRY(_ZGVbN4v_atanhf_sse4) subps %xmm10, %xmm7 psubd %xmm9, %xmm10 addps %xmm8, %xmm7 - pand %xmm10, %xmm12 + pandn %xmm10, %xmm12 psrad $23, %xmm10 cvtdq2ps %xmm10, %xmm13 addps %xmm7, %xmm2 - /* final reconstruction */ + /* final reconstruction. */ pslld $23, %xmm10 paddd %xmm9, %xmm12 psubd %xmm10, %xmm14 - /* polynomial evaluation */ + /* polynomial evaluation. */ subps %xmm4, %xmm12 mulps %xmm14, %xmm2 - movups ATANHF_DATA(sPoly+0)(%rip), %xmm7 + movups LOCAL_DATA(_Poly_1)(%rip), %xmm7 addps %xmm12, %xmm2 mulps %xmm2, %xmm7 - /* Finally, halve the result and reincorporate the sign */ - addps ATANHF_DATA(sPoly+16)(%rip), %xmm7 + /* Finally, halve the result and reincorporate the sign. */ + addps LOCAL_DATA(_Poly_2)(%rip), %xmm7 mulps %xmm2, %xmm7 - addps ATANHF_DATA(sPoly+32)(%rip), %xmm7 + addps LOCAL_DATA(_Poly_3)(%rip), %xmm7 mulps %xmm2, %xmm7 - addps ATANHF_DATA(sPoly+48)(%rip), %xmm7 + addps LOCAL_DATA(_Poly_4)(%rip), %xmm7 mulps %xmm2, %xmm7 - addps ATANHF_DATA(sPoly+64)(%rip), %xmm7 + addps LOCAL_DATA(_Poly_5)(%rip), %xmm7 mulps %xmm2, %xmm7 - addps ATANHF_DATA(sPoly+80)(%rip), %xmm7 + addps LOCAL_DATA(_Poly_6)(%rip), %xmm7 mulps %xmm2, %xmm7 - addps ATANHF_DATA(sPoly+96)(%rip), %xmm7 + addps LOCAL_DATA(_Poly_7)(%rip), %xmm7 mulps %xmm2, %xmm7 - movaps ATANHF_DATA(sPoly+112)(%rip), %xmm6 + movaps COMMON_DATA(_Neg5F)(%rip), %xmm6 addps %xmm6, %xmm7 mulps %xmm2, %xmm7 mulps %xmm2, %xmm7 - mulps ATANHF_DATA(sLn2)(%rip), %xmm13 - /* We can build `sHalf` with `sPoly & sOne`. */ + mulps COMMON_DATA(_Ln2)(%rip), %xmm13 + /* We can build `sHalf` with `_Poly & sOne`. */ andps %xmm4, %xmm6 orps %xmm1, %xmm3 xorps %xmm6, %xmm1 @@ -197,7 +192,7 @@ ENTRY(_ZGVbN4v_atanhf_sse4) /* Finish check of NaNs. */ cmpleps %xmm0, %xmm4 movmskps %xmm4, %edx - cmpltps ATANHF_DATA(TinyRange)(%rip), %xmm0 + cmpltps LOCAL_DATA(_TinyRange)(%rip), %xmm0 andps %xmm0, %xmm3 andnps %xmm1, %xmm0 @@ -206,115 +201,84 @@ ENTRY(_ZGVbN4v_atanhf_sse4) testl %edx, %edx /* Go to special inputs processing branch. */ jne L(SPECIAL_VALUES_BRANCH) - # LOE rbx rbp r12 r13 r14 r15 xmm0 + /* No registers to restore on fast path. */ ret /* Cold case. edx has 1s where there was a special value that needs to be handled by a atanhf call. Optimize for code size - more so than speed here. */ + more so than speed here. */ L(SPECIAL_VALUES_BRANCH): - # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5 + /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on - call entry will be 16-byte aligned. */ + call entry will be 16-byte aligned. */ subq $56, %rsp - cfi_def_cfa_offset(64) + cfi_def_cfa_offset (64) movups %xmm0, 24(%rsp) movups %xmm5, 40(%rsp) /* Use rbx/rbp for callee save registers as they get short - encoding for many instructions (as compared with r12/r13). */ + encoding for many instructions (as compared with r12/r13). */ movq %rbx, (%rsp) - cfi_offset(rbx, -64) + cfi_offset (rbx, -64) movq %rbp, 8(%rsp) - cfi_offset(rbp, -56) - /* edx has 1s where there was a special value that needs to be handled - by a tanhf call. */ + cfi_offset (rbp, -56) + /* edx has 1s where there was a special value that needs to be + handled by a tanhf call. */ movl %edx, %ebx L(SPECIAL_VALUES_LOOP): - # LOE rbx rbp r12 r13 r14 r15 - /* use rbp as index for special value that is saved across calls to - tanhf. We technically don't need a callee save register here as offset - to rsp is always [0, 12] so we can restore rsp by realigning to 64. - Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions - in the loop. */ + + /* use rbp as index for special value that is saved across calls + to tanhf. We technically don't need a callee save register + here as offset to rsp is always [0, 12] so we can restore + rsp by realigning to 64. Essentially the tradeoff is 1 extra + save/restore vs 2 extra instructions in the loop. */ xorl %ebp, %ebp bsfl %ebx, %ebp /* Scalar math fucntion call to process special input. */ movss 40(%rsp, %rbp, 4), %xmm0 call atanhf@PLT - /* No good way to avoid the store-forwarding fault this will cause on - return. `lfence` avoids the SF fault but at greater cost as it - serialized stack/callee save restoration. */ + /* No good way to avoid the store-forwarding fault this will + cause on return. `lfence` avoids the SF fault but at greater + cost as it serialized stack/callee save restoration. */ movss %xmm0, 24(%rsp, %rbp, 4) leal -1(%rbx), %eax andl %eax, %ebx jnz L(SPECIAL_VALUES_LOOP) - # LOE r12 r13 r14 r15 + /* All results have been written to 24(%rsp). */ movups 24(%rsp), %xmm0 movq (%rsp), %rbx - cfi_restore(rbx) + cfi_restore (rbx) movq 8(%rsp), %rbp - cfi_restore(rbp) + cfi_restore (rbp) addq $56, %rsp - cfi_def_cfa_offset(8) + cfi_def_cfa_offset (8) ret END(_ZGVbN4v_atanhf_sse4) - .section .rodata, "a" + .section .rodata.sse4, "a" .align 16 -#ifdef __svml_satanh_data_internal_typedef -typedef unsigned int VUINT32; -typedef struct{ - __declspec(align(16)) VUINT32 sOne[4][1]; - __declspec(align(16)) VUINT32 SgnMask[4][1]; - __declspec(align(16)) VUINT32 sTopMask12[4][1]; - __declspec(align(16)) VUINT32 iBrkValue[4][1]; - __declspec(align(16)) VUINT32 iOffExpoMask[4][1]; - __declspec(align(16)) VUINT32 sPoly[8][4][1]; - __declspec(align(16)) VUINT32 sLn2[4][1]; - __declspec(align(16)) VUINT32 TinyRange[4][1]; -} __svml_satanh_data_internal; -#endif - -__svml_satanh_data_internal: - /* sOne = SP 1.0 */ - .align 16 - .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 - /* SgnMask */ - .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff - /* sTopMask12 */ - .align 16 - .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000 - /* iBrkValue = SP 2/3 */ - .align 16 - .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab - /* iOffExpoMask = SP significand mask ==*/ - .align 16 - .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff - - /* sPoly[] = SP polynomial */ - .align 16 - .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */ - .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */ - .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */ - .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */ - .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */ - .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */ - .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */ - .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */ - - /* sLn2 = SP ln(2) */ - .align 16 - .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 - /* TinyRange */ - .align 16 - .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000 - .align 16 - .type __svml_satanh_data_internal, @object - .size __svml_satanh_data_internal, .-__svml_satanh_data_internal +LOCAL_DATA_NAME: + /* _Poly[] = SP polynomial. */ + /* 1.3820238411426544189453125e-01 P7. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_1, 0x3e0d84ed) + /* -1.5122179687023162841796875e-01 P6. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_2, 0xbe1ad9e3) + /* 1.4042308926582336425781250e-01 P5. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_3, 0x3e0fcb12) + /* -1.6472326219081878662109375e-01 P4. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_4, 0xbe28ad37) + /* 2.0007920265197753906250000e-01 P3. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_5, 0x3e4ce190) + /* -2.5004237890243530273437500e-01 P2. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_6, 0xbe80058e) + /* 3.3333265781402587890625000e-01 P1. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_7, 0x3eaaaa94) + DATA_VEC (LOCAL_DATA_NAME, _TinyRange, 0x0C000000) + .type LOCAL_DATA_NAME, @object + .size LOCAL_DATA_NAME, .-LOCAL_DATA_NAME -- 2.34.1