From: Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org>
To: libc-alpha@sourceware.org
Subject: [PATCH v2 3/5] x86_64: Add sse4_1 optimized bcmp implementation in memcmp-sse4.S
Date: Tue, 14 Sep 2021 01:30:37 -0500 [thread overview]
Message-ID: <20210914063039.1126196-3-goldstein.w.n@gmail.com> (raw)
In-Reply-To: <20210914063039.1126196-1-goldstein.w.n@gmail.com>
No bug. This commit does not modify any of the memcmp
implementation. It just adds bcmp ifdefs to skip obvious cases
where computing the proper 1/-1 required by memcmp is not needed.
test-memcmp, test-bcmp, and test-wmemcmp are all passing.
---
sysdeps/x86_64/multiarch/memcmp-sse4.S | 761 ++++++++++++++++++++++++-
1 file changed, 746 insertions(+), 15 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index b82adcd5fa..b9528ed58e 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -72,7 +72,11 @@ L(79bytesormore):
movdqu (%rdi), %xmm2
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(16bytesin256)
+# endif
mov %rsi, %rcx
and $-16, %rsi
add $16, %rsi
@@ -91,34 +95,58 @@ L(less128bytes):
movdqu (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(16bytesin256)
+# endif
movdqu 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(32bytesin256)
+# endif
movdqu 32(%rdi), %xmm2
pxor 32(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(48bytesin256)
+# endif
movdqu 48(%rdi), %xmm2
pxor 48(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(64bytesin256)
+# endif
cmp $32, %rdx
jb L(less32bytesin64)
movdqu 64(%rdi), %xmm2
pxor 64(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(80bytesin256)
+# endif
movdqu 80(%rdi), %xmm2
pxor 80(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(96bytesin256)
+# endif
sub $32, %rdx
add $32, %rdi
add $32, %rsi
@@ -140,42 +168,74 @@ L(less256bytes):
movdqu (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(16bytesin256)
+# endif
movdqu 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(32bytesin256)
+# endif
movdqu 32(%rdi), %xmm2
pxor 32(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(48bytesin256)
+# endif
movdqu 48(%rdi), %xmm2
pxor 48(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(64bytesin256)
+# endif
movdqu 64(%rdi), %xmm2
pxor 64(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(80bytesin256)
+# endif
movdqu 80(%rdi), %xmm2
pxor 80(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(96bytesin256)
+# endif
movdqu 96(%rdi), %xmm2
pxor 96(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(112bytesin256)
+# endif
movdqu 112(%rdi), %xmm2
pxor 112(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(128bytesin256)
+# endif
add $128, %rsi
add $128, %rdi
@@ -189,12 +249,20 @@ L(less256bytes):
movdqu (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(16bytesin256)
+# endif
movdqu 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(32bytesin256)
+# endif
sub $32, %rdx
add $32, %rdi
add $32, %rsi
@@ -208,82 +276,146 @@ L(less512bytes):
movdqu (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(16bytesin256)
+# endif
movdqu 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(32bytesin256)
+# endif
movdqu 32(%rdi), %xmm2
pxor 32(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(48bytesin256)
+# endif
movdqu 48(%rdi), %xmm2
pxor 48(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(64bytesin256)
+# endif
movdqu 64(%rdi), %xmm2
pxor 64(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(80bytesin256)
+# endif
movdqu 80(%rdi), %xmm2
pxor 80(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(96bytesin256)
+# endif
movdqu 96(%rdi), %xmm2
pxor 96(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(112bytesin256)
+# endif
movdqu 112(%rdi), %xmm2
pxor 112(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(128bytesin256)
+# endif
movdqu 128(%rdi), %xmm2
pxor 128(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(144bytesin256)
+# endif
movdqu 144(%rdi), %xmm2
pxor 144(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(160bytesin256)
+# endif
movdqu 160(%rdi), %xmm2
pxor 160(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(176bytesin256)
+# endif
movdqu 176(%rdi), %xmm2
pxor 176(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(192bytesin256)
+# endif
movdqu 192(%rdi), %xmm2
pxor 192(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(208bytesin256)
+# endif
movdqu 208(%rdi), %xmm2
pxor 208(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(224bytesin256)
+# endif
movdqu 224(%rdi), %xmm2
pxor 224(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(240bytesin256)
+# endif
movdqu 240(%rdi), %xmm2
pxor 240(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(256bytesin256)
+# endif
add $256, %rsi
add $256, %rdi
@@ -300,12 +432,20 @@ L(less512bytes):
movdqu (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(16bytesin256)
+# endif
movdqu 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(32bytesin256)
+# endif
sub $32, %rdx
add $32, %rdi
add $32, %rsi
@@ -346,7 +486,11 @@ L(64bytesormore_loop):
por %xmm5, %xmm1
ptest %xmm1, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(64bytesormore_loop_end)
+# endif
add $64, %rsi
add $64, %rdi
sub $64, %rdx
@@ -380,7 +524,11 @@ L(L2_L3_unaligned_128bytes_loop):
por %xmm5, %xmm1
ptest %xmm1, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(64bytesormore_loop_end)
+# endif
add $64, %rsi
add $64, %rdi
sub $64, %rdx
@@ -404,34 +552,58 @@ L(less128bytesin2aligned):
movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(16bytesin256)
+# endif
movdqa 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(32bytesin256)
+# endif
movdqa 32(%rdi), %xmm2
pxor 32(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(48bytesin256)
+# endif
movdqa 48(%rdi), %xmm2
pxor 48(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(64bytesin256)
+# endif
cmp $32, %rdx
jb L(less32bytesin64in2alinged)
movdqa 64(%rdi), %xmm2
pxor 64(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(80bytesin256)
+# endif
movdqa 80(%rdi), %xmm2
pxor 80(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(96bytesin256)
+# endif
sub $32, %rdx
add $32, %rdi
add $32, %rsi
@@ -454,42 +626,74 @@ L(less256bytesin2alinged):
movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(16bytesin256)
+# endif
movdqa 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(32bytesin256)
+# endif
movdqa 32(%rdi), %xmm2
pxor 32(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(48bytesin256)
+# endif
movdqa 48(%rdi), %xmm2
pxor 48(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(64bytesin256)
+# endif
movdqa 64(%rdi), %xmm2
pxor 64(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(80bytesin256)
+# endif
movdqa 80(%rdi), %xmm2
pxor 80(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(96bytesin256)
+# endif
movdqa 96(%rdi), %xmm2
pxor 96(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(112bytesin256)
+# endif
movdqa 112(%rdi), %xmm2
pxor 112(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(128bytesin256)
+# endif
add $128, %rsi
add $128, %rdi
@@ -503,12 +707,20 @@ L(less256bytesin2alinged):
movdqu (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(16bytesin256)
+# endif
movdqu 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(32bytesin256)
+# endif
sub $32, %rdx
add $32, %rdi
add $32, %rsi
@@ -524,82 +736,146 @@ L(256bytesormorein2aligned):
movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(16bytesin256)
+# endif
movdqa 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(32bytesin256)
+# endif
movdqa 32(%rdi), %xmm2
pxor 32(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(48bytesin256)
+# endif
movdqa 48(%rdi), %xmm2
pxor 48(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(64bytesin256)
+# endif
movdqa 64(%rdi), %xmm2
pxor 64(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(80bytesin256)
+# endif
movdqa 80(%rdi), %xmm2
pxor 80(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(96bytesin256)
+# endif
movdqa 96(%rdi), %xmm2
pxor 96(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(112bytesin256)
+# endif
movdqa 112(%rdi), %xmm2
pxor 112(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(128bytesin256)
+# endif
movdqa 128(%rdi), %xmm2
pxor 128(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(144bytesin256)
+# endif
movdqa 144(%rdi), %xmm2
pxor 144(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(160bytesin256)
+# endif
movdqa 160(%rdi), %xmm2
pxor 160(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(176bytesin256)
+# endif
movdqa 176(%rdi), %xmm2
pxor 176(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(192bytesin256)
+# endif
movdqa 192(%rdi), %xmm2
pxor 192(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(208bytesin256)
+# endif
movdqa 208(%rdi), %xmm2
pxor 208(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(224bytesin256)
+# endif
movdqa 224(%rdi), %xmm2
pxor 224(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(240bytesin256)
+# endif
movdqa 240(%rdi), %xmm2
pxor 240(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(256bytesin256)
+# endif
add $256, %rsi
add $256, %rdi
@@ -616,12 +892,20 @@ L(256bytesormorein2aligned):
movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(16bytesin256)
+# endif
movdqa 16(%rdi), %xmm2
pxor 16(%rsi), %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(32bytesin256)
+# endif
sub $32, %rdx
add $32, %rdi
add $32, %rsi
@@ -663,7 +947,11 @@ L(64bytesormore_loopin2aligned):
por %xmm5, %xmm1
ptest %xmm1, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(64bytesormore_loop_end)
+# endif
add $64, %rsi
add $64, %rdi
sub $64, %rdx
@@ -697,7 +985,11 @@ L(L2_L3_aligned_128bytes_loop):
por %xmm5, %xmm1
ptest %xmm1, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(64bytesormore_loop_end)
+# endif
add $64, %rsi
add $64, %rdi
sub $64, %rdx
@@ -708,7 +1000,7 @@ L(L2_L3_aligned_128bytes_loop):
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
+# ifndef USE_AS_BCMP
.p2align 4
L(64bytesormore_loop_end):
add $16, %rdi
@@ -791,17 +1083,29 @@ L(32bytesin256):
L(16bytesin256):
add $16, %rdi
add $16, %rsi
+# endif
L(16bytes):
mov -16(%rdi), %rax
mov -16(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
L(8bytes):
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
+# ifdef USE_AS_BCMP
+ sub %rcx, %rax
+ mov %rax, %rcx
+ shr $32, %rcx
+ or %ecx, %eax
+# else
cmp %rax, %rcx
jne L(diffin8bytes)
xor %eax, %eax
+# endif
ret
.p2align 4
@@ -809,16 +1113,26 @@ L(12bytes):
mov -12(%rdi), %rax
mov -12(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
L(4bytes):
mov -4(%rsi), %ecx
-# ifndef USE_AS_WMEMCMP
+# ifdef USE_AS_BCMP
mov -4(%rdi), %eax
- cmp %eax, %ecx
+ sub %ecx, %eax
+ ret
# else
+# ifndef USE_AS_WMEMCMP
+ mov -4(%rdi), %eax
+ cmp %eax, %ecx
+# else
cmp -4(%rdi), %ecx
-# endif
+# endif
jne L(diffin4bytes)
+# endif
L(0bytes):
xor %eax, %eax
ret
@@ -832,31 +1146,51 @@ L(65bytes):
mov $-65, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(49bytes):
movdqu -49(%rdi), %xmm1
movdqu -49(%rsi), %xmm2
mov $-49, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(33bytes):
movdqu -33(%rdi), %xmm1
movdqu -33(%rsi), %xmm2
mov $-33, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(17bytes):
mov -17(%rdi), %rax
mov -17(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
L(9bytes):
mov -9(%rdi), %rax
mov -9(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
movzbl -1(%rdi), %eax
movzbl -1(%rsi), %edx
sub %edx, %eax
@@ -867,12 +1201,23 @@ L(13bytes):
mov -13(%rdi), %rax
mov -13(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
+# ifdef USE_AS_BCMP
+ sub %rcx, %rax
+ mov %rax, %rcx
+ shr $32, %rcx
+ or %ecx, %eax
+# else
cmp %rax, %rcx
jne L(diffin8bytes)
xor %eax, %eax
+# endif
ret
.p2align 4
@@ -880,7 +1225,11 @@ L(5bytes):
mov -5(%rdi), %eax
mov -5(%rsi), %ecx
cmp %eax, %ecx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin4bytes)
+# endif
movzbl -1(%rdi), %eax
movzbl -1(%rsi), %edx
sub %edx, %eax
@@ -893,37 +1242,59 @@ L(66bytes):
mov $-66, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(50bytes):
movdqu -50(%rdi), %xmm1
movdqu -50(%rsi), %xmm2
mov $-50, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(34bytes):
movdqu -34(%rdi), %xmm1
movdqu -34(%rsi), %xmm2
mov $-34, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(18bytes):
mov -18(%rdi), %rax
mov -18(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
L(10bytes):
mov -10(%rdi), %rax
mov -10(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
movzwl -2(%rdi), %eax
movzwl -2(%rsi), %ecx
+# ifndef USE_AS_BCMP
cmp %cl, %al
jne L(end)
and $0xffff, %eax
and $0xffff, %ecx
+# endif
sub %ecx, %eax
ret
@@ -932,12 +1303,23 @@ L(14bytes):
mov -14(%rdi), %rax
mov -14(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
+# ifdef USE_AS_BCMP
+ sub %rcx, %rax
+ mov %rax, %rcx
+ shr $32, %rcx
+ or %ecx, %eax
+# else
cmp %rax, %rcx
jne L(diffin8bytes)
xor %eax, %eax
+# endif
ret
.p2align 4
@@ -945,14 +1327,20 @@ L(6bytes):
mov -6(%rdi), %eax
mov -6(%rsi), %ecx
cmp %eax, %ecx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin4bytes)
+# endif
L(2bytes):
movzwl -2(%rsi), %ecx
movzwl -2(%rdi), %eax
+# ifndef USE_AS_BCMP
cmp %cl, %al
jne L(end)
and $0xffff, %eax
and $0xffff, %ecx
+# endif
sub %ecx, %eax
ret
@@ -963,36 +1351,60 @@ L(67bytes):
mov $-67, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(51bytes):
movdqu -51(%rdi), %xmm2
movdqu -51(%rsi), %xmm1
mov $-51, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(35bytes):
movdqu -35(%rsi), %xmm1
movdqu -35(%rdi), %xmm2
mov $-35, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(19bytes):
mov -19(%rdi), %rax
mov -19(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
L(11bytes):
mov -11(%rdi), %rax
mov -11(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
mov -4(%rdi), %eax
mov -4(%rsi), %ecx
+# ifdef USE_AS_BCMP
+ sub %ecx, %eax
+# else
cmp %eax, %ecx
jne L(diffin4bytes)
xor %eax, %eax
+# endif
ret
.p2align 4
@@ -1000,12 +1412,23 @@ L(15bytes):
mov -15(%rdi), %rax
mov -15(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
+# ifdef USE_AS_BCMP
+ sub %rcx, %rax
+ mov %rax, %rcx
+ shr $32, %rcx
+ or %ecx, %eax
+# else
cmp %rax, %rcx
jne L(diffin8bytes)
xor %eax, %eax
+# endif
ret
.p2align 4
@@ -1013,12 +1436,20 @@ L(7bytes):
mov -7(%rdi), %eax
mov -7(%rsi), %ecx
cmp %eax, %ecx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin4bytes)
+# endif
mov -4(%rdi), %eax
mov -4(%rsi), %ecx
+# ifdef USE_AS_BCMP
+ sub %ecx, %eax
+# else
cmp %eax, %ecx
jne L(diffin4bytes)
xor %eax, %eax
+# endif
ret
.p2align 4
@@ -1026,7 +1457,11 @@ L(3bytes):
movzwl -3(%rdi), %eax
movzwl -3(%rsi), %ecx
cmp %eax, %ecx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin2bytes)
+# endif
L(1bytes):
movzbl -1(%rdi), %eax
movzbl -1(%rsi), %ecx
@@ -1041,38 +1476,58 @@ L(68bytes):
mov $-68, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(52bytes):
movdqu -52(%rdi), %xmm2
movdqu -52(%rsi), %xmm1
mov $-52, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(36bytes):
movdqu -36(%rdi), %xmm2
movdqu -36(%rsi), %xmm1
mov $-36, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(20bytes):
movdqu -20(%rdi), %xmm2
movdqu -20(%rsi), %xmm1
mov $-20, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
mov -4(%rsi), %ecx
-
-# ifndef USE_AS_WMEMCMP
+# ifdef USE_AS_BCMP
mov -4(%rdi), %eax
- cmp %eax, %ecx
+ sub %ecx, %eax
# else
+# ifndef USE_AS_WMEMCMP
+ mov -4(%rdi), %eax
+ cmp %eax, %ecx
+# else
cmp -4(%rdi), %ecx
-# endif
+# endif
jne L(diffin4bytes)
xor %eax, %eax
+# endif
ret
# ifndef USE_AS_WMEMCMP
@@ -1084,32 +1539,52 @@ L(69bytes):
mov $-69, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(53bytes):
movdqu -53(%rsi), %xmm1
movdqu -53(%rdi), %xmm2
mov $-53, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(37bytes):
movdqu -37(%rsi), %xmm1
movdqu -37(%rdi), %xmm2
mov $-37, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(21bytes):
movdqu -21(%rsi), %xmm1
movdqu -21(%rdi), %xmm2
mov $-21, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
xor %eax, %eax
ret
@@ -1120,32 +1595,52 @@ L(70bytes):
mov $-70, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(54bytes):
movdqu -54(%rsi), %xmm1
movdqu -54(%rdi), %xmm2
mov $-54, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(38bytes):
movdqu -38(%rsi), %xmm1
movdqu -38(%rdi), %xmm2
mov $-38, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(22bytes):
movdqu -22(%rsi), %xmm1
movdqu -22(%rdi), %xmm2
mov $-22, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
xor %eax, %eax
ret
@@ -1156,32 +1651,52 @@ L(71bytes):
mov $-71, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(55bytes):
movdqu -55(%rdi), %xmm2
movdqu -55(%rsi), %xmm1
mov $-55, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(39bytes):
movdqu -39(%rdi), %xmm2
movdqu -39(%rsi), %xmm1
mov $-39, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(23bytes):
movdqu -23(%rdi), %xmm2
movdqu -23(%rsi), %xmm1
mov $-23, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
xor %eax, %eax
ret
# endif
@@ -1193,33 +1708,53 @@ L(72bytes):
mov $-72, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(56bytes):
movdqu -56(%rdi), %xmm2
movdqu -56(%rsi), %xmm1
mov $-56, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(40bytes):
movdqu -40(%rdi), %xmm2
movdqu -40(%rsi), %xmm1
mov $-40, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(24bytes):
movdqu -24(%rdi), %xmm2
movdqu -24(%rsi), %xmm1
mov $-24, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
mov -8(%rsi), %rcx
mov -8(%rdi), %rax
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
xor %eax, %eax
ret
@@ -1232,32 +1767,52 @@ L(73bytes):
mov $-73, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(57bytes):
movdqu -57(%rdi), %xmm2
movdqu -57(%rsi), %xmm1
mov $-57, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(41bytes):
movdqu -41(%rdi), %xmm2
movdqu -41(%rsi), %xmm1
mov $-41, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(25bytes):
movdqu -25(%rdi), %xmm2
movdqu -25(%rsi), %xmm1
mov $-25, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
mov -9(%rdi), %rax
mov -9(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
movzbl -1(%rdi), %eax
movzbl -1(%rsi), %ecx
sub %ecx, %eax
@@ -1270,35 +1825,60 @@ L(74bytes):
mov $-74, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(58bytes):
movdqu -58(%rdi), %xmm2
movdqu -58(%rsi), %xmm1
mov $-58, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(42bytes):
movdqu -42(%rdi), %xmm2
movdqu -42(%rsi), %xmm1
mov $-42, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(26bytes):
movdqu -26(%rdi), %xmm2
movdqu -26(%rsi), %xmm1
mov $-26, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
mov -10(%rdi), %rax
mov -10(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
movzwl -2(%rdi), %eax
movzwl -2(%rsi), %ecx
+# ifdef USE_AS_BCMP
+ sub %ecx, %eax
+ ret
+# else
jmp L(diffin2bytes)
+# endif
.p2align 4
L(75bytes):
@@ -1307,37 +1887,61 @@ L(75bytes):
mov $-75, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(59bytes):
movdqu -59(%rdi), %xmm2
movdqu -59(%rsi), %xmm1
mov $-59, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(43bytes):
movdqu -43(%rdi), %xmm2
movdqu -43(%rsi), %xmm1
mov $-43, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(27bytes):
movdqu -27(%rdi), %xmm2
movdqu -27(%rsi), %xmm1
mov $-27, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
mov -11(%rdi), %rax
mov -11(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
mov -4(%rdi), %eax
mov -4(%rsi), %ecx
+# ifdef USE_AS_BCMP
+ sub %ecx, %eax
+# else
cmp %eax, %ecx
jne L(diffin4bytes)
xor %eax, %eax
+# endif
ret
# endif
.p2align 4
@@ -1347,41 +1951,66 @@ L(76bytes):
mov $-76, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(60bytes):
movdqu -60(%rdi), %xmm2
movdqu -60(%rsi), %xmm1
mov $-60, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(44bytes):
movdqu -44(%rdi), %xmm2
movdqu -44(%rsi), %xmm1
mov $-44, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(28bytes):
movdqu -28(%rdi), %xmm2
movdqu -28(%rsi), %xmm1
mov $-28, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
mov -12(%rdi), %rax
mov -12(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
mov -4(%rsi), %ecx
-# ifndef USE_AS_WMEMCMP
+# ifdef USE_AS_BCMP
mov -4(%rdi), %eax
- cmp %eax, %ecx
+ sub %ecx, %eax
# else
+# ifndef USE_AS_WMEMCMP
+ mov -4(%rdi), %eax
+ cmp %eax, %ecx
+# else
cmp -4(%rdi), %ecx
-# endif
+# endif
jne L(diffin4bytes)
xor %eax, %eax
+# endif
ret
# ifndef USE_AS_WMEMCMP
@@ -1393,38 +2022,62 @@ L(77bytes):
mov $-77, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(61bytes):
movdqu -61(%rdi), %xmm2
movdqu -61(%rsi), %xmm1
mov $-61, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(45bytes):
movdqu -45(%rdi), %xmm2
movdqu -45(%rsi), %xmm1
mov $-45, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(29bytes):
movdqu -29(%rdi), %xmm2
movdqu -29(%rsi), %xmm1
mov $-29, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
mov -13(%rdi), %rax
mov -13(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
xor %eax, %eax
ret
@@ -1435,36 +2088,60 @@ L(78bytes):
mov $-78, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(62bytes):
movdqu -62(%rdi), %xmm2
movdqu -62(%rsi), %xmm1
mov $-62, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(46bytes):
movdqu -46(%rdi), %xmm2
movdqu -46(%rsi), %xmm1
mov $-46, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(30bytes):
movdqu -30(%rdi), %xmm2
movdqu -30(%rsi), %xmm1
mov $-30, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
mov -14(%rdi), %rax
mov -14(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
xor %eax, %eax
ret
@@ -1475,36 +2152,60 @@ L(79bytes):
mov $-79, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(63bytes):
movdqu -63(%rdi), %xmm2
movdqu -63(%rsi), %xmm1
mov $-63, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(47bytes):
movdqu -47(%rdi), %xmm2
movdqu -47(%rsi), %xmm1
mov $-47, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(31bytes):
movdqu -31(%rdi), %xmm2
movdqu -31(%rsi), %xmm1
mov $-31, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
mov -15(%rdi), %rax
mov -15(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
xor %eax, %eax
ret
# endif
@@ -1515,37 +2216,58 @@ L(64bytes):
mov $-64, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(48bytes):
movdqu -48(%rdi), %xmm2
movdqu -48(%rsi), %xmm1
mov $-48, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
L(32bytes):
movdqu -32(%rdi), %xmm2
movdqu -32(%rsi), %xmm1
mov $-32, %dl
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
+# ifdef USE_AS_BCMP
+ jnc L(return_not_equals)
+# else
jnc L(less16bytes)
+# endif
mov -16(%rdi), %rax
mov -16(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
+# ifdef USE_AS_BCMP
+ jne L(return_not_equals)
+# else
jne L(diffin8bytes)
+# endif
xor %eax, %eax
ret
/*
* Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
*/
+# ifndef USE_AS_BCMP
.p2align 3
L(less16bytes):
movsbq %dl, %rdx
@@ -1561,16 +2283,16 @@ L(diffin8bytes):
shr $32, %rcx
shr $32, %rax
-# ifdef USE_AS_WMEMCMP
+# ifdef USE_AS_WMEMCMP
/* for wmemcmp */
cmp %eax, %ecx
jne L(diffin4bytes)
xor %eax, %eax
ret
-# endif
+# endif
L(diffin4bytes):
-# ifndef USE_AS_WMEMCMP
+# ifndef USE_AS_WMEMCMP
cmp %cx, %ax
jne L(diffin2bytes)
shr $16, %ecx
@@ -1589,7 +2311,7 @@ L(end):
and $0xff, %ecx
sub %ecx, %eax
ret
-# else
+# else
/* for wmemcmp */
mov $1, %eax
@@ -1601,6 +2323,15 @@ L(end):
L(nequal_bigger):
ret
+L(unreal_case):
+ xor %eax, %eax
+ ret
+# endif
+# else
+ .p2align 4
+L(return_not_equals):
+ mov $1, %eax
+ ret
L(unreal_case):
xor %eax, %eax
ret
--
2.25.1
next prev parent reply other threads:[~2021-09-14 6:33 UTC|newest]
Thread overview: 51+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-09-13 23:05 [PATCH 1/5] x86_64: Add support for bcmp using sse2, sse4_1, avx2, and evex Noah Goldstein via Libc-alpha
2021-09-13 23:05 ` [PATCH 2/5] x86_64: Add sse2 optimized bcmp implementation in memcmp.S Noah Goldstein via Libc-alpha
2021-09-13 23:05 ` [PATCH 3/5] x86_64: Add sse4_1 optimized bcmp implementation in memcmp-sse4.S Noah Goldstein via Libc-alpha
2021-09-13 23:05 ` [PATCH 4/5] x86_64: Add avx2 optimized bcmp implementation in bcmp-avx2.S Noah Goldstein via Libc-alpha
2021-09-13 23:05 ` [PATCH 5/5] x86_64: Add evex optimized bcmp implementation in bcmp-evex.S Noah Goldstein via Libc-alpha
2021-09-14 1:18 ` Carlos O'Donell via Libc-alpha
2021-09-14 2:05 ` Noah Goldstein via Libc-alpha
2021-09-14 2:35 ` Carlos O'Donell via Libc-alpha
2021-09-14 2:55 ` DJ Delorie via Libc-alpha
2021-09-14 3:24 ` Noah Goldstein via Libc-alpha
2021-09-14 3:40 ` Noah Goldstein via Libc-alpha
2021-09-14 4:21 ` DJ Delorie via Libc-alpha
2021-09-14 5:29 ` Noah Goldstein via Libc-alpha
2021-09-14 5:42 ` DJ Delorie via Libc-alpha
2021-09-14 5:55 ` Noah Goldstein via Libc-alpha
2021-09-13 23:22 ` [PATCH 1/5] x86_64: Add support for bcmp using sse2, sse4_1, avx2, and evex Noah Goldstein via Libc-alpha
2021-09-14 6:30 ` [PATCH v2 " Noah Goldstein via Libc-alpha
2021-09-14 6:30 ` [PATCH v2 2/5] x86_64: Add sse2 optimized bcmp implementation in memcmp.S Noah Goldstein via Libc-alpha
2021-09-14 6:30 ` Noah Goldstein via Libc-alpha [this message]
2021-09-14 6:30 ` [PATCH v2 4/5] x86_64: Add avx2 optimized bcmp implementation in bcmp-avx2.S Noah Goldstein via Libc-alpha
2021-09-14 6:30 ` [PATCH v2 5/5] x86_64: Add evex optimized bcmp implementation in bcmp-evex.S Noah Goldstein via Libc-alpha
2021-09-14 14:40 ` [PATCH v2 1/5] x86_64: Add support for bcmp using sse2, sse4_1, avx2, and evex H.J. Lu via Libc-alpha
2021-09-14 19:23 ` Noah Goldstein via Libc-alpha
2021-09-14 20:30 ` Florian Weimer via Libc-alpha
2021-09-15 0:00 ` [PATCH " Joseph Myers
2021-09-15 13:37 ` Zack Weinberg via Libc-alpha
2021-09-15 14:01 ` Re: [PATCH 1/5] x86_64: Add support for bcmp using sse2, sse 4_1, " Florian Weimer via Libc-alpha
2021-09-15 18:06 ` Noah Goldstein via Libc-alpha
2021-09-15 18:30 ` Joseph Myers
2021-09-27 1:35 ` Noah Goldstein via Libc-alpha
2021-09-27 7:29 ` Florian Weimer via Libc-alpha
2021-09-27 16:49 ` Noah Goldstein via Libc-alpha
2021-09-27 16:54 ` Florian Weimer via Libc-alpha
2021-09-27 17:54 ` Noah Goldstein via Libc-alpha
2021-09-27 17:56 ` Florian Weimer via Libc-alpha
2021-09-27 18:05 ` Noah Goldstein via Libc-alpha
2021-09-27 18:10 ` Florian Weimer via Libc-alpha
2021-09-27 18:15 ` Noah Goldstein via Libc-alpha
2021-09-27 18:22 ` Florian Weimer via Libc-alpha
2021-09-27 18:34 ` Noah Goldstein via Libc-alpha
2021-09-27 18:56 ` Florian Weimer via Libc-alpha
2021-09-27 19:20 ` Noah Goldstein via Libc-alpha
2021-09-27 19:34 ` Florian Weimer via Libc-alpha
2021-09-27 19:43 ` Noah Goldstein via Libc-alpha
2021-09-27 19:59 ` Florian Weimer via Libc-alpha
2021-09-27 20:22 ` Noah Goldstein via Libc-alpha
2021-09-27 20:24 ` Florian Weimer via Libc-alpha
2021-09-27 20:38 ` Noah Goldstein via Libc-alpha
2021-09-28 0:07 ` Noah Goldstein via Libc-alpha
2021-09-27 17:42 ` Joseph Myers
2021-09-27 17:48 ` Noah Goldstein via Libc-alpha
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://www.gnu.org/software/libc/involved.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210914063039.1126196-3-goldstein.w.n@gmail.com \
--to=libc-alpha@sourceware.org \
--cc=goldstein.w.n@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).